diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,50936 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.999784343325426, + "eval_steps": 25, + "global_step": 6954, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00043131334914815614, + "grad_norm": 0.4778429865837097, + "learning_rate": 0.0, + "loss": 2.4396, + "step": 1 + }, + { + "epoch": 0.0008626266982963123, + "grad_norm": 0.6504954695701599, + "learning_rate": 2.9999999999999997e-06, + "loss": 2.5284, + "step": 2 + }, + { + "epoch": 0.0012939400474444684, + "grad_norm": 0.4514806568622589, + "learning_rate": 5.999999999999999e-06, + "loss": 2.3895, + "step": 3 + }, + { + "epoch": 0.0017252533965926246, + "grad_norm": 0.4726278483867645, + "learning_rate": 8.999999999999999e-06, + "loss": 2.5381, + "step": 4 + }, + { + "epoch": 0.0021565667457407807, + "grad_norm": 0.46150511503219604, + "learning_rate": 1.1999999999999999e-05, + "loss": 2.401, + "step": 5 + }, + { + "epoch": 0.002587880094888937, + "grad_norm": 0.47438162565231323, + "learning_rate": 1.4999999999999999e-05, + "loss": 2.5229, + "step": 6 + }, + { + "epoch": 0.003019193444037093, + "grad_norm": 0.5883695483207703, + "learning_rate": 1.7999999999999997e-05, + "loss": 2.4581, + "step": 7 + }, + { + "epoch": 0.003450506793185249, + "grad_norm": 0.6027829647064209, + "learning_rate": 2.1e-05, + "loss": 2.4734, + "step": 8 + }, + { + "epoch": 0.0038818201423334053, + "grad_norm": 289971.125, + "learning_rate": 2.3999999999999997e-05, + "loss": 2.4135, + "step": 9 + }, + { + "epoch": 0.004313133491481561, + "grad_norm": 0.46024876832962036, + "learning_rate": 2.6999999999999996e-05, + "loss": 2.5216, + "step": 10 + }, + { + "epoch": 0.004744446840629717, + "grad_norm": 0.3895394206047058, + "learning_rate": 2.9999999999999997e-05, + "loss": 2.4507, + "step": 11 + }, + { + "epoch": 0.005175760189777874, + "grad_norm": 0.47679391503334045, + "learning_rate": 3.2999999999999996e-05, + "loss": 2.631, + "step": 12 + }, + { + "epoch": 0.005607073538926029, + "grad_norm": 0.37820500135421753, + "learning_rate": 3.5999999999999994e-05, + "loss": 2.2789, + "step": 13 + }, + { + "epoch": 0.006038386888074186, + "grad_norm": 0.30506932735443115, + "learning_rate": 3.9e-05, + "loss": 2.3152, + "step": 14 + }, + { + "epoch": 0.006469700237222342, + "grad_norm": 0.23102450370788574, + "learning_rate": 4.2e-05, + "loss": 2.485, + "step": 15 + }, + { + "epoch": 0.006901013586370498, + "grad_norm": 19.553178787231445, + "learning_rate": 4.4999999999999996e-05, + "loss": 2.4329, + "step": 16 + }, + { + "epoch": 0.007332326935518654, + "grad_norm": 0.17712759971618652, + "learning_rate": 4.7999999999999994e-05, + "loss": 2.3715, + "step": 17 + }, + { + "epoch": 0.0077636402846668106, + "grad_norm": 0.14594106376171112, + "learning_rate": 5.1e-05, + "loss": 2.1465, + "step": 18 + }, + { + "epoch": 0.008194953633814967, + "grad_norm": 0.1296325922012329, + "learning_rate": 5.399999999999999e-05, + "loss": 2.3163, + "step": 19 + }, + { + "epoch": 0.008626266982963123, + "grad_norm": 0.13177795708179474, + "learning_rate": 5.6999999999999996e-05, + "loss": 2.4841, + "step": 20 + }, + { + "epoch": 0.009057580332111279, + "grad_norm": 0.11691755056381226, + "learning_rate": 5.9999999999999995e-05, + "loss": 2.2407, + "step": 21 + }, + { + "epoch": 0.009488893681259434, + "grad_norm": 0.10889497399330139, + "learning_rate": 6.299999999999999e-05, + "loss": 2.3919, + "step": 22 + }, + { + "epoch": 0.009920207030407592, + "grad_norm": 0.1357669234275818, + "learning_rate": 6.599999999999999e-05, + "loss": 2.6173, + "step": 23 + }, + { + "epoch": 0.010351520379555747, + "grad_norm": 216.71868896484375, + "learning_rate": 6.9e-05, + "loss": 2.3811, + "step": 24 + }, + { + "epoch": 0.010782833728703903, + "grad_norm": 0.12138977646827698, + "learning_rate": 7.199999999999999e-05, + "loss": 2.2775, + "step": 25 + }, + { + "epoch": 0.010782833728703903, + "eval_loss": 2.243220806121826, + "eval_runtime": 199.0847, + "eval_samples_per_second": 0.161, + "eval_steps_per_second": 0.161, + "step": 25 + }, + { + "epoch": 0.011214147077852059, + "grad_norm": 0.13480618596076965, + "learning_rate": 7.5e-05, + "loss": 2.422, + "step": 26 + }, + { + "epoch": 0.011645460427000216, + "grad_norm": 0.11292761564254761, + "learning_rate": 7.8e-05, + "loss": 2.3631, + "step": 27 + }, + { + "epoch": 0.012076773776148372, + "grad_norm": 0.10692216455936432, + "learning_rate": 8.1e-05, + "loss": 2.4068, + "step": 28 + }, + { + "epoch": 0.012508087125296528, + "grad_norm": 0.10275225341320038, + "learning_rate": 8.4e-05, + "loss": 2.1603, + "step": 29 + }, + { + "epoch": 0.012939400474444683, + "grad_norm": 0.10391239821910858, + "learning_rate": 8.699999999999999e-05, + "loss": 2.2908, + "step": 30 + }, + { + "epoch": 0.01337071382359284, + "grad_norm": 0.10926996171474457, + "learning_rate": 8.999999999999999e-05, + "loss": 2.4315, + "step": 31 + }, + { + "epoch": 0.013802027172740997, + "grad_norm": 0.11844368278980255, + "learning_rate": 9.3e-05, + "loss": 2.4053, + "step": 32 + }, + { + "epoch": 0.014233340521889152, + "grad_norm": 0.11695211380720139, + "learning_rate": 9.599999999999999e-05, + "loss": 2.3072, + "step": 33 + }, + { + "epoch": 0.014664653871037308, + "grad_norm": 0.13541661202907562, + "learning_rate": 9.9e-05, + "loss": 2.106, + "step": 34 + }, + { + "epoch": 0.015095967220185465, + "grad_norm": 107.78008270263672, + "learning_rate": 0.000102, + "loss": 2.1436, + "step": 35 + }, + { + "epoch": 0.015527280569333621, + "grad_norm": 0.11143426597118378, + "learning_rate": 0.00010499999999999999, + "loss": 2.1563, + "step": 36 + }, + { + "epoch": 0.015958593918481777, + "grad_norm": 0.12790513038635254, + "learning_rate": 0.00010799999999999998, + "loss": 2.3599, + "step": 37 + }, + { + "epoch": 0.016389907267629934, + "grad_norm": 0.24229316413402557, + "learning_rate": 0.00011099999999999999, + "loss": 2.2941, + "step": 38 + }, + { + "epoch": 0.016821220616778088, + "grad_norm": 0.12360761314630508, + "learning_rate": 0.00011399999999999999, + "loss": 2.3284, + "step": 39 + }, + { + "epoch": 0.017252533965926246, + "grad_norm": 0.10814772546291351, + "learning_rate": 0.000117, + "loss": 2.3827, + "step": 40 + }, + { + "epoch": 0.017683847315074403, + "grad_norm": 0.12808896601200104, + "learning_rate": 0.00011999999999999999, + "loss": 2.2024, + "step": 41 + }, + { + "epoch": 0.018115160664222557, + "grad_norm": 0.11168445646762848, + "learning_rate": 0.00012299999999999998, + "loss": 2.3587, + "step": 42 + }, + { + "epoch": 0.018546474013370715, + "grad_norm": 0.1182243749499321, + "learning_rate": 0.00012599999999999997, + "loss": 2.3013, + "step": 43 + }, + { + "epoch": 0.01897778736251887, + "grad_norm": 0.1579306572675705, + "learning_rate": 0.000129, + "loss": 2.4032, + "step": 44 + }, + { + "epoch": 0.019409100711667026, + "grad_norm": 0.10591604560613632, + "learning_rate": 0.00013199999999999998, + "loss": 2.2099, + "step": 45 + }, + { + "epoch": 0.019840414060815183, + "grad_norm": 0.10893277078866959, + "learning_rate": 0.000135, + "loss": 2.2341, + "step": 46 + }, + { + "epoch": 0.020271727409963337, + "grad_norm": 0.10329752415418625, + "learning_rate": 0.000138, + "loss": 2.2163, + "step": 47 + }, + { + "epoch": 0.020703040759111495, + "grad_norm": 0.09824061393737793, + "learning_rate": 0.00014099999999999998, + "loss": 2.3557, + "step": 48 + }, + { + "epoch": 0.021134354108259652, + "grad_norm": 0.12618935108184814, + "learning_rate": 0.00014399999999999998, + "loss": 2.3195, + "step": 49 + }, + { + "epoch": 0.021565667457407806, + "grad_norm": 0.11831948906183243, + "learning_rate": 0.000147, + "loss": 2.624, + "step": 50 + }, + { + "epoch": 0.021565667457407806, + "eval_loss": 2.210568428039551, + "eval_runtime": 197.8736, + "eval_samples_per_second": 0.162, + "eval_steps_per_second": 0.162, + "step": 50 + }, + { + "epoch": 0.021996980806555964, + "grad_norm": 0.33164137601852417, + "learning_rate": 0.00015, + "loss": 2.3776, + "step": 51 + }, + { + "epoch": 0.022428294155704118, + "grad_norm": 0.13195934891700745, + "learning_rate": 0.00014999999223521492, + "loss": 2.3575, + "step": 52 + }, + { + "epoch": 0.022859607504852275, + "grad_norm": 0.11495663225650787, + "learning_rate": 0.00014999996894086122, + "loss": 2.2736, + "step": 53 + }, + { + "epoch": 0.023290920854000433, + "grad_norm": 0.13961608707904816, + "learning_rate": 0.0001499999301169438, + "loss": 2.2999, + "step": 54 + }, + { + "epoch": 0.023722234203148587, + "grad_norm": 0.14567121863365173, + "learning_rate": 0.00014999987576347067, + "loss": 1.983, + "step": 55 + }, + { + "epoch": 0.024153547552296744, + "grad_norm": 0.14206400513648987, + "learning_rate": 0.00014999980588045305, + "loss": 2.5026, + "step": 56 + }, + { + "epoch": 0.0245848609014449, + "grad_norm": 0.17789866030216217, + "learning_rate": 0.00014999972046790544, + "loss": 2.2247, + "step": 57 + }, + { + "epoch": 0.025016174250593055, + "grad_norm": 0.13413818180561066, + "learning_rate": 0.00014999961952584555, + "loss": 2.3253, + "step": 58 + }, + { + "epoch": 0.025447487599741213, + "grad_norm": 0.13895240426063538, + "learning_rate": 0.00014999950305429424, + "loss": 2.2533, + "step": 59 + }, + { + "epoch": 0.025878800948889367, + "grad_norm": 0.11061283200979233, + "learning_rate": 0.00014999937105327564, + "loss": 2.1234, + "step": 60 + }, + { + "epoch": 0.026310114298037524, + "grad_norm": 0.14688056707382202, + "learning_rate": 0.00014999922352281712, + "loss": 2.4367, + "step": 61 + }, + { + "epoch": 0.02674142764718568, + "grad_norm": 0.16806663572788239, + "learning_rate": 0.00014999906046294915, + "loss": 2.0712, + "step": 62 + }, + { + "epoch": 0.027172740996333836, + "grad_norm": 0.14308182895183563, + "learning_rate": 0.00014999888187370553, + "loss": 2.4362, + "step": 63 + }, + { + "epoch": 0.027604054345481993, + "grad_norm": 0.1595005989074707, + "learning_rate": 0.00014999868775512326, + "loss": 2.3226, + "step": 64 + }, + { + "epoch": 0.028035367694630147, + "grad_norm": 0.12989497184753418, + "learning_rate": 0.00014999847810724253, + "loss": 2.2128, + "step": 65 + }, + { + "epoch": 0.028466681043778305, + "grad_norm": 0.13269419968128204, + "learning_rate": 0.00014999825293010675, + "loss": 2.1651, + "step": 66 + }, + { + "epoch": 0.028897994392926462, + "grad_norm": 0.12201163172721863, + "learning_rate": 0.00014999801222376248, + "loss": 2.0443, + "step": 67 + }, + { + "epoch": 0.029329307742074616, + "grad_norm": 0.14177387952804565, + "learning_rate": 0.00014999775598825967, + "loss": 2.2544, + "step": 68 + }, + { + "epoch": 0.029760621091222773, + "grad_norm": 0.1548364758491516, + "learning_rate": 0.00014999748422365127, + "loss": 2.2396, + "step": 69 + }, + { + "epoch": 0.03019193444037093, + "grad_norm": 10.743372917175293, + "learning_rate": 0.00014999719692999365, + "loss": 2.3391, + "step": 70 + }, + { + "epoch": 0.030623247789519085, + "grad_norm": 0.13508769869804382, + "learning_rate": 0.00014999689410734622, + "loss": 2.3851, + "step": 71 + }, + { + "epoch": 0.031054561138667242, + "grad_norm": 1.441625952720642, + "learning_rate": 0.00014999657575577173, + "loss": 2.2796, + "step": 72 + }, + { + "epoch": 0.031485874487815396, + "grad_norm": 0.17903247475624084, + "learning_rate": 0.00014999624187533606, + "loss": 2.4809, + "step": 73 + }, + { + "epoch": 0.031917187836963554, + "grad_norm": 0.23883047699928284, + "learning_rate": 0.0001499958924661084, + "loss": 2.3531, + "step": 74 + }, + { + "epoch": 0.03234850118611171, + "grad_norm": 0.14161138236522675, + "learning_rate": 0.00014999552752816104, + "loss": 2.2301, + "step": 75 + }, + { + "epoch": 0.03234850118611171, + "eval_loss": 2.1953680515289307, + "eval_runtime": 197.8389, + "eval_samples_per_second": 0.162, + "eval_steps_per_second": 0.162, + "step": 75 + }, + { + "epoch": 0.03277981453525987, + "grad_norm": 0.18088607490062714, + "learning_rate": 0.00014999514706156956, + "loss": 2.4097, + "step": 76 + }, + { + "epoch": 0.03321112788440802, + "grad_norm": 0.16779634356498718, + "learning_rate": 0.00014999475106641276, + "loss": 2.4324, + "step": 77 + }, + { + "epoch": 0.033642441233556176, + "grad_norm": 0.16245946288108826, + "learning_rate": 0.00014999433954277264, + "loss": 2.3828, + "step": 78 + }, + { + "epoch": 0.034073754582704334, + "grad_norm": 0.16727153956890106, + "learning_rate": 0.00014999391249073435, + "loss": 2.3144, + "step": 79 + }, + { + "epoch": 0.03450506793185249, + "grad_norm": 0.15430496633052826, + "learning_rate": 0.00014999346991038639, + "loss": 2.2893, + "step": 80 + }, + { + "epoch": 0.03493638128100065, + "grad_norm": 0.13965576887130737, + "learning_rate": 0.00014999301180182036, + "loss": 2.3668, + "step": 81 + }, + { + "epoch": 0.035367694630148806, + "grad_norm": 0.14725452661514282, + "learning_rate": 0.00014999253816513113, + "loss": 2.2724, + "step": 82 + }, + { + "epoch": 0.03579900797929696, + "grad_norm": 0.1368856281042099, + "learning_rate": 0.00014999204900041678, + "loss": 2.2663, + "step": 83 + }, + { + "epoch": 0.036230321328445114, + "grad_norm": 0.134723961353302, + "learning_rate": 0.00014999154430777857, + "loss": 2.4259, + "step": 84 + }, + { + "epoch": 0.03666163467759327, + "grad_norm": 0.1524089127779007, + "learning_rate": 0.00014999102408732104, + "loss": 2.4348, + "step": 85 + }, + { + "epoch": 0.03709294802674143, + "grad_norm": 0.1420392543077469, + "learning_rate": 0.00014999048833915185, + "loss": 2.3566, + "step": 86 + }, + { + "epoch": 0.03752426137588959, + "grad_norm": 0.13361221551895142, + "learning_rate": 0.000149989937063382, + "loss": 2.3743, + "step": 87 + }, + { + "epoch": 0.03795557472503774, + "grad_norm": 0.14212311804294586, + "learning_rate": 0.00014998937026012557, + "loss": 2.2588, + "step": 88 + }, + { + "epoch": 0.038386888074185894, + "grad_norm": 0.12589286267757416, + "learning_rate": 0.00014998878792949998, + "loss": 2.2884, + "step": 89 + }, + { + "epoch": 0.03881820142333405, + "grad_norm": 1.3771302700042725, + "learning_rate": 0.00014998819007162577, + "loss": 2.2237, + "step": 90 + }, + { + "epoch": 0.03924951477248221, + "grad_norm": 0.14674894511699677, + "learning_rate": 0.00014998757668662678, + "loss": 2.4529, + "step": 91 + }, + { + "epoch": 0.03968082812163037, + "grad_norm": 0.15564079582691193, + "learning_rate": 0.00014998694777462997, + "loss": 2.2877, + "step": 92 + }, + { + "epoch": 0.04011214147077852, + "grad_norm": 0.15917402505874634, + "learning_rate": 0.0001499863033357656, + "loss": 2.5187, + "step": 93 + }, + { + "epoch": 0.040543454819926675, + "grad_norm": 0.15689204633235931, + "learning_rate": 0.00014998564337016706, + "loss": 2.3903, + "step": 94 + }, + { + "epoch": 0.04097476816907483, + "grad_norm": 0.1784036010503769, + "learning_rate": 0.00014998496787797103, + "loss": 2.3838, + "step": 95 + }, + { + "epoch": 0.04140608151822299, + "grad_norm": 0.15815366804599762, + "learning_rate": 0.00014998427685931738, + "loss": 2.3389, + "step": 96 + }, + { + "epoch": 0.04183739486737115, + "grad_norm": 0.14294639229774475, + "learning_rate": 0.00014998357031434918, + "loss": 2.1199, + "step": 97 + }, + { + "epoch": 0.042268708216519305, + "grad_norm": 0.13665415346622467, + "learning_rate": 0.00014998284824321276, + "loss": 2.0823, + "step": 98 + }, + { + "epoch": 0.042700021565667455, + "grad_norm": 0.1319860965013504, + "learning_rate": 0.0001499821106460576, + "loss": 2.2892, + "step": 99 + }, + { + "epoch": 0.04313133491481561, + "grad_norm": 0.14062654972076416, + "learning_rate": 0.00014998135752303647, + "loss": 2.1406, + "step": 100 + }, + { + "epoch": 0.04313133491481561, + "eval_loss": 2.180278778076172, + "eval_runtime": 194.9967, + "eval_samples_per_second": 0.164, + "eval_steps_per_second": 0.164, + "step": 100 + }, + { + "epoch": 0.04356264826396377, + "grad_norm": 0.15747618675231934, + "learning_rate": 0.00014998058887430527, + "loss": 2.329, + "step": 101 + }, + { + "epoch": 0.04399396161311193, + "grad_norm": 0.14773201942443848, + "learning_rate": 0.00014997980470002318, + "loss": 2.3304, + "step": 102 + }, + { + "epoch": 0.044425274962260085, + "grad_norm": 0.2709938883781433, + "learning_rate": 0.00014997900500035254, + "loss": 2.268, + "step": 103 + }, + { + "epoch": 0.044856588311408235, + "grad_norm": 0.31209737062454224, + "learning_rate": 0.00014997818977545896, + "loss": 1.9698, + "step": 104 + }, + { + "epoch": 0.04528790166055639, + "grad_norm": 0.20316815376281738, + "learning_rate": 0.00014997735902551123, + "loss": 2.3328, + "step": 105 + }, + { + "epoch": 0.04571921500970455, + "grad_norm": 0.16161882877349854, + "learning_rate": 0.0001499765127506814, + "loss": 2.401, + "step": 106 + }, + { + "epoch": 0.04615052835885271, + "grad_norm": 0.15170305967330933, + "learning_rate": 0.00014997565095114466, + "loss": 2.4336, + "step": 107 + }, + { + "epoch": 0.046581841708000865, + "grad_norm": 0.1538029909133911, + "learning_rate": 0.00014997477362707948, + "loss": 2.0162, + "step": 108 + }, + { + "epoch": 0.047013155057149016, + "grad_norm": 0.1348794847726822, + "learning_rate": 0.0001499738807786675, + "loss": 2.3923, + "step": 109 + }, + { + "epoch": 0.04744446840629717, + "grad_norm": 0.13398659229278564, + "learning_rate": 0.00014997297240609361, + "loss": 2.2099, + "step": 110 + }, + { + "epoch": 0.04787578175544533, + "grad_norm": 0.13340270519256592, + "learning_rate": 0.0001499720485095459, + "loss": 2.4006, + "step": 111 + }, + { + "epoch": 0.04830709510459349, + "grad_norm": 0.15017318725585938, + "learning_rate": 0.00014997110908921565, + "loss": 2.3831, + "step": 112 + }, + { + "epoch": 0.048738408453741645, + "grad_norm": 0.14307159185409546, + "learning_rate": 0.0001499701541452974, + "loss": 2.3606, + "step": 113 + }, + { + "epoch": 0.0491697218028898, + "grad_norm": 0.16115598380565643, + "learning_rate": 0.00014996918367798888, + "loss": 2.2783, + "step": 114 + }, + { + "epoch": 0.04960103515203795, + "grad_norm": 0.1477428674697876, + "learning_rate": 0.00014996819768749102, + "loss": 2.3053, + "step": 115 + }, + { + "epoch": 0.05003234850118611, + "grad_norm": 0.13940371572971344, + "learning_rate": 0.00014996719617400798, + "loss": 2.4399, + "step": 116 + }, + { + "epoch": 0.05046366185033427, + "grad_norm": 10.187359809875488, + "learning_rate": 0.00014996617913774716, + "loss": 2.1557, + "step": 117 + }, + { + "epoch": 0.050894975199482426, + "grad_norm": 0.16093291342258453, + "learning_rate": 0.00014996514657891915, + "loss": 2.3654, + "step": 118 + }, + { + "epoch": 0.05132628854863058, + "grad_norm": 0.39099806547164917, + "learning_rate": 0.0001499640984977377, + "loss": 2.2144, + "step": 119 + }, + { + "epoch": 0.051757601897778734, + "grad_norm": 0.154411181807518, + "learning_rate": 0.0001499630348944199, + "loss": 2.2602, + "step": 120 + }, + { + "epoch": 0.05218891524692689, + "grad_norm": 0.15475720167160034, + "learning_rate": 0.00014996195576918593, + "loss": 2.492, + "step": 121 + }, + { + "epoch": 0.05262022859607505, + "grad_norm": 0.20788845419883728, + "learning_rate": 0.00014996086112225923, + "loss": 2.1501, + "step": 122 + }, + { + "epoch": 0.053051541945223206, + "grad_norm": 0.18076249957084656, + "learning_rate": 0.00014995975095386652, + "loss": 2.2561, + "step": 123 + }, + { + "epoch": 0.05348285529437136, + "grad_norm": 0.14736950397491455, + "learning_rate": 0.00014995862526423755, + "loss": 2.0482, + "step": 124 + }, + { + "epoch": 0.053914168643519514, + "grad_norm": 0.5525259375572205, + "learning_rate": 0.00014995748405360556, + "loss": 2.1995, + "step": 125 + }, + { + "epoch": 0.053914168643519514, + "eval_loss": 2.171874761581421, + "eval_runtime": 196.3857, + "eval_samples_per_second": 0.163, + "eval_steps_per_second": 0.163, + "step": 125 + }, + { + "epoch": 0.05434548199266767, + "grad_norm": 0.14461691677570343, + "learning_rate": 0.00014995632732220675, + "loss": 2.2688, + "step": 126 + }, + { + "epoch": 0.05477679534181583, + "grad_norm": 0.1665572226047516, + "learning_rate": 0.00014995515507028066, + "loss": 2.4054, + "step": 127 + }, + { + "epoch": 0.055208108690963986, + "grad_norm": 0.1451123058795929, + "learning_rate": 0.00014995396729807002, + "loss": 2.2555, + "step": 128 + }, + { + "epoch": 0.055639422040112144, + "grad_norm": 0.1509980857372284, + "learning_rate": 0.00014995276400582076, + "loss": 2.0205, + "step": 129 + }, + { + "epoch": 0.056070735389260294, + "grad_norm": 0.19234485924243927, + "learning_rate": 0.00014995154519378205, + "loss": 2.2185, + "step": 130 + }, + { + "epoch": 0.05650204873840845, + "grad_norm": 0.16396138072013855, + "learning_rate": 0.00014995031086220624, + "loss": 2.3354, + "step": 131 + }, + { + "epoch": 0.05693336208755661, + "grad_norm": 0.15060104429721832, + "learning_rate": 0.00014994906101134894, + "loss": 2.4476, + "step": 132 + }, + { + "epoch": 0.057364675436704766, + "grad_norm": 0.149602472782135, + "learning_rate": 0.0001499477956414689, + "loss": 2.2837, + "step": 133 + }, + { + "epoch": 0.057795988785852924, + "grad_norm": 0.17236609756946564, + "learning_rate": 0.0001499465147528282, + "loss": 2.2653, + "step": 134 + }, + { + "epoch": 0.05822730213500108, + "grad_norm": 0.18239477276802063, + "learning_rate": 0.000149945218345692, + "loss": 2.3734, + "step": 135 + }, + { + "epoch": 0.05865861548414923, + "grad_norm": 0.17764970660209656, + "learning_rate": 0.00014994390642032875, + "loss": 2.2622, + "step": 136 + }, + { + "epoch": 0.05908992883329739, + "grad_norm": 0.148588165640831, + "learning_rate": 0.0001499425789770101, + "loss": 2.3332, + "step": 137 + }, + { + "epoch": 0.05952124218244555, + "grad_norm": 0.1784641295671463, + "learning_rate": 0.00014994123601601091, + "loss": 2.4611, + "step": 138 + }, + { + "epoch": 0.059952555531593704, + "grad_norm": 0.170943483710289, + "learning_rate": 0.00014993987753760926, + "loss": 2.2352, + "step": 139 + }, + { + "epoch": 0.06038386888074186, + "grad_norm": 0.13959744572639465, + "learning_rate": 0.00014993850354208646, + "loss": 2.1047, + "step": 140 + }, + { + "epoch": 0.06081518222989001, + "grad_norm": 0.14827445149421692, + "learning_rate": 0.00014993711402972696, + "loss": 2.3221, + "step": 141 + }, + { + "epoch": 0.06124649557903817, + "grad_norm": 0.1614200323820114, + "learning_rate": 0.00014993570900081852, + "loss": 2.2172, + "step": 142 + }, + { + "epoch": 0.06167780892818633, + "grad_norm": 0.14604084193706512, + "learning_rate": 0.00014993428845565206, + "loss": 2.3326, + "step": 143 + }, + { + "epoch": 0.062109122277334484, + "grad_norm": 0.15171091258525848, + "learning_rate": 0.00014993285239452167, + "loss": 2.1391, + "step": 144 + }, + { + "epoch": 0.06254043562648263, + "grad_norm": 0.13508319854736328, + "learning_rate": 0.00014993140081772478, + "loss": 2.2099, + "step": 145 + }, + { + "epoch": 0.06297174897563079, + "grad_norm": 0.16098541021347046, + "learning_rate": 0.00014992993372556188, + "loss": 2.2943, + "step": 146 + }, + { + "epoch": 0.06340306232477895, + "grad_norm": 0.1379564106464386, + "learning_rate": 0.0001499284511183368, + "loss": 2.3795, + "step": 147 + }, + { + "epoch": 0.06383437567392711, + "grad_norm": 0.16001056134700775, + "learning_rate": 0.0001499269529963565, + "loss": 2.0379, + "step": 148 + }, + { + "epoch": 0.06426568902307526, + "grad_norm": 0.14426477253437042, + "learning_rate": 0.00014992543935993122, + "loss": 2.402, + "step": 149 + }, + { + "epoch": 0.06469700237222342, + "grad_norm": 0.1733705997467041, + "learning_rate": 0.00014992391020937434, + "loss": 2.5628, + "step": 150 + }, + { + "epoch": 0.06469700237222342, + "eval_loss": 2.165189266204834, + "eval_runtime": 195.8058, + "eval_samples_per_second": 0.163, + "eval_steps_per_second": 0.163, + "step": 150 + }, + { + "epoch": 0.06512831572137158, + "grad_norm": 0.15047837793827057, + "learning_rate": 0.00014992236554500248, + "loss": 2.3213, + "step": 151 + }, + { + "epoch": 0.06555962907051974, + "grad_norm": 0.15518510341644287, + "learning_rate": 0.0001499208053671355, + "loss": 2.3589, + "step": 152 + }, + { + "epoch": 0.0659909424196679, + "grad_norm": 0.14743168652057648, + "learning_rate": 0.00014991922967609646, + "loss": 2.393, + "step": 153 + }, + { + "epoch": 0.06642225576881604, + "grad_norm": 0.14655107259750366, + "learning_rate": 0.00014991763847221163, + "loss": 2.2903, + "step": 154 + }, + { + "epoch": 0.0668535691179642, + "grad_norm": 0.1595122069120407, + "learning_rate": 0.00014991603175581046, + "loss": 2.4246, + "step": 155 + }, + { + "epoch": 0.06728488246711235, + "grad_norm": 0.14971016347408295, + "learning_rate": 0.00014991440952722562, + "loss": 2.2262, + "step": 156 + }, + { + "epoch": 0.06771619581626051, + "grad_norm": 2.5569145679473877, + "learning_rate": 0.00014991277178679304, + "loss": 2.1999, + "step": 157 + }, + { + "epoch": 0.06814750916540867, + "grad_norm": 0.14273586869239807, + "learning_rate": 0.00014991111853485185, + "loss": 2.2801, + "step": 158 + }, + { + "epoch": 0.06857882251455683, + "grad_norm": 0.15529994666576385, + "learning_rate": 0.00014990944977174433, + "loss": 2.3245, + "step": 159 + }, + { + "epoch": 0.06901013586370498, + "grad_norm": 0.1808357834815979, + "learning_rate": 0.00014990776549781608, + "loss": 2.3552, + "step": 160 + }, + { + "epoch": 0.06944144921285314, + "grad_norm": 0.15095044672489166, + "learning_rate": 0.00014990606571341577, + "loss": 2.1439, + "step": 161 + }, + { + "epoch": 0.0698727625620013, + "grad_norm": 0.1695532202720642, + "learning_rate": 0.00014990435041889542, + "loss": 2.331, + "step": 162 + }, + { + "epoch": 0.07030407591114946, + "grad_norm": 0.17096754908561707, + "learning_rate": 0.00014990261961461015, + "loss": 2.2373, + "step": 163 + }, + { + "epoch": 0.07073538926029761, + "grad_norm": 0.7461947798728943, + "learning_rate": 0.00014990087330091838, + "loss": 2.1116, + "step": 164 + }, + { + "epoch": 0.07116670260944576, + "grad_norm": 0.16269400715827942, + "learning_rate": 0.0001498991114781817, + "loss": 2.29, + "step": 165 + }, + { + "epoch": 0.07159801595859391, + "grad_norm": 0.1433313637971878, + "learning_rate": 0.00014989733414676488, + "loss": 2.3007, + "step": 166 + }, + { + "epoch": 0.07202932930774207, + "grad_norm": 0.15983504056930542, + "learning_rate": 0.000149895541307036, + "loss": 2.1564, + "step": 167 + }, + { + "epoch": 0.07246064265689023, + "grad_norm": 0.15153203904628754, + "learning_rate": 0.00014989373295936622, + "loss": 2.1013, + "step": 168 + }, + { + "epoch": 0.07289195600603839, + "grad_norm": 0.19010236859321594, + "learning_rate": 0.00014989190910413, + "loss": 2.3796, + "step": 169 + }, + { + "epoch": 0.07332326935518654, + "grad_norm": 0.15856477618217468, + "learning_rate": 0.00014989006974170502, + "loss": 2.2968, + "step": 170 + }, + { + "epoch": 0.0737545827043347, + "grad_norm": 0.17133678495883942, + "learning_rate": 0.0001498882148724721, + "loss": 2.1509, + "step": 171 + }, + { + "epoch": 0.07418589605348286, + "grad_norm": 0.15978550910949707, + "learning_rate": 0.00014988634449681533, + "loss": 2.2124, + "step": 172 + }, + { + "epoch": 0.07461720940263102, + "grad_norm": 0.17379288375377655, + "learning_rate": 0.00014988445861512198, + "loss": 2.4121, + "step": 173 + }, + { + "epoch": 0.07504852275177917, + "grad_norm": 0.1456250548362732, + "learning_rate": 0.00014988255722778255, + "loss": 2.0439, + "step": 174 + }, + { + "epoch": 0.07547983610092733, + "grad_norm": 0.14056707918643951, + "learning_rate": 0.00014988064033519076, + "loss": 2.07, + "step": 175 + }, + { + "epoch": 0.07547983610092733, + "eval_loss": 2.163438320159912, + "eval_runtime": 199.1167, + "eval_samples_per_second": 0.161, + "eval_steps_per_second": 0.161, + "step": 175 + }, + { + "epoch": 0.07591114945007547, + "grad_norm": 0.16266904771327972, + "learning_rate": 0.00014987870793774352, + "loss": 2.332, + "step": 176 + }, + { + "epoch": 0.07634246279922363, + "grad_norm": 0.14274229109287262, + "learning_rate": 0.00014987676003584094, + "loss": 2.4217, + "step": 177 + }, + { + "epoch": 0.07677377614837179, + "grad_norm": 0.19253656268119812, + "learning_rate": 0.0001498747966298863, + "loss": 2.265, + "step": 178 + }, + { + "epoch": 0.07720508949751995, + "grad_norm": 0.15794239938259125, + "learning_rate": 0.00014987281772028626, + "loss": 2.1961, + "step": 179 + }, + { + "epoch": 0.0776364028466681, + "grad_norm": 0.19452308118343353, + "learning_rate": 0.0001498708233074505, + "loss": 2.1581, + "step": 180 + }, + { + "epoch": 0.07806771619581626, + "grad_norm": 0.17844100296497345, + "learning_rate": 0.000149868813391792, + "loss": 2.2836, + "step": 181 + }, + { + "epoch": 0.07849902954496442, + "grad_norm": 0.13737443089485168, + "learning_rate": 0.00014986678797372695, + "loss": 2.0338, + "step": 182 + }, + { + "epoch": 0.07893034289411258, + "grad_norm": 0.2049536406993866, + "learning_rate": 0.0001498647470536747, + "loss": 2.3868, + "step": 183 + }, + { + "epoch": 0.07936165624326073, + "grad_norm": 0.1511652022600174, + "learning_rate": 0.00014986269063205785, + "loss": 2.3927, + "step": 184 + }, + { + "epoch": 0.07979296959240889, + "grad_norm": 0.15115490555763245, + "learning_rate": 0.00014986061870930225, + "loss": 2.2891, + "step": 185 + }, + { + "epoch": 0.08022428294155703, + "grad_norm": 0.2328520268201828, + "learning_rate": 0.00014985853128583689, + "loss": 1.9137, + "step": 186 + }, + { + "epoch": 0.08065559629070519, + "grad_norm": 0.14687751233577728, + "learning_rate": 0.00014985642836209397, + "loss": 2.3559, + "step": 187 + }, + { + "epoch": 0.08108690963985335, + "grad_norm": 0.1597253829240799, + "learning_rate": 0.00014985430993850893, + "loss": 2.4656, + "step": 188 + }, + { + "epoch": 0.08151822298900151, + "grad_norm": 0.1330839842557907, + "learning_rate": 0.00014985217601552045, + "loss": 2.069, + "step": 189 + }, + { + "epoch": 0.08194953633814966, + "grad_norm": 4.166354656219482, + "learning_rate": 0.00014985002659357035, + "loss": 2.1938, + "step": 190 + }, + { + "epoch": 0.08238084968729782, + "grad_norm": 0.14522095024585724, + "learning_rate": 0.00014984786167310368, + "loss": 2.1597, + "step": 191 + }, + { + "epoch": 0.08281216303644598, + "grad_norm": 0.17430412769317627, + "learning_rate": 0.00014984568125456875, + "loss": 2.39, + "step": 192 + }, + { + "epoch": 0.08324347638559414, + "grad_norm": 0.18026231229305267, + "learning_rate": 0.000149843485338417, + "loss": 2.2799, + "step": 193 + }, + { + "epoch": 0.0836747897347423, + "grad_norm": 0.18112143874168396, + "learning_rate": 0.00014984127392510314, + "loss": 2.122, + "step": 194 + }, + { + "epoch": 0.08410610308389045, + "grad_norm": 0.18193262815475464, + "learning_rate": 0.0001498390470150851, + "loss": 2.3805, + "step": 195 + }, + { + "epoch": 0.08453741643303861, + "grad_norm": 0.1746852993965149, + "learning_rate": 0.0001498368046088239, + "loss": 2.24, + "step": 196 + }, + { + "epoch": 0.08496872978218675, + "grad_norm": 0.1676950603723526, + "learning_rate": 0.00014983454670678393, + "loss": 2.1863, + "step": 197 + }, + { + "epoch": 0.08540004313133491, + "grad_norm": 0.21171925961971283, + "learning_rate": 0.00014983227330943267, + "loss": 2.2388, + "step": 198 + }, + { + "epoch": 0.08583135648048307, + "grad_norm": 10.072641372680664, + "learning_rate": 0.0001498299844172409, + "loss": 2.5266, + "step": 199 + }, + { + "epoch": 0.08626266982963122, + "grad_norm": 0.1808120310306549, + "learning_rate": 0.00014982768003068248, + "loss": 2.2083, + "step": 200 + }, + { + "epoch": 0.08626266982963122, + "eval_loss": 2.1606836318969727, + "eval_runtime": 197.9014, + "eval_samples_per_second": 0.162, + "eval_steps_per_second": 0.162, + "step": 200 + }, + { + "epoch": 0.08669398317877938, + "grad_norm": 0.1853431612253189, + "learning_rate": 0.00014982536015023464, + "loss": 2.3255, + "step": 201 + }, + { + "epoch": 0.08712529652792754, + "grad_norm": 0.16815230250358582, + "learning_rate": 0.00014982302477637772, + "loss": 2.2362, + "step": 202 + }, + { + "epoch": 0.0875566098770757, + "grad_norm": 0.16719475388526917, + "learning_rate": 0.00014982067390959523, + "loss": 2.2921, + "step": 203 + }, + { + "epoch": 0.08798792322622385, + "grad_norm": 0.14663250744342804, + "learning_rate": 0.00014981830755037402, + "loss": 2.2575, + "step": 204 + }, + { + "epoch": 0.08841923657537201, + "grad_norm": 0.17926819622516632, + "learning_rate": 0.000149815925699204, + "loss": 2.2459, + "step": 205 + }, + { + "epoch": 0.08885054992452017, + "grad_norm": 0.20000754296779633, + "learning_rate": 0.00014981352835657843, + "loss": 2.4623, + "step": 206 + }, + { + "epoch": 0.08928186327366831, + "grad_norm": 0.252912700176239, + "learning_rate": 0.00014981111552299364, + "loss": 2.1574, + "step": 207 + }, + { + "epoch": 0.08971317662281647, + "grad_norm": 0.16110049188137054, + "learning_rate": 0.00014980868719894925, + "loss": 2.5126, + "step": 208 + }, + { + "epoch": 0.09014448997196463, + "grad_norm": 0.14292703568935394, + "learning_rate": 0.00014980624338494812, + "loss": 2.2023, + "step": 209 + }, + { + "epoch": 0.09057580332111279, + "grad_norm": 0.14326538145542145, + "learning_rate": 0.00014980378408149618, + "loss": 2.2351, + "step": 210 + }, + { + "epoch": 0.09100711667026094, + "grad_norm": 0.1685512661933899, + "learning_rate": 0.00014980130928910274, + "loss": 2.2906, + "step": 211 + }, + { + "epoch": 0.0914384300194091, + "grad_norm": 0.15357233583927155, + "learning_rate": 0.0001497988190082802, + "loss": 2.1548, + "step": 212 + }, + { + "epoch": 0.09186974336855726, + "grad_norm": 0.14555716514587402, + "learning_rate": 0.0001497963132395442, + "loss": 2.3084, + "step": 213 + }, + { + "epoch": 0.09230105671770542, + "grad_norm": 0.1568971574306488, + "learning_rate": 0.00014979379198341354, + "loss": 2.4084, + "step": 214 + }, + { + "epoch": 0.09273237006685357, + "grad_norm": 0.15939919650554657, + "learning_rate": 0.00014979125524041035, + "loss": 2.0452, + "step": 215 + }, + { + "epoch": 0.09316368341600173, + "grad_norm": 0.17139932513237, + "learning_rate": 0.00014978870301105986, + "loss": 2.3126, + "step": 216 + }, + { + "epoch": 0.09359499676514989, + "grad_norm": 0.16442374885082245, + "learning_rate": 0.00014978613529589054, + "loss": 2.2247, + "step": 217 + }, + { + "epoch": 0.09402631011429803, + "grad_norm": 0.15870434045791626, + "learning_rate": 0.00014978355209543408, + "loss": 2.2533, + "step": 218 + }, + { + "epoch": 0.09445762346344619, + "grad_norm": 0.14200221002101898, + "learning_rate": 0.0001497809534102253, + "loss": 2.1155, + "step": 219 + }, + { + "epoch": 0.09488893681259435, + "grad_norm": 0.15516963601112366, + "learning_rate": 0.00014977833924080235, + "loss": 2.1345, + "step": 220 + }, + { + "epoch": 0.0953202501617425, + "grad_norm": 0.16696950793266296, + "learning_rate": 0.00014977570958770646, + "loss": 2.2495, + "step": 221 + }, + { + "epoch": 0.09575156351089066, + "grad_norm": 0.14794184267520905, + "learning_rate": 0.00014977306445148223, + "loss": 2.2028, + "step": 222 + }, + { + "epoch": 0.09618287686003882, + "grad_norm": 0.15544624626636505, + "learning_rate": 0.00014977040383267726, + "loss": 2.3332, + "step": 223 + }, + { + "epoch": 0.09661419020918698, + "grad_norm": 0.1553119421005249, + "learning_rate": 0.0001497677277318425, + "loss": 2.2463, + "step": 224 + }, + { + "epoch": 0.09704550355833513, + "grad_norm": 0.1486513763666153, + "learning_rate": 0.00014976503614953207, + "loss": 2.2552, + "step": 225 + }, + { + "epoch": 0.09704550355833513, + "eval_loss": 2.1567177772521973, + "eval_runtime": 206.3196, + "eval_samples_per_second": 0.155, + "eval_steps_per_second": 0.155, + "step": 225 + }, + { + "epoch": 0.09747681690748329, + "grad_norm": 0.8157986402511597, + "learning_rate": 0.0001497623290863033, + "loss": 2.3277, + "step": 226 + }, + { + "epoch": 0.09790813025663145, + "grad_norm": 0.14612847566604614, + "learning_rate": 0.0001497596065427167, + "loss": 2.2677, + "step": 227 + }, + { + "epoch": 0.0983394436057796, + "grad_norm": 0.16154086589813232, + "learning_rate": 0.000149756868519336, + "loss": 2.3785, + "step": 228 + }, + { + "epoch": 0.09877075695492775, + "grad_norm": 0.14477179944515228, + "learning_rate": 0.00014975411501672815, + "loss": 2.1928, + "step": 229 + }, + { + "epoch": 0.0992020703040759, + "grad_norm": 0.1605735719203949, + "learning_rate": 0.0001497513460354633, + "loss": 2.4549, + "step": 230 + }, + { + "epoch": 0.09963338365322406, + "grad_norm": 0.17547203600406647, + "learning_rate": 0.00014974856157611477, + "loss": 2.4296, + "step": 231 + }, + { + "epoch": 0.10006469700237222, + "grad_norm": 0.15618549287319183, + "learning_rate": 0.00014974576163925913, + "loss": 2.2927, + "step": 232 + }, + { + "epoch": 0.10049601035152038, + "grad_norm": 0.14896760880947113, + "learning_rate": 0.00014974294622547616, + "loss": 2.1635, + "step": 233 + }, + { + "epoch": 0.10092732370066854, + "grad_norm": 0.20270375907421112, + "learning_rate": 0.0001497401153353488, + "loss": 2.3012, + "step": 234 + }, + { + "epoch": 0.1013586370498167, + "grad_norm": 0.16877515614032745, + "learning_rate": 0.0001497372689694632, + "loss": 1.9421, + "step": 235 + }, + { + "epoch": 0.10178995039896485, + "grad_norm": 0.1533048003911972, + "learning_rate": 0.00014973440712840873, + "loss": 2.4441, + "step": 236 + }, + { + "epoch": 0.10222126374811301, + "grad_norm": 0.17349474132061005, + "learning_rate": 0.000149731529812778, + "loss": 2.3928, + "step": 237 + }, + { + "epoch": 0.10265257709726117, + "grad_norm": 0.1848156601190567, + "learning_rate": 0.00014972863702316676, + "loss": 2.1148, + "step": 238 + }, + { + "epoch": 0.10308389044640931, + "grad_norm": 0.15170809626579285, + "learning_rate": 0.00014972572876017403, + "loss": 2.3089, + "step": 239 + }, + { + "epoch": 0.10351520379555747, + "grad_norm": 0.14373555779457092, + "learning_rate": 0.00014972280502440196, + "loss": 2.2718, + "step": 240 + }, + { + "epoch": 0.10394651714470562, + "grad_norm": 0.15860696136951447, + "learning_rate": 0.00014971986581645595, + "loss": 2.3588, + "step": 241 + }, + { + "epoch": 0.10437783049385378, + "grad_norm": 0.2343427538871765, + "learning_rate": 0.0001497169111369446, + "loss": 2.2485, + "step": 242 + }, + { + "epoch": 0.10480914384300194, + "grad_norm": 0.14802317321300507, + "learning_rate": 0.00014971394098647973, + "loss": 2.2126, + "step": 243 + }, + { + "epoch": 0.1052404571921501, + "grad_norm": 0.15597142279148102, + "learning_rate": 0.0001497109553656763, + "loss": 2.5404, + "step": 244 + }, + { + "epoch": 0.10567177054129825, + "grad_norm": 0.15421420335769653, + "learning_rate": 0.00014970795427515253, + "loss": 2.2657, + "step": 245 + }, + { + "epoch": 0.10610308389044641, + "grad_norm": 0.18327467143535614, + "learning_rate": 0.00014970493771552986, + "loss": 2.2102, + "step": 246 + }, + { + "epoch": 0.10653439723959457, + "grad_norm": 0.15270157158374786, + "learning_rate": 0.00014970190568743284, + "loss": 2.3935, + "step": 247 + }, + { + "epoch": 0.10696571058874273, + "grad_norm": 0.15383994579315186, + "learning_rate": 0.00014969885819148934, + "loss": 2.347, + "step": 248 + }, + { + "epoch": 0.10739702393789088, + "grad_norm": 0.16574591398239136, + "learning_rate": 0.00014969579522833038, + "loss": 2.3328, + "step": 249 + }, + { + "epoch": 0.10782833728703903, + "grad_norm": 0.1711103320121765, + "learning_rate": 0.00014969271679859014, + "loss": 2.1721, + "step": 250 + }, + { + "epoch": 0.10782833728703903, + "eval_loss": 2.1529393196105957, + "eval_runtime": 199.1581, + "eval_samples_per_second": 0.161, + "eval_steps_per_second": 0.161, + "step": 250 + }, + { + "epoch": 0.10825965063618719, + "grad_norm": 0.1534319669008255, + "learning_rate": 0.00014968962290290606, + "loss": 2.2902, + "step": 251 + }, + { + "epoch": 0.10869096398533534, + "grad_norm": 0.16753673553466797, + "learning_rate": 0.00014968651354191876, + "loss": 2.3074, + "step": 252 + }, + { + "epoch": 0.1091222773344835, + "grad_norm": 0.1768234521150589, + "learning_rate": 0.00014968338871627207, + "loss": 2.036, + "step": 253 + }, + { + "epoch": 0.10955359068363166, + "grad_norm": 0.1682131290435791, + "learning_rate": 0.00014968024842661307, + "loss": 2.1679, + "step": 254 + }, + { + "epoch": 0.10998490403277981, + "grad_norm": 0.15855084359645844, + "learning_rate": 0.00014967709267359191, + "loss": 2.2204, + "step": 255 + }, + { + "epoch": 0.11041621738192797, + "grad_norm": 0.20829591155052185, + "learning_rate": 0.00014967392145786204, + "loss": 2.29, + "step": 256 + }, + { + "epoch": 0.11084753073107613, + "grad_norm": 0.17669428884983063, + "learning_rate": 0.00014967073478008012, + "loss": 2.1349, + "step": 257 + }, + { + "epoch": 0.11127884408022429, + "grad_norm": 0.1606130748987198, + "learning_rate": 0.000149667532640906, + "loss": 1.9809, + "step": 258 + }, + { + "epoch": 0.11171015742937244, + "grad_norm": 0.16479335725307465, + "learning_rate": 0.00014966431504100268, + "loss": 2.3708, + "step": 259 + }, + { + "epoch": 0.11214147077852059, + "grad_norm": 0.15960106253623962, + "learning_rate": 0.0001496610819810364, + "loss": 2.1645, + "step": 260 + }, + { + "epoch": 0.11257278412766875, + "grad_norm": 0.1663437783718109, + "learning_rate": 0.00014965783346167664, + "loss": 2.4815, + "step": 261 + }, + { + "epoch": 0.1130040974768169, + "grad_norm": 0.2073906511068344, + "learning_rate": 0.000149654569483596, + "loss": 2.3176, + "step": 262 + }, + { + "epoch": 0.11343541082596506, + "grad_norm": 0.16055135428905487, + "learning_rate": 0.00014965129004747036, + "loss": 2.2771, + "step": 263 + }, + { + "epoch": 0.11386672417511322, + "grad_norm": 0.1713322103023529, + "learning_rate": 0.00014964799515397874, + "loss": 2.1259, + "step": 264 + }, + { + "epoch": 0.11429803752426138, + "grad_norm": 0.17094393074512482, + "learning_rate": 0.0001496446848038034, + "loss": 2.308, + "step": 265 + }, + { + "epoch": 0.11472935087340953, + "grad_norm": 0.18342117965221405, + "learning_rate": 0.00014964135899762972, + "loss": 2.3167, + "step": 266 + }, + { + "epoch": 0.11516066422255769, + "grad_norm": 0.17521141469478607, + "learning_rate": 0.00014963801773614643, + "loss": 2.2702, + "step": 267 + }, + { + "epoch": 0.11559197757170585, + "grad_norm": 0.18539130687713623, + "learning_rate": 0.00014963466102004534, + "loss": 2.2543, + "step": 268 + }, + { + "epoch": 0.116023290920854, + "grad_norm": 0.16789484024047852, + "learning_rate": 0.0001496312888500215, + "loss": 2.2379, + "step": 269 + }, + { + "epoch": 0.11645460427000216, + "grad_norm": 0.18903698027133942, + "learning_rate": 0.00014962790122677314, + "loss": 2.1078, + "step": 270 + }, + { + "epoch": 0.1168859176191503, + "grad_norm": 0.15733234584331512, + "learning_rate": 0.0001496244981510017, + "loss": 2.3288, + "step": 271 + }, + { + "epoch": 0.11731723096829846, + "grad_norm": 0.14740827679634094, + "learning_rate": 0.00014962107962341188, + "loss": 2.2901, + "step": 272 + }, + { + "epoch": 0.11774854431744662, + "grad_norm": 0.16103410720825195, + "learning_rate": 0.00014961764564471144, + "loss": 2.2015, + "step": 273 + }, + { + "epoch": 0.11817985766659478, + "grad_norm": 0.18729737401008606, + "learning_rate": 0.00014961419621561147, + "loss": 2.3337, + "step": 274 + }, + { + "epoch": 0.11861117101574294, + "grad_norm": 0.14679095149040222, + "learning_rate": 0.00014961073133682622, + "loss": 2.2346, + "step": 275 + }, + { + "epoch": 0.11861117101574294, + "eval_loss": 2.150817632675171, + "eval_runtime": 198.5771, + "eval_samples_per_second": 0.161, + "eval_steps_per_second": 0.161, + "step": 275 + }, + { + "epoch": 0.1190424843648911, + "grad_norm": 0.18481458723545074, + "learning_rate": 0.0001496072510090731, + "loss": 2.3638, + "step": 276 + }, + { + "epoch": 0.11947379771403925, + "grad_norm": 0.1526854783296585, + "learning_rate": 0.0001496037552330728, + "loss": 2.4251, + "step": 277 + }, + { + "epoch": 0.11990511106318741, + "grad_norm": 0.16679935157299042, + "learning_rate": 0.0001496002440095491, + "loss": 2.2725, + "step": 278 + }, + { + "epoch": 0.12033642441233557, + "grad_norm": 0.1559242308139801, + "learning_rate": 0.00014959671733922908, + "loss": 2.2053, + "step": 279 + }, + { + "epoch": 0.12076773776148372, + "grad_norm": 0.15493714809417725, + "learning_rate": 0.00014959317522284294, + "loss": 2.3275, + "step": 280 + }, + { + "epoch": 0.12119905111063188, + "grad_norm": 0.17583470046520233, + "learning_rate": 0.00014958961766112415, + "loss": 2.2412, + "step": 281 + }, + { + "epoch": 0.12163036445978002, + "grad_norm": 0.18624302744865417, + "learning_rate": 0.0001495860446548093, + "loss": 2.257, + "step": 282 + }, + { + "epoch": 0.12206167780892818, + "grad_norm": 0.16880694031715393, + "learning_rate": 0.00014958245620463827, + "loss": 2.3616, + "step": 283 + }, + { + "epoch": 0.12249299115807634, + "grad_norm": 0.16436102986335754, + "learning_rate": 0.00014957885231135405, + "loss": 2.2771, + "step": 284 + }, + { + "epoch": 0.1229243045072245, + "grad_norm": 0.1797478049993515, + "learning_rate": 0.0001495752329757029, + "loss": 2.2618, + "step": 285 + }, + { + "epoch": 0.12335561785637265, + "grad_norm": 0.16185274720191956, + "learning_rate": 0.00014957159819843418, + "loss": 2.326, + "step": 286 + }, + { + "epoch": 0.12378693120552081, + "grad_norm": 0.16708776354789734, + "learning_rate": 0.0001495679479803006, + "loss": 2.4173, + "step": 287 + }, + { + "epoch": 0.12421824455466897, + "grad_norm": 0.1608465015888214, + "learning_rate": 0.00014956428232205788, + "loss": 2.1053, + "step": 288 + }, + { + "epoch": 0.12464955790381713, + "grad_norm": 0.15215721726417542, + "learning_rate": 0.00014956060122446512, + "loss": 2.361, + "step": 289 + }, + { + "epoch": 0.12508087125296527, + "grad_norm": 0.18214476108551025, + "learning_rate": 0.0001495569046882845, + "loss": 2.2829, + "step": 290 + }, + { + "epoch": 0.12551218460211344, + "grad_norm": 0.1762077957391739, + "learning_rate": 0.0001495531927142814, + "loss": 2.22, + "step": 291 + }, + { + "epoch": 0.12594349795126158, + "grad_norm": 0.16295194625854492, + "learning_rate": 0.00014954946530322446, + "loss": 2.2119, + "step": 292 + }, + { + "epoch": 0.12637481130040976, + "grad_norm": 0.15086841583251953, + "learning_rate": 0.00014954572245588546, + "loss": 2.2676, + "step": 293 + }, + { + "epoch": 0.1268061246495579, + "grad_norm": 0.1604457050561905, + "learning_rate": 0.00014954196417303942, + "loss": 2.0791, + "step": 294 + }, + { + "epoch": 0.12723743799870607, + "grad_norm": 0.15493498742580414, + "learning_rate": 0.00014953819045546453, + "loss": 2.1768, + "step": 295 + }, + { + "epoch": 0.12766875134785421, + "grad_norm": 0.16815432906150818, + "learning_rate": 0.00014953440130394214, + "loss": 2.4107, + "step": 296 + }, + { + "epoch": 0.12810006469700239, + "grad_norm": 0.15603506565093994, + "learning_rate": 0.0001495305967192569, + "loss": 2.1845, + "step": 297 + }, + { + "epoch": 0.12853137804615053, + "grad_norm": 0.1547204703092575, + "learning_rate": 0.00014952677670219654, + "loss": 2.3356, + "step": 298 + }, + { + "epoch": 0.12896269139529867, + "grad_norm": 0.16853530704975128, + "learning_rate": 0.00014952294125355207, + "loss": 2.0881, + "step": 299 + }, + { + "epoch": 0.12939400474444684, + "grad_norm": 0.1595228761434555, + "learning_rate": 0.0001495190903741176, + "loss": 2.4599, + "step": 300 + }, + { + "epoch": 0.12939400474444684, + "eval_loss": 2.1485748291015625, + "eval_runtime": 199.1444, + "eval_samples_per_second": 0.161, + "eval_steps_per_second": 0.161, + "step": 300 + }, + { + "epoch": 0.129825318093595, + "grad_norm": 0.22638855874538422, + "learning_rate": 0.00014951522406469057, + "loss": 2.2354, + "step": 301 + }, + { + "epoch": 0.13025663144274316, + "grad_norm": 3.426063299179077, + "learning_rate": 0.0001495113423260715, + "loss": 2.1358, + "step": 302 + }, + { + "epoch": 0.1306879447918913, + "grad_norm": 0.15848495066165924, + "learning_rate": 0.00014950744515906416, + "loss": 2.1514, + "step": 303 + }, + { + "epoch": 0.13111925814103947, + "grad_norm": 0.15022383630275726, + "learning_rate": 0.0001495035325644755, + "loss": 2.2748, + "step": 304 + }, + { + "epoch": 0.13155057149018762, + "grad_norm": 0.21391138434410095, + "learning_rate": 0.0001494996045431157, + "loss": 2.4523, + "step": 305 + }, + { + "epoch": 0.1319818848393358, + "grad_norm": 0.15094725787639618, + "learning_rate": 0.00014949566109579804, + "loss": 2.1579, + "step": 306 + }, + { + "epoch": 0.13241319818848393, + "grad_norm": 0.1510794758796692, + "learning_rate": 0.00014949170222333907, + "loss": 2.3597, + "step": 307 + }, + { + "epoch": 0.13284451153763208, + "grad_norm": 0.18151414394378662, + "learning_rate": 0.00014948772792655854, + "loss": 2.1657, + "step": 308 + }, + { + "epoch": 0.13327582488678025, + "grad_norm": 0.17762115597724915, + "learning_rate": 0.00014948373820627934, + "loss": 2.2904, + "step": 309 + }, + { + "epoch": 0.1337071382359284, + "grad_norm": 0.15702734887599945, + "learning_rate": 0.00014947973306332765, + "loss": 2.2888, + "step": 310 + }, + { + "epoch": 0.13413845158507656, + "grad_norm": 0.16596752405166626, + "learning_rate": 0.00014947571249853268, + "loss": 2.3807, + "step": 311 + }, + { + "epoch": 0.1345697649342247, + "grad_norm": 0.16935662925243378, + "learning_rate": 0.00014947167651272703, + "loss": 2.3313, + "step": 312 + }, + { + "epoch": 0.13500107828337288, + "grad_norm": 0.16634508967399597, + "learning_rate": 0.0001494676251067463, + "loss": 2.137, + "step": 313 + }, + { + "epoch": 0.13543239163252102, + "grad_norm": 4.746288299560547, + "learning_rate": 0.00014946355828142946, + "loss": 2.2238, + "step": 314 + }, + { + "epoch": 0.1358637049816692, + "grad_norm": 0.16148056089878082, + "learning_rate": 0.00014945947603761854, + "loss": 1.9769, + "step": 315 + }, + { + "epoch": 0.13629501833081734, + "grad_norm": 0.1666569709777832, + "learning_rate": 0.00014945537837615885, + "loss": 2.227, + "step": 316 + }, + { + "epoch": 0.1367263316799655, + "grad_norm": 0.16864323616027832, + "learning_rate": 0.00014945126529789885, + "loss": 2.292, + "step": 317 + }, + { + "epoch": 0.13715764502911365, + "grad_norm": 0.15381769835948944, + "learning_rate": 0.00014944713680369013, + "loss": 2.1039, + "step": 318 + }, + { + "epoch": 0.1375889583782618, + "grad_norm": 0.16596980392932892, + "learning_rate": 0.00014944299289438765, + "loss": 2.5114, + "step": 319 + }, + { + "epoch": 0.13802027172740997, + "grad_norm": 0.156742125749588, + "learning_rate": 0.0001494388335708494, + "loss": 2.4077, + "step": 320 + }, + { + "epoch": 0.1384515850765581, + "grad_norm": 0.16346225142478943, + "learning_rate": 0.00014943465883393654, + "loss": 2.4141, + "step": 321 + }, + { + "epoch": 0.13888289842570628, + "grad_norm": 0.17249517142772675, + "learning_rate": 0.00014943046868451363, + "loss": 2.4374, + "step": 322 + }, + { + "epoch": 0.13931421177485442, + "grad_norm": 0.15804658830165863, + "learning_rate": 0.00014942626312344822, + "loss": 2.3544, + "step": 323 + }, + { + "epoch": 0.1397455251240026, + "grad_norm": 0.15350793302059174, + "learning_rate": 0.00014942204215161108, + "loss": 2.3532, + "step": 324 + }, + { + "epoch": 0.14017683847315074, + "grad_norm": 0.15994253754615784, + "learning_rate": 0.00014941780576987627, + "loss": 2.1783, + "step": 325 + }, + { + "epoch": 0.14017683847315074, + "eval_loss": 2.149773597717285, + "eval_runtime": 200.0419, + "eval_samples_per_second": 0.16, + "eval_steps_per_second": 0.16, + "step": 325 + }, + { + "epoch": 0.1406081518222989, + "grad_norm": 0.16531150043010712, + "learning_rate": 0.00014941355397912095, + "loss": 2.2161, + "step": 326 + }, + { + "epoch": 0.14103946517144705, + "grad_norm": 0.24105919897556305, + "learning_rate": 0.00014940928678022552, + "loss": 2.1379, + "step": 327 + }, + { + "epoch": 0.14147077852059523, + "grad_norm": 0.17379091680049896, + "learning_rate": 0.00014940500417407352, + "loss": 2.3302, + "step": 328 + }, + { + "epoch": 0.14190209186974337, + "grad_norm": 0.155650332570076, + "learning_rate": 0.00014940070616155173, + "loss": 2.1339, + "step": 329 + }, + { + "epoch": 0.1423334052188915, + "grad_norm": 0.16053761541843414, + "learning_rate": 0.0001493963927435501, + "loss": 2.232, + "step": 330 + }, + { + "epoch": 0.14276471856803968, + "grad_norm": 0.22595985233783722, + "learning_rate": 0.00014939206392096177, + "loss": 2.2803, + "step": 331 + }, + { + "epoch": 0.14319603191718783, + "grad_norm": 0.17255187034606934, + "learning_rate": 0.00014938771969468304, + "loss": 2.0613, + "step": 332 + }, + { + "epoch": 0.143627345266336, + "grad_norm": 0.18829742074012756, + "learning_rate": 0.00014938336006561348, + "loss": 2.4404, + "step": 333 + }, + { + "epoch": 0.14405865861548414, + "grad_norm": 0.7800279855728149, + "learning_rate": 0.0001493789850346558, + "loss": 2.2717, + "step": 334 + }, + { + "epoch": 0.1444899719646323, + "grad_norm": 0.1824026256799698, + "learning_rate": 0.00014937459460271583, + "loss": 2.2762, + "step": 335 + }, + { + "epoch": 0.14492128531378046, + "grad_norm": 0.1767878383398056, + "learning_rate": 0.00014937018877070272, + "loss": 2.3848, + "step": 336 + }, + { + "epoch": 0.14535259866292863, + "grad_norm": 0.1794254034757614, + "learning_rate": 0.00014936576753952872, + "loss": 2.1204, + "step": 337 + }, + { + "epoch": 0.14578391201207677, + "grad_norm": 0.17064186930656433, + "learning_rate": 0.0001493613309101093, + "loss": 2.2617, + "step": 338 + }, + { + "epoch": 0.14621522536122494, + "grad_norm": 0.1971042901277542, + "learning_rate": 0.00014935687888336313, + "loss": 2.3569, + "step": 339 + }, + { + "epoch": 0.1466465387103731, + "grad_norm": 0.16145062446594238, + "learning_rate": 0.00014935241146021201, + "loss": 2.1903, + "step": 340 + }, + { + "epoch": 0.14707785205952123, + "grad_norm": 0.18148300051689148, + "learning_rate": 0.000149347928641581, + "loss": 2.2269, + "step": 341 + }, + { + "epoch": 0.1475091654086694, + "grad_norm": 0.19072268903255463, + "learning_rate": 0.00014934343042839832, + "loss": 2.2133, + "step": 342 + }, + { + "epoch": 0.14794047875781755, + "grad_norm": 0.1789669245481491, + "learning_rate": 0.00014933891682159537, + "loss": 2.2024, + "step": 343 + }, + { + "epoch": 0.14837179210696572, + "grad_norm": 0.1523238867521286, + "learning_rate": 0.00014933438782210672, + "loss": 2.3505, + "step": 344 + }, + { + "epoch": 0.14880310545611386, + "grad_norm": 0.15985886752605438, + "learning_rate": 0.00014932984343087017, + "loss": 2.2357, + "step": 345 + }, + { + "epoch": 0.14923441880526203, + "grad_norm": 0.16432873904705048, + "learning_rate": 0.0001493252836488267, + "loss": 2.2678, + "step": 346 + }, + { + "epoch": 0.14966573215441017, + "grad_norm": 0.14971576631069183, + "learning_rate": 0.0001493207084769204, + "loss": 2.0104, + "step": 347 + }, + { + "epoch": 0.15009704550355835, + "grad_norm": 0.16536620259284973, + "learning_rate": 0.00014931611791609868, + "loss": 2.2598, + "step": 348 + }, + { + "epoch": 0.1505283588527065, + "grad_norm": 0.16472128033638, + "learning_rate": 0.00014931151196731206, + "loss": 2.2177, + "step": 349 + }, + { + "epoch": 0.15095967220185466, + "grad_norm": 0.1672239452600479, + "learning_rate": 0.0001493068906315142, + "loss": 2.5143, + "step": 350 + }, + { + "epoch": 0.15095967220185466, + "eval_loss": 2.147500991821289, + "eval_runtime": 199.3339, + "eval_samples_per_second": 0.161, + "eval_steps_per_second": 0.161, + "step": 350 + }, + { + "epoch": 0.1513909855510028, + "grad_norm": 0.15225522220134735, + "learning_rate": 0.00014930225390966201, + "loss": 2.1453, + "step": 351 + }, + { + "epoch": 0.15182229890015095, + "grad_norm": 0.20649577677249908, + "learning_rate": 0.00014929760180271563, + "loss": 2.1401, + "step": 352 + }, + { + "epoch": 0.15225361224929912, + "grad_norm": 0.20784424245357513, + "learning_rate": 0.00014929293431163826, + "loss": 2.2101, + "step": 353 + }, + { + "epoch": 0.15268492559844726, + "grad_norm": 0.20000191032886505, + "learning_rate": 0.0001492882514373964, + "loss": 2.091, + "step": 354 + }, + { + "epoch": 0.15311623894759543, + "grad_norm": 0.17898254096508026, + "learning_rate": 0.0001492835531809597, + "loss": 2.1752, + "step": 355 + }, + { + "epoch": 0.15354755229674358, + "grad_norm": 1.4756970405578613, + "learning_rate": 0.00014927883954330093, + "loss": 2.2416, + "step": 356 + }, + { + "epoch": 0.15397886564589175, + "grad_norm": 0.17481818795204163, + "learning_rate": 0.00014927411052539615, + "loss": 2.3631, + "step": 357 + }, + { + "epoch": 0.1544101789950399, + "grad_norm": 0.1990717649459839, + "learning_rate": 0.00014926936612822452, + "loss": 2.4406, + "step": 358 + }, + { + "epoch": 0.15484149234418806, + "grad_norm": 0.1901736706495285, + "learning_rate": 0.00014926460635276845, + "loss": 2.2884, + "step": 359 + }, + { + "epoch": 0.1552728056933362, + "grad_norm": 0.16656960546970367, + "learning_rate": 0.00014925983120001352, + "loss": 2.3036, + "step": 360 + }, + { + "epoch": 0.15570411904248435, + "grad_norm": 0.16077920794487, + "learning_rate": 0.0001492550406709484, + "loss": 2.2038, + "step": 361 + }, + { + "epoch": 0.15613543239163252, + "grad_norm": 0.18244610726833344, + "learning_rate": 0.0001492502347665651, + "loss": 2.2564, + "step": 362 + }, + { + "epoch": 0.15656674574078067, + "grad_norm": 0.16418373584747314, + "learning_rate": 0.0001492454134878587, + "loss": 2.216, + "step": 363 + }, + { + "epoch": 0.15699805908992884, + "grad_norm": 0.16991998255252838, + "learning_rate": 0.00014924057683582748, + "loss": 2.194, + "step": 364 + }, + { + "epoch": 0.15742937243907698, + "grad_norm": 0.17290419340133667, + "learning_rate": 0.00014923572481147297, + "loss": 2.1836, + "step": 365 + }, + { + "epoch": 0.15786068578822515, + "grad_norm": 0.17075559496879578, + "learning_rate": 0.0001492308574157998, + "loss": 2.3898, + "step": 366 + }, + { + "epoch": 0.1582919991373733, + "grad_norm": 0.15228039026260376, + "learning_rate": 0.00014922597464981583, + "loss": 2.1941, + "step": 367 + }, + { + "epoch": 0.15872331248652147, + "grad_norm": 0.15793053805828094, + "learning_rate": 0.0001492210765145321, + "loss": 2.2475, + "step": 368 + }, + { + "epoch": 0.1591546258356696, + "grad_norm": 0.1701696217060089, + "learning_rate": 0.00014921616301096278, + "loss": 2.1133, + "step": 369 + }, + { + "epoch": 0.15958593918481778, + "grad_norm": 0.1666203737258911, + "learning_rate": 0.0001492112341401253, + "loss": 2.2035, + "step": 370 + }, + { + "epoch": 0.16001725253396593, + "grad_norm": 0.15982025861740112, + "learning_rate": 0.00014920628990304025, + "loss": 2.1841, + "step": 371 + }, + { + "epoch": 0.16044856588311407, + "grad_norm": 0.18124015629291534, + "learning_rate": 0.00014920133030073136, + "loss": 2.3534, + "step": 372 + }, + { + "epoch": 0.16087987923226224, + "grad_norm": 0.1591426134109497, + "learning_rate": 0.00014919635533422557, + "loss": 2.1313, + "step": 373 + }, + { + "epoch": 0.16131119258141038, + "grad_norm": 0.15558771789073944, + "learning_rate": 0.00014919136500455302, + "loss": 2.2474, + "step": 374 + }, + { + "epoch": 0.16174250593055856, + "grad_norm": 0.15386322140693665, + "learning_rate": 0.000149186359312747, + "loss": 2.0939, + "step": 375 + }, + { + "epoch": 0.16174250593055856, + "eval_loss": 2.145970344543457, + "eval_runtime": 199.181, + "eval_samples_per_second": 0.161, + "eval_steps_per_second": 0.161, + "step": 375 + }, + { + "epoch": 0.1621738192797067, + "grad_norm": 0.1613401472568512, + "learning_rate": 0.000149181338259844, + "loss": 2.2186, + "step": 376 + }, + { + "epoch": 0.16260513262885487, + "grad_norm": 0.16436243057250977, + "learning_rate": 0.0001491763018468837, + "loss": 2.1255, + "step": 377 + }, + { + "epoch": 0.16303644597800301, + "grad_norm": 0.15056097507476807, + "learning_rate": 0.00014917125007490889, + "loss": 2.2909, + "step": 378 + }, + { + "epoch": 0.16346775932715119, + "grad_norm": 1.3172454833984375, + "learning_rate": 0.00014916618294496566, + "loss": 2.3205, + "step": 379 + }, + { + "epoch": 0.16389907267629933, + "grad_norm": 0.1644919365644455, + "learning_rate": 0.00014916110045810316, + "loss": 2.2956, + "step": 380 + }, + { + "epoch": 0.1643303860254475, + "grad_norm": 0.16997544467449188, + "learning_rate": 0.0001491560026153738, + "loss": 2.4651, + "step": 381 + }, + { + "epoch": 0.16476169937459564, + "grad_norm": 0.17540326714515686, + "learning_rate": 0.00014915088941783313, + "loss": 2.2358, + "step": 382 + }, + { + "epoch": 0.1651930127237438, + "grad_norm": 0.1699107587337494, + "learning_rate": 0.0001491457608665399, + "loss": 2.4559, + "step": 383 + }, + { + "epoch": 0.16562432607289196, + "grad_norm": 0.1600213497877121, + "learning_rate": 0.0001491406169625561, + "loss": 2.1716, + "step": 384 + }, + { + "epoch": 0.1660556394220401, + "grad_norm": 0.5548557639122009, + "learning_rate": 0.00014913545770694673, + "loss": 2.2096, + "step": 385 + }, + { + "epoch": 0.16648695277118827, + "grad_norm": 0.1656799465417862, + "learning_rate": 0.0001491302831007801, + "loss": 2.18, + "step": 386 + }, + { + "epoch": 0.16691826612033642, + "grad_norm": 0.18784065544605255, + "learning_rate": 0.00014912509314512768, + "loss": 2.3623, + "step": 387 + }, + { + "epoch": 0.1673495794694846, + "grad_norm": 0.17505837976932526, + "learning_rate": 0.00014911988784106412, + "loss": 2.3084, + "step": 388 + }, + { + "epoch": 0.16778089281863273, + "grad_norm": 0.1727839559316635, + "learning_rate": 0.00014911466718966722, + "loss": 2.2042, + "step": 389 + }, + { + "epoch": 0.1682122061677809, + "grad_norm": 0.16786758601665497, + "learning_rate": 0.00014910943119201796, + "loss": 1.9998, + "step": 390 + }, + { + "epoch": 0.16864351951692905, + "grad_norm": 0.18020984530448914, + "learning_rate": 0.00014910417984920056, + "loss": 2.2189, + "step": 391 + }, + { + "epoch": 0.16907483286607722, + "grad_norm": 0.16372406482696533, + "learning_rate": 0.00014909891316230227, + "loss": 2.2852, + "step": 392 + }, + { + "epoch": 0.16950614621522536, + "grad_norm": 0.15674197673797607, + "learning_rate": 0.00014909363113241374, + "loss": 2.0479, + "step": 393 + }, + { + "epoch": 0.1699374595643735, + "grad_norm": 0.17500346899032593, + "learning_rate": 0.00014908833376062858, + "loss": 2.4546, + "step": 394 + }, + { + "epoch": 0.17036877291352168, + "grad_norm": 0.20425276458263397, + "learning_rate": 0.0001490830210480437, + "loss": 2.318, + "step": 395 + }, + { + "epoch": 0.17080008626266982, + "grad_norm": 0.16255128383636475, + "learning_rate": 0.00014907769299575915, + "loss": 2.3926, + "step": 396 + }, + { + "epoch": 0.171231399611818, + "grad_norm": 0.18405508995056152, + "learning_rate": 0.00014907234960487818, + "loss": 2.2712, + "step": 397 + }, + { + "epoch": 0.17166271296096614, + "grad_norm": 0.1642206460237503, + "learning_rate": 0.00014906699087650717, + "loss": 2.1432, + "step": 398 + }, + { + "epoch": 0.1720940263101143, + "grad_norm": 0.14774279296398163, + "learning_rate": 0.00014906161681175573, + "loss": 2.1958, + "step": 399 + }, + { + "epoch": 0.17252533965926245, + "grad_norm": 0.1816944181919098, + "learning_rate": 0.00014905622741173658, + "loss": 2.3528, + "step": 400 + }, + { + "epoch": 0.17252533965926245, + "eval_loss": 2.145236015319824, + "eval_runtime": 199.4998, + "eval_samples_per_second": 0.16, + "eval_steps_per_second": 0.16, + "step": 400 + }, + { + "epoch": 0.17295665300841062, + "grad_norm": 0.15424853563308716, + "learning_rate": 0.00014905082267756568, + "loss": 2.1429, + "step": 401 + }, + { + "epoch": 0.17338796635755876, + "grad_norm": 0.1658765971660614, + "learning_rate": 0.00014904540261036217, + "loss": 2.2798, + "step": 402 + }, + { + "epoch": 0.17381927970670694, + "grad_norm": 0.183359295129776, + "learning_rate": 0.0001490399672112483, + "loss": 2.3538, + "step": 403 + }, + { + "epoch": 0.17425059305585508, + "grad_norm": 0.16044598817825317, + "learning_rate": 0.0001490345164813495, + "loss": 2.2135, + "step": 404 + }, + { + "epoch": 0.17468190640500322, + "grad_norm": 0.1611524522304535, + "learning_rate": 0.00014902905042179446, + "loss": 2.2036, + "step": 405 + }, + { + "epoch": 0.1751132197541514, + "grad_norm": 0.16904768347740173, + "learning_rate": 0.00014902356903371495, + "loss": 2.3068, + "step": 406 + }, + { + "epoch": 0.17554453310329954, + "grad_norm": 0.1789490431547165, + "learning_rate": 0.00014901807231824598, + "loss": 2.2595, + "step": 407 + }, + { + "epoch": 0.1759758464524477, + "grad_norm": 0.21464680135250092, + "learning_rate": 0.00014901256027652568, + "loss": 2.2024, + "step": 408 + }, + { + "epoch": 0.17640715980159585, + "grad_norm": 0.16914889216423035, + "learning_rate": 0.0001490070329096954, + "loss": 2.1231, + "step": 409 + }, + { + "epoch": 0.17683847315074402, + "grad_norm": 0.17402493953704834, + "learning_rate": 0.0001490014902188996, + "loss": 1.9689, + "step": 410 + }, + { + "epoch": 0.17726978649989217, + "grad_norm": 0.15200650691986084, + "learning_rate": 0.000148995932205286, + "loss": 2.34, + "step": 411 + }, + { + "epoch": 0.17770109984904034, + "grad_norm": 0.16262347996234894, + "learning_rate": 0.0001489903588700054, + "loss": 2.216, + "step": 412 + }, + { + "epoch": 0.17813241319818848, + "grad_norm": 0.16141052544116974, + "learning_rate": 0.0001489847702142119, + "loss": 2.3693, + "step": 413 + }, + { + "epoch": 0.17856372654733663, + "grad_norm": 0.1716516762971878, + "learning_rate": 0.00014897916623906262, + "loss": 2.2875, + "step": 414 + }, + { + "epoch": 0.1789950398964848, + "grad_norm": 0.16397427022457123, + "learning_rate": 0.00014897354694571794, + "loss": 2.3241, + "step": 415 + }, + { + "epoch": 0.17942635324563294, + "grad_norm": 0.1554698348045349, + "learning_rate": 0.00014896791233534143, + "loss": 2.2666, + "step": 416 + }, + { + "epoch": 0.1798576665947811, + "grad_norm": 0.16670356690883636, + "learning_rate": 0.00014896226240909975, + "loss": 2.3704, + "step": 417 + }, + { + "epoch": 0.18028897994392926, + "grad_norm": 0.14467710256576538, + "learning_rate": 0.0001489565971681628, + "loss": 2.2538, + "step": 418 + }, + { + "epoch": 0.18072029329307743, + "grad_norm": 0.1782013177871704, + "learning_rate": 0.00014895091661370363, + "loss": 2.263, + "step": 419 + }, + { + "epoch": 0.18115160664222557, + "grad_norm": 0.16968831419944763, + "learning_rate": 0.00014894522074689845, + "loss": 2.4698, + "step": 420 + }, + { + "epoch": 0.18158291999137374, + "grad_norm": 0.1799907386302948, + "learning_rate": 0.0001489395095689267, + "loss": 2.2043, + "step": 421 + }, + { + "epoch": 0.18201423334052189, + "grad_norm": 0.14666646718978882, + "learning_rate": 0.00014893378308097088, + "loss": 2.1512, + "step": 422 + }, + { + "epoch": 0.18244554668967006, + "grad_norm": 0.15499548614025116, + "learning_rate": 0.00014892804128421676, + "loss": 2.0666, + "step": 423 + }, + { + "epoch": 0.1828768600388182, + "grad_norm": 0.17115478217601776, + "learning_rate": 0.00014892228417985324, + "loss": 2.4236, + "step": 424 + }, + { + "epoch": 0.18330817338796634, + "grad_norm": 0.16812491416931152, + "learning_rate": 0.00014891651176907235, + "loss": 2.2477, + "step": 425 + }, + { + "epoch": 0.18330817338796634, + "eval_loss": 2.1428017616271973, + "eval_runtime": 202.8329, + "eval_samples_per_second": 0.158, + "eval_steps_per_second": 0.158, + "step": 425 + }, + { + "epoch": 0.18373948673711452, + "grad_norm": 0.16948385536670685, + "learning_rate": 0.00014891072405306938, + "loss": 2.3145, + "step": 426 + }, + { + "epoch": 0.18417080008626266, + "grad_norm": 0.14548641443252563, + "learning_rate": 0.0001489049210330427, + "loss": 2.2195, + "step": 427 + }, + { + "epoch": 0.18460211343541083, + "grad_norm": 0.14978563785552979, + "learning_rate": 0.00014889910271019392, + "loss": 2.0593, + "step": 428 + }, + { + "epoch": 0.18503342678455897, + "grad_norm": 0.16009832918643951, + "learning_rate": 0.0001488932690857278, + "loss": 2.3423, + "step": 429 + }, + { + "epoch": 0.18546474013370715, + "grad_norm": 0.17507681250572205, + "learning_rate": 0.0001488874201608522, + "loss": 2.1819, + "step": 430 + }, + { + "epoch": 0.1858960534828553, + "grad_norm": 0.18338724970817566, + "learning_rate": 0.00014888155593677823, + "loss": 2.1853, + "step": 431 + }, + { + "epoch": 0.18632736683200346, + "grad_norm": 0.15442681312561035, + "learning_rate": 0.0001488756764147202, + "loss": 2.2035, + "step": 432 + }, + { + "epoch": 0.1867586801811516, + "grad_norm": 0.5245089530944824, + "learning_rate": 0.00014886978159589541, + "loss": 2.2897, + "step": 433 + }, + { + "epoch": 0.18718999353029978, + "grad_norm": 0.13658595085144043, + "learning_rate": 0.0001488638714815246, + "loss": 2.1846, + "step": 434 + }, + { + "epoch": 0.18762130687944792, + "grad_norm": 0.17327919602394104, + "learning_rate": 0.00014885794607283136, + "loss": 2.1647, + "step": 435 + }, + { + "epoch": 0.18805262022859606, + "grad_norm": 0.17164286971092224, + "learning_rate": 0.0001488520053710427, + "loss": 2.3012, + "step": 436 + }, + { + "epoch": 0.18848393357774423, + "grad_norm": 0.15758675336837769, + "learning_rate": 0.00014884604937738874, + "loss": 2.3051, + "step": 437 + }, + { + "epoch": 0.18891524692689238, + "grad_norm": 0.1563560664653778, + "learning_rate": 0.00014884007809310264, + "loss": 2.1007, + "step": 438 + }, + { + "epoch": 0.18934656027604055, + "grad_norm": 0.17045928537845612, + "learning_rate": 0.00014883409151942087, + "loss": 2.2063, + "step": 439 + }, + { + "epoch": 0.1897778736251887, + "grad_norm": 0.1447650045156479, + "learning_rate": 0.00014882808965758305, + "loss": 2.2638, + "step": 440 + }, + { + "epoch": 0.19020918697433686, + "grad_norm": 0.1615784913301468, + "learning_rate": 0.0001488220725088319, + "loss": 2.1472, + "step": 441 + }, + { + "epoch": 0.190640500323485, + "grad_norm": 0.16287849843502045, + "learning_rate": 0.00014881604007441332, + "loss": 1.9561, + "step": 442 + }, + { + "epoch": 0.19107181367263318, + "grad_norm": 0.2232157289981842, + "learning_rate": 0.0001488099923555764, + "loss": 2.3082, + "step": 443 + }, + { + "epoch": 0.19150312702178132, + "grad_norm": 0.17425796389579773, + "learning_rate": 0.00014880392935357338, + "loss": 2.3123, + "step": 444 + }, + { + "epoch": 0.1919344403709295, + "grad_norm": 0.1674915999174118, + "learning_rate": 0.00014879785106965967, + "loss": 2.1359, + "step": 445 + }, + { + "epoch": 0.19236575372007764, + "grad_norm": 0.16911302506923676, + "learning_rate": 0.0001487917575050939, + "loss": 2.124, + "step": 446 + }, + { + "epoch": 0.19279706706922578, + "grad_norm": 0.13820582628250122, + "learning_rate": 0.00014878564866113774, + "loss": 2.1094, + "step": 447 + }, + { + "epoch": 0.19322838041837395, + "grad_norm": 0.19064851105213165, + "learning_rate": 0.0001487795245390561, + "loss": 2.1212, + "step": 448 + }, + { + "epoch": 0.1936596937675221, + "grad_norm": 10.566106796264648, + "learning_rate": 0.0001487733851401171, + "loss": 2.2606, + "step": 449 + }, + { + "epoch": 0.19409100711667027, + "grad_norm": 0.17199164628982544, + "learning_rate": 0.0001487672304655919, + "loss": 2.3024, + "step": 450 + }, + { + "epoch": 0.19409100711667027, + "eval_loss": 2.142256259918213, + "eval_runtime": 198.5198, + "eval_samples_per_second": 0.161, + "eval_steps_per_second": 0.161, + "step": 450 + }, + { + "epoch": 0.1945223204658184, + "grad_norm": 0.15751536190509796, + "learning_rate": 0.00014876106051675496, + "loss": 2.3373, + "step": 451 + }, + { + "epoch": 0.19495363381496658, + "grad_norm": 0.1832340806722641, + "learning_rate": 0.0001487548752948838, + "loss": 2.2288, + "step": 452 + }, + { + "epoch": 0.19538494716411473, + "grad_norm": 0.7646205425262451, + "learning_rate": 0.00014874867480125912, + "loss": 2.4942, + "step": 453 + }, + { + "epoch": 0.1958162605132629, + "grad_norm": 0.1642490178346634, + "learning_rate": 0.00014874245903716483, + "loss": 2.3353, + "step": 454 + }, + { + "epoch": 0.19624757386241104, + "grad_norm": 0.16240373253822327, + "learning_rate": 0.00014873622800388797, + "loss": 2.1596, + "step": 455 + }, + { + "epoch": 0.1966788872115592, + "grad_norm": 0.14881034195423126, + "learning_rate": 0.00014872998170271874, + "loss": 1.8476, + "step": 456 + }, + { + "epoch": 0.19711020056070735, + "grad_norm": 0.1751919984817505, + "learning_rate": 0.0001487237201349505, + "loss": 2.3051, + "step": 457 + }, + { + "epoch": 0.1975415139098555, + "grad_norm": 0.19177457690238953, + "learning_rate": 0.00014871744330187975, + "loss": 2.3034, + "step": 458 + }, + { + "epoch": 0.19797282725900367, + "grad_norm": 0.17044596374034882, + "learning_rate": 0.0001487111512048062, + "loss": 2.1733, + "step": 459 + }, + { + "epoch": 0.1984041406081518, + "grad_norm": 0.1570066660642624, + "learning_rate": 0.00014870484384503274, + "loss": 2.2867, + "step": 460 + }, + { + "epoch": 0.19883545395729998, + "grad_norm": 0.18342995643615723, + "learning_rate": 0.00014869852122386532, + "loss": 2.3922, + "step": 461 + }, + { + "epoch": 0.19926676730644813, + "grad_norm": 0.18954606354236603, + "learning_rate": 0.00014869218334261314, + "loss": 2.2386, + "step": 462 + }, + { + "epoch": 0.1996980806555963, + "grad_norm": 0.1563069224357605, + "learning_rate": 0.0001486858302025885, + "loss": 1.9606, + "step": 463 + }, + { + "epoch": 0.20012939400474444, + "grad_norm": 0.15879355370998383, + "learning_rate": 0.0001486794618051069, + "loss": 2.3816, + "step": 464 + }, + { + "epoch": 0.20056070735389261, + "grad_norm": 0.17720089852809906, + "learning_rate": 0.000148673078151487, + "loss": 2.3701, + "step": 465 + }, + { + "epoch": 0.20099202070304076, + "grad_norm": 0.1530313342809677, + "learning_rate": 0.0001486666792430506, + "loss": 2.2818, + "step": 466 + }, + { + "epoch": 0.2014233340521889, + "grad_norm": 0.1817789524793625, + "learning_rate": 0.00014866026508112264, + "loss": 2.333, + "step": 467 + }, + { + "epoch": 0.20185464740133707, + "grad_norm": 0.16977185010910034, + "learning_rate": 0.00014865383566703126, + "loss": 2.1365, + "step": 468 + }, + { + "epoch": 0.20228596075048522, + "grad_norm": 0.15945596992969513, + "learning_rate": 0.00014864739100210774, + "loss": 2.2966, + "step": 469 + }, + { + "epoch": 0.2027172740996334, + "grad_norm": 0.1694299727678299, + "learning_rate": 0.00014864093108768654, + "loss": 2.1316, + "step": 470 + }, + { + "epoch": 0.20314858744878153, + "grad_norm": 0.15608814358711243, + "learning_rate": 0.0001486344559251052, + "loss": 2.1924, + "step": 471 + }, + { + "epoch": 0.2035799007979297, + "grad_norm": 0.15335038304328918, + "learning_rate": 0.00014862796551570455, + "loss": 2.1299, + "step": 472 + }, + { + "epoch": 0.20401121414707785, + "grad_norm": 0.1774802803993225, + "learning_rate": 0.00014862145986082844, + "loss": 2.0524, + "step": 473 + }, + { + "epoch": 0.20444252749622602, + "grad_norm": 0.1639878749847412, + "learning_rate": 0.00014861493896182394, + "loss": 2.1435, + "step": 474 + }, + { + "epoch": 0.20487384084537416, + "grad_norm": 0.17186541855335236, + "learning_rate": 0.0001486084028200413, + "loss": 2.2342, + "step": 475 + }, + { + "epoch": 0.20487384084537416, + "eval_loss": 2.142070770263672, + "eval_runtime": 198.599, + "eval_samples_per_second": 0.161, + "eval_steps_per_second": 0.161, + "step": 475 + }, + { + "epoch": 0.20530515419452233, + "grad_norm": 0.17359185218811035, + "learning_rate": 0.00014860185143683387, + "loss": 2.23, + "step": 476 + }, + { + "epoch": 0.20573646754367048, + "grad_norm": 0.16410641372203827, + "learning_rate": 0.00014859528481355825, + "loss": 1.7696, + "step": 477 + }, + { + "epoch": 0.20616778089281862, + "grad_norm": 0.19071485102176666, + "learning_rate": 0.00014858870295157405, + "loss": 2.1906, + "step": 478 + }, + { + "epoch": 0.2065990942419668, + "grad_norm": 0.1821126639842987, + "learning_rate": 0.00014858210585224415, + "loss": 2.1561, + "step": 479 + }, + { + "epoch": 0.20703040759111493, + "grad_norm": 0.20578120648860931, + "learning_rate": 0.00014857549351693456, + "loss": 2.0733, + "step": 480 + }, + { + "epoch": 0.2074617209402631, + "grad_norm": 0.15326827764511108, + "learning_rate": 0.00014856886594701442, + "loss": 2.2958, + "step": 481 + }, + { + "epoch": 0.20789303428941125, + "grad_norm": 0.2035706788301468, + "learning_rate": 0.00014856222314385607, + "loss": 2.2846, + "step": 482 + }, + { + "epoch": 0.20832434763855942, + "grad_norm": 0.17650489509105682, + "learning_rate": 0.00014855556510883495, + "loss": 2.2103, + "step": 483 + }, + { + "epoch": 0.20875566098770756, + "grad_norm": 0.199966162443161, + "learning_rate": 0.00014854889184332968, + "loss": 1.9659, + "step": 484 + }, + { + "epoch": 0.20918697433685574, + "grad_norm": 0.17148557305335999, + "learning_rate": 0.00014854220334872205, + "loss": 2.2062, + "step": 485 + }, + { + "epoch": 0.20961828768600388, + "grad_norm": 0.5571771264076233, + "learning_rate": 0.00014853549962639694, + "loss": 2.191, + "step": 486 + }, + { + "epoch": 0.21004960103515205, + "grad_norm": 0.16572660207748413, + "learning_rate": 0.0001485287806777425, + "loss": 2.1958, + "step": 487 + }, + { + "epoch": 0.2104809143843002, + "grad_norm": 0.16806870698928833, + "learning_rate": 0.0001485220465041499, + "loss": 2.3831, + "step": 488 + }, + { + "epoch": 0.21091222773344834, + "grad_norm": 0.1517253816127777, + "learning_rate": 0.00014851529710701355, + "loss": 1.9694, + "step": 489 + }, + { + "epoch": 0.2113435410825965, + "grad_norm": 1.1633365154266357, + "learning_rate": 0.000148508532487731, + "loss": 2.1299, + "step": 490 + }, + { + "epoch": 0.21177485443174465, + "grad_norm": 0.16392192244529724, + "learning_rate": 0.0001485017526477029, + "loss": 2.1838, + "step": 491 + }, + { + "epoch": 0.21220616778089282, + "grad_norm": 0.16261541843414307, + "learning_rate": 0.0001484949575883331, + "loss": 2.3216, + "step": 492 + }, + { + "epoch": 0.21263748113004097, + "grad_norm": 0.18670570850372314, + "learning_rate": 0.00014848814731102864, + "loss": 2.195, + "step": 493 + }, + { + "epoch": 0.21306879447918914, + "grad_norm": 0.18141891062259674, + "learning_rate": 0.00014848132181719963, + "loss": 2.3045, + "step": 494 + }, + { + "epoch": 0.21350010782833728, + "grad_norm": 0.19345717132091522, + "learning_rate": 0.00014847448110825933, + "loss": 2.238, + "step": 495 + }, + { + "epoch": 0.21393142117748545, + "grad_norm": 0.1645248383283615, + "learning_rate": 0.00014846762518562422, + "loss": 1.9521, + "step": 496 + }, + { + "epoch": 0.2143627345266336, + "grad_norm": 0.18492425978183746, + "learning_rate": 0.0001484607540507139, + "loss": 2.3596, + "step": 497 + }, + { + "epoch": 0.21479404787578177, + "grad_norm": 0.2229699045419693, + "learning_rate": 0.00014845386770495107, + "loss": 2.2964, + "step": 498 + }, + { + "epoch": 0.2152253612249299, + "grad_norm": 0.161407932639122, + "learning_rate": 0.0001484469661497617, + "loss": 2.1899, + "step": 499 + }, + { + "epoch": 0.21565667457407806, + "grad_norm": 0.1566958725452423, + "learning_rate": 0.00014844004938657475, + "loss": 2.1988, + "step": 500 + }, + { + "epoch": 0.21565667457407806, + "eval_loss": 2.1414709091186523, + "eval_runtime": 195.9589, + "eval_samples_per_second": 0.163, + "eval_steps_per_second": 0.163, + "step": 500 + }, + { + "epoch": 0.21608798792322623, + "grad_norm": 0.17611227929592133, + "learning_rate": 0.00014843311741682247, + "loss": 2.1853, + "step": 501 + }, + { + "epoch": 0.21651930127237437, + "grad_norm": 0.16318093240261078, + "learning_rate": 0.00014842617024194015, + "loss": 1.9843, + "step": 502 + }, + { + "epoch": 0.21695061462152254, + "grad_norm": 0.18686994910240173, + "learning_rate": 0.00014841920786336633, + "loss": 2.1425, + "step": 503 + }, + { + "epoch": 0.21738192797067069, + "grad_norm": 0.21011656522750854, + "learning_rate": 0.00014841223028254263, + "loss": 2.2464, + "step": 504 + }, + { + "epoch": 0.21781324131981886, + "grad_norm": 0.1605004221200943, + "learning_rate": 0.00014840523750091382, + "loss": 2.4065, + "step": 505 + }, + { + "epoch": 0.218244554668967, + "grad_norm": 0.16983674466609955, + "learning_rate": 0.00014839822951992783, + "loss": 2.25, + "step": 506 + }, + { + "epoch": 0.21867586801811517, + "grad_norm": 0.18663226068019867, + "learning_rate": 0.00014839120634103575, + "loss": 2.432, + "step": 507 + }, + { + "epoch": 0.21910718136726332, + "grad_norm": 0.17589536309242249, + "learning_rate": 0.0001483841679656918, + "loss": 2.3051, + "step": 508 + }, + { + "epoch": 0.2195384947164115, + "grad_norm": 0.162202388048172, + "learning_rate": 0.00014837711439535337, + "loss": 2.2593, + "step": 509 + }, + { + "epoch": 0.21996980806555963, + "grad_norm": 0.17092770338058472, + "learning_rate": 0.00014837004563148095, + "loss": 2.1711, + "step": 510 + }, + { + "epoch": 0.22040112141470777, + "grad_norm": 0.17335964739322662, + "learning_rate": 0.00014836296167553825, + "loss": 2.2748, + "step": 511 + }, + { + "epoch": 0.22083243476385594, + "grad_norm": 0.15895693004131317, + "learning_rate": 0.00014835586252899202, + "loss": 2.2698, + "step": 512 + }, + { + "epoch": 0.2212637481130041, + "grad_norm": 0.17679838836193085, + "learning_rate": 0.00014834874819331226, + "loss": 2.3605, + "step": 513 + }, + { + "epoch": 0.22169506146215226, + "grad_norm": 13.117202758789062, + "learning_rate": 0.00014834161866997207, + "loss": 2.3758, + "step": 514 + }, + { + "epoch": 0.2221263748113004, + "grad_norm": 0.15782105922698975, + "learning_rate": 0.00014833447396044768, + "loss": 2.0502, + "step": 515 + }, + { + "epoch": 0.22255768816044857, + "grad_norm": 0.19260673224925995, + "learning_rate": 0.00014832731406621844, + "loss": 2.3573, + "step": 516 + }, + { + "epoch": 0.22298900150959672, + "grad_norm": 0.16335803270339966, + "learning_rate": 0.000148320138988767, + "loss": 2.4363, + "step": 517 + }, + { + "epoch": 0.2234203148587449, + "grad_norm": 0.16421473026275635, + "learning_rate": 0.00014831294872957892, + "loss": 2.3026, + "step": 518 + }, + { + "epoch": 0.22385162820789303, + "grad_norm": 0.1660144180059433, + "learning_rate": 0.00014830574329014308, + "loss": 2.2679, + "step": 519 + }, + { + "epoch": 0.22428294155704118, + "grad_norm": 0.17254631221294403, + "learning_rate": 0.00014829852267195142, + "loss": 2.1773, + "step": 520 + }, + { + "epoch": 0.22471425490618935, + "grad_norm": 0.19264696538448334, + "learning_rate": 0.00014829128687649908, + "loss": 1.8575, + "step": 521 + }, + { + "epoch": 0.2251455682553375, + "grad_norm": 0.16468121111392975, + "learning_rate": 0.0001482840359052843, + "loss": 2.2782, + "step": 522 + }, + { + "epoch": 0.22557688160448566, + "grad_norm": 0.17862504720687866, + "learning_rate": 0.00014827676975980844, + "loss": 2.2626, + "step": 523 + }, + { + "epoch": 0.2260081949536338, + "grad_norm": 0.16050106287002563, + "learning_rate": 0.00014826948844157608, + "loss": 2.1099, + "step": 524 + }, + { + "epoch": 0.22643950830278198, + "grad_norm": 0.17345581948757172, + "learning_rate": 0.00014826219195209487, + "loss": 2.2409, + "step": 525 + }, + { + "epoch": 0.22643950830278198, + "eval_loss": 2.1415786743164062, + "eval_runtime": 195.7665, + "eval_samples_per_second": 0.163, + "eval_steps_per_second": 0.163, + "step": 525 + }, + { + "epoch": 0.22687082165193012, + "grad_norm": 0.15715128183364868, + "learning_rate": 0.00014825488029287563, + "loss": 2.2605, + "step": 526 + }, + { + "epoch": 0.2273021350010783, + "grad_norm": 0.16525809466838837, + "learning_rate": 0.00014824755346543234, + "loss": 2.3405, + "step": 527 + }, + { + "epoch": 0.22773344835022644, + "grad_norm": 0.17459775507450104, + "learning_rate": 0.00014824021147128206, + "loss": 2.3439, + "step": 528 + }, + { + "epoch": 0.2281647616993746, + "grad_norm": 0.14913460612297058, + "learning_rate": 0.0001482328543119451, + "loss": 2.2562, + "step": 529 + }, + { + "epoch": 0.22859607504852275, + "grad_norm": 0.15568678081035614, + "learning_rate": 0.00014822548198894473, + "loss": 2.3318, + "step": 530 + }, + { + "epoch": 0.2290273883976709, + "grad_norm": 0.21238815784454346, + "learning_rate": 0.00014821809450380754, + "loss": 2.4029, + "step": 531 + }, + { + "epoch": 0.22945870174681907, + "grad_norm": 0.16378119587898254, + "learning_rate": 0.00014821069185806324, + "loss": 2.2611, + "step": 532 + }, + { + "epoch": 0.2298900150959672, + "grad_norm": 0.1750461906194687, + "learning_rate": 0.00014820327405324452, + "loss": 2.3605, + "step": 533 + }, + { + "epoch": 0.23032132844511538, + "grad_norm": 0.17745572328567505, + "learning_rate": 0.0001481958410908874, + "loss": 2.3389, + "step": 534 + }, + { + "epoch": 0.23075264179426352, + "grad_norm": 0.18830518424510956, + "learning_rate": 0.0001481883929725309, + "loss": 2.4119, + "step": 535 + }, + { + "epoch": 0.2311839551434117, + "grad_norm": 0.18070408701896667, + "learning_rate": 0.00014818092969971728, + "loss": 2.1729, + "step": 536 + }, + { + "epoch": 0.23161526849255984, + "grad_norm": 0.15680450201034546, + "learning_rate": 0.00014817345127399187, + "loss": 2.3044, + "step": 537 + }, + { + "epoch": 0.232046581841708, + "grad_norm": 0.1724395602941513, + "learning_rate": 0.00014816595769690316, + "loss": 2.202, + "step": 538 + }, + { + "epoch": 0.23247789519085615, + "grad_norm": 0.14011593163013458, + "learning_rate": 0.0001481584489700028, + "loss": 2.1531, + "step": 539 + }, + { + "epoch": 0.23290920854000433, + "grad_norm": 0.2459859997034073, + "learning_rate": 0.00014815092509484548, + "loss": 2.2046, + "step": 540 + }, + { + "epoch": 0.23334052188915247, + "grad_norm": 0.17075932025909424, + "learning_rate": 0.0001481433860729892, + "loss": 2.1311, + "step": 541 + }, + { + "epoch": 0.2337718352383006, + "grad_norm": 0.1609731912612915, + "learning_rate": 0.00014813583190599491, + "loss": 2.2081, + "step": 542 + }, + { + "epoch": 0.23420314858744878, + "grad_norm": 0.16049246490001678, + "learning_rate": 0.00014812826259542688, + "loss": 2.256, + "step": 543 + }, + { + "epoch": 0.23463446193659693, + "grad_norm": 0.1557115763425827, + "learning_rate": 0.00014812067814285232, + "loss": 2.2185, + "step": 544 + }, + { + "epoch": 0.2350657752857451, + "grad_norm": 0.15727244317531586, + "learning_rate": 0.00014811307854984172, + "loss": 2.3666, + "step": 545 + }, + { + "epoch": 0.23549708863489324, + "grad_norm": 0.7236835360527039, + "learning_rate": 0.00014810546381796867, + "loss": 2.2764, + "step": 546 + }, + { + "epoch": 0.2359284019840414, + "grad_norm": 0.16988235712051392, + "learning_rate": 0.00014809783394880986, + "loss": 2.138, + "step": 547 + }, + { + "epoch": 0.23635971533318956, + "grad_norm": 0.1607985943555832, + "learning_rate": 0.00014809018894394515, + "loss": 2.0603, + "step": 548 + }, + { + "epoch": 0.23679102868233773, + "grad_norm": 0.15483079850673676, + "learning_rate": 0.00014808252880495754, + "loss": 2.2166, + "step": 549 + }, + { + "epoch": 0.23722234203148587, + "grad_norm": 0.16665418446063995, + "learning_rate": 0.00014807485353343308, + "loss": 2.4235, + "step": 550 + }, + { + "epoch": 0.23722234203148587, + "eval_loss": 2.139251708984375, + "eval_runtime": 195.574, + "eval_samples_per_second": 0.164, + "eval_steps_per_second": 0.164, + "step": 550 + }, + { + "epoch": 0.23765365538063404, + "grad_norm": 0.15788012742996216, + "learning_rate": 0.0001480671631309611, + "loss": 2.2751, + "step": 551 + }, + { + "epoch": 0.2380849687297822, + "grad_norm": 0.16924196481704712, + "learning_rate": 0.00014805945759913393, + "loss": 2.152, + "step": 552 + }, + { + "epoch": 0.23851628207893033, + "grad_norm": 0.16306330263614655, + "learning_rate": 0.0001480517369395471, + "loss": 2.2638, + "step": 553 + }, + { + "epoch": 0.2389475954280785, + "grad_norm": 0.20340776443481445, + "learning_rate": 0.00014804400115379926, + "loss": 2.4568, + "step": 554 + }, + { + "epoch": 0.23937890877722665, + "grad_norm": 0.15684518218040466, + "learning_rate": 0.00014803625024349218, + "loss": 2.3376, + "step": 555 + }, + { + "epoch": 0.23981022212637482, + "grad_norm": 0.17012250423431396, + "learning_rate": 0.00014802848421023078, + "loss": 2.3626, + "step": 556 + }, + { + "epoch": 0.24024153547552296, + "grad_norm": 0.150764599442482, + "learning_rate": 0.0001480207030556231, + "loss": 2.2943, + "step": 557 + }, + { + "epoch": 0.24067284882467113, + "grad_norm": 0.17265529930591583, + "learning_rate": 0.0001480129067812803, + "loss": 2.2501, + "step": 558 + }, + { + "epoch": 0.24110416217381928, + "grad_norm": 0.18661445379257202, + "learning_rate": 0.00014800509538881668, + "loss": 2.4074, + "step": 559 + }, + { + "epoch": 0.24153547552296745, + "grad_norm": 0.16257712244987488, + "learning_rate": 0.00014799726887984973, + "loss": 2.2272, + "step": 560 + }, + { + "epoch": 0.2419667888721156, + "grad_norm": 0.16717654466629028, + "learning_rate": 0.00014798942725599996, + "loss": 2.2114, + "step": 561 + }, + { + "epoch": 0.24239810222126376, + "grad_norm": 0.17024226486682892, + "learning_rate": 0.00014798157051889107, + "loss": 2.2616, + "step": 562 + }, + { + "epoch": 0.2428294155704119, + "grad_norm": 0.18863873183727264, + "learning_rate": 0.0001479736986701499, + "loss": 2.3047, + "step": 563 + }, + { + "epoch": 0.24326072891956005, + "grad_norm": 0.15407370030879974, + "learning_rate": 0.00014796581171140638, + "loss": 2.3092, + "step": 564 + }, + { + "epoch": 0.24369204226870822, + "grad_norm": 0.1805611550807953, + "learning_rate": 0.0001479579096442936, + "loss": 2.4571, + "step": 565 + }, + { + "epoch": 0.24412335561785636, + "grad_norm": 0.16363826394081116, + "learning_rate": 0.0001479499924704478, + "loss": 2.3163, + "step": 566 + }, + { + "epoch": 0.24455466896700453, + "grad_norm": 0.15935292840003967, + "learning_rate": 0.00014794206019150827, + "loss": 2.0979, + "step": 567 + }, + { + "epoch": 0.24498598231615268, + "grad_norm": 0.18893010914325714, + "learning_rate": 0.0001479341128091175, + "loss": 2.1063, + "step": 568 + }, + { + "epoch": 0.24541729566530085, + "grad_norm": 0.16460704803466797, + "learning_rate": 0.0001479261503249211, + "loss": 2.2173, + "step": 569 + }, + { + "epoch": 0.245848609014449, + "grad_norm": 0.1792575865983963, + "learning_rate": 0.00014791817274056776, + "loss": 2.2617, + "step": 570 + }, + { + "epoch": 0.24627992236359716, + "grad_norm": 0.2035822719335556, + "learning_rate": 0.00014791018005770931, + "loss": 2.3372, + "step": 571 + }, + { + "epoch": 0.2467112357127453, + "grad_norm": 0.17065055668354034, + "learning_rate": 0.00014790217227800075, + "loss": 2.1626, + "step": 572 + }, + { + "epoch": 0.24714254906189345, + "grad_norm": 0.21874402463436127, + "learning_rate": 0.0001478941494031002, + "loss": 2.2004, + "step": 573 + }, + { + "epoch": 0.24757386241104162, + "grad_norm": 0.190892294049263, + "learning_rate": 0.00014788611143466885, + "loss": 2.3801, + "step": 574 + }, + { + "epoch": 0.24800517576018977, + "grad_norm": 0.17411410808563232, + "learning_rate": 0.00014787805837437103, + "loss": 2.3002, + "step": 575 + }, + { + "epoch": 0.24800517576018977, + "eval_loss": 2.138429641723633, + "eval_runtime": 196.0983, + "eval_samples_per_second": 0.163, + "eval_steps_per_second": 0.163, + "step": 575 + }, + { + "epoch": 0.24843648910933794, + "grad_norm": 0.15937082469463348, + "learning_rate": 0.00014786999022387427, + "loss": 2.1931, + "step": 576 + }, + { + "epoch": 0.24886780245848608, + "grad_norm": 0.3105604648590088, + "learning_rate": 0.00014786190698484915, + "loss": 2.1752, + "step": 577 + }, + { + "epoch": 0.24929911580763425, + "grad_norm": 0.17199891805648804, + "learning_rate": 0.00014785380865896939, + "loss": 2.1575, + "step": 578 + }, + { + "epoch": 0.2497304291567824, + "grad_norm": 0.15182305872440338, + "learning_rate": 0.0001478456952479118, + "loss": 2.1933, + "step": 579 + }, + { + "epoch": 0.25016174250593054, + "grad_norm": 0.20546135306358337, + "learning_rate": 0.0001478375667533564, + "loss": 2.2331, + "step": 580 + }, + { + "epoch": 0.25059305585507874, + "grad_norm": 0.1734628528356552, + "learning_rate": 0.00014782942317698627, + "loss": 1.8555, + "step": 581 + }, + { + "epoch": 0.2510243692042269, + "grad_norm": 0.22307294607162476, + "learning_rate": 0.00014782126452048762, + "loss": 2.1859, + "step": 582 + }, + { + "epoch": 0.251455682553375, + "grad_norm": 0.1479979008436203, + "learning_rate": 0.0001478130907855498, + "loss": 2.0667, + "step": 583 + }, + { + "epoch": 0.25188699590252317, + "grad_norm": 0.15659348666667938, + "learning_rate": 0.00014780490197386522, + "loss": 2.2639, + "step": 584 + }, + { + "epoch": 0.2523183092516713, + "grad_norm": 0.16493523120880127, + "learning_rate": 0.00014779669808712953, + "loss": 2.2895, + "step": 585 + }, + { + "epoch": 0.2527496226008195, + "grad_norm": 0.14518970251083374, + "learning_rate": 0.00014778847912704142, + "loss": 2.1188, + "step": 586 + }, + { + "epoch": 0.25318093594996766, + "grad_norm": 0.15645526349544525, + "learning_rate": 0.0001477802450953027, + "loss": 2.4878, + "step": 587 + }, + { + "epoch": 0.2536122492991158, + "grad_norm": 0.1723669171333313, + "learning_rate": 0.00014777199599361833, + "loss": 2.164, + "step": 588 + }, + { + "epoch": 0.25404356264826394, + "grad_norm": 0.17445899546146393, + "learning_rate": 0.00014776373182369634, + "loss": 2.2338, + "step": 589 + }, + { + "epoch": 0.25447487599741214, + "grad_norm": 0.15474815666675568, + "learning_rate": 0.000147755452587248, + "loss": 2.2222, + "step": 590 + }, + { + "epoch": 0.2549061893465603, + "grad_norm": 0.16550877690315247, + "learning_rate": 0.0001477471582859875, + "loss": 2.199, + "step": 591 + }, + { + "epoch": 0.25533750269570843, + "grad_norm": 1.0499933958053589, + "learning_rate": 0.00014773884892163236, + "loss": 2.1105, + "step": 592 + }, + { + "epoch": 0.2557688160448566, + "grad_norm": 0.7829015254974365, + "learning_rate": 0.00014773052449590309, + "loss": 1.9241, + "step": 593 + }, + { + "epoch": 0.25620012939400477, + "grad_norm": 0.17413672804832458, + "learning_rate": 0.00014772218501052335, + "loss": 2.3395, + "step": 594 + }, + { + "epoch": 0.2566314427431529, + "grad_norm": 0.1572626531124115, + "learning_rate": 0.00014771383046721992, + "loss": 2.1767, + "step": 595 + }, + { + "epoch": 0.25706275609230106, + "grad_norm": 0.17021185159683228, + "learning_rate": 0.0001477054608677227, + "loss": 2.2199, + "step": 596 + }, + { + "epoch": 0.2574940694414492, + "grad_norm": 0.1993943303823471, + "learning_rate": 0.00014769707621376473, + "loss": 2.3534, + "step": 597 + }, + { + "epoch": 0.25792538279059735, + "grad_norm": 0.17457185685634613, + "learning_rate": 0.00014768867650708214, + "loss": 2.1193, + "step": 598 + }, + { + "epoch": 0.25835669613974555, + "grad_norm": 0.16530904173851013, + "learning_rate": 0.00014768026174941416, + "loss": 2.2143, + "step": 599 + }, + { + "epoch": 0.2587880094888937, + "grad_norm": 0.16474269330501556, + "learning_rate": 0.00014767183194250316, + "loss": 2.2017, + "step": 600 + }, + { + "epoch": 0.2587880094888937, + "eval_loss": 2.1378190517425537, + "eval_runtime": 196.4733, + "eval_samples_per_second": 0.163, + "eval_steps_per_second": 0.163, + "step": 600 + }, + { + "epoch": 0.25921932283804183, + "grad_norm": 0.1671011596918106, + "learning_rate": 0.00014766338708809465, + "loss": 2.0947, + "step": 601 + }, + { + "epoch": 0.25965063618719, + "grad_norm": 0.15906308591365814, + "learning_rate": 0.00014765492718793718, + "loss": 2.128, + "step": 602 + }, + { + "epoch": 0.2600819495363382, + "grad_norm": 0.15791428089141846, + "learning_rate": 0.00014764645224378252, + "loss": 2.1973, + "step": 603 + }, + { + "epoch": 0.2605132628854863, + "grad_norm": 0.2012777179479599, + "learning_rate": 0.00014763796225738542, + "loss": 2.2386, + "step": 604 + }, + { + "epoch": 0.26094457623463446, + "grad_norm": 0.18990804255008698, + "learning_rate": 0.00014762945723050395, + "loss": 2.4138, + "step": 605 + }, + { + "epoch": 0.2613758895837826, + "grad_norm": 0.17072105407714844, + "learning_rate": 0.00014762093716489907, + "loss": 2.1979, + "step": 606 + }, + { + "epoch": 0.26180720293293075, + "grad_norm": 0.1727512776851654, + "learning_rate": 0.00014761240206233499, + "loss": 2.3602, + "step": 607 + }, + { + "epoch": 0.26223851628207895, + "grad_norm": 0.1772853136062622, + "learning_rate": 0.00014760385192457898, + "loss": 2.2454, + "step": 608 + }, + { + "epoch": 0.2626698296312271, + "grad_norm": 0.91371750831604, + "learning_rate": 0.00014759528675340145, + "loss": 2.3401, + "step": 609 + }, + { + "epoch": 0.26310114298037524, + "grad_norm": 0.1926833838224411, + "learning_rate": 0.0001475867065505759, + "loss": 2.3298, + "step": 610 + }, + { + "epoch": 0.2635324563295234, + "grad_norm": 0.17731206119060516, + "learning_rate": 0.00014757811131787895, + "loss": 2.3576, + "step": 611 + }, + { + "epoch": 0.2639637696786716, + "grad_norm": 0.1678379476070404, + "learning_rate": 0.00014756950105709036, + "loss": 2.1995, + "step": 612 + }, + { + "epoch": 0.2643950830278197, + "grad_norm": 0.1588241308927536, + "learning_rate": 0.000147560875769993, + "loss": 2.3786, + "step": 613 + }, + { + "epoch": 0.26482639637696787, + "grad_norm": 0.16079789400100708, + "learning_rate": 0.00014755223545837278, + "loss": 2.3308, + "step": 614 + }, + { + "epoch": 0.265257709726116, + "grad_norm": 0.18283496797084808, + "learning_rate": 0.0001475435801240188, + "loss": 2.2975, + "step": 615 + }, + { + "epoch": 0.26568902307526415, + "grad_norm": 0.18275459110736847, + "learning_rate": 0.00014753490976872322, + "loss": 2.4674, + "step": 616 + }, + { + "epoch": 0.26612033642441235, + "grad_norm": 0.18903575837612152, + "learning_rate": 0.00014752622439428132, + "loss": 2.2237, + "step": 617 + }, + { + "epoch": 0.2665516497735605, + "grad_norm": 0.1476614773273468, + "learning_rate": 0.00014751752400249155, + "loss": 2.0617, + "step": 618 + }, + { + "epoch": 0.26698296312270864, + "grad_norm": 0.18297837674617767, + "learning_rate": 0.0001475088085951554, + "loss": 2.163, + "step": 619 + }, + { + "epoch": 0.2674142764718568, + "grad_norm": 0.18463227152824402, + "learning_rate": 0.0001475000781740775, + "loss": 2.4564, + "step": 620 + }, + { + "epoch": 0.267845589821005, + "grad_norm": 0.18771764636039734, + "learning_rate": 0.00014749133274106555, + "loss": 2.13, + "step": 621 + }, + { + "epoch": 0.2682769031701531, + "grad_norm": 0.15326857566833496, + "learning_rate": 0.0001474825722979304, + "loss": 2.3268, + "step": 622 + }, + { + "epoch": 0.26870821651930127, + "grad_norm": 0.16682656109333038, + "learning_rate": 0.000147473796846486, + "loss": 2.396, + "step": 623 + }, + { + "epoch": 0.2691395298684494, + "grad_norm": 0.21941545605659485, + "learning_rate": 0.0001474650063885494, + "loss": 2.206, + "step": 624 + }, + { + "epoch": 0.2695708432175976, + "grad_norm": 0.15367084741592407, + "learning_rate": 0.00014745620092594078, + "loss": 2.2849, + "step": 625 + }, + { + "epoch": 0.2695708432175976, + "eval_loss": 2.136373519897461, + "eval_runtime": 196.1623, + "eval_samples_per_second": 0.163, + "eval_steps_per_second": 0.163, + "step": 625 + }, + { + "epoch": 0.27000215656674575, + "grad_norm": 0.1778152734041214, + "learning_rate": 0.0001474473804604834, + "loss": 2.2033, + "step": 626 + }, + { + "epoch": 0.2704334699158939, + "grad_norm": 0.16819840669631958, + "learning_rate": 0.00014743854499400357, + "loss": 2.1546, + "step": 627 + }, + { + "epoch": 0.27086478326504204, + "grad_norm": 0.191078320145607, + "learning_rate": 0.00014742969452833087, + "loss": 2.2636, + "step": 628 + }, + { + "epoch": 0.2712960966141902, + "grad_norm": 0.18553073704242706, + "learning_rate": 0.00014742082906529785, + "loss": 2.2205, + "step": 629 + }, + { + "epoch": 0.2717274099633384, + "grad_norm": 0.15881304442882538, + "learning_rate": 0.00014741194860674016, + "loss": 2.2768, + "step": 630 + }, + { + "epoch": 0.27215872331248653, + "grad_norm": 0.1653841882944107, + "learning_rate": 0.00014740305315449664, + "loss": 2.1803, + "step": 631 + }, + { + "epoch": 0.27259003666163467, + "grad_norm": 0.20279088616371155, + "learning_rate": 0.00014739414271040918, + "loss": 2.3493, + "step": 632 + }, + { + "epoch": 0.2730213500107828, + "grad_norm": 0.16524416208267212, + "learning_rate": 0.0001473852172763228, + "loss": 2.2841, + "step": 633 + }, + { + "epoch": 0.273452663359931, + "grad_norm": 0.1860021948814392, + "learning_rate": 0.00014737627685408556, + "loss": 2.178, + "step": 634 + }, + { + "epoch": 0.27388397670907916, + "grad_norm": 0.16222679615020752, + "learning_rate": 0.00014736732144554873, + "loss": 2.2774, + "step": 635 + }, + { + "epoch": 0.2743152900582273, + "grad_norm": 0.1624583899974823, + "learning_rate": 0.00014735835105256657, + "loss": 2.329, + "step": 636 + }, + { + "epoch": 0.27474660340737544, + "grad_norm": 0.17073509097099304, + "learning_rate": 0.00014734936567699657, + "loss": 2.1383, + "step": 637 + }, + { + "epoch": 0.2751779167565236, + "grad_norm": 0.15490107238292694, + "learning_rate": 0.00014734036532069916, + "loss": 2.1866, + "step": 638 + }, + { + "epoch": 0.2756092301056718, + "grad_norm": 0.17504987120628357, + "learning_rate": 0.00014733134998553803, + "loss": 2.3105, + "step": 639 + }, + { + "epoch": 0.27604054345481993, + "grad_norm": 0.16620095074176788, + "learning_rate": 0.00014732231967337989, + "loss": 2.1842, + "step": 640 + }, + { + "epoch": 0.2764718568039681, + "grad_norm": 0.15766702592372894, + "learning_rate": 0.00014731327438609454, + "loss": 2.2546, + "step": 641 + }, + { + "epoch": 0.2769031701531162, + "grad_norm": 0.18117624521255493, + "learning_rate": 0.00014730421412555492, + "loss": 2.2438, + "step": 642 + }, + { + "epoch": 0.2773344835022644, + "grad_norm": 0.1685836762189865, + "learning_rate": 0.00014729513889363708, + "loss": 2.1841, + "step": 643 + }, + { + "epoch": 0.27776579685141256, + "grad_norm": 0.1785241663455963, + "learning_rate": 0.00014728604869222012, + "loss": 2.3096, + "step": 644 + }, + { + "epoch": 0.2781971102005607, + "grad_norm": 0.1662873923778534, + "learning_rate": 0.00014727694352318626, + "loss": 2.3142, + "step": 645 + }, + { + "epoch": 0.27862842354970885, + "grad_norm": 0.15653882920742035, + "learning_rate": 0.00014726782338842087, + "loss": 2.1199, + "step": 646 + }, + { + "epoch": 0.27905973689885705, + "grad_norm": 0.19263307750225067, + "learning_rate": 0.00014725868828981232, + "loss": 2.3715, + "step": 647 + }, + { + "epoch": 0.2794910502480052, + "grad_norm": 0.18505576252937317, + "learning_rate": 0.00014724953822925214, + "loss": 2.3807, + "step": 648 + }, + { + "epoch": 0.27992236359715333, + "grad_norm": 0.16494807600975037, + "learning_rate": 0.00014724037320863498, + "loss": 2.3978, + "step": 649 + }, + { + "epoch": 0.2803536769463015, + "grad_norm": 0.16470667719841003, + "learning_rate": 0.00014723119322985852, + "loss": 2.4136, + "step": 650 + }, + { + "epoch": 0.2803536769463015, + "eval_loss": 2.134341239929199, + "eval_runtime": 203.163, + "eval_samples_per_second": 0.158, + "eval_steps_per_second": 0.158, + "step": 650 + }, + { + "epoch": 0.2807849902954496, + "grad_norm": 0.16153836250305176, + "learning_rate": 0.0001472219982948236, + "loss": 2.1043, + "step": 651 + }, + { + "epoch": 0.2812163036445978, + "grad_norm": 0.17460724711418152, + "learning_rate": 0.00014721278840543414, + "loss": 2.0528, + "step": 652 + }, + { + "epoch": 0.28164761699374596, + "grad_norm": 0.1704031229019165, + "learning_rate": 0.00014720356356359715, + "loss": 2.5944, + "step": 653 + }, + { + "epoch": 0.2820789303428941, + "grad_norm": 0.17145408689975739, + "learning_rate": 0.00014719432377122268, + "loss": 2.4141, + "step": 654 + }, + { + "epoch": 0.28251024369204225, + "grad_norm": 0.16281622648239136, + "learning_rate": 0.00014718506903022398, + "loss": 2.4925, + "step": 655 + }, + { + "epoch": 0.28294155704119045, + "grad_norm": 0.17861981689929962, + "learning_rate": 0.00014717579934251735, + "loss": 2.1748, + "step": 656 + }, + { + "epoch": 0.2833728703903386, + "grad_norm": 0.15419550240039825, + "learning_rate": 0.00014716651471002214, + "loss": 2.0366, + "step": 657 + }, + { + "epoch": 0.28380418373948674, + "grad_norm": 0.18950116634368896, + "learning_rate": 0.0001471572151346609, + "loss": 2.0928, + "step": 658 + }, + { + "epoch": 0.2842354970886349, + "grad_norm": 0.1749858558177948, + "learning_rate": 0.00014714790061835914, + "loss": 2.164, + "step": 659 + }, + { + "epoch": 0.284666810437783, + "grad_norm": 0.18648329377174377, + "learning_rate": 0.00014713857116304554, + "loss": 2.2707, + "step": 660 + }, + { + "epoch": 0.2850981237869312, + "grad_norm": 0.15878723561763763, + "learning_rate": 0.00014712922677065192, + "loss": 2.3001, + "step": 661 + }, + { + "epoch": 0.28552943713607937, + "grad_norm": 0.16697610914707184, + "learning_rate": 0.0001471198674431131, + "loss": 2.4064, + "step": 662 + }, + { + "epoch": 0.2859607504852275, + "grad_norm": 0.16395072638988495, + "learning_rate": 0.00014711049318236706, + "loss": 2.3017, + "step": 663 + }, + { + "epoch": 0.28639206383437565, + "grad_norm": 0.16236403584480286, + "learning_rate": 0.00014710110399035478, + "loss": 2.2722, + "step": 664 + }, + { + "epoch": 0.28682337718352385, + "grad_norm": 0.16682536900043488, + "learning_rate": 0.00014709169986902042, + "loss": 2.3292, + "step": 665 + }, + { + "epoch": 0.287254690532672, + "grad_norm": 0.16841153800487518, + "learning_rate": 0.00014708228082031127, + "loss": 2.3933, + "step": 666 + }, + { + "epoch": 0.28768600388182014, + "grad_norm": 0.16072210669517517, + "learning_rate": 0.00014707284684617756, + "loss": 2.2569, + "step": 667 + }, + { + "epoch": 0.2881173172309683, + "grad_norm": 0.15559151768684387, + "learning_rate": 0.00014706339794857276, + "loss": 2.3392, + "step": 668 + }, + { + "epoch": 0.2885486305801164, + "grad_norm": 0.159805566072464, + "learning_rate": 0.00014705393412945333, + "loss": 2.2788, + "step": 669 + }, + { + "epoch": 0.2889799439292646, + "grad_norm": 0.1495869755744934, + "learning_rate": 0.0001470444553907789, + "loss": 2.0518, + "step": 670 + }, + { + "epoch": 0.28941125727841277, + "grad_norm": 0.17200689017772675, + "learning_rate": 0.00014703496173451206, + "loss": 2.3155, + "step": 671 + }, + { + "epoch": 0.2898425706275609, + "grad_norm": 0.15233959257602692, + "learning_rate": 0.00014702545316261869, + "loss": 2.1359, + "step": 672 + }, + { + "epoch": 0.29027388397670906, + "grad_norm": 0.16399161517620087, + "learning_rate": 0.00014701592967706755, + "loss": 2.1428, + "step": 673 + }, + { + "epoch": 0.29070519732585726, + "grad_norm": 0.165985107421875, + "learning_rate": 0.00014700639127983062, + "loss": 2.1885, + "step": 674 + }, + { + "epoch": 0.2911365106750054, + "grad_norm": 0.16751165688037872, + "learning_rate": 0.00014699683797288294, + "loss": 2.3933, + "step": 675 + }, + { + "epoch": 0.2911365106750054, + "eval_loss": 2.132812023162842, + "eval_runtime": 200.6349, + "eval_samples_per_second": 0.159, + "eval_steps_per_second": 0.159, + "step": 675 + }, + { + "epoch": 0.29156782402415354, + "grad_norm": 0.19334676861763, + "learning_rate": 0.0001469872697582026, + "loss": 2.2254, + "step": 676 + }, + { + "epoch": 0.2919991373733017, + "grad_norm": 0.15076959133148193, + "learning_rate": 0.0001469776866377708, + "loss": 2.3313, + "step": 677 + }, + { + "epoch": 0.2924304507224499, + "grad_norm": 0.17511612176895142, + "learning_rate": 0.0001469680886135719, + "loss": 2.4024, + "step": 678 + }, + { + "epoch": 0.29286176407159803, + "grad_norm": 0.1722462773323059, + "learning_rate": 0.00014695847568759317, + "loss": 2.2334, + "step": 679 + }, + { + "epoch": 0.2932930774207462, + "grad_norm": 0.15810130536556244, + "learning_rate": 0.00014694884786182517, + "loss": 2.405, + "step": 680 + }, + { + "epoch": 0.2937243907698943, + "grad_norm": 0.19121390581130981, + "learning_rate": 0.00014693920513826137, + "loss": 2.1286, + "step": 681 + }, + { + "epoch": 0.29415570411904246, + "grad_norm": 0.15669195353984833, + "learning_rate": 0.00014692954751889843, + "loss": 2.1573, + "step": 682 + }, + { + "epoch": 0.29458701746819066, + "grad_norm": 0.16549846529960632, + "learning_rate": 0.00014691987500573607, + "loss": 2.2062, + "step": 683 + }, + { + "epoch": 0.2950183308173388, + "grad_norm": 0.1586558222770691, + "learning_rate": 0.0001469101876007771, + "loss": 2.127, + "step": 684 + }, + { + "epoch": 0.29544964416648695, + "grad_norm": 0.16695398092269897, + "learning_rate": 0.0001469004853060274, + "loss": 2.4235, + "step": 685 + }, + { + "epoch": 0.2958809575156351, + "grad_norm": 0.17500896751880646, + "learning_rate": 0.0001468907681234959, + "loss": 2.3542, + "step": 686 + }, + { + "epoch": 0.2963122708647833, + "grad_norm": 0.1749137043952942, + "learning_rate": 0.0001468810360551947, + "loss": 2.1788, + "step": 687 + }, + { + "epoch": 0.29674358421393143, + "grad_norm": 0.15084341168403625, + "learning_rate": 0.0001468712891031389, + "loss": 2.2913, + "step": 688 + }, + { + "epoch": 0.2971748975630796, + "grad_norm": 0.1857728809118271, + "learning_rate": 0.0001468615272693467, + "loss": 2.3329, + "step": 689 + }, + { + "epoch": 0.2976062109122277, + "grad_norm": 0.1701570600271225, + "learning_rate": 0.00014685175055583944, + "loss": 2.0987, + "step": 690 + }, + { + "epoch": 0.29803752426137586, + "grad_norm": 0.18066200613975525, + "learning_rate": 0.00014684195896464146, + "loss": 2.199, + "step": 691 + }, + { + "epoch": 0.29846883761052406, + "grad_norm": 0.15877336263656616, + "learning_rate": 0.00014683215249778022, + "loss": 2.1216, + "step": 692 + }, + { + "epoch": 0.2989001509596722, + "grad_norm": 0.16353558003902435, + "learning_rate": 0.00014682233115728628, + "loss": 2.1863, + "step": 693 + }, + { + "epoch": 0.29933146430882035, + "grad_norm": 0.16829341650009155, + "learning_rate": 0.00014681249494519322, + "loss": 2.2876, + "step": 694 + }, + { + "epoch": 0.2997627776579685, + "grad_norm": 0.1578504890203476, + "learning_rate": 0.00014680264386353776, + "loss": 2.3461, + "step": 695 + }, + { + "epoch": 0.3001940910071167, + "grad_norm": 0.20184138417243958, + "learning_rate": 0.00014679277791435966, + "loss": 2.2431, + "step": 696 + }, + { + "epoch": 0.30062540435626484, + "grad_norm": 0.1779334396123886, + "learning_rate": 0.0001467828970997018, + "loss": 2.3081, + "step": 697 + }, + { + "epoch": 0.301056717705413, + "grad_norm": 0.17701883614063263, + "learning_rate": 0.00014677300142161006, + "loss": 2.2477, + "step": 698 + }, + { + "epoch": 0.3014880310545611, + "grad_norm": 0.17183318734169006, + "learning_rate": 0.00014676309088213353, + "loss": 2.0537, + "step": 699 + }, + { + "epoch": 0.3019193444037093, + "grad_norm": 0.17507003247737885, + "learning_rate": 0.00014675316548332418, + "loss": 2.2578, + "step": 700 + }, + { + "epoch": 0.3019193444037093, + "eval_loss": 2.131542682647705, + "eval_runtime": 204.1156, + "eval_samples_per_second": 0.157, + "eval_steps_per_second": 0.157, + "step": 700 + }, + { + "epoch": 0.30235065775285747, + "grad_norm": 0.18731343746185303, + "learning_rate": 0.0001467432252272373, + "loss": 2.3435, + "step": 701 + }, + { + "epoch": 0.3027819711020056, + "grad_norm": 0.1506258249282837, + "learning_rate": 0.00014673327011593103, + "loss": 2.1025, + "step": 702 + }, + { + "epoch": 0.30321328445115375, + "grad_norm": 0.15649913251399994, + "learning_rate": 0.00014672330015146672, + "loss": 2.0297, + "step": 703 + }, + { + "epoch": 0.3036445978003019, + "grad_norm": 0.17421182990074158, + "learning_rate": 0.00014671331533590878, + "loss": 2.1474, + "step": 704 + }, + { + "epoch": 0.3040759111494501, + "grad_norm": 0.18708555400371552, + "learning_rate": 0.00014670331567132465, + "loss": 2.0628, + "step": 705 + }, + { + "epoch": 0.30450722449859824, + "grad_norm": 0.17888276278972626, + "learning_rate": 0.00014669330115978484, + "loss": 2.2094, + "step": 706 + }, + { + "epoch": 0.3049385378477464, + "grad_norm": 0.16658446192741394, + "learning_rate": 0.00014668327180336304, + "loss": 2.2866, + "step": 707 + }, + { + "epoch": 0.3053698511968945, + "grad_norm": 0.18156693875789642, + "learning_rate": 0.0001466732276041359, + "loss": 2.3104, + "step": 708 + }, + { + "epoch": 0.3058011645460427, + "grad_norm": 0.16488181054592133, + "learning_rate": 0.00014666316856418315, + "loss": 2.1646, + "step": 709 + }, + { + "epoch": 0.30623247789519087, + "grad_norm": 0.15647055208683014, + "learning_rate": 0.0001466530946855877, + "loss": 2.3348, + "step": 710 + }, + { + "epoch": 0.306663791244339, + "grad_norm": 0.16166602075099945, + "learning_rate": 0.00014664300597043537, + "loss": 2.3414, + "step": 711 + }, + { + "epoch": 0.30709510459348716, + "grad_norm": 0.160395085811615, + "learning_rate": 0.00014663290242081519, + "loss": 2.3181, + "step": 712 + }, + { + "epoch": 0.3075264179426353, + "grad_norm": 0.21888132393360138, + "learning_rate": 0.0001466227840388192, + "loss": 2.1063, + "step": 713 + }, + { + "epoch": 0.3079577312917835, + "grad_norm": 0.17477422952651978, + "learning_rate": 0.00014661265082654255, + "loss": 2.1944, + "step": 714 + }, + { + "epoch": 0.30838904464093164, + "grad_norm": 0.15590433776378632, + "learning_rate": 0.00014660250278608335, + "loss": 2.3067, + "step": 715 + }, + { + "epoch": 0.3088203579900798, + "grad_norm": 0.17154192924499512, + "learning_rate": 0.00014659233991954295, + "loss": 2.2371, + "step": 716 + }, + { + "epoch": 0.30925167133922793, + "grad_norm": 0.18001079559326172, + "learning_rate": 0.00014658216222902562, + "loss": 2.3827, + "step": 717 + }, + { + "epoch": 0.30968298468837613, + "grad_norm": 0.19673855602741241, + "learning_rate": 0.00014657196971663882, + "loss": 2.2921, + "step": 718 + }, + { + "epoch": 0.31011429803752427, + "grad_norm": 0.2262069284915924, + "learning_rate": 0.000146561762384493, + "loss": 2.2864, + "step": 719 + }, + { + "epoch": 0.3105456113866724, + "grad_norm": 0.17984148859977722, + "learning_rate": 0.00014655154023470167, + "loss": 1.9371, + "step": 720 + }, + { + "epoch": 0.31097692473582056, + "grad_norm": 0.17629437148571014, + "learning_rate": 0.00014654130326938146, + "loss": 2.1894, + "step": 721 + }, + { + "epoch": 0.3114082380849687, + "grad_norm": 0.17931878566741943, + "learning_rate": 0.00014653105149065206, + "loss": 2.0503, + "step": 722 + }, + { + "epoch": 0.3118395514341169, + "grad_norm": 0.15631668269634247, + "learning_rate": 0.00014652078490063619, + "loss": 2.1298, + "step": 723 + }, + { + "epoch": 0.31227086478326505, + "grad_norm": 0.15321604907512665, + "learning_rate": 0.00014651050350145966, + "loss": 2.245, + "step": 724 + }, + { + "epoch": 0.3127021781324132, + "grad_norm": 0.16721682250499725, + "learning_rate": 0.00014650020729525137, + "loss": 2.2935, + "step": 725 + }, + { + "epoch": 0.3127021781324132, + "eval_loss": 2.13155460357666, + "eval_runtime": 205.1729, + "eval_samples_per_second": 0.156, + "eval_steps_per_second": 0.156, + "step": 725 + }, + { + "epoch": 0.31313349148156133, + "grad_norm": 0.17930914461612701, + "learning_rate": 0.00014648989628414323, + "loss": 2.1548, + "step": 726 + }, + { + "epoch": 0.31356480483070953, + "grad_norm": 0.16474506258964539, + "learning_rate": 0.00014647957047027028, + "loss": 2.3057, + "step": 727 + }, + { + "epoch": 0.3139961181798577, + "grad_norm": 0.1486273854970932, + "learning_rate": 0.00014646922985577058, + "loss": 2.1771, + "step": 728 + }, + { + "epoch": 0.3144274315290058, + "grad_norm": 0.1768963783979416, + "learning_rate": 0.00014645887444278524, + "loss": 2.0942, + "step": 729 + }, + { + "epoch": 0.31485874487815396, + "grad_norm": 0.16324689984321594, + "learning_rate": 0.0001464485042334585, + "loss": 2.0724, + "step": 730 + }, + { + "epoch": 0.31529005822730216, + "grad_norm": 0.15467900037765503, + "learning_rate": 0.00014643811922993762, + "loss": 2.1993, + "step": 731 + }, + { + "epoch": 0.3157213715764503, + "grad_norm": 0.15890255570411682, + "learning_rate": 0.00014642771943437291, + "loss": 2.2111, + "step": 732 + }, + { + "epoch": 0.31615268492559845, + "grad_norm": 0.1673458218574524, + "learning_rate": 0.00014641730484891776, + "loss": 2.0386, + "step": 733 + }, + { + "epoch": 0.3165839982747466, + "grad_norm": 0.21124054491519928, + "learning_rate": 0.00014640687547572866, + "loss": 2.2538, + "step": 734 + }, + { + "epoch": 0.31701531162389474, + "grad_norm": 0.17169217765331268, + "learning_rate": 0.0001463964313169651, + "loss": 2.1802, + "step": 735 + }, + { + "epoch": 0.31744662497304293, + "grad_norm": 0.18922211229801178, + "learning_rate": 0.00014638597237478964, + "loss": 2.3256, + "step": 736 + }, + { + "epoch": 0.3178779383221911, + "grad_norm": 0.16592665016651154, + "learning_rate": 0.00014637549865136794, + "loss": 2.4164, + "step": 737 + }, + { + "epoch": 0.3183092516713392, + "grad_norm": 0.19938214123249054, + "learning_rate": 0.0001463650101488687, + "loss": 2.2966, + "step": 738 + }, + { + "epoch": 0.31874056502048737, + "grad_norm": 0.18568481504917145, + "learning_rate": 0.00014635450686946365, + "loss": 2.2379, + "step": 739 + }, + { + "epoch": 0.31917187836963556, + "grad_norm": 0.15624523162841797, + "learning_rate": 0.00014634398881532766, + "loss": 2.0803, + "step": 740 + }, + { + "epoch": 0.3196031917187837, + "grad_norm": 0.14758580923080444, + "learning_rate": 0.00014633345598863857, + "loss": 1.9887, + "step": 741 + }, + { + "epoch": 0.32003450506793185, + "grad_norm": 0.1929500699043274, + "learning_rate": 0.00014632290839157734, + "loss": 2.4384, + "step": 742 + }, + { + "epoch": 0.32046581841708, + "grad_norm": 0.20792317390441895, + "learning_rate": 0.00014631234602632796, + "loss": 2.1976, + "step": 743 + }, + { + "epoch": 0.32089713176622814, + "grad_norm": 0.1632717400789261, + "learning_rate": 0.00014630176889507743, + "loss": 2.3856, + "step": 744 + }, + { + "epoch": 0.32132844511537634, + "grad_norm": 0.1877809315919876, + "learning_rate": 0.00014629117700001593, + "loss": 2.2074, + "step": 745 + }, + { + "epoch": 0.3217597584645245, + "grad_norm": 0.16195634007453918, + "learning_rate": 0.00014628057034333665, + "loss": 2.1698, + "step": 746 + }, + { + "epoch": 0.3221910718136726, + "grad_norm": 0.17172454297542572, + "learning_rate": 0.0001462699489272357, + "loss": 2.2282, + "step": 747 + }, + { + "epoch": 0.32262238516282077, + "grad_norm": 0.1560070663690567, + "learning_rate": 0.0001462593127539125, + "loss": 2.0006, + "step": 748 + }, + { + "epoch": 0.32305369851196897, + "grad_norm": 0.15756119787693024, + "learning_rate": 0.00014624866182556926, + "loss": 2.27, + "step": 749 + }, + { + "epoch": 0.3234850118611171, + "grad_norm": 0.16225910186767578, + "learning_rate": 0.00014623799614441144, + "loss": 2.3256, + "step": 750 + }, + { + "epoch": 0.3234850118611171, + "eval_loss": 2.131229877471924, + "eval_runtime": 204.3866, + "eval_samples_per_second": 0.157, + "eval_steps_per_second": 0.157, + "step": 750 + }, + { + "epoch": 0.32391632521026525, + "grad_norm": 0.1817995011806488, + "learning_rate": 0.00014622731571264748, + "loss": 2.3521, + "step": 751 + }, + { + "epoch": 0.3243476385594134, + "grad_norm": 0.18910230696201324, + "learning_rate": 0.00014621662053248888, + "loss": 2.4786, + "step": 752 + }, + { + "epoch": 0.3247789519085616, + "grad_norm": 0.16400721669197083, + "learning_rate": 0.00014620591060615016, + "loss": 2.3265, + "step": 753 + }, + { + "epoch": 0.32521026525770974, + "grad_norm": 0.15586580336093903, + "learning_rate": 0.000146195185935849, + "loss": 2.2058, + "step": 754 + }, + { + "epoch": 0.3256415786068579, + "grad_norm": 1.2911137342453003, + "learning_rate": 0.000146184446523806, + "loss": 2.1915, + "step": 755 + }, + { + "epoch": 0.32607289195600603, + "grad_norm": 0.18798713386058807, + "learning_rate": 0.00014617369237224486, + "loss": 2.3488, + "step": 756 + }, + { + "epoch": 0.32650420530515417, + "grad_norm": 0.16819198429584503, + "learning_rate": 0.00014616292348339238, + "loss": 2.3031, + "step": 757 + }, + { + "epoch": 0.32693551865430237, + "grad_norm": 0.16455453634262085, + "learning_rate": 0.00014615213985947836, + "loss": 2.2154, + "step": 758 + }, + { + "epoch": 0.3273668320034505, + "grad_norm": 0.18589816987514496, + "learning_rate": 0.0001461413415027357, + "loss": 2.2736, + "step": 759 + }, + { + "epoch": 0.32779814535259866, + "grad_norm": 0.17672494053840637, + "learning_rate": 0.0001461305284154003, + "loss": 2.2528, + "step": 760 + }, + { + "epoch": 0.3282294587017468, + "grad_norm": 0.15951240062713623, + "learning_rate": 0.0001461197005997111, + "loss": 1.9911, + "step": 761 + }, + { + "epoch": 0.328660772050895, + "grad_norm": 0.1629369705915451, + "learning_rate": 0.00014610885805791012, + "loss": 2.4015, + "step": 762 + }, + { + "epoch": 0.32909208540004314, + "grad_norm": 0.2176440805196762, + "learning_rate": 0.0001460980007922425, + "loss": 2.2203, + "step": 763 + }, + { + "epoch": 0.3295233987491913, + "grad_norm": 0.18058481812477112, + "learning_rate": 0.00014608712880495627, + "loss": 2.3013, + "step": 764 + }, + { + "epoch": 0.32995471209833943, + "grad_norm": 0.1542855054140091, + "learning_rate": 0.0001460762420983026, + "loss": 2.4098, + "step": 765 + }, + { + "epoch": 0.3303860254474876, + "grad_norm": 0.17161045968532562, + "learning_rate": 0.00014606534067453577, + "loss": 2.117, + "step": 766 + }, + { + "epoch": 0.3308173387966358, + "grad_norm": 0.1644153594970703, + "learning_rate": 0.00014605442453591297, + "loss": 2.084, + "step": 767 + }, + { + "epoch": 0.3312486521457839, + "grad_norm": 0.15911726653575897, + "learning_rate": 0.00014604349368469452, + "loss": 2.1107, + "step": 768 + }, + { + "epoch": 0.33167996549493206, + "grad_norm": 0.1879819631576538, + "learning_rate": 0.00014603254812314384, + "loss": 2.303, + "step": 769 + }, + { + "epoch": 0.3321112788440802, + "grad_norm": 0.1810748130083084, + "learning_rate": 0.00014602158785352723, + "loss": 2.3159, + "step": 770 + }, + { + "epoch": 0.3325425921932284, + "grad_norm": 0.16361068189144135, + "learning_rate": 0.0001460106128781142, + "loss": 2.2551, + "step": 771 + }, + { + "epoch": 0.33297390554237655, + "grad_norm": 0.17128886282444, + "learning_rate": 0.0001459996231991772, + "loss": 2.2071, + "step": 772 + }, + { + "epoch": 0.3334052188915247, + "grad_norm": 0.1600296050310135, + "learning_rate": 0.00014598861881899176, + "loss": 2.0581, + "step": 773 + }, + { + "epoch": 0.33383653224067283, + "grad_norm": 0.16864512860774994, + "learning_rate": 0.00014597759973983648, + "loss": 2.1672, + "step": 774 + }, + { + "epoch": 0.334267845589821, + "grad_norm": 0.19236478209495544, + "learning_rate": 0.000145966565963993, + "loss": 2.4652, + "step": 775 + }, + { + "epoch": 0.334267845589821, + "eval_loss": 2.1317968368530273, + "eval_runtime": 204.7723, + "eval_samples_per_second": 0.156, + "eval_steps_per_second": 0.156, + "step": 775 + }, + { + "epoch": 0.3346991589389692, + "grad_norm": 0.17529065907001495, + "learning_rate": 0.00014595551749374593, + "loss": 2.4414, + "step": 776 + }, + { + "epoch": 0.3351304722881173, + "grad_norm": 0.16876325011253357, + "learning_rate": 0.00014594445433138302, + "loss": 2.1084, + "step": 777 + }, + { + "epoch": 0.33556178563726546, + "grad_norm": 0.16747407615184784, + "learning_rate": 0.00014593337647919501, + "loss": 2.2267, + "step": 778 + }, + { + "epoch": 0.3359930989864136, + "grad_norm": 0.17824633419513702, + "learning_rate": 0.00014592228393947568, + "loss": 2.2853, + "step": 779 + }, + { + "epoch": 0.3364244123355618, + "grad_norm": 0.1787227839231491, + "learning_rate": 0.00014591117671452187, + "loss": 2.2528, + "step": 780 + }, + { + "epoch": 0.33685572568470995, + "grad_norm": 0.1657239943742752, + "learning_rate": 0.00014590005480663348, + "loss": 2.2671, + "step": 781 + }, + { + "epoch": 0.3372870390338581, + "grad_norm": 0.1929490864276886, + "learning_rate": 0.00014588891821811333, + "loss": 2.2231, + "step": 782 + }, + { + "epoch": 0.33771835238300624, + "grad_norm": 0.17597073316574097, + "learning_rate": 0.00014587776695126748, + "loss": 2.2238, + "step": 783 + }, + { + "epoch": 0.33814966573215444, + "grad_norm": 0.17662476003170013, + "learning_rate": 0.00014586660100840486, + "loss": 2.1492, + "step": 784 + }, + { + "epoch": 0.3385809790813026, + "grad_norm": 0.16856659948825836, + "learning_rate": 0.00014585542039183752, + "loss": 2.2535, + "step": 785 + }, + { + "epoch": 0.3390122924304507, + "grad_norm": 0.17280738055706024, + "learning_rate": 0.00014584422510388053, + "loss": 2.178, + "step": 786 + }, + { + "epoch": 0.33944360577959887, + "grad_norm": 0.17797470092773438, + "learning_rate": 0.000145833015146852, + "loss": 2.3755, + "step": 787 + }, + { + "epoch": 0.339874919128747, + "grad_norm": 0.1805947721004486, + "learning_rate": 0.00014582179052307305, + "loss": 2.195, + "step": 788 + }, + { + "epoch": 0.3403062324778952, + "grad_norm": 0.18467366695404053, + "learning_rate": 0.0001458105512348679, + "loss": 2.2721, + "step": 789 + }, + { + "epoch": 0.34073754582704335, + "grad_norm": 0.15401802957057953, + "learning_rate": 0.0001457992972845637, + "loss": 2.1344, + "step": 790 + }, + { + "epoch": 0.3411688591761915, + "grad_norm": 0.19222162663936615, + "learning_rate": 0.0001457880286744908, + "loss": 2.2172, + "step": 791 + }, + { + "epoch": 0.34160017252533964, + "grad_norm": 0.1638176292181015, + "learning_rate": 0.00014577674540698244, + "loss": 2.0332, + "step": 792 + }, + { + "epoch": 0.34203148587448784, + "grad_norm": 0.2038831114768982, + "learning_rate": 0.0001457654474843749, + "loss": 1.9864, + "step": 793 + }, + { + "epoch": 0.342462799223636, + "grad_norm": 0.2040897160768509, + "learning_rate": 0.0001457541349090076, + "loss": 2.1794, + "step": 794 + }, + { + "epoch": 0.3428941125727841, + "grad_norm": 0.17992299795150757, + "learning_rate": 0.00014574280768322293, + "loss": 2.3073, + "step": 795 + }, + { + "epoch": 0.34332542592193227, + "grad_norm": 0.1908801645040512, + "learning_rate": 0.00014573146580936628, + "loss": 2.3596, + "step": 796 + }, + { + "epoch": 0.3437567392710804, + "grad_norm": 0.1825549155473709, + "learning_rate": 0.0001457201092897861, + "loss": 2.1582, + "step": 797 + }, + { + "epoch": 0.3441880526202286, + "grad_norm": 0.1776050329208374, + "learning_rate": 0.00014570873812683397, + "loss": 2.1406, + "step": 798 + }, + { + "epoch": 0.34461936596937676, + "grad_norm": 0.16008296608924866, + "learning_rate": 0.00014569735232286432, + "loss": 2.3458, + "step": 799 + }, + { + "epoch": 0.3450506793185249, + "grad_norm": 0.1878412365913391, + "learning_rate": 0.00014568595188023473, + "loss": 2.0734, + "step": 800 + }, + { + "epoch": 0.3450506793185249, + "eval_loss": 2.130296230316162, + "eval_runtime": 204.5115, + "eval_samples_per_second": 0.156, + "eval_steps_per_second": 0.156, + "step": 800 + }, + { + "epoch": 0.34548199266767304, + "grad_norm": 0.15881088376045227, + "learning_rate": 0.00014567453680130584, + "loss": 2.2351, + "step": 801 + }, + { + "epoch": 0.34591330601682124, + "grad_norm": 0.2246032953262329, + "learning_rate": 0.00014566310708844117, + "loss": 2.4797, + "step": 802 + }, + { + "epoch": 0.3463446193659694, + "grad_norm": 0.16534580290317535, + "learning_rate": 0.00014565166274400744, + "loss": 2.3781, + "step": 803 + }, + { + "epoch": 0.34677593271511753, + "grad_norm": 0.17714694142341614, + "learning_rate": 0.00014564020377037432, + "loss": 2.2876, + "step": 804 + }, + { + "epoch": 0.3472072460642657, + "grad_norm": 0.16253145039081573, + "learning_rate": 0.00014562873016991446, + "loss": 2.3543, + "step": 805 + }, + { + "epoch": 0.3476385594134139, + "grad_norm": 0.15457387268543243, + "learning_rate": 0.00014561724194500368, + "loss": 2.0988, + "step": 806 + }, + { + "epoch": 0.348069872762562, + "grad_norm": 0.1657828837633133, + "learning_rate": 0.00014560573909802065, + "loss": 2.1691, + "step": 807 + }, + { + "epoch": 0.34850118611171016, + "grad_norm": 0.15933750569820404, + "learning_rate": 0.00014559422163134723, + "loss": 2.1606, + "step": 808 + }, + { + "epoch": 0.3489324994608583, + "grad_norm": 0.18061064183712006, + "learning_rate": 0.00014558268954736824, + "loss": 2.3919, + "step": 809 + }, + { + "epoch": 0.34936381281000645, + "grad_norm": 0.16154713928699493, + "learning_rate": 0.0001455711428484715, + "loss": 2.1546, + "step": 810 + }, + { + "epoch": 0.34979512615915465, + "grad_norm": 0.1660330444574356, + "learning_rate": 0.00014555958153704787, + "loss": 2.1342, + "step": 811 + }, + { + "epoch": 0.3502264395083028, + "grad_norm": 0.1714458018541336, + "learning_rate": 0.0001455480056154912, + "loss": 2.1792, + "step": 812 + }, + { + "epoch": 0.35065775285745093, + "grad_norm": 0.15950365364551544, + "learning_rate": 0.00014553641508619852, + "loss": 2.458, + "step": 813 + }, + { + "epoch": 0.3510890662065991, + "grad_norm": 0.15440607070922852, + "learning_rate": 0.00014552480995156975, + "loss": 2.388, + "step": 814 + }, + { + "epoch": 0.3515203795557473, + "grad_norm": 0.15359917283058167, + "learning_rate": 0.00014551319021400777, + "loss": 2.2415, + "step": 815 + }, + { + "epoch": 0.3519516929048954, + "grad_norm": 0.1715104579925537, + "learning_rate": 0.00014550155587591867, + "loss": 2.2896, + "step": 816 + }, + { + "epoch": 0.35238300625404356, + "grad_norm": 0.16861777007579803, + "learning_rate": 0.00014548990693971143, + "loss": 2.2502, + "step": 817 + }, + { + "epoch": 0.3528143196031917, + "grad_norm": 0.18368162214756012, + "learning_rate": 0.00014547824340779811, + "loss": 2.2043, + "step": 818 + }, + { + "epoch": 0.35324563295233985, + "grad_norm": 0.15260176360607147, + "learning_rate": 0.00014546656528259376, + "loss": 2.1346, + "step": 819 + }, + { + "epoch": 0.35367694630148805, + "grad_norm": 0.15488527715206146, + "learning_rate": 0.00014545487256651643, + "loss": 2.146, + "step": 820 + }, + { + "epoch": 0.3541082596506362, + "grad_norm": 0.16105197370052338, + "learning_rate": 0.00014544316526198726, + "loss": 2.2272, + "step": 821 + }, + { + "epoch": 0.35453957299978434, + "grad_norm": 0.17276594042778015, + "learning_rate": 0.00014543144337143037, + "loss": 2.1993, + "step": 822 + }, + { + "epoch": 0.3549708863489325, + "grad_norm": 0.15881919860839844, + "learning_rate": 0.0001454197068972729, + "loss": 2.2004, + "step": 823 + }, + { + "epoch": 0.3554021996980807, + "grad_norm": 0.16273415088653564, + "learning_rate": 0.00014540795584194505, + "loss": 2.0913, + "step": 824 + }, + { + "epoch": 0.3558335130472288, + "grad_norm": 0.17026178538799286, + "learning_rate": 0.00014539619020787994, + "loss": 2.2855, + "step": 825 + }, + { + "epoch": 0.3558335130472288, + "eval_loss": 2.1297106742858887, + "eval_runtime": 204.6653, + "eval_samples_per_second": 0.156, + "eval_steps_per_second": 0.156, + "step": 825 + }, + { + "epoch": 0.35626482639637697, + "grad_norm": 0.1643732637166977, + "learning_rate": 0.00014538440999751383, + "loss": 2.1876, + "step": 826 + }, + { + "epoch": 0.3566961397455251, + "grad_norm": 0.16995814442634583, + "learning_rate": 0.00014537261521328592, + "loss": 2.1945, + "step": 827 + }, + { + "epoch": 0.35712745309467325, + "grad_norm": 0.19104905426502228, + "learning_rate": 0.00014536080585763844, + "loss": 2.0627, + "step": 828 + }, + { + "epoch": 0.35755876644382145, + "grad_norm": 0.18704453110694885, + "learning_rate": 0.00014534898193301665, + "loss": 2.2512, + "step": 829 + }, + { + "epoch": 0.3579900797929696, + "grad_norm": 0.17079386115074158, + "learning_rate": 0.00014533714344186884, + "loss": 2.1159, + "step": 830 + }, + { + "epoch": 0.35842139314211774, + "grad_norm": 0.1565588116645813, + "learning_rate": 0.00014532529038664626, + "loss": 2.2767, + "step": 831 + }, + { + "epoch": 0.3588527064912659, + "grad_norm": 1.9668205976486206, + "learning_rate": 0.00014531342276980327, + "loss": 2.2867, + "step": 832 + }, + { + "epoch": 0.3592840198404141, + "grad_norm": 0.14429862797260284, + "learning_rate": 0.00014530154059379713, + "loss": 2.1932, + "step": 833 + }, + { + "epoch": 0.3597153331895622, + "grad_norm": 0.1885858178138733, + "learning_rate": 0.00014528964386108825, + "loss": 2.3821, + "step": 834 + }, + { + "epoch": 0.36014664653871037, + "grad_norm": 0.16591544449329376, + "learning_rate": 0.00014527773257413988, + "loss": 2.3467, + "step": 835 + }, + { + "epoch": 0.3605779598878585, + "grad_norm": 0.17843496799468994, + "learning_rate": 0.00014526580673541848, + "loss": 2.2347, + "step": 836 + }, + { + "epoch": 0.3610092732370067, + "grad_norm": 1.2142467498779297, + "learning_rate": 0.00014525386634739335, + "loss": 2.0654, + "step": 837 + }, + { + "epoch": 0.36144058658615486, + "grad_norm": 0.18093134462833405, + "learning_rate": 0.0001452419114125369, + "loss": 2.1252, + "step": 838 + }, + { + "epoch": 0.361871899935303, + "grad_norm": 0.17511476576328278, + "learning_rate": 0.0001452299419333246, + "loss": 2.2497, + "step": 839 + }, + { + "epoch": 0.36230321328445114, + "grad_norm": 0.18539828062057495, + "learning_rate": 0.00014521795791223475, + "loss": 2.3388, + "step": 840 + }, + { + "epoch": 0.3627345266335993, + "grad_norm": 0.22852657735347748, + "learning_rate": 0.00014520595935174887, + "loss": 2.4679, + "step": 841 + }, + { + "epoch": 0.3631658399827475, + "grad_norm": 0.18394552171230316, + "learning_rate": 0.0001451939462543513, + "loss": 2.1722, + "step": 842 + }, + { + "epoch": 0.36359715333189563, + "grad_norm": 0.1762983798980713, + "learning_rate": 0.00014518191862252953, + "loss": 1.9698, + "step": 843 + }, + { + "epoch": 0.36402846668104377, + "grad_norm": 0.2538037598133087, + "learning_rate": 0.00014516987645877403, + "loss": 2.2843, + "step": 844 + }, + { + "epoch": 0.3644597800301919, + "grad_norm": 0.1793690174818039, + "learning_rate": 0.00014515781976557826, + "loss": 2.2725, + "step": 845 + }, + { + "epoch": 0.3648910933793401, + "grad_norm": 0.17635536193847656, + "learning_rate": 0.00014514574854543867, + "loss": 2.2042, + "step": 846 + }, + { + "epoch": 0.36532240672848826, + "grad_norm": 0.1704099327325821, + "learning_rate": 0.0001451336628008547, + "loss": 2.1619, + "step": 847 + }, + { + "epoch": 0.3657537200776364, + "grad_norm": 0.1625613272190094, + "learning_rate": 0.00014512156253432891, + "loss": 2.2149, + "step": 848 + }, + { + "epoch": 0.36618503342678455, + "grad_norm": 0.1626460701227188, + "learning_rate": 0.00014510944774836677, + "loss": 2.2051, + "step": 849 + }, + { + "epoch": 0.3666163467759327, + "grad_norm": 0.16857227683067322, + "learning_rate": 0.00014509731844547676, + "loss": 2.1927, + "step": 850 + }, + { + "epoch": 0.3666163467759327, + "eval_loss": 2.133108139038086, + "eval_runtime": 204.1873, + "eval_samples_per_second": 0.157, + "eval_steps_per_second": 0.157, + "step": 850 + }, + { + "epoch": 0.3670476601250809, + "grad_norm": 0.18687738478183746, + "learning_rate": 0.0001450851746281704, + "loss": 2.2809, + "step": 851 + }, + { + "epoch": 0.36747897347422903, + "grad_norm": 0.19265635311603546, + "learning_rate": 0.0001450730162989622, + "loss": 2.3029, + "step": 852 + }, + { + "epoch": 0.3679102868233772, + "grad_norm": 0.16877678036689758, + "learning_rate": 0.00014506084346036963, + "loss": 2.2134, + "step": 853 + }, + { + "epoch": 0.3683416001725253, + "grad_norm": 0.17525163292884827, + "learning_rate": 0.0001450486561149133, + "loss": 2.1285, + "step": 854 + }, + { + "epoch": 0.3687729135216735, + "grad_norm": 0.1960926353931427, + "learning_rate": 0.00014503645426511665, + "loss": 2.2813, + "step": 855 + }, + { + "epoch": 0.36920422687082166, + "grad_norm": 0.1757328361272812, + "learning_rate": 0.00014502423791350625, + "loss": 2.2732, + "step": 856 + }, + { + "epoch": 0.3696355402199698, + "grad_norm": 0.1861843466758728, + "learning_rate": 0.00014501200706261162, + "loss": 2.1508, + "step": 857 + }, + { + "epoch": 0.37006685356911795, + "grad_norm": 0.19854430854320526, + "learning_rate": 0.00014499976171496528, + "loss": 2.2179, + "step": 858 + }, + { + "epoch": 0.37049816691826615, + "grad_norm": 0.16625559329986572, + "learning_rate": 0.00014498750187310277, + "loss": 1.99, + "step": 859 + }, + { + "epoch": 0.3709294802674143, + "grad_norm": 0.15606555342674255, + "learning_rate": 0.00014497522753956264, + "loss": 2.3534, + "step": 860 + }, + { + "epoch": 0.37136079361656243, + "grad_norm": 0.17310819029808044, + "learning_rate": 0.00014496293871688642, + "loss": 2.1748, + "step": 861 + }, + { + "epoch": 0.3717921069657106, + "grad_norm": 0.17876191437244415, + "learning_rate": 0.0001449506354076186, + "loss": 2.2285, + "step": 862 + }, + { + "epoch": 0.3722234203148587, + "grad_norm": 0.18471559882164001, + "learning_rate": 0.0001449383176143068, + "loss": 2.2541, + "step": 863 + }, + { + "epoch": 0.3726547336640069, + "grad_norm": 0.171518474817276, + "learning_rate": 0.00014492598533950146, + "loss": 2.357, + "step": 864 + }, + { + "epoch": 0.37308604701315506, + "grad_norm": 0.1647776961326599, + "learning_rate": 0.0001449136385857562, + "loss": 2.2007, + "step": 865 + }, + { + "epoch": 0.3735173603623032, + "grad_norm": 0.1820192039012909, + "learning_rate": 0.0001449012773556275, + "loss": 2.2049, + "step": 866 + }, + { + "epoch": 0.37394867371145135, + "grad_norm": 0.16099920868873596, + "learning_rate": 0.00014488890165167487, + "loss": 2.258, + "step": 867 + }, + { + "epoch": 0.37437998706059955, + "grad_norm": 0.1925518810749054, + "learning_rate": 0.00014487651147646088, + "loss": 2.3798, + "step": 868 + }, + { + "epoch": 0.3748113004097477, + "grad_norm": 0.17227499186992645, + "learning_rate": 0.00014486410683255103, + "loss": 2.3747, + "step": 869 + }, + { + "epoch": 0.37524261375889584, + "grad_norm": 0.16866856813430786, + "learning_rate": 0.00014485168772251382, + "loss": 2.2091, + "step": 870 + }, + { + "epoch": 0.375673927108044, + "grad_norm": 0.16172897815704346, + "learning_rate": 0.00014483925414892078, + "loss": 2.3496, + "step": 871 + }, + { + "epoch": 0.3761052404571921, + "grad_norm": 0.23066756129264832, + "learning_rate": 0.00014482680611434644, + "loss": 1.9826, + "step": 872 + }, + { + "epoch": 0.3765365538063403, + "grad_norm": 0.18224367499351501, + "learning_rate": 0.00014481434362136828, + "loss": 2.2193, + "step": 873 + }, + { + "epoch": 0.37696786715548847, + "grad_norm": 1.6786094903945923, + "learning_rate": 0.0001448018666725668, + "loss": 2.1686, + "step": 874 + }, + { + "epoch": 0.3773991805046366, + "grad_norm": 0.1985001266002655, + "learning_rate": 0.00014478937527052547, + "loss": 2.2994, + "step": 875 + }, + { + "epoch": 0.3773991805046366, + "eval_loss": 2.131563425064087, + "eval_runtime": 198.5072, + "eval_samples_per_second": 0.161, + "eval_steps_per_second": 0.161, + "step": 875 + }, + { + "epoch": 0.37783049385378475, + "grad_norm": 0.19998972117900848, + "learning_rate": 0.0001447768694178308, + "loss": 2.3498, + "step": 876 + }, + { + "epoch": 0.37826180720293295, + "grad_norm": 0.3351645767688751, + "learning_rate": 0.00014476434911707225, + "loss": 2.0949, + "step": 877 + }, + { + "epoch": 0.3786931205520811, + "grad_norm": 0.19838584959506989, + "learning_rate": 0.00014475181437084228, + "loss": 2.3821, + "step": 878 + }, + { + "epoch": 0.37912443390122924, + "grad_norm": 0.17798852920532227, + "learning_rate": 0.00014473926518173636, + "loss": 2.3823, + "step": 879 + }, + { + "epoch": 0.3795557472503774, + "grad_norm": 0.26538652181625366, + "learning_rate": 0.0001447267015523529, + "loss": 2.2315, + "step": 880 + }, + { + "epoch": 0.37998706059952553, + "grad_norm": 0.20238934457302094, + "learning_rate": 0.0001447141234852934, + "loss": 2.3112, + "step": 881 + }, + { + "epoch": 0.3804183739486737, + "grad_norm": 0.16403162479400635, + "learning_rate": 0.00014470153098316226, + "loss": 1.8282, + "step": 882 + }, + { + "epoch": 0.38084968729782187, + "grad_norm": 0.17718282341957092, + "learning_rate": 0.00014468892404856685, + "loss": 2.0251, + "step": 883 + }, + { + "epoch": 0.38128100064697, + "grad_norm": 0.21754078567028046, + "learning_rate": 0.00014467630268411762, + "loss": 2.1585, + "step": 884 + }, + { + "epoch": 0.38171231399611816, + "grad_norm": 0.21408778429031372, + "learning_rate": 0.00014466366689242795, + "loss": 1.9296, + "step": 885 + }, + { + "epoch": 0.38214362734526636, + "grad_norm": 0.1843462884426117, + "learning_rate": 0.00014465101667611425, + "loss": 2.1953, + "step": 886 + }, + { + "epoch": 0.3825749406944145, + "grad_norm": 0.21462063491344452, + "learning_rate": 0.00014463835203779583, + "loss": 2.1426, + "step": 887 + }, + { + "epoch": 0.38300625404356264, + "grad_norm": 0.20921021699905396, + "learning_rate": 0.00014462567298009508, + "loss": 2.3491, + "step": 888 + }, + { + "epoch": 0.3834375673927108, + "grad_norm": 0.19191640615463257, + "learning_rate": 0.0001446129795056373, + "loss": 2.1436, + "step": 889 + }, + { + "epoch": 0.383868880741859, + "grad_norm": 0.1827997863292694, + "learning_rate": 0.00014460027161705086, + "loss": 2.2466, + "step": 890 + }, + { + "epoch": 0.38430019409100713, + "grad_norm": 0.16525106132030487, + "learning_rate": 0.00014458754931696705, + "loss": 2.2714, + "step": 891 + }, + { + "epoch": 0.3847315074401553, + "grad_norm": 0.1852799654006958, + "learning_rate": 0.00014457481260802014, + "loss": 2.0365, + "step": 892 + }, + { + "epoch": 0.3851628207893034, + "grad_norm": 0.1985684037208557, + "learning_rate": 0.00014456206149284745, + "loss": 2.0933, + "step": 893 + }, + { + "epoch": 0.38559413413845156, + "grad_norm": 0.19180408120155334, + "learning_rate": 0.00014454929597408918, + "loss": 2.2093, + "step": 894 + }, + { + "epoch": 0.38602544748759976, + "grad_norm": 0.17508214712142944, + "learning_rate": 0.0001445365160543886, + "loss": 2.2649, + "step": 895 + }, + { + "epoch": 0.3864567608367479, + "grad_norm": 0.1650542914867401, + "learning_rate": 0.00014452372173639195, + "loss": 2.3021, + "step": 896 + }, + { + "epoch": 0.38688807418589605, + "grad_norm": 0.37150564789772034, + "learning_rate": 0.00014451091302274843, + "loss": 2.0946, + "step": 897 + }, + { + "epoch": 0.3873193875350442, + "grad_norm": 0.17382101714611053, + "learning_rate": 0.00014449808991611019, + "loss": 2.2065, + "step": 898 + }, + { + "epoch": 0.3877507008841924, + "grad_norm": 0.16627924144268036, + "learning_rate": 0.0001444852524191324, + "loss": 2.244, + "step": 899 + }, + { + "epoch": 0.38818201423334053, + "grad_norm": 0.17999930679798126, + "learning_rate": 0.00014447240053447327, + "loss": 2.2263, + "step": 900 + }, + { + "epoch": 0.38818201423334053, + "eval_loss": 2.128645896911621, + "eval_runtime": 196.0098, + "eval_samples_per_second": 0.163, + "eval_steps_per_second": 0.163, + "step": 900 + }, + { + "epoch": 0.3886133275824887, + "grad_norm": 3.2199740409851074, + "learning_rate": 0.00014445953426479383, + "loss": 2.1536, + "step": 901 + }, + { + "epoch": 0.3890446409316368, + "grad_norm": 0.19231465458869934, + "learning_rate": 0.00014444665361275822, + "loss": 1.9445, + "step": 902 + }, + { + "epoch": 0.38947595428078496, + "grad_norm": 0.21291469037532806, + "learning_rate": 0.00014443375858103354, + "loss": 2.267, + "step": 903 + }, + { + "epoch": 0.38990726762993316, + "grad_norm": 0.1833806037902832, + "learning_rate": 0.00014442084917228983, + "loss": 2.2679, + "step": 904 + }, + { + "epoch": 0.3903385809790813, + "grad_norm": 0.16631187498569489, + "learning_rate": 0.00014440792538920012, + "loss": 2.1788, + "step": 905 + }, + { + "epoch": 0.39076989432822945, + "grad_norm": 0.2123652994632721, + "learning_rate": 0.00014439498723444044, + "loss": 2.3326, + "step": 906 + }, + { + "epoch": 0.3912012076773776, + "grad_norm": 0.18742474913597107, + "learning_rate": 0.00014438203471068975, + "loss": 2.2317, + "step": 907 + }, + { + "epoch": 0.3916325210265258, + "grad_norm": 0.1959163248538971, + "learning_rate": 0.00014436906782063, + "loss": 2.1793, + "step": 908 + }, + { + "epoch": 0.39206383437567394, + "grad_norm": 0.176411971449852, + "learning_rate": 0.00014435608656694618, + "loss": 2.2729, + "step": 909 + }, + { + "epoch": 0.3924951477248221, + "grad_norm": 0.1642094999551773, + "learning_rate": 0.00014434309095232617, + "loss": 2.1317, + "step": 910 + }, + { + "epoch": 0.3929264610739702, + "grad_norm": 0.19887696206569672, + "learning_rate": 0.00014433008097946084, + "loss": 2.2663, + "step": 911 + }, + { + "epoch": 0.3933577744231184, + "grad_norm": 0.18333375453948975, + "learning_rate": 0.00014431705665104408, + "loss": 2.3066, + "step": 912 + }, + { + "epoch": 0.39378908777226657, + "grad_norm": 0.17824384570121765, + "learning_rate": 0.0001443040179697727, + "loss": 2.2588, + "step": 913 + }, + { + "epoch": 0.3942204011214147, + "grad_norm": 0.1979813426733017, + "learning_rate": 0.0001442909649383465, + "loss": 2.1181, + "step": 914 + }, + { + "epoch": 0.39465171447056285, + "grad_norm": 0.19859793782234192, + "learning_rate": 0.00014427789755946824, + "loss": 2.3726, + "step": 915 + }, + { + "epoch": 0.395083027819711, + "grad_norm": 0.14289377629756927, + "learning_rate": 0.0001442648158358437, + "loss": 2.0499, + "step": 916 + }, + { + "epoch": 0.3955143411688592, + "grad_norm": 0.1597006618976593, + "learning_rate": 0.00014425171977018158, + "loss": 2.2563, + "step": 917 + }, + { + "epoch": 0.39594565451800734, + "grad_norm": 0.1751205325126648, + "learning_rate": 0.00014423860936519354, + "loss": 2.2895, + "step": 918 + }, + { + "epoch": 0.3963769678671555, + "grad_norm": 0.21087686717510223, + "learning_rate": 0.00014422548462359423, + "loss": 2.2271, + "step": 919 + }, + { + "epoch": 0.3968082812163036, + "grad_norm": 0.17048583924770355, + "learning_rate": 0.00014421234554810134, + "loss": 2.2043, + "step": 920 + }, + { + "epoch": 0.3972395945654518, + "grad_norm": 0.1800399273633957, + "learning_rate": 0.00014419919214143538, + "loss": 2.2716, + "step": 921 + }, + { + "epoch": 0.39767090791459997, + "grad_norm": 0.17856170237064362, + "learning_rate": 0.00014418602440631996, + "loss": 2.3445, + "step": 922 + }, + { + "epoch": 0.3981022212637481, + "grad_norm": 0.15118065476417542, + "learning_rate": 0.0001441728423454816, + "loss": 2.1092, + "step": 923 + }, + { + "epoch": 0.39853353461289626, + "grad_norm": 0.18618765473365784, + "learning_rate": 0.0001441596459616497, + "loss": 2.3339, + "step": 924 + }, + { + "epoch": 0.3989648479620444, + "grad_norm": 0.17451119422912598, + "learning_rate": 0.00014414643525755683, + "loss": 2.2336, + "step": 925 + }, + { + "epoch": 0.3989648479620444, + "eval_loss": 2.128000259399414, + "eval_runtime": 197.4817, + "eval_samples_per_second": 0.162, + "eval_steps_per_second": 0.162, + "step": 925 + }, + { + "epoch": 0.3993961613111926, + "grad_norm": 0.168556809425354, + "learning_rate": 0.00014413321023593837, + "loss": 2.4635, + "step": 926 + }, + { + "epoch": 0.39982747466034074, + "grad_norm": 0.1529935747385025, + "learning_rate": 0.00014411997089953268, + "loss": 2.0522, + "step": 927 + }, + { + "epoch": 0.4002587880094889, + "grad_norm": 0.15923142433166504, + "learning_rate": 0.00014410671725108117, + "loss": 2.3012, + "step": 928 + }, + { + "epoch": 0.40069010135863703, + "grad_norm": 0.17462576925754547, + "learning_rate": 0.00014409344929332808, + "loss": 2.2029, + "step": 929 + }, + { + "epoch": 0.40112141470778523, + "grad_norm": 0.17241305112838745, + "learning_rate": 0.00014408016702902073, + "loss": 2.379, + "step": 930 + }, + { + "epoch": 0.4015527280569334, + "grad_norm": 0.15656477212905884, + "learning_rate": 0.00014406687046090934, + "loss": 2.2175, + "step": 931 + }, + { + "epoch": 0.4019840414060815, + "grad_norm": 0.1552082747220993, + "learning_rate": 0.00014405355959174712, + "loss": 2.106, + "step": 932 + }, + { + "epoch": 0.40241535475522966, + "grad_norm": 0.18489037454128265, + "learning_rate": 0.00014404023442429027, + "loss": 2.2944, + "step": 933 + }, + { + "epoch": 0.4028466681043778, + "grad_norm": 0.1694725602865219, + "learning_rate": 0.0001440268949612978, + "loss": 2.014, + "step": 934 + }, + { + "epoch": 0.403277981453526, + "grad_norm": 0.1695718765258789, + "learning_rate": 0.00014401354120553193, + "loss": 2.1954, + "step": 935 + }, + { + "epoch": 0.40370929480267415, + "grad_norm": 0.16405755281448364, + "learning_rate": 0.00014400017315975761, + "loss": 2.3878, + "step": 936 + }, + { + "epoch": 0.4041406081518223, + "grad_norm": 0.17652428150177002, + "learning_rate": 0.00014398679082674288, + "loss": 2.31, + "step": 937 + }, + { + "epoch": 0.40457192150097043, + "grad_norm": 0.1729700267314911, + "learning_rate": 0.00014397339420925865, + "loss": 2.1835, + "step": 938 + }, + { + "epoch": 0.40500323485011863, + "grad_norm": 0.1890537589788437, + "learning_rate": 0.00014395998331007888, + "loss": 2.2332, + "step": 939 + }, + { + "epoch": 0.4054345481992668, + "grad_norm": 0.1828519105911255, + "learning_rate": 0.00014394655813198043, + "loss": 2.1929, + "step": 940 + }, + { + "epoch": 0.4058658615484149, + "grad_norm": 0.16532336175441742, + "learning_rate": 0.00014393311867774315, + "loss": 2.2461, + "step": 941 + }, + { + "epoch": 0.40629717489756306, + "grad_norm": 3.688812494277954, + "learning_rate": 0.0001439196649501498, + "loss": 2.1308, + "step": 942 + }, + { + "epoch": 0.40672848824671126, + "grad_norm": 0.1970452517271042, + "learning_rate": 0.00014390619695198612, + "loss": 2.3738, + "step": 943 + }, + { + "epoch": 0.4071598015958594, + "grad_norm": 0.1757008284330368, + "learning_rate": 0.0001438927146860408, + "loss": 2.3448, + "step": 944 + }, + { + "epoch": 0.40759111494500755, + "grad_norm": 0.1817890703678131, + "learning_rate": 0.00014387921815510555, + "loss": 2.1039, + "step": 945 + }, + { + "epoch": 0.4080224282941557, + "grad_norm": 0.21462790668010712, + "learning_rate": 0.0001438657073619749, + "loss": 2.2177, + "step": 946 + }, + { + "epoch": 0.40845374164330384, + "grad_norm": 0.18851101398468018, + "learning_rate": 0.00014385218230944643, + "loss": 2.1774, + "step": 947 + }, + { + "epoch": 0.40888505499245204, + "grad_norm": 0.18490348756313324, + "learning_rate": 0.00014383864300032069, + "loss": 2.3663, + "step": 948 + }, + { + "epoch": 0.4093163683416002, + "grad_norm": 0.1923254281282425, + "learning_rate": 0.00014382508943740107, + "loss": 2.3492, + "step": 949 + }, + { + "epoch": 0.4097476816907483, + "grad_norm": 0.16575244069099426, + "learning_rate": 0.00014381152162349406, + "loss": 2.2695, + "step": 950 + }, + { + "epoch": 0.4097476816907483, + "eval_loss": 2.1288790702819824, + "eval_runtime": 196.1803, + "eval_samples_per_second": 0.163, + "eval_steps_per_second": 0.163, + "step": 950 + }, + { + "epoch": 0.41017899503989647, + "grad_norm": 0.1713244915008545, + "learning_rate": 0.00014379793956140899, + "loss": 2.2441, + "step": 951 + }, + { + "epoch": 0.41061030838904466, + "grad_norm": 0.15283671021461487, + "learning_rate": 0.00014378434325395815, + "loss": 2.2407, + "step": 952 + }, + { + "epoch": 0.4110416217381928, + "grad_norm": 0.2225114405155182, + "learning_rate": 0.00014377073270395682, + "loss": 2.2592, + "step": 953 + }, + { + "epoch": 0.41147293508734095, + "grad_norm": 0.1541251540184021, + "learning_rate": 0.00014375710791422324, + "loss": 2.0837, + "step": 954 + }, + { + "epoch": 0.4119042484364891, + "grad_norm": 0.1644444763660431, + "learning_rate": 0.00014374346888757853, + "loss": 2.2987, + "step": 955 + }, + { + "epoch": 0.41233556178563724, + "grad_norm": 0.16322757303714752, + "learning_rate": 0.00014372981562684684, + "loss": 2.2194, + "step": 956 + }, + { + "epoch": 0.41276687513478544, + "grad_norm": 0.22558414936065674, + "learning_rate": 0.0001437161481348552, + "loss": 2.264, + "step": 957 + }, + { + "epoch": 0.4131981884839336, + "grad_norm": 0.20409363508224487, + "learning_rate": 0.00014370246641443363, + "loss": 2.5098, + "step": 958 + }, + { + "epoch": 0.4136295018330817, + "grad_norm": 0.17902424931526184, + "learning_rate": 0.00014368877046841506, + "loss": 2.3053, + "step": 959 + }, + { + "epoch": 0.41406081518222987, + "grad_norm": 0.16873684525489807, + "learning_rate": 0.0001436750602996354, + "loss": 2.2177, + "step": 960 + }, + { + "epoch": 0.41449212853137807, + "grad_norm": 0.1802307814359665, + "learning_rate": 0.0001436613359109335, + "loss": 2.2051, + "step": 961 + }, + { + "epoch": 0.4149234418805262, + "grad_norm": 0.2217787653207779, + "learning_rate": 0.00014364759730515112, + "loss": 2.4363, + "step": 962 + }, + { + "epoch": 0.41535475522967436, + "grad_norm": 0.17444701492786407, + "learning_rate": 0.000143633844485133, + "loss": 2.2822, + "step": 963 + }, + { + "epoch": 0.4157860685788225, + "grad_norm": 0.14976367354393005, + "learning_rate": 0.00014362007745372682, + "loss": 2.2694, + "step": 964 + }, + { + "epoch": 0.4162173819279707, + "grad_norm": 0.17860938608646393, + "learning_rate": 0.00014360629621378316, + "loss": 2.291, + "step": 965 + }, + { + "epoch": 0.41664869527711884, + "grad_norm": 0.18789131939411163, + "learning_rate": 0.00014359250076815565, + "loss": 2.2155, + "step": 966 + }, + { + "epoch": 0.417080008626267, + "grad_norm": 0.16874562203884125, + "learning_rate": 0.00014357869111970072, + "loss": 2.1584, + "step": 967 + }, + { + "epoch": 0.41751132197541513, + "grad_norm": 0.14934919774532318, + "learning_rate": 0.00014356486727127782, + "loss": 2.1014, + "step": 968 + }, + { + "epoch": 0.41794263532456327, + "grad_norm": 0.17155322432518005, + "learning_rate": 0.00014355102922574935, + "loss": 2.3703, + "step": 969 + }, + { + "epoch": 0.41837394867371147, + "grad_norm": 0.16445334255695343, + "learning_rate": 0.0001435371769859806, + "loss": 2.1432, + "step": 970 + }, + { + "epoch": 0.4188052620228596, + "grad_norm": 0.15587712824344635, + "learning_rate": 0.00014352331055483987, + "loss": 2.2831, + "step": 971 + }, + { + "epoch": 0.41923657537200776, + "grad_norm": 0.17527459561824799, + "learning_rate": 0.0001435094299351983, + "loss": 2.0739, + "step": 972 + }, + { + "epoch": 0.4196678887211559, + "grad_norm": 0.15222637355327606, + "learning_rate": 0.00014349553512993008, + "loss": 2.152, + "step": 973 + }, + { + "epoch": 0.4200992020703041, + "grad_norm": 0.18465347588062286, + "learning_rate": 0.00014348162614191224, + "loss": 2.3775, + "step": 974 + }, + { + "epoch": 0.42053051541945224, + "grad_norm": 0.1829536408185959, + "learning_rate": 0.00014346770297402482, + "loss": 2.2017, + "step": 975 + }, + { + "epoch": 0.42053051541945224, + "eval_loss": 2.1273937225341797, + "eval_runtime": 199.8206, + "eval_samples_per_second": 0.16, + "eval_steps_per_second": 0.16, + "step": 975 + }, + { + "epoch": 0.4209618287686004, + "grad_norm": 0.18385931849479675, + "learning_rate": 0.00014345376562915075, + "loss": 2.3043, + "step": 976 + }, + { + "epoch": 0.42139314211774853, + "grad_norm": 0.16265417635440826, + "learning_rate": 0.0001434398141101759, + "loss": 2.2154, + "step": 977 + }, + { + "epoch": 0.4218244554668967, + "grad_norm": 0.1492079794406891, + "learning_rate": 0.0001434258484199891, + "loss": 2.2189, + "step": 978 + }, + { + "epoch": 0.4222557688160449, + "grad_norm": 0.16092488169670105, + "learning_rate": 0.0001434118685614821, + "loss": 2.2781, + "step": 979 + }, + { + "epoch": 0.422687082165193, + "grad_norm": 0.16646727919578552, + "learning_rate": 0.00014339787453754955, + "loss": 2.1738, + "step": 980 + }, + { + "epoch": 0.42311839551434116, + "grad_norm": 0.18572886288166046, + "learning_rate": 0.00014338386635108911, + "loss": 2.1226, + "step": 981 + }, + { + "epoch": 0.4235497088634893, + "grad_norm": 0.17068977653980255, + "learning_rate": 0.0001433698440050013, + "loss": 2.3263, + "step": 982 + }, + { + "epoch": 0.4239810222126375, + "grad_norm": 0.15894320607185364, + "learning_rate": 0.0001433558075021896, + "loss": 2.2735, + "step": 983 + }, + { + "epoch": 0.42441233556178565, + "grad_norm": 0.16953733563423157, + "learning_rate": 0.00014334175684556046, + "loss": 2.3021, + "step": 984 + }, + { + "epoch": 0.4248436489109338, + "grad_norm": 0.16865341365337372, + "learning_rate": 0.00014332769203802316, + "loss": 1.8232, + "step": 985 + }, + { + "epoch": 0.42527496226008193, + "grad_norm": 0.17496295273303986, + "learning_rate": 0.00014331361308249002, + "loss": 2.0722, + "step": 986 + }, + { + "epoch": 0.4257062756092301, + "grad_norm": 0.17187760770320892, + "learning_rate": 0.00014329951998187624, + "loss": 2.1005, + "step": 987 + }, + { + "epoch": 0.4261375889583783, + "grad_norm": 0.17174141108989716, + "learning_rate": 0.00014328541273909992, + "loss": 2.3126, + "step": 988 + }, + { + "epoch": 0.4265689023075264, + "grad_norm": 0.16444823145866394, + "learning_rate": 0.00014327129135708214, + "loss": 2.2856, + "step": 989 + }, + { + "epoch": 0.42700021565667456, + "grad_norm": 0.20330820977687836, + "learning_rate": 0.00014325715583874687, + "loss": 2.0886, + "step": 990 + }, + { + "epoch": 0.4274315290058227, + "grad_norm": 0.1728164255619049, + "learning_rate": 0.00014324300618702104, + "loss": 2.0495, + "step": 991 + }, + { + "epoch": 0.4278628423549709, + "grad_norm": 0.17656084895133972, + "learning_rate": 0.0001432288424048345, + "loss": 2.3372, + "step": 992 + }, + { + "epoch": 0.42829415570411905, + "grad_norm": 0.17885622382164001, + "learning_rate": 0.00014321466449512, + "loss": 2.2132, + "step": 993 + }, + { + "epoch": 0.4287254690532672, + "grad_norm": 0.17777030169963837, + "learning_rate": 0.00014320047246081321, + "loss": 2.4089, + "step": 994 + }, + { + "epoch": 0.42915678240241534, + "grad_norm": 0.17604289948940277, + "learning_rate": 0.00014318626630485277, + "loss": 2.1123, + "step": 995 + }, + { + "epoch": 0.42958809575156354, + "grad_norm": 0.15696115791797638, + "learning_rate": 0.00014317204603018024, + "loss": 2.1366, + "step": 996 + }, + { + "epoch": 0.4300194091007117, + "grad_norm": 0.14866608381271362, + "learning_rate": 0.00014315781163974008, + "loss": 2.2554, + "step": 997 + }, + { + "epoch": 0.4304507224498598, + "grad_norm": 0.15694919228553772, + "learning_rate": 0.00014314356313647962, + "loss": 2.2106, + "step": 998 + }, + { + "epoch": 0.43088203579900797, + "grad_norm": 0.1933506727218628, + "learning_rate": 0.00014312930052334924, + "loss": 2.1608, + "step": 999 + }, + { + "epoch": 0.4313133491481561, + "grad_norm": 0.16768963634967804, + "learning_rate": 0.0001431150238033021, + "loss": 1.8865, + "step": 1000 + }, + { + "epoch": 0.4313133491481561, + "eval_loss": 2.1255507469177246, + "eval_runtime": 200.2665, + "eval_samples_per_second": 0.16, + "eval_steps_per_second": 0.16, + "step": 1000 + }, + { + "epoch": 0.4317446624973043, + "grad_norm": 0.17734010517597198, + "learning_rate": 0.00014310073297929443, + "loss": 2.3392, + "step": 1001 + }, + { + "epoch": 0.43217597584645245, + "grad_norm": 0.16111144423484802, + "learning_rate": 0.00014308642805428525, + "loss": 2.1516, + "step": 1002 + }, + { + "epoch": 0.4326072891956006, + "grad_norm": 0.16525249183177948, + "learning_rate": 0.00014307210903123654, + "loss": 2.1927, + "step": 1003 + }, + { + "epoch": 0.43303860254474874, + "grad_norm": 0.20979925990104675, + "learning_rate": 0.00014305777591311323, + "loss": 2.3105, + "step": 1004 + }, + { + "epoch": 0.43346991589389694, + "grad_norm": 0.15769192576408386, + "learning_rate": 0.00014304342870288318, + "loss": 2.2937, + "step": 1005 + }, + { + "epoch": 0.4339012292430451, + "grad_norm": 0.1562070995569229, + "learning_rate": 0.00014302906740351707, + "loss": 2.2987, + "step": 1006 + }, + { + "epoch": 0.4343325425921932, + "grad_norm": 0.17766329646110535, + "learning_rate": 0.00014301469201798863, + "loss": 2.2952, + "step": 1007 + }, + { + "epoch": 0.43476385594134137, + "grad_norm": 0.2226317971944809, + "learning_rate": 0.0001430003025492744, + "loss": 2.3549, + "step": 1008 + }, + { + "epoch": 0.4351951692904895, + "grad_norm": 0.19673429429531097, + "learning_rate": 0.00014298589900035388, + "loss": 2.3077, + "step": 1009 + }, + { + "epoch": 0.4356264826396377, + "grad_norm": 0.16575779020786285, + "learning_rate": 0.0001429714813742095, + "loss": 2.3121, + "step": 1010 + }, + { + "epoch": 0.43605779598878586, + "grad_norm": 0.17023572325706482, + "learning_rate": 0.00014295704967382656, + "loss": 2.1495, + "step": 1011 + }, + { + "epoch": 0.436489109337934, + "grad_norm": 0.17324426770210266, + "learning_rate": 0.00014294260390219335, + "loss": 2.2113, + "step": 1012 + }, + { + "epoch": 0.43692042268708214, + "grad_norm": 0.1725987046957016, + "learning_rate": 0.00014292814406230097, + "loss": 2.172, + "step": 1013 + }, + { + "epoch": 0.43735173603623034, + "grad_norm": 0.16376245021820068, + "learning_rate": 0.00014291367015714353, + "loss": 2.1641, + "step": 1014 + }, + { + "epoch": 0.4377830493853785, + "grad_norm": 0.19950352609157562, + "learning_rate": 0.000142899182189718, + "loss": 2.2105, + "step": 1015 + }, + { + "epoch": 0.43821436273452663, + "grad_norm": 0.16930456459522247, + "learning_rate": 0.00014288468016302423, + "loss": 2.2622, + "step": 1016 + }, + { + "epoch": 0.4386456760836748, + "grad_norm": 0.1521873027086258, + "learning_rate": 0.0001428701640800651, + "loss": 2.2575, + "step": 1017 + }, + { + "epoch": 0.439076989432823, + "grad_norm": 0.1518697440624237, + "learning_rate": 0.00014285563394384623, + "loss": 2.162, + "step": 1018 + }, + { + "epoch": 0.4395083027819711, + "grad_norm": 0.1935448795557022, + "learning_rate": 0.00014284108975737636, + "loss": 2.3691, + "step": 1019 + }, + { + "epoch": 0.43993961613111926, + "grad_norm": 0.18544535338878632, + "learning_rate": 0.0001428265315236669, + "loss": 2.2723, + "step": 1020 + }, + { + "epoch": 0.4403709294802674, + "grad_norm": 0.18568457663059235, + "learning_rate": 0.00014281195924573236, + "loss": 2.3239, + "step": 1021 + }, + { + "epoch": 0.44080224282941555, + "grad_norm": 0.1548377275466919, + "learning_rate": 0.00014279737292659012, + "loss": 2.0073, + "step": 1022 + }, + { + "epoch": 0.44123355617856375, + "grad_norm": 0.2214689552783966, + "learning_rate": 0.00014278277256926037, + "loss": 2.3241, + "step": 1023 + }, + { + "epoch": 0.4416648695277119, + "grad_norm": 0.1754104644060135, + "learning_rate": 0.0001427681581767663, + "loss": 2.2573, + "step": 1024 + }, + { + "epoch": 0.44209618287686003, + "grad_norm": 0.15047457814216614, + "learning_rate": 0.00014275352975213397, + "loss": 2.1531, + "step": 1025 + }, + { + "epoch": 0.44209618287686003, + "eval_loss": 2.1244282722473145, + "eval_runtime": 200.1144, + "eval_samples_per_second": 0.16, + "eval_steps_per_second": 0.16, + "step": 1025 + }, + { + "epoch": 0.4425274962260082, + "grad_norm": 0.1728946715593338, + "learning_rate": 0.00014273888729839237, + "loss": 2.3304, + "step": 1026 + }, + { + "epoch": 0.4429588095751564, + "grad_norm": 0.16061633825302124, + "learning_rate": 0.00014272423081857337, + "loss": 2.2651, + "step": 1027 + }, + { + "epoch": 0.4433901229243045, + "grad_norm": 0.1639309525489807, + "learning_rate": 0.00014270956031571178, + "loss": 1.9692, + "step": 1028 + }, + { + "epoch": 0.44382143627345266, + "grad_norm": 0.18484780192375183, + "learning_rate": 0.00014269487579284524, + "loss": 2.4299, + "step": 1029 + }, + { + "epoch": 0.4442527496226008, + "grad_norm": 0.17049303650856018, + "learning_rate": 0.00014268017725301439, + "loss": 2.1617, + "step": 1030 + }, + { + "epoch": 0.44468406297174895, + "grad_norm": 0.148739755153656, + "learning_rate": 0.0001426654646992627, + "loss": 2.2107, + "step": 1031 + }, + { + "epoch": 0.44511537632089715, + "grad_norm": 0.1590275764465332, + "learning_rate": 0.00014265073813463655, + "loss": 2.2585, + "step": 1032 + }, + { + "epoch": 0.4455466896700453, + "grad_norm": 0.20391826331615448, + "learning_rate": 0.00014263599756218526, + "loss": 2.3769, + "step": 1033 + }, + { + "epoch": 0.44597800301919344, + "grad_norm": 0.22355666756629944, + "learning_rate": 0.00014262124298496104, + "loss": 2.1261, + "step": 1034 + }, + { + "epoch": 0.4464093163683416, + "grad_norm": 0.1847553700208664, + "learning_rate": 0.00014260647440601895, + "loss": 2.2571, + "step": 1035 + }, + { + "epoch": 0.4468406297174898, + "grad_norm": 0.16822926700115204, + "learning_rate": 0.000142591691828417, + "loss": 2.0749, + "step": 1036 + }, + { + "epoch": 0.4472719430666379, + "grad_norm": 0.2041289359331131, + "learning_rate": 0.00014257689525521607, + "loss": 2.3264, + "step": 1037 + }, + { + "epoch": 0.44770325641578607, + "grad_norm": 0.20407123863697052, + "learning_rate": 0.00014256208468948, + "loss": 2.2982, + "step": 1038 + }, + { + "epoch": 0.4481345697649342, + "grad_norm": 0.18789510428905487, + "learning_rate": 0.00014254726013427545, + "loss": 2.1637, + "step": 1039 + }, + { + "epoch": 0.44856588311408235, + "grad_norm": 0.25421515107154846, + "learning_rate": 0.00014253242159267197, + "loss": 2.423, + "step": 1040 + }, + { + "epoch": 0.44899719646323055, + "grad_norm": 3.8317348957061768, + "learning_rate": 0.0001425175690677421, + "loss": 2.121, + "step": 1041 + }, + { + "epoch": 0.4494285098123787, + "grad_norm": 0.20088981091976166, + "learning_rate": 0.00014250270256256119, + "loss": 2.272, + "step": 1042 + }, + { + "epoch": 0.44985982316152684, + "grad_norm": 3.788172721862793, + "learning_rate": 0.00014248782208020754, + "loss": 2.293, + "step": 1043 + }, + { + "epoch": 0.450291136510675, + "grad_norm": 0.1680663675069809, + "learning_rate": 0.00014247292762376227, + "loss": 2.073, + "step": 1044 + }, + { + "epoch": 0.4507224498598232, + "grad_norm": 0.1550556868314743, + "learning_rate": 0.00014245801919630946, + "loss": 2.2315, + "step": 1045 + }, + { + "epoch": 0.4511537632089713, + "grad_norm": 0.17220419645309448, + "learning_rate": 0.00014244309680093607, + "loss": 2.3914, + "step": 1046 + }, + { + "epoch": 0.45158507655811947, + "grad_norm": 0.7895448207855225, + "learning_rate": 0.00014242816044073196, + "loss": 2.0289, + "step": 1047 + }, + { + "epoch": 0.4520163899072676, + "grad_norm": 0.16854602098464966, + "learning_rate": 0.00014241321011878983, + "loss": 2.2614, + "step": 1048 + }, + { + "epoch": 0.4524477032564158, + "grad_norm": 0.16735167801380157, + "learning_rate": 0.00014239824583820533, + "loss": 2.1894, + "step": 1049 + }, + { + "epoch": 0.45287901660556396, + "grad_norm": 0.1836962252855301, + "learning_rate": 0.00014238326760207697, + "loss": 2.2541, + "step": 1050 + }, + { + "epoch": 0.45287901660556396, + "eval_loss": 2.1253392696380615, + "eval_runtime": 199.256, + "eval_samples_per_second": 0.161, + "eval_steps_per_second": 0.161, + "step": 1050 + }, + { + "epoch": 0.4533103299547121, + "grad_norm": 0.16606397926807404, + "learning_rate": 0.00014236827541350619, + "loss": 2.1566, + "step": 1051 + }, + { + "epoch": 0.45374164330386024, + "grad_norm": 0.15843674540519714, + "learning_rate": 0.00014235326927559725, + "loss": 2.1098, + "step": 1052 + }, + { + "epoch": 0.4541729566530084, + "grad_norm": 0.160728320479393, + "learning_rate": 0.0001423382491914573, + "loss": 2.0559, + "step": 1053 + }, + { + "epoch": 0.4546042700021566, + "grad_norm": 0.16055908799171448, + "learning_rate": 0.00014232321516419652, + "loss": 2.2103, + "step": 1054 + }, + { + "epoch": 0.45503558335130473, + "grad_norm": 0.16064168512821198, + "learning_rate": 0.00014230816719692775, + "loss": 2.3539, + "step": 1055 + }, + { + "epoch": 0.4554668967004529, + "grad_norm": 0.19479875266551971, + "learning_rate": 0.00014229310529276692, + "loss": 2.179, + "step": 1056 + }, + { + "epoch": 0.455898210049601, + "grad_norm": 0.15349072217941284, + "learning_rate": 0.00014227802945483272, + "loss": 2.0798, + "step": 1057 + }, + { + "epoch": 0.4563295233987492, + "grad_norm": 0.15733902156352997, + "learning_rate": 0.00014226293968624683, + "loss": 2.187, + "step": 1058 + }, + { + "epoch": 0.45676083674789736, + "grad_norm": 0.1859322190284729, + "learning_rate": 0.00014224783599013364, + "loss": 2.1991, + "step": 1059 + }, + { + "epoch": 0.4571921500970455, + "grad_norm": 0.22436240315437317, + "learning_rate": 0.00014223271836962063, + "loss": 2.127, + "step": 1060 + }, + { + "epoch": 0.45762346344619365, + "grad_norm": 0.16210895776748657, + "learning_rate": 0.00014221758682783804, + "loss": 2.2366, + "step": 1061 + }, + { + "epoch": 0.4580547767953418, + "grad_norm": 0.16493889689445496, + "learning_rate": 0.000142202441367919, + "loss": 2.1817, + "step": 1062 + }, + { + "epoch": 0.45848609014449, + "grad_norm": 0.16778267920017242, + "learning_rate": 0.0001421872819929996, + "loss": 2.2076, + "step": 1063 + }, + { + "epoch": 0.45891740349363813, + "grad_norm": 0.15672996640205383, + "learning_rate": 0.00014217210870621865, + "loss": 2.0867, + "step": 1064 + }, + { + "epoch": 0.4593487168427863, + "grad_norm": 0.18593546748161316, + "learning_rate": 0.00014215692151071805, + "loss": 2.0031, + "step": 1065 + }, + { + "epoch": 0.4597800301919344, + "grad_norm": 0.17268811166286469, + "learning_rate": 0.00014214172040964243, + "loss": 2.204, + "step": 1066 + }, + { + "epoch": 0.4602113435410826, + "grad_norm": 0.1761752963066101, + "learning_rate": 0.00014212650540613937, + "loss": 2.2918, + "step": 1067 + }, + { + "epoch": 0.46064265689023076, + "grad_norm": 0.18261955678462982, + "learning_rate": 0.00014211127650335927, + "loss": 2.1724, + "step": 1068 + }, + { + "epoch": 0.4610739702393789, + "grad_norm": 0.19669634103775024, + "learning_rate": 0.00014209603370445544, + "loss": 1.9778, + "step": 1069 + }, + { + "epoch": 0.46150528358852705, + "grad_norm": 0.17395718395709991, + "learning_rate": 0.0001420807770125841, + "loss": 2.1735, + "step": 1070 + }, + { + "epoch": 0.46193659693767525, + "grad_norm": 0.1703193187713623, + "learning_rate": 0.00014206550643090427, + "loss": 2.2564, + "step": 1071 + }, + { + "epoch": 0.4623679102868234, + "grad_norm": 0.4253257215023041, + "learning_rate": 0.00014205022196257794, + "loss": 2.2818, + "step": 1072 + }, + { + "epoch": 0.46279922363597154, + "grad_norm": 0.16150128841400146, + "learning_rate": 0.0001420349236107699, + "loss": 2.1974, + "step": 1073 + }, + { + "epoch": 0.4632305369851197, + "grad_norm": 0.16813993453979492, + "learning_rate": 0.00014201961137864783, + "loss": 2.0688, + "step": 1074 + }, + { + "epoch": 0.4636618503342678, + "grad_norm": 0.21386326849460602, + "learning_rate": 0.00014200428526938233, + "loss": 2.3743, + "step": 1075 + }, + { + "epoch": 0.4636618503342678, + "eval_loss": 2.1256213188171387, + "eval_runtime": 205.8902, + "eval_samples_per_second": 0.155, + "eval_steps_per_second": 0.155, + "step": 1075 + }, + { + "epoch": 0.464093163683416, + "grad_norm": 0.18708860874176025, + "learning_rate": 0.0001419889452861468, + "loss": 2.2642, + "step": 1076 + }, + { + "epoch": 0.46452447703256416, + "grad_norm": 0.18455776572227478, + "learning_rate": 0.0001419735914321176, + "loss": 2.2839, + "step": 1077 + }, + { + "epoch": 0.4649557903817123, + "grad_norm": 0.15820108354091644, + "learning_rate": 0.00014195822371047384, + "loss": 2.162, + "step": 1078 + }, + { + "epoch": 0.46538710373086045, + "grad_norm": 0.17975345253944397, + "learning_rate": 0.00014194284212439763, + "loss": 2.0403, + "step": 1079 + }, + { + "epoch": 0.46581841708000865, + "grad_norm": 0.18395720422267914, + "learning_rate": 0.0001419274466770739, + "loss": 2.1124, + "step": 1080 + }, + { + "epoch": 0.4662497304291568, + "grad_norm": 0.158233180642128, + "learning_rate": 0.00014191203737169044, + "loss": 1.8722, + "step": 1081 + }, + { + "epoch": 0.46668104377830494, + "grad_norm": 0.16250333189964294, + "learning_rate": 0.0001418966142114379, + "loss": 2.0023, + "step": 1082 + }, + { + "epoch": 0.4671123571274531, + "grad_norm": 0.16339930891990662, + "learning_rate": 0.0001418811771995098, + "loss": 2.2289, + "step": 1083 + }, + { + "epoch": 0.4675436704766012, + "grad_norm": 0.1458890587091446, + "learning_rate": 0.00014186572633910258, + "loss": 1.9933, + "step": 1084 + }, + { + "epoch": 0.4679749838257494, + "grad_norm": 0.16976583003997803, + "learning_rate": 0.00014185026163341546, + "loss": 2.0317, + "step": 1085 + }, + { + "epoch": 0.46840629717489757, + "grad_norm": 0.1867656409740448, + "learning_rate": 0.00014183478308565063, + "loss": 2.2477, + "step": 1086 + }, + { + "epoch": 0.4688376105240457, + "grad_norm": 0.16612374782562256, + "learning_rate": 0.00014181929069901307, + "loss": 2.2293, + "step": 1087 + }, + { + "epoch": 0.46926892387319386, + "grad_norm": 0.16322407126426697, + "learning_rate": 0.00014180378447671064, + "loss": 2.2751, + "step": 1088 + }, + { + "epoch": 0.46970023722234205, + "grad_norm": 0.20379450917243958, + "learning_rate": 0.0001417882644219541, + "loss": 2.1489, + "step": 1089 + }, + { + "epoch": 0.4701315505714902, + "grad_norm": 0.18934619426727295, + "learning_rate": 0.00014177273053795697, + "loss": 2.5549, + "step": 1090 + }, + { + "epoch": 0.47056286392063834, + "grad_norm": 0.17536357045173645, + "learning_rate": 0.00014175718282793585, + "loss": 2.2638, + "step": 1091 + }, + { + "epoch": 0.4709941772697865, + "grad_norm": 0.17303945124149323, + "learning_rate": 0.00014174162129510992, + "loss": 2.0555, + "step": 1092 + }, + { + "epoch": 0.47142549061893463, + "grad_norm": 0.16105377674102783, + "learning_rate": 0.00014172604594270144, + "loss": 2.2979, + "step": 1093 + }, + { + "epoch": 0.4718568039680828, + "grad_norm": 0.16773848235607147, + "learning_rate": 0.00014171045677393543, + "loss": 2.1319, + "step": 1094 + }, + { + "epoch": 0.47228811731723097, + "grad_norm": 0.16544897854328156, + "learning_rate": 0.00014169485379203985, + "loss": 2.2134, + "step": 1095 + }, + { + "epoch": 0.4727194306663791, + "grad_norm": 0.3179582953453064, + "learning_rate": 0.0001416792370002454, + "loss": 2.174, + "step": 1096 + }, + { + "epoch": 0.47315074401552726, + "grad_norm": 0.16636958718299866, + "learning_rate": 0.00014166360640178576, + "loss": 2.188, + "step": 1097 + }, + { + "epoch": 0.47358205736467546, + "grad_norm": 0.15930014848709106, + "learning_rate": 0.0001416479619998974, + "loss": 1.9799, + "step": 1098 + }, + { + "epoch": 0.4740133707138236, + "grad_norm": 0.17669852077960968, + "learning_rate": 0.00014163230379781963, + "loss": 2.0805, + "step": 1099 + }, + { + "epoch": 0.47444468406297174, + "grad_norm": 0.1416047066450119, + "learning_rate": 0.00014161663179879469, + "loss": 2.3382, + "step": 1100 + }, + { + "epoch": 0.47444468406297174, + "eval_loss": 2.1245031356811523, + "eval_runtime": 196.9672, + "eval_samples_per_second": 0.162, + "eval_steps_per_second": 0.162, + "step": 1100 + }, + { + "epoch": 0.4748759974121199, + "grad_norm": 0.17482198774814606, + "learning_rate": 0.00014160094600606762, + "loss": 2.2444, + "step": 1101 + }, + { + "epoch": 0.4753073107612681, + "grad_norm": 0.176433727145195, + "learning_rate": 0.00014158524642288634, + "loss": 2.2817, + "step": 1102 + }, + { + "epoch": 0.47573862411041623, + "grad_norm": 0.16114936769008636, + "learning_rate": 0.00014156953305250166, + "loss": 2.2974, + "step": 1103 + }, + { + "epoch": 0.4761699374595644, + "grad_norm": 0.16793087124824524, + "learning_rate": 0.00014155380589816715, + "loss": 2.1037, + "step": 1104 + }, + { + "epoch": 0.4766012508087125, + "grad_norm": 0.1911628097295761, + "learning_rate": 0.0001415380649631393, + "loss": 2.2625, + "step": 1105 + }, + { + "epoch": 0.47703256415786066, + "grad_norm": 0.25380977988243103, + "learning_rate": 0.00014152231025067746, + "loss": 2.1, + "step": 1106 + }, + { + "epoch": 0.47746387750700886, + "grad_norm": 0.1790010780096054, + "learning_rate": 0.0001415065417640438, + "loss": 2.0709, + "step": 1107 + }, + { + "epoch": 0.477895190856157, + "grad_norm": 0.15084779262542725, + "learning_rate": 0.00014149075950650335, + "loss": 2.0395, + "step": 1108 + }, + { + "epoch": 0.47832650420530515, + "grad_norm": 0.17813463509082794, + "learning_rate": 0.00014147496348132404, + "loss": 2.4109, + "step": 1109 + }, + { + "epoch": 0.4787578175544533, + "grad_norm": 0.17824919521808624, + "learning_rate": 0.00014145915369177657, + "loss": 2.058, + "step": 1110 + }, + { + "epoch": 0.4791891309036015, + "grad_norm": 0.17649079859256744, + "learning_rate": 0.00014144333014113456, + "loss": 2.0866, + "step": 1111 + }, + { + "epoch": 0.47962044425274963, + "grad_norm": 0.1977768838405609, + "learning_rate": 0.00014142749283267442, + "loss": 2.1816, + "step": 1112 + }, + { + "epoch": 0.4800517576018978, + "grad_norm": 0.17968136072158813, + "learning_rate": 0.00014141164176967544, + "loss": 2.2179, + "step": 1113 + }, + { + "epoch": 0.4804830709510459, + "grad_norm": 0.16740866005420685, + "learning_rate": 0.00014139577695541976, + "loss": 2.2603, + "step": 1114 + }, + { + "epoch": 0.48091438430019406, + "grad_norm": 0.16543352603912354, + "learning_rate": 0.0001413798983931924, + "loss": 2.271, + "step": 1115 + }, + { + "epoch": 0.48134569764934226, + "grad_norm": 0.1757402867078781, + "learning_rate": 0.00014136400608628113, + "loss": 2.3212, + "step": 1116 + }, + { + "epoch": 0.4817770109984904, + "grad_norm": 0.20364037156105042, + "learning_rate": 0.0001413481000379767, + "loss": 2.2111, + "step": 1117 + }, + { + "epoch": 0.48220832434763855, + "grad_norm": 0.18235564231872559, + "learning_rate": 0.00014133218025157255, + "loss": 2.6066, + "step": 1118 + }, + { + "epoch": 0.4826396376967867, + "grad_norm": 0.16168133914470673, + "learning_rate": 0.0001413162467303651, + "loss": 2.189, + "step": 1119 + }, + { + "epoch": 0.4830709510459349, + "grad_norm": 0.17553839087486267, + "learning_rate": 0.00014130029947765352, + "loss": 2.1359, + "step": 1120 + }, + { + "epoch": 0.48350226439508304, + "grad_norm": 0.17774251103401184, + "learning_rate": 0.0001412843384967399, + "loss": 2.4902, + "step": 1121 + }, + { + "epoch": 0.4839335777442312, + "grad_norm": 0.17376278340816498, + "learning_rate": 0.0001412683637909291, + "loss": 2.2022, + "step": 1122 + }, + { + "epoch": 0.4843648910933793, + "grad_norm": 0.18041923642158508, + "learning_rate": 0.0001412523753635289, + "loss": 2.3228, + "step": 1123 + }, + { + "epoch": 0.4847962044425275, + "grad_norm": 0.2041550576686859, + "learning_rate": 0.00014123637321784987, + "loss": 2.4281, + "step": 1124 + }, + { + "epoch": 0.48522751779167567, + "grad_norm": 0.1704571694135666, + "learning_rate": 0.00014122035735720538, + "loss": 2.1997, + "step": 1125 + }, + { + "epoch": 0.48522751779167567, + "eval_loss": 2.1237313747406006, + "eval_runtime": 197.9828, + "eval_samples_per_second": 0.162, + "eval_steps_per_second": 0.162, + "step": 1125 + }, + { + "epoch": 0.4856588311408238, + "grad_norm": 0.18956632912158966, + "learning_rate": 0.00014120432778491176, + "loss": 2.2244, + "step": 1126 + }, + { + "epoch": 0.48609014448997195, + "grad_norm": 0.16811709105968475, + "learning_rate": 0.00014118828450428807, + "loss": 2.1037, + "step": 1127 + }, + { + "epoch": 0.4865214578391201, + "grad_norm": 0.20404136180877686, + "learning_rate": 0.00014117222751865621, + "loss": 2.3572, + "step": 1128 + }, + { + "epoch": 0.4869527711882683, + "grad_norm": 0.1897653490304947, + "learning_rate": 0.00014115615683134105, + "loss": 2.0607, + "step": 1129 + }, + { + "epoch": 0.48738408453741644, + "grad_norm": 0.20207540690898895, + "learning_rate": 0.00014114007244567012, + "loss": 2.3401, + "step": 1130 + }, + { + "epoch": 0.4878153978865646, + "grad_norm": 0.16764988005161285, + "learning_rate": 0.0001411239743649739, + "loss": 2.1391, + "step": 1131 + }, + { + "epoch": 0.4882467112357127, + "grad_norm": 0.15346284210681915, + "learning_rate": 0.00014110786259258565, + "loss": 2.1538, + "step": 1132 + }, + { + "epoch": 0.4886780245848609, + "grad_norm": 0.17945681512355804, + "learning_rate": 0.00014109173713184155, + "loss": 2.2123, + "step": 1133 + }, + { + "epoch": 0.48910933793400907, + "grad_norm": 0.1814691722393036, + "learning_rate": 0.00014107559798608047, + "loss": 2.1695, + "step": 1134 + }, + { + "epoch": 0.4895406512831572, + "grad_norm": 0.17287328839302063, + "learning_rate": 0.00014105944515864426, + "loss": 2.2417, + "step": 1135 + }, + { + "epoch": 0.48997196463230536, + "grad_norm": 0.17122970521450043, + "learning_rate": 0.00014104327865287748, + "loss": 2.0969, + "step": 1136 + }, + { + "epoch": 0.4904032779814535, + "grad_norm": 0.19306020438671112, + "learning_rate": 0.00014102709847212765, + "loss": 2.4507, + "step": 1137 + }, + { + "epoch": 0.4908345913306017, + "grad_norm": 0.17338979244232178, + "learning_rate": 0.000141010904619745, + "loss": 2.2043, + "step": 1138 + }, + { + "epoch": 0.49126590467974984, + "grad_norm": 0.19331465661525726, + "learning_rate": 0.0001409946970990827, + "loss": 2.2461, + "step": 1139 + }, + { + "epoch": 0.491697218028898, + "grad_norm": 0.1610681712627411, + "learning_rate": 0.00014097847591349662, + "loss": 2.091, + "step": 1140 + }, + { + "epoch": 0.49212853137804613, + "grad_norm": 0.17557750642299652, + "learning_rate": 0.0001409622410663456, + "loss": 2.1339, + "step": 1141 + }, + { + "epoch": 0.49255984472719433, + "grad_norm": 0.16115573048591614, + "learning_rate": 0.00014094599256099123, + "loss": 2.1913, + "step": 1142 + }, + { + "epoch": 0.4929911580763425, + "grad_norm": 0.15505258738994598, + "learning_rate": 0.0001409297304007979, + "loss": 2.1831, + "step": 1143 + }, + { + "epoch": 0.4934224714254906, + "grad_norm": 0.1573905646800995, + "learning_rate": 0.00014091345458913288, + "loss": 2.2225, + "step": 1144 + }, + { + "epoch": 0.49385378477463876, + "grad_norm": 0.15298591554164886, + "learning_rate": 0.00014089716512936628, + "loss": 2.1078, + "step": 1145 + }, + { + "epoch": 0.4942850981237869, + "grad_norm": 0.14728635549545288, + "learning_rate": 0.00014088086202487102, + "loss": 2.034, + "step": 1146 + }, + { + "epoch": 0.4947164114729351, + "grad_norm": 0.18687205016613007, + "learning_rate": 0.0001408645452790228, + "loss": 2.2212, + "step": 1147 + }, + { + "epoch": 0.49514772482208325, + "grad_norm": 0.22122670710086823, + "learning_rate": 0.00014084821489520022, + "loss": 2.1835, + "step": 1148 + }, + { + "epoch": 0.4955790381712314, + "grad_norm": 0.1600857973098755, + "learning_rate": 0.00014083187087678462, + "loss": 2.3065, + "step": 1149 + }, + { + "epoch": 0.49601035152037953, + "grad_norm": 0.15622857213020325, + "learning_rate": 0.00014081551322716023, + "loss": 2.1417, + "step": 1150 + }, + { + "epoch": 0.49601035152037953, + "eval_loss": 2.122781753540039, + "eval_runtime": 198.1311, + "eval_samples_per_second": 0.162, + "eval_steps_per_second": 0.162, + "step": 1150 + }, + { + "epoch": 0.49644166486952773, + "grad_norm": 0.159952774643898, + "learning_rate": 0.0001407991419497141, + "loss": 2.3553, + "step": 1151 + }, + { + "epoch": 0.4968729782186759, + "grad_norm": 0.17937713861465454, + "learning_rate": 0.00014078275704783604, + "loss": 2.2021, + "step": 1152 + }, + { + "epoch": 0.497304291567824, + "grad_norm": 0.16701209545135498, + "learning_rate": 0.00014076635852491878, + "loss": 2.0779, + "step": 1153 + }, + { + "epoch": 0.49773560491697216, + "grad_norm": 0.19412510097026825, + "learning_rate": 0.00014074994638435775, + "loss": 1.895, + "step": 1154 + }, + { + "epoch": 0.49816691826612036, + "grad_norm": 0.17660222947597504, + "learning_rate": 0.00014073352062955128, + "loss": 2.2594, + "step": 1155 + }, + { + "epoch": 0.4985982316152685, + "grad_norm": 0.15943753719329834, + "learning_rate": 0.00014071708126390053, + "loss": 2.4051, + "step": 1156 + }, + { + "epoch": 0.49902954496441665, + "grad_norm": 0.165870800614357, + "learning_rate": 0.0001407006282908094, + "loss": 2.338, + "step": 1157 + }, + { + "epoch": 0.4994608583135648, + "grad_norm": 0.16600917279720306, + "learning_rate": 0.00014068416171368476, + "loss": 2.0591, + "step": 1158 + }, + { + "epoch": 0.49989217166271294, + "grad_norm": 0.18140505254268646, + "learning_rate": 0.00014066768153593609, + "loss": 2.4228, + "step": 1159 + }, + { + "epoch": 0.5003234850118611, + "grad_norm": 0.17261415719985962, + "learning_rate": 0.0001406511877609758, + "loss": 2.1168, + "step": 1160 + }, + { + "epoch": 0.5007547983610092, + "grad_norm": 0.17106929421424866, + "learning_rate": 0.0001406346803922192, + "loss": 2.1904, + "step": 1161 + }, + { + "epoch": 0.5011861117101575, + "grad_norm": 0.16084757447242737, + "learning_rate": 0.00014061815943308424, + "loss": 2.2569, + "step": 1162 + }, + { + "epoch": 0.5016174250593056, + "grad_norm": 0.17182020843029022, + "learning_rate": 0.00014060162488699175, + "loss": 2.0081, + "step": 1163 + }, + { + "epoch": 0.5020487384084538, + "grad_norm": 0.16303688287734985, + "learning_rate": 0.00014058507675736542, + "loss": 2.141, + "step": 1164 + }, + { + "epoch": 0.5024800517576019, + "grad_norm": 0.17593824863433838, + "learning_rate": 0.00014056851504763175, + "loss": 2.2078, + "step": 1165 + }, + { + "epoch": 0.50291136510675, + "grad_norm": 0.17629596590995789, + "learning_rate": 0.00014055193976121998, + "loss": 2.2364, + "step": 1166 + }, + { + "epoch": 0.5033426784558982, + "grad_norm": 0.19598311185836792, + "learning_rate": 0.0001405353509015622, + "loss": 2.2881, + "step": 1167 + }, + { + "epoch": 0.5037739918050463, + "grad_norm": 0.1822650283575058, + "learning_rate": 0.00014051874847209334, + "loss": 2.3105, + "step": 1168 + }, + { + "epoch": 0.5042053051541945, + "grad_norm": 0.2322334349155426, + "learning_rate": 0.00014050213247625114, + "loss": 2.1919, + "step": 1169 + }, + { + "epoch": 0.5046366185033426, + "grad_norm": 0.173886239528656, + "learning_rate": 0.00014048550291747606, + "loss": 2.2836, + "step": 1170 + }, + { + "epoch": 0.5050679318524909, + "grad_norm": 0.23277299106121063, + "learning_rate": 0.0001404688597992115, + "loss": 2.2851, + "step": 1171 + }, + { + "epoch": 0.505499245201639, + "grad_norm": 0.1585807353258133, + "learning_rate": 0.0001404522031249035, + "loss": 2.3763, + "step": 1172 + }, + { + "epoch": 0.5059305585507872, + "grad_norm": 0.17657655477523804, + "learning_rate": 0.00014043553289800112, + "loss": 2.287, + "step": 1173 + }, + { + "epoch": 0.5063618718999353, + "grad_norm": 0.1734786033630371, + "learning_rate": 0.00014041884912195608, + "loss": 2.1696, + "step": 1174 + }, + { + "epoch": 0.5067931852490835, + "grad_norm": 0.1727166771888733, + "learning_rate": 0.0001404021518002229, + "loss": 2.1269, + "step": 1175 + }, + { + "epoch": 0.5067931852490835, + "eval_loss": 2.1227898597717285, + "eval_runtime": 198.1515, + "eval_samples_per_second": 0.161, + "eval_steps_per_second": 0.161, + "step": 1175 + }, + { + "epoch": 0.5072244985982316, + "grad_norm": 0.19915609061717987, + "learning_rate": 0.00014038544093625892, + "loss": 2.1747, + "step": 1176 + }, + { + "epoch": 0.5076558119473797, + "grad_norm": 0.15546812117099762, + "learning_rate": 0.0001403687165335244, + "loss": 2.0218, + "step": 1177 + }, + { + "epoch": 0.5080871252965279, + "grad_norm": 0.17874981462955475, + "learning_rate": 0.00014035197859548225, + "loss": 2.4689, + "step": 1178 + }, + { + "epoch": 0.508518438645676, + "grad_norm": 0.15583276748657227, + "learning_rate": 0.00014033522712559825, + "loss": 2.0931, + "step": 1179 + }, + { + "epoch": 0.5089497519948243, + "grad_norm": 0.19189707934856415, + "learning_rate": 0.00014031846212734096, + "loss": 2.2619, + "step": 1180 + }, + { + "epoch": 0.5093810653439724, + "grad_norm": 0.20342782139778137, + "learning_rate": 0.00014030168360418182, + "loss": 2.0796, + "step": 1181 + }, + { + "epoch": 0.5098123786931206, + "grad_norm": 0.17643365263938904, + "learning_rate": 0.00014028489155959494, + "loss": 2.274, + "step": 1182 + }, + { + "epoch": 0.5102436920422687, + "grad_norm": 0.1894954890012741, + "learning_rate": 0.00014026808599705733, + "loss": 2.2807, + "step": 1183 + }, + { + "epoch": 0.5106750053914169, + "grad_norm": 0.17265945672988892, + "learning_rate": 0.00014025126692004874, + "loss": 2.2272, + "step": 1184 + }, + { + "epoch": 0.511106318740565, + "grad_norm": 0.2056184709072113, + "learning_rate": 0.00014023443433205176, + "loss": 2.2147, + "step": 1185 + }, + { + "epoch": 0.5115376320897131, + "grad_norm": 0.16164915263652802, + "learning_rate": 0.00014021758823655174, + "loss": 1.9599, + "step": 1186 + }, + { + "epoch": 0.5119689454388613, + "grad_norm": 0.1566849946975708, + "learning_rate": 0.0001402007286370369, + "loss": 2.0889, + "step": 1187 + }, + { + "epoch": 0.5124002587880095, + "grad_norm": 0.17637118697166443, + "learning_rate": 0.00014018385553699818, + "loss": 2.2879, + "step": 1188 + }, + { + "epoch": 0.5128315721371577, + "grad_norm": 0.18087397515773773, + "learning_rate": 0.0001401669689399293, + "loss": 2.2892, + "step": 1189 + }, + { + "epoch": 0.5132628854863058, + "grad_norm": 0.19656838476657867, + "learning_rate": 0.00014015006884932687, + "loss": 2.3285, + "step": 1190 + }, + { + "epoch": 0.513694198835454, + "grad_norm": 0.1986808478832245, + "learning_rate": 0.0001401331552686902, + "loss": 2.2076, + "step": 1191 + }, + { + "epoch": 0.5141255121846021, + "grad_norm": 0.15159070491790771, + "learning_rate": 0.00014011622820152145, + "loss": 2.0549, + "step": 1192 + }, + { + "epoch": 0.5145568255337503, + "grad_norm": 0.17124521732330322, + "learning_rate": 0.00014009928765132556, + "loss": 2.2253, + "step": 1193 + }, + { + "epoch": 0.5149881388828984, + "grad_norm": 0.19993092119693756, + "learning_rate": 0.00014008233362161024, + "loss": 2.1887, + "step": 1194 + }, + { + "epoch": 0.5154194522320465, + "grad_norm": 0.17296721041202545, + "learning_rate": 0.00014006536611588602, + "loss": 2.2939, + "step": 1195 + }, + { + "epoch": 0.5158507655811947, + "grad_norm": 0.15552176535129547, + "learning_rate": 0.0001400483851376662, + "loss": 2.1718, + "step": 1196 + }, + { + "epoch": 0.516282078930343, + "grad_norm": 0.17558616399765015, + "learning_rate": 0.00014003139069046692, + "loss": 2.367, + "step": 1197 + }, + { + "epoch": 0.5167133922794911, + "grad_norm": 0.15603621304035187, + "learning_rate": 0.00014001438277780697, + "loss": 1.8832, + "step": 1198 + }, + { + "epoch": 0.5171447056286392, + "grad_norm": 0.16005119681358337, + "learning_rate": 0.0001399973614032081, + "loss": 1.8694, + "step": 1199 + }, + { + "epoch": 0.5175760189777874, + "grad_norm": 0.17181624472141266, + "learning_rate": 0.00013998032657019474, + "loss": 2.3409, + "step": 1200 + }, + { + "epoch": 0.5175760189777874, + "eval_loss": 2.1217408180236816, + "eval_runtime": 199.1599, + "eval_samples_per_second": 0.161, + "eval_steps_per_second": 0.161, + "step": 1200 + }, + { + "epoch": 0.5180073323269355, + "grad_norm": 0.15452653169631958, + "learning_rate": 0.00013996327828229418, + "loss": 2.1593, + "step": 1201 + }, + { + "epoch": 0.5184386456760837, + "grad_norm": 0.16115358471870422, + "learning_rate": 0.0001399462165430364, + "loss": 2.1336, + "step": 1202 + }, + { + "epoch": 0.5188699590252318, + "grad_norm": 0.16711373627185822, + "learning_rate": 0.00013992914135595427, + "loss": 2.3766, + "step": 1203 + }, + { + "epoch": 0.51930127237438, + "grad_norm": 0.1763235479593277, + "learning_rate": 0.00013991205272458334, + "loss": 2.4137, + "step": 1204 + }, + { + "epoch": 0.5197325857235281, + "grad_norm": 0.16084948182106018, + "learning_rate": 0.00013989495065246205, + "loss": 2.1188, + "step": 1205 + }, + { + "epoch": 0.5201638990726764, + "grad_norm": 0.17245417833328247, + "learning_rate": 0.00013987783514313155, + "loss": 2.2178, + "step": 1206 + }, + { + "epoch": 0.5205952124218245, + "grad_norm": 0.1630506068468094, + "learning_rate": 0.00013986070620013576, + "loss": 2.073, + "step": 1207 + }, + { + "epoch": 0.5210265257709726, + "grad_norm": 0.17694434523582458, + "learning_rate": 0.00013984356382702146, + "loss": 2.1957, + "step": 1208 + }, + { + "epoch": 0.5214578391201208, + "grad_norm": 0.18180590867996216, + "learning_rate": 0.00013982640802733817, + "loss": 2.4528, + "step": 1209 + }, + { + "epoch": 0.5218891524692689, + "grad_norm": 0.24107345938682556, + "learning_rate": 0.00013980923880463816, + "loss": 2.2918, + "step": 1210 + }, + { + "epoch": 0.5223204658184171, + "grad_norm": 0.15491077303886414, + "learning_rate": 0.0001397920561624765, + "loss": 2.0963, + "step": 1211 + }, + { + "epoch": 0.5227517791675652, + "grad_norm": 0.1684110015630722, + "learning_rate": 0.00013977486010441107, + "loss": 2.2447, + "step": 1212 + }, + { + "epoch": 0.5231830925167134, + "grad_norm": 0.16640383005142212, + "learning_rate": 0.0001397576506340025, + "loss": 2.1924, + "step": 1213 + }, + { + "epoch": 0.5236144058658615, + "grad_norm": 0.18121518194675446, + "learning_rate": 0.00013974042775481417, + "loss": 2.2443, + "step": 1214 + }, + { + "epoch": 0.5240457192150098, + "grad_norm": 0.17246213555335999, + "learning_rate": 0.0001397231914704123, + "loss": 2.2232, + "step": 1215 + }, + { + "epoch": 0.5244770325641579, + "grad_norm": 0.18331019580364227, + "learning_rate": 0.0001397059417843658, + "loss": 2.0938, + "step": 1216 + }, + { + "epoch": 0.524908345913306, + "grad_norm": 0.17655853927135468, + "learning_rate": 0.00013968867870024648, + "loss": 2.2938, + "step": 1217 + }, + { + "epoch": 0.5253396592624542, + "grad_norm": 0.17127923667430878, + "learning_rate": 0.0001396714022216288, + "loss": 2.2469, + "step": 1218 + }, + { + "epoch": 0.5257709726116023, + "grad_norm": 0.1631791591644287, + "learning_rate": 0.00013965411235209005, + "loss": 2.2233, + "step": 1219 + }, + { + "epoch": 0.5262022859607505, + "grad_norm": 0.15704067051410675, + "learning_rate": 0.0001396368090952103, + "loss": 2.1231, + "step": 1220 + }, + { + "epoch": 0.5266335993098986, + "grad_norm": 0.17695368826389313, + "learning_rate": 0.00013961949245457235, + "loss": 2.2012, + "step": 1221 + }, + { + "epoch": 0.5270649126590468, + "grad_norm": 0.1563882827758789, + "learning_rate": 0.00013960216243376184, + "loss": 2.0977, + "step": 1222 + }, + { + "epoch": 0.5274962260081949, + "grad_norm": 0.16035982966423035, + "learning_rate": 0.00013958481903636713, + "loss": 2.1171, + "step": 1223 + }, + { + "epoch": 0.5279275393573432, + "grad_norm": 0.16886377334594727, + "learning_rate": 0.00013956746226597935, + "loss": 2.3124, + "step": 1224 + }, + { + "epoch": 0.5283588527064913, + "grad_norm": 0.1622340977191925, + "learning_rate": 0.0001395500921261924, + "loss": 2.339, + "step": 1225 + }, + { + "epoch": 0.5283588527064913, + "eval_loss": 2.120978832244873, + "eval_runtime": 197.7462, + "eval_samples_per_second": 0.162, + "eval_steps_per_second": 0.162, + "step": 1225 + }, + { + "epoch": 0.5287901660556394, + "grad_norm": 0.16553667187690735, + "learning_rate": 0.00013953270862060294, + "loss": 2.2944, + "step": 1226 + }, + { + "epoch": 0.5292214794047876, + "grad_norm": 0.15495768189430237, + "learning_rate": 0.00013951531175281046, + "loss": 2.2237, + "step": 1227 + }, + { + "epoch": 0.5296527927539357, + "grad_norm": 0.16204552352428436, + "learning_rate": 0.0001394979015264172, + "loss": 2.3077, + "step": 1228 + }, + { + "epoch": 0.5300841061030839, + "grad_norm": 0.17178687453269958, + "learning_rate": 0.00013948047794502802, + "loss": 2.2002, + "step": 1229 + }, + { + "epoch": 0.530515419452232, + "grad_norm": 1.4567164182662964, + "learning_rate": 0.0001394630410122508, + "loss": 2.2502, + "step": 1230 + }, + { + "epoch": 0.5309467328013802, + "grad_norm": 0.1543223112821579, + "learning_rate": 0.00013944559073169593, + "loss": 2.1162, + "step": 1231 + }, + { + "epoch": 0.5313780461505283, + "grad_norm": 0.18608568608760834, + "learning_rate": 0.00013942812710697678, + "loss": 2.3083, + "step": 1232 + }, + { + "epoch": 0.5318093594996766, + "grad_norm": 0.1594236046075821, + "learning_rate": 0.00013941065014170932, + "loss": 2.1773, + "step": 1233 + }, + { + "epoch": 0.5322406728488247, + "grad_norm": 0.18374276161193848, + "learning_rate": 0.00013939315983951234, + "loss": 2.3547, + "step": 1234 + }, + { + "epoch": 0.5326719861979728, + "grad_norm": 0.14752689003944397, + "learning_rate": 0.00013937565620400746, + "loss": 2.2261, + "step": 1235 + }, + { + "epoch": 0.533103299547121, + "grad_norm": 0.1832878440618515, + "learning_rate": 0.00013935813923881895, + "loss": 2.1805, + "step": 1236 + }, + { + "epoch": 0.5335346128962691, + "grad_norm": 0.18583160638809204, + "learning_rate": 0.0001393406089475739, + "loss": 2.0176, + "step": 1237 + }, + { + "epoch": 0.5339659262454173, + "grad_norm": 0.1821640282869339, + "learning_rate": 0.00013932306533390214, + "loss": 2.2021, + "step": 1238 + }, + { + "epoch": 0.5343972395945654, + "grad_norm": 0.16310039162635803, + "learning_rate": 0.0001393055084014363, + "loss": 2.0919, + "step": 1239 + }, + { + "epoch": 0.5348285529437136, + "grad_norm": 0.18390604853630066, + "learning_rate": 0.00013928793815381167, + "loss": 2.0216, + "step": 1240 + }, + { + "epoch": 0.5352598662928618, + "grad_norm": 0.15094532072544098, + "learning_rate": 0.00013927035459466644, + "loss": 2.336, + "step": 1241 + }, + { + "epoch": 0.53569117964201, + "grad_norm": 0.16046181321144104, + "learning_rate": 0.00013925275772764143, + "loss": 2.3603, + "step": 1242 + }, + { + "epoch": 0.5361224929911581, + "grad_norm": 0.18129126727581024, + "learning_rate": 0.00013923514755638029, + "loss": 2.3465, + "step": 1243 + }, + { + "epoch": 0.5365538063403062, + "grad_norm": 0.17180679738521576, + "learning_rate": 0.00013921752408452937, + "loss": 2.1805, + "step": 1244 + }, + { + "epoch": 0.5369851196894544, + "grad_norm": 0.15546143054962158, + "learning_rate": 0.00013919988731573782, + "loss": 2.1498, + "step": 1245 + }, + { + "epoch": 0.5374164330386025, + "grad_norm": 0.16976916790008545, + "learning_rate": 0.00013918223725365754, + "loss": 2.1929, + "step": 1246 + }, + { + "epoch": 0.5378477463877507, + "grad_norm": 0.16079477965831757, + "learning_rate": 0.00013916457390194314, + "loss": 2.3685, + "step": 1247 + }, + { + "epoch": 0.5382790597368988, + "grad_norm": 0.16682679951190948, + "learning_rate": 0.000139146897264252, + "loss": 2.147, + "step": 1248 + }, + { + "epoch": 0.538710373086047, + "grad_norm": 0.20510974526405334, + "learning_rate": 0.00013912920734424433, + "loss": 2.2688, + "step": 1249 + }, + { + "epoch": 0.5391416864351952, + "grad_norm": 0.1665913313627243, + "learning_rate": 0.00013911150414558292, + "loss": 2.2088, + "step": 1250 + }, + { + "epoch": 0.5391416864351952, + "eval_loss": 2.122371196746826, + "eval_runtime": 198.043, + "eval_samples_per_second": 0.162, + "eval_steps_per_second": 0.162, + "step": 1250 + }, + { + "epoch": 0.5395729997843434, + "grad_norm": 0.17404893040657043, + "learning_rate": 0.0001390937876719335, + "loss": 2.2236, + "step": 1251 + }, + { + "epoch": 0.5400043131334915, + "grad_norm": 0.21290186047554016, + "learning_rate": 0.00013907605792696444, + "loss": 2.1855, + "step": 1252 + }, + { + "epoch": 0.5404356264826397, + "grad_norm": 0.18014317750930786, + "learning_rate": 0.0001390583149143468, + "loss": 2.3149, + "step": 1253 + }, + { + "epoch": 0.5408669398317878, + "grad_norm": 0.1657971888780594, + "learning_rate": 0.0001390405586377546, + "loss": 1.9182, + "step": 1254 + }, + { + "epoch": 0.5412982531809359, + "grad_norm": 0.17486490309238434, + "learning_rate": 0.00013902278910086433, + "loss": 2.1968, + "step": 1255 + }, + { + "epoch": 0.5417295665300841, + "grad_norm": 0.1682259440422058, + "learning_rate": 0.00013900500630735548, + "loss": 2.3308, + "step": 1256 + }, + { + "epoch": 0.5421608798792322, + "grad_norm": 0.19142164289951324, + "learning_rate": 0.00013898721026091014, + "loss": 2.2679, + "step": 1257 + }, + { + "epoch": 0.5425921932283804, + "grad_norm": 0.15226717293262482, + "learning_rate": 0.00013896940096521312, + "loss": 2.0877, + "step": 1258 + }, + { + "epoch": 0.5430235065775286, + "grad_norm": 0.17772473394870758, + "learning_rate": 0.00013895157842395208, + "loss": 2.4032, + "step": 1259 + }, + { + "epoch": 0.5434548199266768, + "grad_norm": 0.17212212085723877, + "learning_rate": 0.00013893374264081738, + "loss": 2.298, + "step": 1260 + }, + { + "epoch": 0.5438861332758249, + "grad_norm": 0.1911373734474182, + "learning_rate": 0.0001389158936195021, + "loss": 2.3969, + "step": 1261 + }, + { + "epoch": 0.5443174466249731, + "grad_norm": 0.1889583170413971, + "learning_rate": 0.00013889803136370207, + "loss": 2.136, + "step": 1262 + }, + { + "epoch": 0.5447487599741212, + "grad_norm": 0.16074030101299286, + "learning_rate": 0.00013888015587711587, + "loss": 2.2086, + "step": 1263 + }, + { + "epoch": 0.5451800733232693, + "grad_norm": 0.15296761691570282, + "learning_rate": 0.0001388622671634448, + "loss": 2.3906, + "step": 1264 + }, + { + "epoch": 0.5456113866724175, + "grad_norm": 0.18919295072555542, + "learning_rate": 0.00013884436522639297, + "loss": 2.3383, + "step": 1265 + }, + { + "epoch": 0.5460427000215656, + "grad_norm": 0.16433317959308624, + "learning_rate": 0.0001388264500696671, + "loss": 2.1688, + "step": 1266 + }, + { + "epoch": 0.5464740133707138, + "grad_norm": 0.16185767948627472, + "learning_rate": 0.00013880852169697672, + "loss": 2.5015, + "step": 1267 + }, + { + "epoch": 0.546905326719862, + "grad_norm": 0.18482503294944763, + "learning_rate": 0.00013879058011203417, + "loss": 2.2468, + "step": 1268 + }, + { + "epoch": 0.5473366400690102, + "grad_norm": 0.1860150247812271, + "learning_rate": 0.00013877262531855438, + "loss": 2.204, + "step": 1269 + }, + { + "epoch": 0.5477679534181583, + "grad_norm": 0.1531609743833542, + "learning_rate": 0.0001387546573202551, + "loss": 2.2641, + "step": 1270 + }, + { + "epoch": 0.5481992667673065, + "grad_norm": 0.19258692860603333, + "learning_rate": 0.00013873667612085687, + "loss": 2.1963, + "step": 1271 + }, + { + "epoch": 0.5486305801164546, + "grad_norm": 0.20093025267124176, + "learning_rate": 0.0001387186817240828, + "loss": 2.0085, + "step": 1272 + }, + { + "epoch": 0.5490618934656027, + "grad_norm": 0.16600941121578217, + "learning_rate": 0.00013870067413365887, + "loss": 2.3559, + "step": 1273 + }, + { + "epoch": 0.5494932068147509, + "grad_norm": 0.15207615494728088, + "learning_rate": 0.00013868265335331372, + "loss": 1.9825, + "step": 1274 + }, + { + "epoch": 0.549924520163899, + "grad_norm": 0.16786298155784607, + "learning_rate": 0.0001386646193867788, + "loss": 2.3603, + "step": 1275 + }, + { + "epoch": 0.549924520163899, + "eval_loss": 2.120718479156494, + "eval_runtime": 197.0115, + "eval_samples_per_second": 0.162, + "eval_steps_per_second": 0.162, + "step": 1275 + }, + { + "epoch": 0.5503558335130472, + "grad_norm": 0.8668006658554077, + "learning_rate": 0.0001386465722377882, + "loss": 2.1801, + "step": 1276 + }, + { + "epoch": 0.5507871468621954, + "grad_norm": 0.16311630606651306, + "learning_rate": 0.00013862851191007877, + "loss": 2.0761, + "step": 1277 + }, + { + "epoch": 0.5512184602113436, + "grad_norm": 0.1599023938179016, + "learning_rate": 0.00013861043840739015, + "loss": 2.1566, + "step": 1278 + }, + { + "epoch": 0.5516497735604917, + "grad_norm": 0.1699908971786499, + "learning_rate": 0.0001385923517334646, + "loss": 2.2602, + "step": 1279 + }, + { + "epoch": 0.5520810869096399, + "grad_norm": 0.16559387743473053, + "learning_rate": 0.00013857425189204724, + "loss": 2.3211, + "step": 1280 + }, + { + "epoch": 0.552512400258788, + "grad_norm": 0.1768280565738678, + "learning_rate": 0.00013855613888688574, + "loss": 2.2501, + "step": 1281 + }, + { + "epoch": 0.5529437136079361, + "grad_norm": 0.16502228379249573, + "learning_rate": 0.00013853801272173063, + "loss": 1.9774, + "step": 1282 + }, + { + "epoch": 0.5533750269570843, + "grad_norm": 0.16906069219112396, + "learning_rate": 0.0001385198734003352, + "loss": 2.2409, + "step": 1283 + }, + { + "epoch": 0.5538063403062324, + "grad_norm": 0.15726394951343536, + "learning_rate": 0.0001385017209264553, + "loss": 2.3878, + "step": 1284 + }, + { + "epoch": 0.5542376536553806, + "grad_norm": 0.17712968587875366, + "learning_rate": 0.00013848355530384965, + "loss": 2.1526, + "step": 1285 + }, + { + "epoch": 0.5546689670045288, + "grad_norm": 0.19086754322052002, + "learning_rate": 0.00013846537653627964, + "loss": 2.2802, + "step": 1286 + }, + { + "epoch": 0.555100280353677, + "grad_norm": 0.16991381347179413, + "learning_rate": 0.0001384471846275094, + "loss": 2.332, + "step": 1287 + }, + { + "epoch": 0.5555315937028251, + "grad_norm": 0.19330179691314697, + "learning_rate": 0.00013842897958130568, + "loss": 2.4018, + "step": 1288 + }, + { + "epoch": 0.5559629070519733, + "grad_norm": 0.16834238171577454, + "learning_rate": 0.00013841076140143813, + "loss": 2.2892, + "step": 1289 + }, + { + "epoch": 0.5563942204011214, + "grad_norm": 0.17593060433864594, + "learning_rate": 0.00013839253009167898, + "loss": 2.269, + "step": 1290 + }, + { + "epoch": 0.5568255337502696, + "grad_norm": 0.15262264013290405, + "learning_rate": 0.00013837428565580318, + "loss": 2.2531, + "step": 1291 + }, + { + "epoch": 0.5572568470994177, + "grad_norm": 0.16939087212085724, + "learning_rate": 0.00013835602809758853, + "loss": 2.1666, + "step": 1292 + }, + { + "epoch": 0.5576881604485658, + "grad_norm": 0.183957040309906, + "learning_rate": 0.0001383377574208154, + "loss": 2.1925, + "step": 1293 + }, + { + "epoch": 0.5581194737977141, + "grad_norm": 0.17081241309642792, + "learning_rate": 0.00013831947362926694, + "loss": 2.1878, + "step": 1294 + }, + { + "epoch": 0.5585507871468622, + "grad_norm": 0.17042112350463867, + "learning_rate": 0.00013830117672672902, + "loss": 2.4236, + "step": 1295 + }, + { + "epoch": 0.5589821004960104, + "grad_norm": 0.17426973581314087, + "learning_rate": 0.0001382828667169902, + "loss": 2.3423, + "step": 1296 + }, + { + "epoch": 0.5594134138451585, + "grad_norm": 0.1627834439277649, + "learning_rate": 0.00013826454360384182, + "loss": 2.3386, + "step": 1297 + }, + { + "epoch": 0.5598447271943067, + "grad_norm": 0.15972758829593658, + "learning_rate": 0.0001382462073910778, + "loss": 2.0709, + "step": 1298 + }, + { + "epoch": 0.5602760405434548, + "grad_norm": 0.17867346107959747, + "learning_rate": 0.0001382278580824949, + "loss": 2.1761, + "step": 1299 + }, + { + "epoch": 0.560707353892603, + "grad_norm": 0.20676644146442413, + "learning_rate": 0.0001382094956818925, + "loss": 2.3478, + "step": 1300 + }, + { + "epoch": 0.560707353892603, + "eval_loss": 2.121127128601074, + "eval_runtime": 207.5162, + "eval_samples_per_second": 0.154, + "eval_steps_per_second": 0.154, + "step": 1300 + }, + { + "epoch": 0.5611386672417511, + "grad_norm": 0.1815398633480072, + "learning_rate": 0.00013819112019307283, + "loss": 2.3379, + "step": 1301 + }, + { + "epoch": 0.5615699805908992, + "grad_norm": 0.1549919694662094, + "learning_rate": 0.00013817273161984066, + "loss": 2.0807, + "step": 1302 + }, + { + "epoch": 0.5620012939400475, + "grad_norm": 0.17683330178260803, + "learning_rate": 0.00013815432996600353, + "loss": 2.2306, + "step": 1303 + }, + { + "epoch": 0.5624326072891956, + "grad_norm": 0.17177826166152954, + "learning_rate": 0.00013813591523537176, + "loss": 2.2926, + "step": 1304 + }, + { + "epoch": 0.5628639206383438, + "grad_norm": 0.16859027743339539, + "learning_rate": 0.0001381174874317583, + "loss": 2.2788, + "step": 1305 + }, + { + "epoch": 0.5632952339874919, + "grad_norm": 0.17350350320339203, + "learning_rate": 0.00013809904655897882, + "loss": 2.255, + "step": 1306 + }, + { + "epoch": 0.5637265473366401, + "grad_norm": 0.17333823442459106, + "learning_rate": 0.00013808059262085173, + "loss": 2.1696, + "step": 1307 + }, + { + "epoch": 0.5641578606857882, + "grad_norm": 0.1684228777885437, + "learning_rate": 0.00013806212562119806, + "loss": 2.2234, + "step": 1308 + }, + { + "epoch": 0.5645891740349364, + "grad_norm": 0.16519196331501007, + "learning_rate": 0.0001380436455638417, + "loss": 2.1744, + "step": 1309 + }, + { + "epoch": 0.5650204873840845, + "grad_norm": 0.159685418009758, + "learning_rate": 0.00013802515245260906, + "loss": 2.3333, + "step": 1310 + }, + { + "epoch": 0.5654518007332326, + "grad_norm": 0.21496149897575378, + "learning_rate": 0.00013800664629132935, + "loss": 2.187, + "step": 1311 + }, + { + "epoch": 0.5658831140823809, + "grad_norm": 0.19159434735774994, + "learning_rate": 0.00013798812708383453, + "loss": 2.2162, + "step": 1312 + }, + { + "epoch": 0.566314427431529, + "grad_norm": 0.1915857344865799, + "learning_rate": 0.00013796959483395915, + "loss": 2.195, + "step": 1313 + }, + { + "epoch": 0.5667457407806772, + "grad_norm": 0.18786486983299255, + "learning_rate": 0.00013795104954554056, + "loss": 2.1581, + "step": 1314 + }, + { + "epoch": 0.5671770541298253, + "grad_norm": 0.1549830585718155, + "learning_rate": 0.00013793249122241873, + "loss": 2.2125, + "step": 1315 + }, + { + "epoch": 0.5676083674789735, + "grad_norm": 0.18538445234298706, + "learning_rate": 0.00013791391986843636, + "loss": 2.327, + "step": 1316 + }, + { + "epoch": 0.5680396808281216, + "grad_norm": 0.15483881533145905, + "learning_rate": 0.00013789533548743888, + "loss": 2.0952, + "step": 1317 + }, + { + "epoch": 0.5684709941772698, + "grad_norm": 0.14627200365066528, + "learning_rate": 0.00013787673808327437, + "loss": 2.1907, + "step": 1318 + }, + { + "epoch": 0.5689023075264179, + "grad_norm": 0.1589733511209488, + "learning_rate": 0.00013785812765979366, + "loss": 2.1557, + "step": 1319 + }, + { + "epoch": 0.569333620875566, + "grad_norm": 0.17344968020915985, + "learning_rate": 0.00013783950422085016, + "loss": 2.2803, + "step": 1320 + }, + { + "epoch": 0.5697649342247143, + "grad_norm": 0.15491415560245514, + "learning_rate": 0.00013782086777030015, + "loss": 1.9636, + "step": 1321 + }, + { + "epoch": 0.5701962475738624, + "grad_norm": 0.16137534379959106, + "learning_rate": 0.00013780221831200245, + "loss": 2.2102, + "step": 1322 + }, + { + "epoch": 0.5706275609230106, + "grad_norm": 0.15676361322402954, + "learning_rate": 0.00013778355584981867, + "loss": 2.2007, + "step": 1323 + }, + { + "epoch": 0.5710588742721587, + "grad_norm": 0.16707196831703186, + "learning_rate": 0.00013776488038761306, + "loss": 2.1038, + "step": 1324 + }, + { + "epoch": 0.5714901876213069, + "grad_norm": 0.18508413434028625, + "learning_rate": 0.00013774619192925258, + "loss": 2.2576, + "step": 1325 + }, + { + "epoch": 0.5714901876213069, + "eval_loss": 2.1195273399353027, + "eval_runtime": 201.6363, + "eval_samples_per_second": 0.159, + "eval_steps_per_second": 0.159, + "step": 1325 + }, + { + "epoch": 0.571921500970455, + "grad_norm": 0.1671893149614334, + "learning_rate": 0.00013772749047860687, + "loss": 2.1635, + "step": 1326 + }, + { + "epoch": 0.5723528143196032, + "grad_norm": 0.15377773344516754, + "learning_rate": 0.00013770877603954827, + "loss": 2.273, + "step": 1327 + }, + { + "epoch": 0.5727841276687513, + "grad_norm": 0.1514381468296051, + "learning_rate": 0.00013769004861595189, + "loss": 2.1513, + "step": 1328 + }, + { + "epoch": 0.5732154410178995, + "grad_norm": 0.16755861043930054, + "learning_rate": 0.0001376713082116953, + "loss": 2.0788, + "step": 1329 + }, + { + "epoch": 0.5736467543670477, + "grad_norm": 0.1689564436674118, + "learning_rate": 0.000137652554830659, + "loss": 2.0674, + "step": 1330 + }, + { + "epoch": 0.5740780677161959, + "grad_norm": 0.16019587218761444, + "learning_rate": 0.0001376337884767261, + "loss": 2.1102, + "step": 1331 + }, + { + "epoch": 0.574509381065344, + "grad_norm": 0.16313177347183228, + "learning_rate": 0.00013761500915378228, + "loss": 2.0397, + "step": 1332 + }, + { + "epoch": 0.5749406944144921, + "grad_norm": 0.183137446641922, + "learning_rate": 0.00013759621686571612, + "loss": 2.2121, + "step": 1333 + }, + { + "epoch": 0.5753720077636403, + "grad_norm": 0.18410666286945343, + "learning_rate": 0.0001375774116164187, + "loss": 2.2207, + "step": 1334 + }, + { + "epoch": 0.5758033211127884, + "grad_norm": 0.16913245618343353, + "learning_rate": 0.00013755859340978388, + "loss": 2.2753, + "step": 1335 + }, + { + "epoch": 0.5762346344619366, + "grad_norm": 0.18499286472797394, + "learning_rate": 0.00013753976224970815, + "loss": 2.0903, + "step": 1336 + }, + { + "epoch": 0.5766659478110847, + "grad_norm": 0.19554150104522705, + "learning_rate": 0.00013752091814009073, + "loss": 2.3823, + "step": 1337 + }, + { + "epoch": 0.5770972611602329, + "grad_norm": 0.2066139131784439, + "learning_rate": 0.0001375020610848335, + "loss": 2.3225, + "step": 1338 + }, + { + "epoch": 0.5775285745093811, + "grad_norm": 0.1761176884174347, + "learning_rate": 0.000137483191087841, + "loss": 2.1969, + "step": 1339 + }, + { + "epoch": 0.5779598878585293, + "grad_norm": 0.1563565880060196, + "learning_rate": 0.00013746430815302047, + "loss": 1.9912, + "step": 1340 + }, + { + "epoch": 0.5783912012076774, + "grad_norm": 0.1895601898431778, + "learning_rate": 0.00013744541228428187, + "loss": 2.1682, + "step": 1341 + }, + { + "epoch": 0.5788225145568255, + "grad_norm": 0.1506771296262741, + "learning_rate": 0.00013742650348553775, + "loss": 1.7265, + "step": 1342 + }, + { + "epoch": 0.5792538279059737, + "grad_norm": 0.1695070117712021, + "learning_rate": 0.0001374075817607034, + "loss": 2.1685, + "step": 1343 + }, + { + "epoch": 0.5796851412551218, + "grad_norm": 0.15319640934467316, + "learning_rate": 0.00013738864711369675, + "loss": 2.0115, + "step": 1344 + }, + { + "epoch": 0.58011645460427, + "grad_norm": 0.15707378089427948, + "learning_rate": 0.00013736969954843847, + "loss": 1.9165, + "step": 1345 + }, + { + "epoch": 0.5805477679534181, + "grad_norm": 0.18513138592243195, + "learning_rate": 0.00013735073906885183, + "loss": 2.0196, + "step": 1346 + }, + { + "epoch": 0.5809790813025664, + "grad_norm": 0.1941574513912201, + "learning_rate": 0.0001373317656788628, + "loss": 2.1579, + "step": 1347 + }, + { + "epoch": 0.5814103946517145, + "grad_norm": 0.1600506603717804, + "learning_rate": 0.00013731277938240004, + "loss": 2.1314, + "step": 1348 + }, + { + "epoch": 0.5818417080008627, + "grad_norm": 0.16690665483474731, + "learning_rate": 0.00013729378018339486, + "loss": 2.5433, + "step": 1349 + }, + { + "epoch": 0.5822730213500108, + "grad_norm": 0.1693698763847351, + "learning_rate": 0.00013727476808578128, + "loss": 2.2022, + "step": 1350 + }, + { + "epoch": 0.5822730213500108, + "eval_loss": 2.118407964706421, + "eval_runtime": 201.4914, + "eval_samples_per_second": 0.159, + "eval_steps_per_second": 0.159, + "step": 1350 + }, + { + "epoch": 0.5827043346991589, + "grad_norm": 0.1695820689201355, + "learning_rate": 0.00013725574309349592, + "loss": 2.162, + "step": 1351 + }, + { + "epoch": 0.5831356480483071, + "grad_norm": 0.17078326642513275, + "learning_rate": 0.00013723670521047817, + "loss": 2.1169, + "step": 1352 + }, + { + "epoch": 0.5835669613974552, + "grad_norm": 0.18800406157970428, + "learning_rate": 0.00013721765444066998, + "loss": 2.0955, + "step": 1353 + }, + { + "epoch": 0.5839982747466034, + "grad_norm": 0.1687876284122467, + "learning_rate": 0.00013719859078801603, + "loss": 2.1847, + "step": 1354 + }, + { + "epoch": 0.5844295880957515, + "grad_norm": 0.17234942317008972, + "learning_rate": 0.00013717951425646366, + "loss": 2.0536, + "step": 1355 + }, + { + "epoch": 0.5848609014448998, + "grad_norm": 0.17885074019432068, + "learning_rate": 0.00013716042484996288, + "loss": 2.283, + "step": 1356 + }, + { + "epoch": 0.5852922147940479, + "grad_norm": 0.1754123568534851, + "learning_rate": 0.00013714132257246637, + "loss": 2.3275, + "step": 1357 + }, + { + "epoch": 0.5857235281431961, + "grad_norm": 0.19185900688171387, + "learning_rate": 0.00013712220742792945, + "loss": 2.2428, + "step": 1358 + }, + { + "epoch": 0.5861548414923442, + "grad_norm": 0.16430750489234924, + "learning_rate": 0.00013710307942031014, + "loss": 2.1534, + "step": 1359 + }, + { + "epoch": 0.5865861548414923, + "grad_norm": 0.15442483127117157, + "learning_rate": 0.00013708393855356907, + "loss": 2.2003, + "step": 1360 + }, + { + "epoch": 0.5870174681906405, + "grad_norm": 0.16345712542533875, + "learning_rate": 0.0001370647848316696, + "loss": 2.2425, + "step": 1361 + }, + { + "epoch": 0.5874487815397886, + "grad_norm": 0.22671642899513245, + "learning_rate": 0.0001370456182585777, + "loss": 2.1916, + "step": 1362 + }, + { + "epoch": 0.5878800948889368, + "grad_norm": 0.17455963790416718, + "learning_rate": 0.00013702643883826198, + "loss": 2.0417, + "step": 1363 + }, + { + "epoch": 0.5883114082380849, + "grad_norm": 0.16011476516723633, + "learning_rate": 0.00013700724657469382, + "loss": 2.1057, + "step": 1364 + }, + { + "epoch": 0.5887427215872332, + "grad_norm": 0.14858493208885193, + "learning_rate": 0.00013698804147184717, + "loss": 2.1275, + "step": 1365 + }, + { + "epoch": 0.5891740349363813, + "grad_norm": 0.17408862709999084, + "learning_rate": 0.00013696882353369863, + "loss": 2.1891, + "step": 1366 + }, + { + "epoch": 0.5896053482855295, + "grad_norm": 0.18313269317150116, + "learning_rate": 0.0001369495927642275, + "loss": 2.2143, + "step": 1367 + }, + { + "epoch": 0.5900366616346776, + "grad_norm": 0.1936151534318924, + "learning_rate": 0.00013693034916741572, + "loss": 2.0642, + "step": 1368 + }, + { + "epoch": 0.5904679749838257, + "grad_norm": 0.2034844607114792, + "learning_rate": 0.00013691109274724787, + "loss": 2.1864, + "step": 1369 + }, + { + "epoch": 0.5908992883329739, + "grad_norm": 0.1795257031917572, + "learning_rate": 0.00013689182350771123, + "loss": 2.0779, + "step": 1370 + }, + { + "epoch": 0.591330601682122, + "grad_norm": 0.17389395833015442, + "learning_rate": 0.0001368725414527957, + "loss": 2.2393, + "step": 1371 + }, + { + "epoch": 0.5917619150312702, + "grad_norm": 0.21569593250751495, + "learning_rate": 0.00013685324658649383, + "loss": 2.07, + "step": 1372 + }, + { + "epoch": 0.5921932283804183, + "grad_norm": 0.18595005571842194, + "learning_rate": 0.00013683393891280086, + "loss": 2.2535, + "step": 1373 + }, + { + "epoch": 0.5926245417295666, + "grad_norm": 0.16178612411022186, + "learning_rate": 0.0001368146184357146, + "loss": 2.248, + "step": 1374 + }, + { + "epoch": 0.5930558550787147, + "grad_norm": 0.16927756369113922, + "learning_rate": 0.00013679528515923563, + "loss": 2.0816, + "step": 1375 + }, + { + "epoch": 0.5930558550787147, + "eval_loss": 2.118720054626465, + "eval_runtime": 202.3739, + "eval_samples_per_second": 0.158, + "eval_steps_per_second": 0.158, + "step": 1375 + }, + { + "epoch": 0.5934871684278629, + "grad_norm": 0.19910386204719543, + "learning_rate": 0.00013677593908736704, + "loss": 2.2726, + "step": 1376 + }, + { + "epoch": 0.593918481777011, + "grad_norm": 0.22931334376335144, + "learning_rate": 0.00013675658022411474, + "loss": 2.0352, + "step": 1377 + }, + { + "epoch": 0.5943497951261592, + "grad_norm": 0.18184387683868408, + "learning_rate": 0.00013673720857348712, + "loss": 2.1897, + "step": 1378 + }, + { + "epoch": 0.5947811084753073, + "grad_norm": 0.1753135472536087, + "learning_rate": 0.00013671782413949532, + "loss": 2.3276, + "step": 1379 + }, + { + "epoch": 0.5952124218244554, + "grad_norm": 0.17682583630084991, + "learning_rate": 0.0001366984269261531, + "loss": 2.2037, + "step": 1380 + }, + { + "epoch": 0.5956437351736036, + "grad_norm": 0.21315497159957886, + "learning_rate": 0.00013667901693747688, + "loss": 2.2358, + "step": 1381 + }, + { + "epoch": 0.5960750485227517, + "grad_norm": 0.20123091340065002, + "learning_rate": 0.0001366595941774857, + "loss": 2.1988, + "step": 1382 + }, + { + "epoch": 0.5965063618719, + "grad_norm": 0.18664579093456268, + "learning_rate": 0.0001366401586502012, + "loss": 2.0231, + "step": 1383 + }, + { + "epoch": 0.5969376752210481, + "grad_norm": 0.1737365573644638, + "learning_rate": 0.0001366207103596478, + "loss": 2.1643, + "step": 1384 + }, + { + "epoch": 0.5973689885701963, + "grad_norm": 0.22827516496181488, + "learning_rate": 0.00013660124930985244, + "loss": 2.2214, + "step": 1385 + }, + { + "epoch": 0.5978003019193444, + "grad_norm": 0.18609853088855743, + "learning_rate": 0.00013658177550484476, + "loss": 2.1498, + "step": 1386 + }, + { + "epoch": 0.5982316152684926, + "grad_norm": 0.42302677035331726, + "learning_rate": 0.00013656228894865698, + "loss": 2.3019, + "step": 1387 + }, + { + "epoch": 0.5986629286176407, + "grad_norm": 0.1694529801607132, + "learning_rate": 0.00013654278964532409, + "loss": 1.8489, + "step": 1388 + }, + { + "epoch": 0.5990942419667888, + "grad_norm": 0.16748681664466858, + "learning_rate": 0.00013652327759888354, + "loss": 2.1027, + "step": 1389 + }, + { + "epoch": 0.599525555315937, + "grad_norm": 0.19268131256103516, + "learning_rate": 0.00013650375281337558, + "loss": 2.4342, + "step": 1390 + }, + { + "epoch": 0.5999568686650851, + "grad_norm": 0.17943158745765686, + "learning_rate": 0.000136484215292843, + "loss": 2.3367, + "step": 1391 + }, + { + "epoch": 0.6003881820142334, + "grad_norm": 0.20505180954933167, + "learning_rate": 0.00013646466504133124, + "loss": 2.0027, + "step": 1392 + }, + { + "epoch": 0.6008194953633815, + "grad_norm": 0.14545932412147522, + "learning_rate": 0.00013644510206288845, + "loss": 2.1271, + "step": 1393 + }, + { + "epoch": 0.6012508087125297, + "grad_norm": 0.1819576621055603, + "learning_rate": 0.0001364255263615653, + "loss": 2.3828, + "step": 1394 + }, + { + "epoch": 0.6016821220616778, + "grad_norm": 0.1994846612215042, + "learning_rate": 0.00013640593794141522, + "loss": 2.0621, + "step": 1395 + }, + { + "epoch": 0.602113435410826, + "grad_norm": 0.16838571429252625, + "learning_rate": 0.00013638633680649413, + "loss": 2.3161, + "step": 1396 + }, + { + "epoch": 0.6025447487599741, + "grad_norm": 0.16921788454055786, + "learning_rate": 0.0001363667229608607, + "loss": 2.239, + "step": 1397 + }, + { + "epoch": 0.6029760621091222, + "grad_norm": 0.18538618087768555, + "learning_rate": 0.0001363470964085762, + "loss": 2.1616, + "step": 1398 + }, + { + "epoch": 0.6034073754582704, + "grad_norm": 0.19689178466796875, + "learning_rate": 0.0001363274571537045, + "loss": 2.0951, + "step": 1399 + }, + { + "epoch": 0.6038386888074186, + "grad_norm": 0.17458391189575195, + "learning_rate": 0.00013630780520031214, + "loss": 2.2259, + "step": 1400 + }, + { + "epoch": 0.6038386888074186, + "eval_loss": 2.1179585456848145, + "eval_runtime": 202.3652, + "eval_samples_per_second": 0.158, + "eval_steps_per_second": 0.158, + "step": 1400 + }, + { + "epoch": 0.6042700021565668, + "grad_norm": 0.17060507833957672, + "learning_rate": 0.00013628814055246824, + "loss": 2.228, + "step": 1401 + }, + { + "epoch": 0.6047013155057149, + "grad_norm": 0.1892073005437851, + "learning_rate": 0.00013626846321424462, + "loss": 2.0956, + "step": 1402 + }, + { + "epoch": 0.6051326288548631, + "grad_norm": 0.33646124601364136, + "learning_rate": 0.00013624877318971565, + "loss": 2.3609, + "step": 1403 + }, + { + "epoch": 0.6055639422040112, + "grad_norm": 0.17053109407424927, + "learning_rate": 0.0001362290704829584, + "loss": 2.0822, + "step": 1404 + }, + { + "epoch": 0.6059952555531594, + "grad_norm": 0.19565366208553314, + "learning_rate": 0.00013620935509805253, + "loss": 2.2828, + "step": 1405 + }, + { + "epoch": 0.6064265689023075, + "grad_norm": 0.34117087721824646, + "learning_rate": 0.0001361896270390803, + "loss": 2.3242, + "step": 1406 + }, + { + "epoch": 0.6068578822514556, + "grad_norm": 0.1925416886806488, + "learning_rate": 0.00013616988631012664, + "loss": 2.2093, + "step": 1407 + }, + { + "epoch": 0.6072891956006038, + "grad_norm": 0.17825543880462646, + "learning_rate": 0.00013615013291527907, + "loss": 2.3198, + "step": 1408 + }, + { + "epoch": 0.607720508949752, + "grad_norm": 0.175543874502182, + "learning_rate": 0.00013613036685862774, + "loss": 2.432, + "step": 1409 + }, + { + "epoch": 0.6081518222989002, + "grad_norm": 0.16355106234550476, + "learning_rate": 0.00013611058814426546, + "loss": 2.2179, + "step": 1410 + }, + { + "epoch": 0.6085831356480483, + "grad_norm": 0.16528934240341187, + "learning_rate": 0.00013609079677628761, + "loss": 2.2397, + "step": 1411 + }, + { + "epoch": 0.6090144489971965, + "grad_norm": 0.17749467492103577, + "learning_rate": 0.0001360709927587922, + "loss": 2.0724, + "step": 1412 + }, + { + "epoch": 0.6094457623463446, + "grad_norm": 0.18812529742717743, + "learning_rate": 0.00013605117609587986, + "loss": 2.1415, + "step": 1413 + }, + { + "epoch": 0.6098770756954928, + "grad_norm": 0.16908733546733856, + "learning_rate": 0.00013603134679165393, + "loss": 2.2852, + "step": 1414 + }, + { + "epoch": 0.6103083890446409, + "grad_norm": 0.18603071570396423, + "learning_rate": 0.00013601150485022018, + "loss": 2.2209, + "step": 1415 + }, + { + "epoch": 0.610739702393789, + "grad_norm": 0.18312253057956696, + "learning_rate": 0.00013599165027568713, + "loss": 2.203, + "step": 1416 + }, + { + "epoch": 0.6111710157429372, + "grad_norm": 0.16831669211387634, + "learning_rate": 0.0001359717830721659, + "loss": 1.9639, + "step": 1417 + }, + { + "epoch": 0.6116023290920855, + "grad_norm": 0.18003444373607635, + "learning_rate": 0.0001359519032437702, + "loss": 2.4113, + "step": 1418 + }, + { + "epoch": 0.6120336424412336, + "grad_norm": 0.18845191597938538, + "learning_rate": 0.0001359320107946164, + "loss": 2.5032, + "step": 1419 + }, + { + "epoch": 0.6124649557903817, + "grad_norm": 0.19460377097129822, + "learning_rate": 0.00013591210572882342, + "loss": 2.2331, + "step": 1420 + }, + { + "epoch": 0.6128962691395299, + "grad_norm": 0.17799149453639984, + "learning_rate": 0.00013589218805051283, + "loss": 2.2636, + "step": 1421 + }, + { + "epoch": 0.613327582488678, + "grad_norm": 0.17384687066078186, + "learning_rate": 0.00013587225776380877, + "loss": 2.0293, + "step": 1422 + }, + { + "epoch": 0.6137588958378262, + "grad_norm": 0.1670762002468109, + "learning_rate": 0.00013585231487283806, + "loss": 2.2104, + "step": 1423 + }, + { + "epoch": 0.6141902091869743, + "grad_norm": 0.1775355190038681, + "learning_rate": 0.00013583235938173011, + "loss": 2.2815, + "step": 1424 + }, + { + "epoch": 0.6146215225361225, + "grad_norm": 0.2014247626066208, + "learning_rate": 0.00013581239129461687, + "loss": 2.0719, + "step": 1425 + }, + { + "epoch": 0.6146215225361225, + "eval_loss": 2.1178596019744873, + "eval_runtime": 202.7019, + "eval_samples_per_second": 0.158, + "eval_steps_per_second": 0.158, + "step": 1425 + }, + { + "epoch": 0.6150528358852706, + "grad_norm": 0.16708634793758392, + "learning_rate": 0.000135792410615633, + "loss": 2.1602, + "step": 1426 + }, + { + "epoch": 0.6154841492344189, + "grad_norm": 0.16631104052066803, + "learning_rate": 0.0001357724173489157, + "loss": 2.3131, + "step": 1427 + }, + { + "epoch": 0.615915462583567, + "grad_norm": 0.1849827915430069, + "learning_rate": 0.00013575241149860477, + "loss": 2.2917, + "step": 1428 + }, + { + "epoch": 0.6163467759327151, + "grad_norm": 0.18862168490886688, + "learning_rate": 0.00013573239306884267, + "loss": 2.1622, + "step": 1429 + }, + { + "epoch": 0.6167780892818633, + "grad_norm": 0.18775883316993713, + "learning_rate": 0.00013571236206377443, + "loss": 2.232, + "step": 1430 + }, + { + "epoch": 0.6172094026310114, + "grad_norm": 0.17551693320274353, + "learning_rate": 0.00013569231848754767, + "loss": 2.1698, + "step": 1431 + }, + { + "epoch": 0.6176407159801596, + "grad_norm": 0.19101572036743164, + "learning_rate": 0.00013567226234431268, + "loss": 2.3313, + "step": 1432 + }, + { + "epoch": 0.6180720293293077, + "grad_norm": 0.22161993384361267, + "learning_rate": 0.00013565219363822226, + "loss": 1.9818, + "step": 1433 + }, + { + "epoch": 0.6185033426784559, + "grad_norm": 0.15873490273952484, + "learning_rate": 0.00013563211237343185, + "loss": 2.1122, + "step": 1434 + }, + { + "epoch": 0.618934656027604, + "grad_norm": 0.20279757678508759, + "learning_rate": 0.00013561201855409952, + "loss": 2.2819, + "step": 1435 + }, + { + "epoch": 0.6193659693767523, + "grad_norm": 0.18889349699020386, + "learning_rate": 0.0001355919121843859, + "loss": 2.4155, + "step": 1436 + }, + { + "epoch": 0.6197972827259004, + "grad_norm": 0.1641744077205658, + "learning_rate": 0.00013557179326845428, + "loss": 2.2855, + "step": 1437 + }, + { + "epoch": 0.6202285960750485, + "grad_norm": 0.20285926759243011, + "learning_rate": 0.00013555166181047042, + "loss": 2.2571, + "step": 1438 + }, + { + "epoch": 0.6206599094241967, + "grad_norm": 0.1879100352525711, + "learning_rate": 0.00013553151781460282, + "loss": 2.0196, + "step": 1439 + }, + { + "epoch": 0.6210912227733448, + "grad_norm": 0.20289397239685059, + "learning_rate": 0.00013551136128502249, + "loss": 2.3061, + "step": 1440 + }, + { + "epoch": 0.621522536122493, + "grad_norm": 0.15986093878746033, + "learning_rate": 0.00013549119222590306, + "loss": 2.4075, + "step": 1441 + }, + { + "epoch": 0.6219538494716411, + "grad_norm": 0.17021022737026215, + "learning_rate": 0.00013547101064142073, + "loss": 2.1968, + "step": 1442 + }, + { + "epoch": 0.6223851628207893, + "grad_norm": 0.18939296901226044, + "learning_rate": 0.00013545081653575442, + "loss": 2.1435, + "step": 1443 + }, + { + "epoch": 0.6228164761699374, + "grad_norm": 0.1589539647102356, + "learning_rate": 0.0001354306099130854, + "loss": 2.0662, + "step": 1444 + }, + { + "epoch": 0.6232477895190857, + "grad_norm": 0.17256878316402435, + "learning_rate": 0.00013541039077759776, + "loss": 2.1539, + "step": 1445 + }, + { + "epoch": 0.6236791028682338, + "grad_norm": 0.16598886251449585, + "learning_rate": 0.00013539015913347809, + "loss": 2.1643, + "step": 1446 + }, + { + "epoch": 0.624110416217382, + "grad_norm": 0.1731709986925125, + "learning_rate": 0.00013536991498491554, + "loss": 2.2187, + "step": 1447 + }, + { + "epoch": 0.6245417295665301, + "grad_norm": 0.21180196106433868, + "learning_rate": 0.0001353496583361019, + "loss": 2.2559, + "step": 1448 + }, + { + "epoch": 0.6249730429156782, + "grad_norm": 0.18084551393985748, + "learning_rate": 0.00013532938919123152, + "loss": 2.3106, + "step": 1449 + }, + { + "epoch": 0.6254043562648264, + "grad_norm": 0.18181219696998596, + "learning_rate": 0.00013530910755450137, + "loss": 2.1849, + "step": 1450 + }, + { + "epoch": 0.6254043562648264, + "eval_loss": 2.1173229217529297, + "eval_runtime": 201.5491, + "eval_samples_per_second": 0.159, + "eval_steps_per_second": 0.159, + "step": 1450 + }, + { + "epoch": 0.6258356696139745, + "grad_norm": 0.2053252011537552, + "learning_rate": 0.00013528881343011096, + "loss": 2.3693, + "step": 1451 + }, + { + "epoch": 0.6262669829631227, + "grad_norm": 0.20996972918510437, + "learning_rate": 0.00013526850682226243, + "loss": 2.2502, + "step": 1452 + }, + { + "epoch": 0.6266982963122709, + "grad_norm": 0.1840354949235916, + "learning_rate": 0.00013524818773516048, + "loss": 2.2374, + "step": 1453 + }, + { + "epoch": 0.6271296096614191, + "grad_norm": 0.1774517297744751, + "learning_rate": 0.0001352278561730124, + "loss": 2.1423, + "step": 1454 + }, + { + "epoch": 0.6275609230105672, + "grad_norm": 0.20158281922340393, + "learning_rate": 0.00013520751214002804, + "loss": 2.2852, + "step": 1455 + }, + { + "epoch": 0.6279922363597154, + "grad_norm": 0.15398597717285156, + "learning_rate": 0.0001351871556404199, + "loss": 2.2491, + "step": 1456 + }, + { + "epoch": 0.6284235497088635, + "grad_norm": 0.17070600390434265, + "learning_rate": 0.00013516678667840296, + "loss": 2.1026, + "step": 1457 + }, + { + "epoch": 0.6288548630580116, + "grad_norm": 0.1867612600326538, + "learning_rate": 0.0001351464052581949, + "loss": 2.1568, + "step": 1458 + }, + { + "epoch": 0.6292861764071598, + "grad_norm": 0.16975905001163483, + "learning_rate": 0.00013512601138401583, + "loss": 2.0452, + "step": 1459 + }, + { + "epoch": 0.6297174897563079, + "grad_norm": 0.17002518475055695, + "learning_rate": 0.00013510560506008862, + "loss": 2.1918, + "step": 1460 + }, + { + "epoch": 0.6301488031054561, + "grad_norm": 0.17760762572288513, + "learning_rate": 0.00013508518629063858, + "loss": 2.1783, + "step": 1461 + }, + { + "epoch": 0.6305801164546043, + "grad_norm": 0.259189248085022, + "learning_rate": 0.0001350647550798936, + "loss": 2.3244, + "step": 1462 + }, + { + "epoch": 0.6310114298037525, + "grad_norm": 0.1676439344882965, + "learning_rate": 0.00013504431143208424, + "loss": 2.22, + "step": 1463 + }, + { + "epoch": 0.6314427431529006, + "grad_norm": 0.15792736411094666, + "learning_rate": 0.0001350238553514436, + "loss": 2.2445, + "step": 1464 + }, + { + "epoch": 0.6318740565020488, + "grad_norm": 0.1680126190185547, + "learning_rate": 0.00013500338684220727, + "loss": 2.3718, + "step": 1465 + }, + { + "epoch": 0.6323053698511969, + "grad_norm": 0.16542601585388184, + "learning_rate": 0.00013498290590861352, + "loss": 2.3419, + "step": 1466 + }, + { + "epoch": 0.632736683200345, + "grad_norm": 0.17210273444652557, + "learning_rate": 0.0001349624125549031, + "loss": 2.2629, + "step": 1467 + }, + { + "epoch": 0.6331679965494932, + "grad_norm": 0.1726984828710556, + "learning_rate": 0.00013494190678531946, + "loss": 2.4776, + "step": 1468 + }, + { + "epoch": 0.6335993098986413, + "grad_norm": 0.16882048547267914, + "learning_rate": 0.0001349213886041085, + "loss": 2.1406, + "step": 1469 + }, + { + "epoch": 0.6340306232477895, + "grad_norm": 0.1476915329694748, + "learning_rate": 0.00013490085801551874, + "loss": 1.9109, + "step": 1470 + }, + { + "epoch": 0.6344619365969377, + "grad_norm": 0.1599586308002472, + "learning_rate": 0.00013488031502380127, + "loss": 2.2298, + "step": 1471 + }, + { + "epoch": 0.6348932499460859, + "grad_norm": 0.22215233743190765, + "learning_rate": 0.00013485975963320972, + "loss": 2.2152, + "step": 1472 + }, + { + "epoch": 0.635324563295234, + "grad_norm": 0.17221570014953613, + "learning_rate": 0.00013483919184800034, + "loss": 2.2218, + "step": 1473 + }, + { + "epoch": 0.6357558766443822, + "grad_norm": 0.17482635378837585, + "learning_rate": 0.0001348186116724319, + "loss": 2.4065, + "step": 1474 + }, + { + "epoch": 0.6361871899935303, + "grad_norm": 0.17080114781856537, + "learning_rate": 0.00013479801911076572, + "loss": 2.2043, + "step": 1475 + }, + { + "epoch": 0.6361871899935303, + "eval_loss": 2.1166903972625732, + "eval_runtime": 202.3179, + "eval_samples_per_second": 0.158, + "eval_steps_per_second": 0.158, + "step": 1475 + }, + { + "epoch": 0.6366185033426784, + "grad_norm": 0.18259508907794952, + "learning_rate": 0.00013477741416726574, + "loss": 2.2346, + "step": 1476 + }, + { + "epoch": 0.6370498166918266, + "grad_norm": 0.17076414823532104, + "learning_rate": 0.00013475679684619846, + "loss": 2.1289, + "step": 1477 + }, + { + "epoch": 0.6374811300409747, + "grad_norm": 0.16617844998836517, + "learning_rate": 0.0001347361671518329, + "loss": 2.2685, + "step": 1478 + }, + { + "epoch": 0.6379124433901229, + "grad_norm": 0.1720925122499466, + "learning_rate": 0.00013471552508844068, + "loss": 2.2616, + "step": 1479 + }, + { + "epoch": 0.6383437567392711, + "grad_norm": 0.18244807422161102, + "learning_rate": 0.00013469487066029596, + "loss": 2.2319, + "step": 1480 + }, + { + "epoch": 0.6387750700884193, + "grad_norm": 0.16281767189502716, + "learning_rate": 0.00013467420387167542, + "loss": 2.2527, + "step": 1481 + }, + { + "epoch": 0.6392063834375674, + "grad_norm": 0.1821095198392868, + "learning_rate": 0.00013465352472685839, + "loss": 2.2209, + "step": 1482 + }, + { + "epoch": 0.6396376967867156, + "grad_norm": 0.21405041217803955, + "learning_rate": 0.00013463283323012672, + "loss": 2.169, + "step": 1483 + }, + { + "epoch": 0.6400690101358637, + "grad_norm": 0.17871929705142975, + "learning_rate": 0.0001346121293857648, + "loss": 2.2038, + "step": 1484 + }, + { + "epoch": 0.6405003234850118, + "grad_norm": 0.15631656348705292, + "learning_rate": 0.0001345914131980596, + "loss": 2.2222, + "step": 1485 + }, + { + "epoch": 0.64093163683416, + "grad_norm": 0.15821655094623566, + "learning_rate": 0.00013457068467130057, + "loss": 2.2539, + "step": 1486 + }, + { + "epoch": 0.6413629501833081, + "grad_norm": 0.16861869394779205, + "learning_rate": 0.00013454994380977987, + "loss": 1.9289, + "step": 1487 + }, + { + "epoch": 0.6417942635324563, + "grad_norm": 0.1654800921678543, + "learning_rate": 0.00013452919061779204, + "loss": 2.3163, + "step": 1488 + }, + { + "epoch": 0.6422255768816045, + "grad_norm": 0.1614343225955963, + "learning_rate": 0.00013450842509963432, + "loss": 2.3947, + "step": 1489 + }, + { + "epoch": 0.6426568902307527, + "grad_norm": 0.17356038093566895, + "learning_rate": 0.0001344876472596064, + "loss": 2.174, + "step": 1490 + }, + { + "epoch": 0.6430882035799008, + "grad_norm": 0.1758730709552765, + "learning_rate": 0.00013446685710201056, + "loss": 2.1513, + "step": 1491 + }, + { + "epoch": 0.643519516929049, + "grad_norm": 0.1510227918624878, + "learning_rate": 0.00013444605463115163, + "loss": 1.8994, + "step": 1492 + }, + { + "epoch": 0.6439508302781971, + "grad_norm": 0.17163372039794922, + "learning_rate": 0.00013442523985133704, + "loss": 1.8301, + "step": 1493 + }, + { + "epoch": 0.6443821436273452, + "grad_norm": 0.1685454398393631, + "learning_rate": 0.00013440441276687664, + "loss": 2.2378, + "step": 1494 + }, + { + "epoch": 0.6448134569764934, + "grad_norm": 0.16419187188148499, + "learning_rate": 0.00013438357338208293, + "loss": 2.3511, + "step": 1495 + }, + { + "epoch": 0.6452447703256415, + "grad_norm": 0.17738017439842224, + "learning_rate": 0.000134362721701271, + "loss": 2.3097, + "step": 1496 + }, + { + "epoch": 0.6456760836747897, + "grad_norm": 0.1585911363363266, + "learning_rate": 0.0001343418577287583, + "loss": 2.2102, + "step": 1497 + }, + { + "epoch": 0.6461073970239379, + "grad_norm": 0.18356698751449585, + "learning_rate": 0.00013432098146886506, + "loss": 2.1842, + "step": 1498 + }, + { + "epoch": 0.6465387103730861, + "grad_norm": 0.17388349771499634, + "learning_rate": 0.00013430009292591386, + "loss": 2.1864, + "step": 1499 + }, + { + "epoch": 0.6469700237222342, + "grad_norm": 0.17987026274204254, + "learning_rate": 0.0001342791921042299, + "loss": 2.0218, + "step": 1500 + }, + { + "epoch": 0.6469700237222342, + "eval_loss": 2.1155967712402344, + "eval_runtime": 201.8593, + "eval_samples_per_second": 0.159, + "eval_steps_per_second": 0.159, + "step": 1500 + }, + { + "epoch": 0.6474013370713824, + "grad_norm": 0.1733091026544571, + "learning_rate": 0.00013425827900814097, + "loss": 2.4174, + "step": 1501 + }, + { + "epoch": 0.6478326504205305, + "grad_norm": 0.19367888569831848, + "learning_rate": 0.00013423735364197735, + "loss": 2.1148, + "step": 1502 + }, + { + "epoch": 0.6482639637696787, + "grad_norm": 0.1763000339269638, + "learning_rate": 0.0001342164160100718, + "loss": 2.4165, + "step": 1503 + }, + { + "epoch": 0.6486952771188268, + "grad_norm": 0.17568454146385193, + "learning_rate": 0.00013419546611675979, + "loss": 2.2303, + "step": 1504 + }, + { + "epoch": 0.6491265904679749, + "grad_norm": 0.18122456967830658, + "learning_rate": 0.00013417450396637913, + "loss": 2.2891, + "step": 1505 + }, + { + "epoch": 0.6495579038171232, + "grad_norm": 0.17672553658485413, + "learning_rate": 0.00013415352956327033, + "loss": 2.1847, + "step": 1506 + }, + { + "epoch": 0.6499892171662713, + "grad_norm": 0.1894833892583847, + "learning_rate": 0.00013413254291177632, + "loss": 2.3146, + "step": 1507 + }, + { + "epoch": 0.6504205305154195, + "grad_norm": 0.16244618594646454, + "learning_rate": 0.00013411154401624266, + "loss": 2.2783, + "step": 1508 + }, + { + "epoch": 0.6508518438645676, + "grad_norm": 0.1534096598625183, + "learning_rate": 0.00013409053288101735, + "loss": 2.0793, + "step": 1509 + }, + { + "epoch": 0.6512831572137158, + "grad_norm": 0.16889546811580658, + "learning_rate": 0.00013406950951045104, + "loss": 2.1439, + "step": 1510 + }, + { + "epoch": 0.6517144705628639, + "grad_norm": 0.21417348086833954, + "learning_rate": 0.0001340484739088968, + "loss": 2.2297, + "step": 1511 + }, + { + "epoch": 0.6521457839120121, + "grad_norm": 0.16018380224704742, + "learning_rate": 0.0001340274260807103, + "loss": 2.1571, + "step": 1512 + }, + { + "epoch": 0.6525770972611602, + "grad_norm": 0.18208736181259155, + "learning_rate": 0.0001340063660302497, + "loss": 2.1953, + "step": 1513 + }, + { + "epoch": 0.6530084106103083, + "grad_norm": 0.1658926159143448, + "learning_rate": 0.0001339852937618757, + "loss": 2.2122, + "step": 1514 + }, + { + "epoch": 0.6534397239594566, + "grad_norm": 0.14056910574436188, + "learning_rate": 0.0001339642092799516, + "loss": 2.18, + "step": 1515 + }, + { + "epoch": 0.6538710373086047, + "grad_norm": 0.1788419485092163, + "learning_rate": 0.0001339431125888432, + "loss": 2.2319, + "step": 1516 + }, + { + "epoch": 0.6543023506577529, + "grad_norm": 0.1722828596830368, + "learning_rate": 0.0001339220036929187, + "loss": 2.3425, + "step": 1517 + }, + { + "epoch": 0.654733664006901, + "grad_norm": 0.2073151171207428, + "learning_rate": 0.00013390088259654896, + "loss": 2.1697, + "step": 1518 + }, + { + "epoch": 0.6551649773560492, + "grad_norm": 0.1691809594631195, + "learning_rate": 0.0001338797493041074, + "loss": 2.4019, + "step": 1519 + }, + { + "epoch": 0.6555962907051973, + "grad_norm": 0.16326852142810822, + "learning_rate": 0.0001338586038199698, + "loss": 2.1642, + "step": 1520 + }, + { + "epoch": 0.6560276040543455, + "grad_norm": 0.16764166951179504, + "learning_rate": 0.0001338374461485146, + "loss": 2.2035, + "step": 1521 + }, + { + "epoch": 0.6564589174034936, + "grad_norm": 0.16470211744308472, + "learning_rate": 0.0001338162762941228, + "loss": 2.1875, + "step": 1522 + }, + { + "epoch": 0.6568902307526417, + "grad_norm": 0.149167001247406, + "learning_rate": 0.00013379509426117775, + "loss": 2.0018, + "step": 1523 + }, + { + "epoch": 0.65732154410179, + "grad_norm": 0.16698133945465088, + "learning_rate": 0.00013377390005406547, + "loss": 2.1576, + "step": 1524 + }, + { + "epoch": 0.6577528574509381, + "grad_norm": 0.1746443808078766, + "learning_rate": 0.00013375269367717443, + "loss": 2.2006, + "step": 1525 + }, + { + "epoch": 0.6577528574509381, + "eval_loss": 2.115690231323242, + "eval_runtime": 212.2087, + "eval_samples_per_second": 0.151, + "eval_steps_per_second": 0.151, + "step": 1525 + }, + { + "epoch": 0.6581841708000863, + "grad_norm": 0.17736956477165222, + "learning_rate": 0.00013373147513489567, + "loss": 2.1085, + "step": 1526 + }, + { + "epoch": 0.6586154841492344, + "grad_norm": 0.19343900680541992, + "learning_rate": 0.00013371024443162272, + "loss": 2.078, + "step": 1527 + }, + { + "epoch": 0.6590467974983826, + "grad_norm": 0.1655140072107315, + "learning_rate": 0.0001336890015717516, + "loss": 2.1921, + "step": 1528 + }, + { + "epoch": 0.6594781108475307, + "grad_norm": 0.1496494710445404, + "learning_rate": 0.00013366774655968088, + "loss": 2.1519, + "step": 1529 + }, + { + "epoch": 0.6599094241966789, + "grad_norm": 0.18701104819774628, + "learning_rate": 0.0001336464793998117, + "loss": 2.2564, + "step": 1530 + }, + { + "epoch": 0.660340737545827, + "grad_norm": 0.16007700562477112, + "learning_rate": 0.00013362520009654755, + "loss": 2.1806, + "step": 1531 + }, + { + "epoch": 0.6607720508949751, + "grad_norm": 0.19236847758293152, + "learning_rate": 0.00013360390865429464, + "loss": 2.1388, + "step": 1532 + }, + { + "epoch": 0.6612033642441234, + "grad_norm": 0.18432816863059998, + "learning_rate": 0.00013358260507746156, + "loss": 2.3327, + "step": 1533 + }, + { + "epoch": 0.6616346775932715, + "grad_norm": 0.16497185826301575, + "learning_rate": 0.00013356128937045946, + "loss": 2.1085, + "step": 1534 + }, + { + "epoch": 0.6620659909424197, + "grad_norm": 0.1854122281074524, + "learning_rate": 0.00013353996153770196, + "loss": 2.2072, + "step": 1535 + }, + { + "epoch": 0.6624973042915678, + "grad_norm": 0.17767350375652313, + "learning_rate": 0.00013351862158360526, + "loss": 2.2957, + "step": 1536 + }, + { + "epoch": 0.662928617640716, + "grad_norm": 0.1622297763824463, + "learning_rate": 0.000133497269512588, + "loss": 2.2327, + "step": 1537 + }, + { + "epoch": 0.6633599309898641, + "grad_norm": 0.17678019404411316, + "learning_rate": 0.00013347590532907137, + "loss": 2.3593, + "step": 1538 + }, + { + "epoch": 0.6637912443390123, + "grad_norm": 0.16748957335948944, + "learning_rate": 0.00013345452903747905, + "loss": 1.9395, + "step": 1539 + }, + { + "epoch": 0.6642225576881604, + "grad_norm": 0.17669428884983063, + "learning_rate": 0.00013343314064223727, + "loss": 2.1601, + "step": 1540 + }, + { + "epoch": 0.6646538710373086, + "grad_norm": 0.17978622019290924, + "learning_rate": 0.0001334117401477747, + "loss": 2.1725, + "step": 1541 + }, + { + "epoch": 0.6650851843864568, + "grad_norm": 0.1864762157201767, + "learning_rate": 0.00013339032755852256, + "loss": 2.46, + "step": 1542 + }, + { + "epoch": 0.665516497735605, + "grad_norm": 0.1701022833585739, + "learning_rate": 0.00013336890287891454, + "loss": 2.105, + "step": 1543 + }, + { + "epoch": 0.6659478110847531, + "grad_norm": 0.14923730492591858, + "learning_rate": 0.00013334746611338687, + "loss": 2.1351, + "step": 1544 + }, + { + "epoch": 0.6663791244339012, + "grad_norm": 0.14656859636306763, + "learning_rate": 0.00013332601726637826, + "loss": 2.0266, + "step": 1545 + }, + { + "epoch": 0.6668104377830494, + "grad_norm": 0.1627119928598404, + "learning_rate": 0.00013330455634232992, + "loss": 2.1563, + "step": 1546 + }, + { + "epoch": 0.6672417511321975, + "grad_norm": 0.20333190262317657, + "learning_rate": 0.0001332830833456856, + "loss": 2.242, + "step": 1547 + }, + { + "epoch": 0.6676730644813457, + "grad_norm": 0.19267724454402924, + "learning_rate": 0.00013326159828089151, + "loss": 2.2732, + "step": 1548 + }, + { + "epoch": 0.6681043778304938, + "grad_norm": 0.19448162615299225, + "learning_rate": 0.00013324010115239635, + "loss": 2.1141, + "step": 1549 + }, + { + "epoch": 0.668535691179642, + "grad_norm": 0.161507710814476, + "learning_rate": 0.00013321859196465134, + "loss": 2.1465, + "step": 1550 + }, + { + "epoch": 0.668535691179642, + "eval_loss": 2.1159982681274414, + "eval_runtime": 202.5102, + "eval_samples_per_second": 0.158, + "eval_steps_per_second": 0.158, + "step": 1550 + }, + { + "epoch": 0.6689670045287902, + "grad_norm": 0.18547660112380981, + "learning_rate": 0.00013319707072211018, + "loss": 2.0345, + "step": 1551 + }, + { + "epoch": 0.6693983178779384, + "grad_norm": 0.1630726009607315, + "learning_rate": 0.00013317553742922913, + "loss": 2.2126, + "step": 1552 + }, + { + "epoch": 0.6698296312270865, + "grad_norm": 0.196413055062294, + "learning_rate": 0.00013315399209046683, + "loss": 2.2386, + "step": 1553 + }, + { + "epoch": 0.6702609445762346, + "grad_norm": 0.17378070950508118, + "learning_rate": 0.0001331324347102845, + "loss": 2.0881, + "step": 1554 + }, + { + "epoch": 0.6706922579253828, + "grad_norm": 0.17834343016147614, + "learning_rate": 0.00013311086529314587, + "loss": 2.2791, + "step": 1555 + }, + { + "epoch": 0.6711235712745309, + "grad_norm": 0.16736628115177155, + "learning_rate": 0.00013308928384351708, + "loss": 2.323, + "step": 1556 + }, + { + "epoch": 0.6715548846236791, + "grad_norm": 0.19940096139907837, + "learning_rate": 0.0001330676903658668, + "loss": 1.9494, + "step": 1557 + }, + { + "epoch": 0.6719861979728272, + "grad_norm": 0.1720159649848938, + "learning_rate": 0.00013304608486466624, + "loss": 2.3472, + "step": 1558 + }, + { + "epoch": 0.6724175113219755, + "grad_norm": 0.18553723394870758, + "learning_rate": 0.000133024467344389, + "loss": 2.3026, + "step": 1559 + }, + { + "epoch": 0.6728488246711236, + "grad_norm": 0.1666479855775833, + "learning_rate": 0.00013300283780951123, + "loss": 2.3268, + "step": 1560 + }, + { + "epoch": 0.6732801380202718, + "grad_norm": 0.16056552529335022, + "learning_rate": 0.00013298119626451162, + "loss": 2.144, + "step": 1561 + }, + { + "epoch": 0.6737114513694199, + "grad_norm": 0.17346884310245514, + "learning_rate": 0.00013295954271387128, + "loss": 2.0181, + "step": 1562 + }, + { + "epoch": 0.674142764718568, + "grad_norm": 0.187408909201622, + "learning_rate": 0.00013293787716207373, + "loss": 2.3228, + "step": 1563 + }, + { + "epoch": 0.6745740780677162, + "grad_norm": 0.16592130064964294, + "learning_rate": 0.00013291619961360514, + "loss": 2.0589, + "step": 1564 + }, + { + "epoch": 0.6750053914168643, + "grad_norm": 0.15113787353038788, + "learning_rate": 0.00013289451007295405, + "loss": 2.0183, + "step": 1565 + }, + { + "epoch": 0.6754367047660125, + "grad_norm": 0.17211243510246277, + "learning_rate": 0.00013287280854461152, + "loss": 2.0831, + "step": 1566 + }, + { + "epoch": 0.6758680181151606, + "grad_norm": 0.17111659049987793, + "learning_rate": 0.00013285109503307114, + "loss": 2.1257, + "step": 1567 + }, + { + "epoch": 0.6762993314643089, + "grad_norm": 0.166707381606102, + "learning_rate": 0.00013282936954282884, + "loss": 2.1942, + "step": 1568 + }, + { + "epoch": 0.676730644813457, + "grad_norm": 0.14764513075351715, + "learning_rate": 0.00013280763207838315, + "loss": 2.2528, + "step": 1569 + }, + { + "epoch": 0.6771619581626052, + "grad_norm": 0.207320898771286, + "learning_rate": 0.0001327858826442351, + "loss": 2.0984, + "step": 1570 + }, + { + "epoch": 0.6775932715117533, + "grad_norm": 0.16856639087200165, + "learning_rate": 0.0001327641212448881, + "loss": 2.4237, + "step": 1571 + }, + { + "epoch": 0.6780245848609014, + "grad_norm": 0.17226271331310272, + "learning_rate": 0.00013274234788484814, + "loss": 2.0749, + "step": 1572 + }, + { + "epoch": 0.6784558982100496, + "grad_norm": 0.17077532410621643, + "learning_rate": 0.00013272056256862354, + "loss": 2.3413, + "step": 1573 + }, + { + "epoch": 0.6788872115591977, + "grad_norm": 0.17257219552993774, + "learning_rate": 0.00013269876530072524, + "loss": 2.2468, + "step": 1574 + }, + { + "epoch": 0.6793185249083459, + "grad_norm": 0.18015973269939423, + "learning_rate": 0.0001326769560856666, + "loss": 2.028, + "step": 1575 + }, + { + "epoch": 0.6793185249083459, + "eval_loss": 2.115028142929077, + "eval_runtime": 199.1239, + "eval_samples_per_second": 0.161, + "eval_steps_per_second": 0.161, + "step": 1575 + }, + { + "epoch": 0.679749838257494, + "grad_norm": 0.15598492324352264, + "learning_rate": 0.0001326551349279635, + "loss": 2.2803, + "step": 1576 + }, + { + "epoch": 0.6801811516066423, + "grad_norm": 0.17086711525917053, + "learning_rate": 0.00013263330183213419, + "loss": 2.2152, + "step": 1577 + }, + { + "epoch": 0.6806124649557904, + "grad_norm": 0.14766903221607208, + "learning_rate": 0.00013261145680269943, + "loss": 2.1492, + "step": 1578 + }, + { + "epoch": 0.6810437783049386, + "grad_norm": 0.20790471136569977, + "learning_rate": 0.00013258959984418253, + "loss": 2.0664, + "step": 1579 + }, + { + "epoch": 0.6814750916540867, + "grad_norm": 0.15962834656238556, + "learning_rate": 0.0001325677309611092, + "loss": 2.061, + "step": 1580 + }, + { + "epoch": 0.6819064050032349, + "grad_norm": 0.2015094757080078, + "learning_rate": 0.00013254585015800763, + "loss": 2.1781, + "step": 1581 + }, + { + "epoch": 0.682337718352383, + "grad_norm": 0.15662479400634766, + "learning_rate": 0.0001325239574394085, + "loss": 1.9441, + "step": 1582 + }, + { + "epoch": 0.6827690317015311, + "grad_norm": 0.17095041275024414, + "learning_rate": 0.00013250205280984485, + "loss": 2.3391, + "step": 1583 + }, + { + "epoch": 0.6832003450506793, + "grad_norm": 0.17483989894390106, + "learning_rate": 0.0001324801362738524, + "loss": 2.1489, + "step": 1584 + }, + { + "epoch": 0.6836316583998274, + "grad_norm": 0.15106312930583954, + "learning_rate": 0.00013245820783596907, + "loss": 2.1818, + "step": 1585 + }, + { + "epoch": 0.6840629717489757, + "grad_norm": 0.1592348963022232, + "learning_rate": 0.0001324362675007355, + "loss": 2.051, + "step": 1586 + }, + { + "epoch": 0.6844942850981238, + "grad_norm": 0.1669055074453354, + "learning_rate": 0.00013241431527269464, + "loss": 2.3425, + "step": 1587 + }, + { + "epoch": 0.684925598447272, + "grad_norm": 0.15657608211040497, + "learning_rate": 0.00013239235115639192, + "loss": 2.2442, + "step": 1588 + }, + { + "epoch": 0.6853569117964201, + "grad_norm": 0.17330090701580048, + "learning_rate": 0.00013237037515637524, + "loss": 2.1638, + "step": 1589 + }, + { + "epoch": 0.6857882251455683, + "grad_norm": 0.16446971893310547, + "learning_rate": 0.000132348387277195, + "loss": 2.1034, + "step": 1590 + }, + { + "epoch": 0.6862195384947164, + "grad_norm": 0.1576288938522339, + "learning_rate": 0.00013232638752340404, + "loss": 2.0789, + "step": 1591 + }, + { + "epoch": 0.6866508518438645, + "grad_norm": 0.18883441388607025, + "learning_rate": 0.0001323043758995576, + "loss": 2.2768, + "step": 1592 + }, + { + "epoch": 0.6870821651930127, + "grad_norm": 0.1743883192539215, + "learning_rate": 0.0001322823524102135, + "loss": 2.2914, + "step": 1593 + }, + { + "epoch": 0.6875134785421608, + "grad_norm": 0.16601242125034332, + "learning_rate": 0.0001322603170599319, + "loss": 2.2557, + "step": 1594 + }, + { + "epoch": 0.6879447918913091, + "grad_norm": 0.19149911403656006, + "learning_rate": 0.00013223826985327543, + "loss": 2.3684, + "step": 1595 + }, + { + "epoch": 0.6883761052404572, + "grad_norm": 0.16588738560676575, + "learning_rate": 0.00013221621079480926, + "loss": 2.165, + "step": 1596 + }, + { + "epoch": 0.6888074185896054, + "grad_norm": 0.1697489619255066, + "learning_rate": 0.00013219413988910096, + "loss": 2.2294, + "step": 1597 + }, + { + "epoch": 0.6892387319387535, + "grad_norm": 0.16599583625793457, + "learning_rate": 0.0001321720571407205, + "loss": 2.2237, + "step": 1598 + }, + { + "epoch": 0.6896700452879017, + "grad_norm": 0.16053009033203125, + "learning_rate": 0.00013214996255424039, + "loss": 2.1803, + "step": 1599 + }, + { + "epoch": 0.6901013586370498, + "grad_norm": 0.18454283475875854, + "learning_rate": 0.00013212785613423558, + "loss": 2.3049, + "step": 1600 + }, + { + "epoch": 0.6901013586370498, + "eval_loss": 2.1139638423919678, + "eval_runtime": 198.9309, + "eval_samples_per_second": 0.161, + "eval_steps_per_second": 0.161, + "step": 1600 + }, + { + "epoch": 0.6905326719861979, + "grad_norm": 0.16557788848876953, + "learning_rate": 0.00013210573788528338, + "loss": 2.0268, + "step": 1601 + }, + { + "epoch": 0.6909639853353461, + "grad_norm": 0.18790724873542786, + "learning_rate": 0.00013208360781196367, + "loss": 2.3526, + "step": 1602 + }, + { + "epoch": 0.6913952986844942, + "grad_norm": 0.19259871542453766, + "learning_rate": 0.0001320614659188587, + "loss": 2.0988, + "step": 1603 + }, + { + "epoch": 0.6918266120336425, + "grad_norm": 0.15778112411499023, + "learning_rate": 0.0001320393122105532, + "loss": 2.2377, + "step": 1604 + }, + { + "epoch": 0.6922579253827906, + "grad_norm": 0.18303173780441284, + "learning_rate": 0.00013201714669163435, + "loss": 2.2089, + "step": 1605 + }, + { + "epoch": 0.6926892387319388, + "grad_norm": 0.17680174112319946, + "learning_rate": 0.00013199496936669176, + "loss": 2.3218, + "step": 1606 + }, + { + "epoch": 0.6931205520810869, + "grad_norm": 0.17215006053447723, + "learning_rate": 0.00013197278024031745, + "loss": 2.1908, + "step": 1607 + }, + { + "epoch": 0.6935518654302351, + "grad_norm": 0.16192790865898132, + "learning_rate": 0.00013195057931710593, + "loss": 2.1841, + "step": 1608 + }, + { + "epoch": 0.6939831787793832, + "grad_norm": 0.14507627487182617, + "learning_rate": 0.0001319283666016542, + "loss": 2.0118, + "step": 1609 + }, + { + "epoch": 0.6944144921285313, + "grad_norm": 0.18748687207698822, + "learning_rate": 0.00013190614209856156, + "loss": 2.4215, + "step": 1610 + }, + { + "epoch": 0.6948458054776795, + "grad_norm": 0.17146620154380798, + "learning_rate": 0.00013188390581242993, + "loss": 2.1129, + "step": 1611 + }, + { + "epoch": 0.6952771188268277, + "grad_norm": 0.16766369342803955, + "learning_rate": 0.00013186165774786352, + "loss": 2.1973, + "step": 1612 + }, + { + "epoch": 0.6957084321759759, + "grad_norm": 0.17402087152004242, + "learning_rate": 0.00013183939790946901, + "loss": 2.1548, + "step": 1613 + }, + { + "epoch": 0.696139745525124, + "grad_norm": 0.1601400375366211, + "learning_rate": 0.00013181712630185564, + "loss": 2.0221, + "step": 1614 + }, + { + "epoch": 0.6965710588742722, + "grad_norm": 0.1470576971769333, + "learning_rate": 0.00013179484292963488, + "loss": 2.1986, + "step": 1615 + }, + { + "epoch": 0.6970023722234203, + "grad_norm": 0.166117861866951, + "learning_rate": 0.0001317725477974208, + "loss": 2.4442, + "step": 1616 + }, + { + "epoch": 0.6974336855725685, + "grad_norm": 0.1759869009256363, + "learning_rate": 0.00013175024090982988, + "loss": 2.2802, + "step": 1617 + }, + { + "epoch": 0.6978649989217166, + "grad_norm": 0.14935262501239777, + "learning_rate": 0.00013172792227148094, + "loss": 2.0724, + "step": 1618 + }, + { + "epoch": 0.6982963122708647, + "grad_norm": 0.19077062606811523, + "learning_rate": 0.0001317055918869953, + "loss": 2.1706, + "step": 1619 + }, + { + "epoch": 0.6987276256200129, + "grad_norm": 0.16506893932819366, + "learning_rate": 0.00013168324976099678, + "loss": 2.2949, + "step": 1620 + }, + { + "epoch": 0.6991589389691611, + "grad_norm": 0.18387648463249207, + "learning_rate": 0.0001316608958981115, + "loss": 2.1215, + "step": 1621 + }, + { + "epoch": 0.6995902523183093, + "grad_norm": 0.18008792400360107, + "learning_rate": 0.0001316385303029681, + "loss": 2.1253, + "step": 1622 + }, + { + "epoch": 0.7000215656674574, + "grad_norm": 0.18667413294315338, + "learning_rate": 0.0001316161529801976, + "loss": 2.2715, + "step": 1623 + }, + { + "epoch": 0.7004528790166056, + "grad_norm": 0.20948120951652527, + "learning_rate": 0.0001315937639344335, + "loss": 2.1971, + "step": 1624 + }, + { + "epoch": 0.7008841923657537, + "grad_norm": 0.16733720898628235, + "learning_rate": 0.00013157136317031167, + "loss": 2.24, + "step": 1625 + }, + { + "epoch": 0.7008841923657537, + "eval_loss": 2.1133079528808594, + "eval_runtime": 198.8724, + "eval_samples_per_second": 0.161, + "eval_steps_per_second": 0.161, + "step": 1625 + }, + { + "epoch": 0.7013155057149019, + "grad_norm": 0.17461319267749786, + "learning_rate": 0.00013154895069247046, + "loss": 2.2935, + "step": 1626 + }, + { + "epoch": 0.70174681906405, + "grad_norm": 0.16576853394508362, + "learning_rate": 0.00013152652650555056, + "loss": 2.3532, + "step": 1627 + }, + { + "epoch": 0.7021781324131982, + "grad_norm": 0.1629101186990738, + "learning_rate": 0.00013150409061419522, + "loss": 2.2289, + "step": 1628 + }, + { + "epoch": 0.7026094457623463, + "grad_norm": 0.19192031025886536, + "learning_rate": 0.00013148164302304998, + "loss": 2.1729, + "step": 1629 + }, + { + "epoch": 0.7030407591114946, + "grad_norm": 0.23054882884025574, + "learning_rate": 0.00013145918373676287, + "loss": 2.2828, + "step": 1630 + }, + { + "epoch": 0.7034720724606427, + "grad_norm": 0.15536463260650635, + "learning_rate": 0.00013143671275998438, + "loss": 1.9453, + "step": 1631 + }, + { + "epoch": 0.7039033858097908, + "grad_norm": 0.1682126522064209, + "learning_rate": 0.0001314142300973673, + "loss": 2.2695, + "step": 1632 + }, + { + "epoch": 0.704334699158939, + "grad_norm": 0.1809571087360382, + "learning_rate": 0.00013139173575356695, + "loss": 2.2376, + "step": 1633 + }, + { + "epoch": 0.7047660125080871, + "grad_norm": 0.14714162051677704, + "learning_rate": 0.000131369229733241, + "loss": 1.9291, + "step": 1634 + }, + { + "epoch": 0.7051973258572353, + "grad_norm": 0.17537806928157806, + "learning_rate": 0.00013134671204104962, + "loss": 2.1672, + "step": 1635 + }, + { + "epoch": 0.7056286392063834, + "grad_norm": 0.19214670360088348, + "learning_rate": 0.0001313241826816553, + "loss": 2.3329, + "step": 1636 + }, + { + "epoch": 0.7060599525555316, + "grad_norm": 0.1673903614282608, + "learning_rate": 0.000131301641659723, + "loss": 2.1169, + "step": 1637 + }, + { + "epoch": 0.7064912659046797, + "grad_norm": 0.16690760850906372, + "learning_rate": 0.0001312790889799201, + "loss": 2.2248, + "step": 1638 + }, + { + "epoch": 0.706922579253828, + "grad_norm": 0.1609828621149063, + "learning_rate": 0.00013125652464691638, + "loss": 2.2809, + "step": 1639 + }, + { + "epoch": 0.7073538926029761, + "grad_norm": 0.20783407986164093, + "learning_rate": 0.00013123394866538404, + "loss": 2.3455, + "step": 1640 + }, + { + "epoch": 0.7077852059521242, + "grad_norm": 0.17028851807117462, + "learning_rate": 0.00013121136103999762, + "loss": 2.1143, + "step": 1641 + }, + { + "epoch": 0.7082165193012724, + "grad_norm": 0.16780084371566772, + "learning_rate": 0.00013118876177543417, + "loss": 2.1457, + "step": 1642 + }, + { + "epoch": 0.7086478326504205, + "grad_norm": 0.18313480913639069, + "learning_rate": 0.00013116615087637316, + "loss": 2.4553, + "step": 1643 + }, + { + "epoch": 0.7090791459995687, + "grad_norm": 0.17945381999015808, + "learning_rate": 0.00013114352834749638, + "loss": 2.2977, + "step": 1644 + }, + { + "epoch": 0.7095104593487168, + "grad_norm": 0.17324642837047577, + "learning_rate": 0.00013112089419348809, + "loss": 2.3561, + "step": 1645 + }, + { + "epoch": 0.709941772697865, + "grad_norm": 0.18440823256969452, + "learning_rate": 0.00013109824841903492, + "loss": 2.399, + "step": 1646 + }, + { + "epoch": 0.7103730860470131, + "grad_norm": 0.17321951687335968, + "learning_rate": 0.00013107559102882592, + "loss": 2.3652, + "step": 1647 + }, + { + "epoch": 0.7108043993961614, + "grad_norm": 0.1459442675113678, + "learning_rate": 0.0001310529220275526, + "loss": 1.9473, + "step": 1648 + }, + { + "epoch": 0.7112357127453095, + "grad_norm": 0.16677115857601166, + "learning_rate": 0.00013103024141990875, + "loss": 2.2992, + "step": 1649 + }, + { + "epoch": 0.7116670260944576, + "grad_norm": 0.16851958632469177, + "learning_rate": 0.0001310075492105907, + "loss": 2.3005, + "step": 1650 + }, + { + "epoch": 0.7116670260944576, + "eval_loss": 2.113213062286377, + "eval_runtime": 198.74, + "eval_samples_per_second": 0.161, + "eval_steps_per_second": 0.161, + "step": 1650 + }, + { + "epoch": 0.7120983394436058, + "grad_norm": 0.17779399454593658, + "learning_rate": 0.00013098484540429708, + "loss": 2.1, + "step": 1651 + }, + { + "epoch": 0.7125296527927539, + "grad_norm": 0.1771998256444931, + "learning_rate": 0.000130962130005729, + "loss": 2.1188, + "step": 1652 + }, + { + "epoch": 0.7129609661419021, + "grad_norm": 0.7633037567138672, + "learning_rate": 0.00013093940301958992, + "loss": 2.1638, + "step": 1653 + }, + { + "epoch": 0.7133922794910502, + "grad_norm": 0.16661760210990906, + "learning_rate": 0.0001309166644505857, + "loss": 1.8541, + "step": 1654 + }, + { + "epoch": 0.7138235928401984, + "grad_norm": 0.16859006881713867, + "learning_rate": 0.0001308939143034246, + "loss": 2.3292, + "step": 1655 + }, + { + "epoch": 0.7142549061893465, + "grad_norm": 0.1498507708311081, + "learning_rate": 0.00013087115258281727, + "loss": 2.0369, + "step": 1656 + }, + { + "epoch": 0.7146862195384948, + "grad_norm": 0.15672807395458221, + "learning_rate": 0.0001308483792934768, + "loss": 2.283, + "step": 1657 + }, + { + "epoch": 0.7151175328876429, + "grad_norm": 0.181760773062706, + "learning_rate": 0.0001308255944401187, + "loss": 2.2157, + "step": 1658 + }, + { + "epoch": 0.715548846236791, + "grad_norm": 0.15118049085140228, + "learning_rate": 0.00013080279802746072, + "loss": 2.2297, + "step": 1659 + }, + { + "epoch": 0.7159801595859392, + "grad_norm": 0.1727578490972519, + "learning_rate": 0.0001307799900602232, + "loss": 2.3038, + "step": 1660 + }, + { + "epoch": 0.7164114729350873, + "grad_norm": 0.20055891573429108, + "learning_rate": 0.00013075717054312875, + "loss": 2.1896, + "step": 1661 + }, + { + "epoch": 0.7168427862842355, + "grad_norm": 0.27386513352394104, + "learning_rate": 0.00013073433948090237, + "loss": 2.139, + "step": 1662 + }, + { + "epoch": 0.7172740996333836, + "grad_norm": 0.16511470079421997, + "learning_rate": 0.00013071149687827148, + "loss": 2.2663, + "step": 1663 + }, + { + "epoch": 0.7177054129825318, + "grad_norm": 0.15780644118785858, + "learning_rate": 0.0001306886427399659, + "loss": 2.1271, + "step": 1664 + }, + { + "epoch": 0.71813672633168, + "grad_norm": 0.15114596486091614, + "learning_rate": 0.00013066577707071788, + "loss": 1.9493, + "step": 1665 + }, + { + "epoch": 0.7185680396808282, + "grad_norm": 0.2407820224761963, + "learning_rate": 0.00013064289987526197, + "loss": 2.0795, + "step": 1666 + }, + { + "epoch": 0.7189993530299763, + "grad_norm": 0.17818287014961243, + "learning_rate": 0.00013062001115833513, + "loss": 2.311, + "step": 1667 + }, + { + "epoch": 0.7194306663791245, + "grad_norm": 0.16417573392391205, + "learning_rate": 0.0001305971109246767, + "loss": 2.1726, + "step": 1668 + }, + { + "epoch": 0.7198619797282726, + "grad_norm": 0.1662161648273468, + "learning_rate": 0.00013057419917902847, + "loss": 2.2471, + "step": 1669 + }, + { + "epoch": 0.7202932930774207, + "grad_norm": 0.20036767423152924, + "learning_rate": 0.00013055127592613458, + "loss": 2.3349, + "step": 1670 + }, + { + "epoch": 0.7207246064265689, + "grad_norm": 0.21882568299770355, + "learning_rate": 0.0001305283411707415, + "loss": 2.0971, + "step": 1671 + }, + { + "epoch": 0.721155919775717, + "grad_norm": 0.15568691492080688, + "learning_rate": 0.0001305053949175981, + "loss": 2.3335, + "step": 1672 + }, + { + "epoch": 0.7215872331248652, + "grad_norm": 0.18450284004211426, + "learning_rate": 0.00013048243717145568, + "loss": 2.2341, + "step": 1673 + }, + { + "epoch": 0.7220185464740134, + "grad_norm": 0.16564737260341644, + "learning_rate": 0.00013045946793706792, + "loss": 2.3969, + "step": 1674 + }, + { + "epoch": 0.7224498598231616, + "grad_norm": 0.17275582253932953, + "learning_rate": 0.00013043648721919085, + "loss": 2.2686, + "step": 1675 + }, + { + "epoch": 0.7224498598231616, + "eval_loss": 2.1133737564086914, + "eval_runtime": 207.8226, + "eval_samples_per_second": 0.154, + "eval_steps_per_second": 0.154, + "step": 1675 + }, + { + "epoch": 0.7228811731723097, + "grad_norm": 0.1843978464603424, + "learning_rate": 0.00013041349502258283, + "loss": 2.1839, + "step": 1676 + }, + { + "epoch": 0.7233124865214579, + "grad_norm": 0.16497057676315308, + "learning_rate": 0.0001303904913520047, + "loss": 2.3698, + "step": 1677 + }, + { + "epoch": 0.723743799870606, + "grad_norm": 0.40322139859199524, + "learning_rate": 0.00013036747621221958, + "loss": 2.2511, + "step": 1678 + }, + { + "epoch": 0.7241751132197541, + "grad_norm": 0.17039461433887482, + "learning_rate": 0.00013034444960799302, + "loss": 2.6212, + "step": 1679 + }, + { + "epoch": 0.7246064265689023, + "grad_norm": 0.20115548372268677, + "learning_rate": 0.00013032141154409294, + "loss": 2.239, + "step": 1680 + }, + { + "epoch": 0.7250377399180504, + "grad_norm": 0.17254777252674103, + "learning_rate": 0.00013029836202528962, + "loss": 2.204, + "step": 1681 + }, + { + "epoch": 0.7254690532671986, + "grad_norm": 0.180454283952713, + "learning_rate": 0.00013027530105635566, + "loss": 2.1329, + "step": 1682 + }, + { + "epoch": 0.7259003666163468, + "grad_norm": 0.16699275374412537, + "learning_rate": 0.0001302522286420662, + "loss": 2.1671, + "step": 1683 + }, + { + "epoch": 0.726331679965495, + "grad_norm": 0.179158553481102, + "learning_rate": 0.00013022914478719855, + "loss": 2.3858, + "step": 1684 + }, + { + "epoch": 0.7267629933146431, + "grad_norm": 0.16012237966060638, + "learning_rate": 0.00013020604949653248, + "loss": 2.2873, + "step": 1685 + }, + { + "epoch": 0.7271943066637913, + "grad_norm": 0.16123633086681366, + "learning_rate": 0.00013018294277485017, + "loss": 2.2534, + "step": 1686 + }, + { + "epoch": 0.7276256200129394, + "grad_norm": 0.1661210060119629, + "learning_rate": 0.00013015982462693607, + "loss": 2.3161, + "step": 1687 + }, + { + "epoch": 0.7280569333620875, + "grad_norm": 0.17415104806423187, + "learning_rate": 0.00013013669505757705, + "loss": 2.0021, + "step": 1688 + }, + { + "epoch": 0.7284882467112357, + "grad_norm": 0.18667307496070862, + "learning_rate": 0.00013011355407156237, + "loss": 2.0903, + "step": 1689 + }, + { + "epoch": 0.7289195600603838, + "grad_norm": 0.19069887697696686, + "learning_rate": 0.0001300904016736836, + "loss": 2.0884, + "step": 1690 + }, + { + "epoch": 0.729350873409532, + "grad_norm": 0.15097585320472717, + "learning_rate": 0.0001300672378687347, + "loss": 2.3432, + "step": 1691 + }, + { + "epoch": 0.7297821867586802, + "grad_norm": 0.18867550790309906, + "learning_rate": 0.00013004406266151202, + "loss": 2.2952, + "step": 1692 + }, + { + "epoch": 0.7302135001078284, + "grad_norm": 0.1902410238981247, + "learning_rate": 0.00013002087605681417, + "loss": 2.2431, + "step": 1693 + }, + { + "epoch": 0.7306448134569765, + "grad_norm": 0.20812374353408813, + "learning_rate": 0.00012999767805944228, + "loss": 2.1721, + "step": 1694 + }, + { + "epoch": 0.7310761268061247, + "grad_norm": 0.1791360080242157, + "learning_rate": 0.0001299744686741997, + "loss": 2.425, + "step": 1695 + }, + { + "epoch": 0.7315074401552728, + "grad_norm": 0.18107859790325165, + "learning_rate": 0.00012995124790589216, + "loss": 2.3085, + "step": 1696 + }, + { + "epoch": 0.731938753504421, + "grad_norm": 0.17820578813552856, + "learning_rate": 0.0001299280157593278, + "loss": 2.2546, + "step": 1697 + }, + { + "epoch": 0.7323700668535691, + "grad_norm": 0.18765179812908173, + "learning_rate": 0.0001299047722393171, + "loss": 1.9474, + "step": 1698 + }, + { + "epoch": 0.7328013802027172, + "grad_norm": 0.16326814889907837, + "learning_rate": 0.00012988151735067292, + "loss": 2.2706, + "step": 1699 + }, + { + "epoch": 0.7332326935518654, + "grad_norm": 0.16765248775482178, + "learning_rate": 0.0001298582510982104, + "loss": 2.2238, + "step": 1700 + }, + { + "epoch": 0.7332326935518654, + "eval_loss": 2.112516403198242, + "eval_runtime": 203.7373, + "eval_samples_per_second": 0.157, + "eval_steps_per_second": 0.157, + "step": 1700 + }, + { + "epoch": 0.7336640069010136, + "grad_norm": 0.1879028081893921, + "learning_rate": 0.00012983497348674703, + "loss": 2.2476, + "step": 1701 + }, + { + "epoch": 0.7340953202501618, + "grad_norm": 0.17669300734996796, + "learning_rate": 0.00012981168452110275, + "loss": 2.1725, + "step": 1702 + }, + { + "epoch": 0.7345266335993099, + "grad_norm": 0.16430909931659698, + "learning_rate": 0.00012978838420609978, + "loss": 2.1852, + "step": 1703 + }, + { + "epoch": 0.7349579469484581, + "grad_norm": 0.20571014285087585, + "learning_rate": 0.00012976507254656267, + "loss": 2.2288, + "step": 1704 + }, + { + "epoch": 0.7353892602976062, + "grad_norm": 0.17860174179077148, + "learning_rate": 0.00012974174954731845, + "loss": 2.4026, + "step": 1705 + }, + { + "epoch": 0.7358205736467544, + "grad_norm": 0.16562241315841675, + "learning_rate": 0.0001297184152131963, + "loss": 2.0744, + "step": 1706 + }, + { + "epoch": 0.7362518869959025, + "grad_norm": 0.17572078108787537, + "learning_rate": 0.0001296950695490279, + "loss": 2.3155, + "step": 1707 + }, + { + "epoch": 0.7366832003450506, + "grad_norm": 0.17776505649089813, + "learning_rate": 0.00012967171255964723, + "loss": 2.2062, + "step": 1708 + }, + { + "epoch": 0.7371145136941988, + "grad_norm": 0.24452145397663116, + "learning_rate": 0.00012964834424989056, + "loss": 2.0492, + "step": 1709 + }, + { + "epoch": 0.737545827043347, + "grad_norm": 0.18888737261295319, + "learning_rate": 0.00012962496462459662, + "loss": 2.2781, + "step": 1710 + }, + { + "epoch": 0.7379771403924952, + "grad_norm": 0.16819074749946594, + "learning_rate": 0.00012960157368860635, + "loss": 2.2522, + "step": 1711 + }, + { + "epoch": 0.7384084537416433, + "grad_norm": 0.1931333839893341, + "learning_rate": 0.00012957817144676318, + "loss": 2.2074, + "step": 1712 + }, + { + "epoch": 0.7388397670907915, + "grad_norm": 0.20771650969982147, + "learning_rate": 0.00012955475790391274, + "loss": 2.1816, + "step": 1713 + }, + { + "epoch": 0.7392710804399396, + "grad_norm": 0.14832836389541626, + "learning_rate": 0.00012953133306490305, + "loss": 2.1424, + "step": 1714 + }, + { + "epoch": 0.7397023937890878, + "grad_norm": 0.16810181736946106, + "learning_rate": 0.00012950789693458453, + "loss": 2.2168, + "step": 1715 + }, + { + "epoch": 0.7401337071382359, + "grad_norm": 0.17525973916053772, + "learning_rate": 0.00012948444951780985, + "loss": 2.1605, + "step": 1716 + }, + { + "epoch": 0.740565020487384, + "grad_norm": 0.17031776905059814, + "learning_rate": 0.0001294609908194341, + "loss": 2.1088, + "step": 1717 + }, + { + "epoch": 0.7409963338365323, + "grad_norm": 0.21623241901397705, + "learning_rate": 0.00012943752084431458, + "loss": 2.2305, + "step": 1718 + }, + { + "epoch": 0.7414276471856804, + "grad_norm": 0.18505755066871643, + "learning_rate": 0.00012941403959731106, + "loss": 2.273, + "step": 1719 + }, + { + "epoch": 0.7418589605348286, + "grad_norm": 0.1552658975124359, + "learning_rate": 0.00012939054708328555, + "loss": 2.195, + "step": 1720 + }, + { + "epoch": 0.7422902738839767, + "grad_norm": 0.17611075937747955, + "learning_rate": 0.0001293670433071025, + "loss": 2.2172, + "step": 1721 + }, + { + "epoch": 0.7427215872331249, + "grad_norm": 0.16624145209789276, + "learning_rate": 0.00012934352827362862, + "loss": 2.3531, + "step": 1722 + }, + { + "epoch": 0.743152900582273, + "grad_norm": 0.18196657299995422, + "learning_rate": 0.00012932000198773288, + "loss": 2.286, + "step": 1723 + }, + { + "epoch": 0.7435842139314212, + "grad_norm": 0.16066540777683258, + "learning_rate": 0.00012929646445428668, + "loss": 2.4649, + "step": 1724 + }, + { + "epoch": 0.7440155272805693, + "grad_norm": 0.17888200283050537, + "learning_rate": 0.0001292729156781638, + "loss": 2.0565, + "step": 1725 + }, + { + "epoch": 0.7440155272805693, + "eval_loss": 2.1118383407592773, + "eval_runtime": 200.5771, + "eval_samples_per_second": 0.16, + "eval_steps_per_second": 0.16, + "step": 1725 + }, + { + "epoch": 0.7444468406297174, + "grad_norm": 0.19620761275291443, + "learning_rate": 0.00012924935566424012, + "loss": 2.1012, + "step": 1726 + }, + { + "epoch": 0.7448781539788657, + "grad_norm": 0.1674032360315323, + "learning_rate": 0.00012922578441739414, + "loss": 2.151, + "step": 1727 + }, + { + "epoch": 0.7453094673280138, + "grad_norm": 0.16218839585781097, + "learning_rate": 0.0001292022019425065, + "loss": 2.099, + "step": 1728 + }, + { + "epoch": 0.745740780677162, + "grad_norm": 0.16986028850078583, + "learning_rate": 0.0001291786082444602, + "loss": 2.2028, + "step": 1729 + }, + { + "epoch": 0.7461720940263101, + "grad_norm": 0.16744236648082733, + "learning_rate": 0.00012915500332814054, + "loss": 2.2487, + "step": 1730 + }, + { + "epoch": 0.7466034073754583, + "grad_norm": 0.15288467705249786, + "learning_rate": 0.00012913138719843526, + "loss": 2.3417, + "step": 1731 + }, + { + "epoch": 0.7470347207246064, + "grad_norm": 0.16407521069049835, + "learning_rate": 0.00012910775986023427, + "loss": 2.1652, + "step": 1732 + }, + { + "epoch": 0.7474660340737546, + "grad_norm": 0.16800500452518463, + "learning_rate": 0.0001290841213184299, + "loss": 2.3682, + "step": 1733 + }, + { + "epoch": 0.7478973474229027, + "grad_norm": 0.1646740883588791, + "learning_rate": 0.00012906047157791673, + "loss": 1.9321, + "step": 1734 + }, + { + "epoch": 0.7483286607720508, + "grad_norm": 0.173130065202713, + "learning_rate": 0.00012903681064359174, + "loss": 2.0989, + "step": 1735 + }, + { + "epoch": 0.7487599741211991, + "grad_norm": 0.16296248137950897, + "learning_rate": 0.00012901313852035416, + "loss": 2.0963, + "step": 1736 + }, + { + "epoch": 0.7491912874703472, + "grad_norm": 0.16170215606689453, + "learning_rate": 0.00012898945521310558, + "loss": 2.4658, + "step": 1737 + }, + { + "epoch": 0.7496226008194954, + "grad_norm": 0.1658870279788971, + "learning_rate": 0.00012896576072674989, + "loss": 2.2508, + "step": 1738 + }, + { + "epoch": 0.7500539141686435, + "grad_norm": 0.1697956621646881, + "learning_rate": 0.00012894205506619325, + "loss": 2.224, + "step": 1739 + }, + { + "epoch": 0.7504852275177917, + "grad_norm": 0.1617509126663208, + "learning_rate": 0.00012891833823634426, + "loss": 2.2718, + "step": 1740 + }, + { + "epoch": 0.7509165408669398, + "grad_norm": 0.17925076186656952, + "learning_rate": 0.00012889461024211365, + "loss": 2.3989, + "step": 1741 + }, + { + "epoch": 0.751347854216088, + "grad_norm": 0.17088088393211365, + "learning_rate": 0.00012887087108841462, + "loss": 2.2883, + "step": 1742 + }, + { + "epoch": 0.7517791675652361, + "grad_norm": 0.1580200046300888, + "learning_rate": 0.0001288471207801626, + "loss": 2.1735, + "step": 1743 + }, + { + "epoch": 0.7522104809143842, + "grad_norm": 0.16144931316375732, + "learning_rate": 0.00012882335932227538, + "loss": 2.1267, + "step": 1744 + }, + { + "epoch": 0.7526417942635325, + "grad_norm": 0.17619143426418304, + "learning_rate": 0.000128799586719673, + "loss": 2.103, + "step": 1745 + }, + { + "epoch": 0.7530731076126806, + "grad_norm": 0.16034960746765137, + "learning_rate": 0.00012877580297727788, + "loss": 2.0091, + "step": 1746 + }, + { + "epoch": 0.7535044209618288, + "grad_norm": 0.17221620678901672, + "learning_rate": 0.00012875200810001463, + "loss": 2.1892, + "step": 1747 + }, + { + "epoch": 0.7539357343109769, + "grad_norm": 0.18462108075618744, + "learning_rate": 0.0001287282020928103, + "loss": 2.3866, + "step": 1748 + }, + { + "epoch": 0.7543670476601251, + "grad_norm": 0.22234316170215607, + "learning_rate": 0.00012870438496059416, + "loss": 2.254, + "step": 1749 + }, + { + "epoch": 0.7547983610092732, + "grad_norm": 0.16872234642505646, + "learning_rate": 0.0001286805567082978, + "loss": 2.267, + "step": 1750 + }, + { + "epoch": 0.7547983610092732, + "eval_loss": 2.111443519592285, + "eval_runtime": 201.8678, + "eval_samples_per_second": 0.159, + "eval_steps_per_second": 0.159, + "step": 1750 + }, + { + "epoch": 0.7552296743584214, + "grad_norm": 0.20871369540691376, + "learning_rate": 0.00012865671734085514, + "loss": 2.0419, + "step": 1751 + }, + { + "epoch": 0.7556609877075695, + "grad_norm": 0.21343334019184113, + "learning_rate": 0.00012863286686320237, + "loss": 2.1727, + "step": 1752 + }, + { + "epoch": 0.7560923010567177, + "grad_norm": 0.15887396037578583, + "learning_rate": 0.00012860900528027802, + "loss": 2.1621, + "step": 1753 + }, + { + "epoch": 0.7565236144058659, + "grad_norm": 0.1738533079624176, + "learning_rate": 0.00012858513259702287, + "loss": 2.347, + "step": 1754 + }, + { + "epoch": 0.756954927755014, + "grad_norm": 0.1632392704486847, + "learning_rate": 0.00012856124881837998, + "loss": 2.0953, + "step": 1755 + }, + { + "epoch": 0.7573862411041622, + "grad_norm": 0.163606658577919, + "learning_rate": 0.00012853735394929483, + "loss": 2.0839, + "step": 1756 + }, + { + "epoch": 0.7578175544533103, + "grad_norm": 0.18141624331474304, + "learning_rate": 0.00012851344799471502, + "loss": 2.3043, + "step": 1757 + }, + { + "epoch": 0.7582488678024585, + "grad_norm": 0.172869473695755, + "learning_rate": 0.00012848953095959063, + "loss": 2.1286, + "step": 1758 + }, + { + "epoch": 0.7586801811516066, + "grad_norm": 0.17472733557224274, + "learning_rate": 0.0001284656028488739, + "loss": 2.1691, + "step": 1759 + }, + { + "epoch": 0.7591114945007548, + "grad_norm": 0.1549355536699295, + "learning_rate": 0.0001284416636675194, + "loss": 2.053, + "step": 1760 + }, + { + "epoch": 0.7595428078499029, + "grad_norm": 0.1682683378458023, + "learning_rate": 0.000128417713420484, + "loss": 2.2314, + "step": 1761 + }, + { + "epoch": 0.7599741211990511, + "grad_norm": 0.1796865016222, + "learning_rate": 0.00012839375211272692, + "loss": 2.0654, + "step": 1762 + }, + { + "epoch": 0.7604054345481993, + "grad_norm": 0.1781945377588272, + "learning_rate": 0.0001283697797492095, + "loss": 2.3292, + "step": 1763 + }, + { + "epoch": 0.7608367478973475, + "grad_norm": 0.17216362059116364, + "learning_rate": 0.00012834579633489557, + "loss": 2.1891, + "step": 1764 + }, + { + "epoch": 0.7612680612464956, + "grad_norm": 0.17948682606220245, + "learning_rate": 0.00012832180187475112, + "loss": 2.2332, + "step": 1765 + }, + { + "epoch": 0.7616993745956437, + "grad_norm": 0.17100459337234497, + "learning_rate": 0.00012829779637374446, + "loss": 2.291, + "step": 1766 + }, + { + "epoch": 0.7621306879447919, + "grad_norm": 0.15035024285316467, + "learning_rate": 0.00012827377983684623, + "loss": 2.1859, + "step": 1767 + }, + { + "epoch": 0.76256200129394, + "grad_norm": 0.17111943662166595, + "learning_rate": 0.0001282497522690293, + "loss": 2.0255, + "step": 1768 + }, + { + "epoch": 0.7629933146430882, + "grad_norm": 0.19134071469306946, + "learning_rate": 0.0001282257136752688, + "loss": 2.1275, + "step": 1769 + }, + { + "epoch": 0.7634246279922363, + "grad_norm": 0.16713270545005798, + "learning_rate": 0.00012820166406054225, + "loss": 2.2782, + "step": 1770 + }, + { + "epoch": 0.7638559413413846, + "grad_norm": 0.16537237167358398, + "learning_rate": 0.00012817760342982934, + "loss": 2.1584, + "step": 1771 + }, + { + "epoch": 0.7642872546905327, + "grad_norm": 0.17069876194000244, + "learning_rate": 0.0001281535317881121, + "loss": 2.2834, + "step": 1772 + }, + { + "epoch": 0.7647185680396809, + "grad_norm": 0.16779346764087677, + "learning_rate": 0.00012812944914037481, + "loss": 2.3434, + "step": 1773 + }, + { + "epoch": 0.765149881388829, + "grad_norm": 0.16906912624835968, + "learning_rate": 0.00012810535549160408, + "loss": 2.3108, + "step": 1774 + }, + { + "epoch": 0.7655811947379771, + "grad_norm": 0.16279946267604828, + "learning_rate": 0.00012808125084678876, + "loss": 1.9309, + "step": 1775 + }, + { + "epoch": 0.7655811947379771, + "eval_loss": 2.111085891723633, + "eval_runtime": 200.7241, + "eval_samples_per_second": 0.159, + "eval_steps_per_second": 0.159, + "step": 1775 + }, + { + "epoch": 0.7660125080871253, + "grad_norm": 0.16854463517665863, + "learning_rate": 0.00012805713521091993, + "loss": 2.2577, + "step": 1776 + }, + { + "epoch": 0.7664438214362734, + "grad_norm": 0.1790621429681778, + "learning_rate": 0.00012803300858899104, + "loss": 2.0779, + "step": 1777 + }, + { + "epoch": 0.7668751347854216, + "grad_norm": 0.17216108739376068, + "learning_rate": 0.0001280088709859978, + "loss": 2.1478, + "step": 1778 + }, + { + "epoch": 0.7673064481345697, + "grad_norm": 0.16908183693885803, + "learning_rate": 0.00012798472240693808, + "loss": 2.3447, + "step": 1779 + }, + { + "epoch": 0.767737761483718, + "grad_norm": 0.16917240619659424, + "learning_rate": 0.00012796056285681217, + "loss": 2.2356, + "step": 1780 + }, + { + "epoch": 0.7681690748328661, + "grad_norm": 0.1798560619354248, + "learning_rate": 0.00012793639234062254, + "loss": 2.2074, + "step": 1781 + }, + { + "epoch": 0.7686003881820143, + "grad_norm": 0.1688271462917328, + "learning_rate": 0.000127912210863374, + "loss": 2.2913, + "step": 1782 + }, + { + "epoch": 0.7690317015311624, + "grad_norm": 0.1530812680721283, + "learning_rate": 0.00012788801843007357, + "loss": 2.2181, + "step": 1783 + }, + { + "epoch": 0.7694630148803105, + "grad_norm": 0.1799415647983551, + "learning_rate": 0.0001278638150457305, + "loss": 2.0686, + "step": 1784 + }, + { + "epoch": 0.7698943282294587, + "grad_norm": 0.16236048936843872, + "learning_rate": 0.00012783960071535645, + "loss": 2.0682, + "step": 1785 + }, + { + "epoch": 0.7703256415786068, + "grad_norm": 0.1817246377468109, + "learning_rate": 0.00012781537544396526, + "loss": 2.1458, + "step": 1786 + }, + { + "epoch": 0.770756954927755, + "grad_norm": 0.16584637761116028, + "learning_rate": 0.00012779113923657294, + "loss": 2.1422, + "step": 1787 + }, + { + "epoch": 0.7711882682769031, + "grad_norm": 0.16539442539215088, + "learning_rate": 0.000127766892098198, + "loss": 2.1348, + "step": 1788 + }, + { + "epoch": 0.7716195816260514, + "grad_norm": 0.16954608261585236, + "learning_rate": 0.00012774263403386095, + "loss": 2.2259, + "step": 1789 + }, + { + "epoch": 0.7720508949751995, + "grad_norm": 0.19718922674655914, + "learning_rate": 0.00012771836504858473, + "loss": 2.2328, + "step": 1790 + }, + { + "epoch": 0.7724822083243477, + "grad_norm": 0.17745161056518555, + "learning_rate": 0.00012769408514739453, + "loss": 2.2707, + "step": 1791 + }, + { + "epoch": 0.7729135216734958, + "grad_norm": 0.1657809466123581, + "learning_rate": 0.00012766979433531778, + "loss": 2.2097, + "step": 1792 + }, + { + "epoch": 0.773344835022644, + "grad_norm": 0.19770976901054382, + "learning_rate": 0.00012764549261738407, + "loss": 2.2412, + "step": 1793 + }, + { + "epoch": 0.7737761483717921, + "grad_norm": 0.1652282029390335, + "learning_rate": 0.0001276211799986254, + "loss": 2.2458, + "step": 1794 + }, + { + "epoch": 0.7742074617209402, + "grad_norm": 0.16925354301929474, + "learning_rate": 0.00012759685648407597, + "loss": 2.2538, + "step": 1795 + }, + { + "epoch": 0.7746387750700884, + "grad_norm": 0.16957129538059235, + "learning_rate": 0.00012757252207877223, + "loss": 2.275, + "step": 1796 + }, + { + "epoch": 0.7750700884192365, + "grad_norm": 0.16301840543746948, + "learning_rate": 0.00012754817678775285, + "loss": 2.2116, + "step": 1797 + }, + { + "epoch": 0.7755014017683848, + "grad_norm": 0.16502845287322998, + "learning_rate": 0.0001275238206160588, + "loss": 2.3485, + "step": 1798 + }, + { + "epoch": 0.7759327151175329, + "grad_norm": 0.16845610737800598, + "learning_rate": 0.00012749945356873335, + "loss": 2.046, + "step": 1799 + }, + { + "epoch": 0.7763640284666811, + "grad_norm": 0.16439995169639587, + "learning_rate": 0.00012747507565082184, + "loss": 2.1971, + "step": 1800 + }, + { + "epoch": 0.7763640284666811, + "eval_loss": 2.110645294189453, + "eval_runtime": 201.0521, + "eval_samples_per_second": 0.159, + "eval_steps_per_second": 0.159, + "step": 1800 + }, + { + "epoch": 0.7767953418158292, + "grad_norm": 0.17457184195518494, + "learning_rate": 0.0001274506868673721, + "loss": 2.2552, + "step": 1801 + }, + { + "epoch": 0.7772266551649774, + "grad_norm": 0.15566860139369965, + "learning_rate": 0.00012742628722343407, + "loss": 2.248, + "step": 1802 + }, + { + "epoch": 0.7776579685141255, + "grad_norm": 0.175991490483284, + "learning_rate": 0.0001274018767240599, + "loss": 2.2171, + "step": 1803 + }, + { + "epoch": 0.7780892818632736, + "grad_norm": 0.17536449432373047, + "learning_rate": 0.00012737745537430413, + "loss": 2.059, + "step": 1804 + }, + { + "epoch": 0.7785205952124218, + "grad_norm": 0.19441308081150055, + "learning_rate": 0.00012735302317922343, + "loss": 1.9221, + "step": 1805 + }, + { + "epoch": 0.7789519085615699, + "grad_norm": 0.17545975744724274, + "learning_rate": 0.00012732858014387677, + "loss": 2.1557, + "step": 1806 + }, + { + "epoch": 0.7793832219107182, + "grad_norm": 0.18923307955265045, + "learning_rate": 0.0001273041262733253, + "loss": 2.3349, + "step": 1807 + }, + { + "epoch": 0.7798145352598663, + "grad_norm": 0.16929149627685547, + "learning_rate": 0.00012727966157263253, + "loss": 2.2812, + "step": 1808 + }, + { + "epoch": 0.7802458486090145, + "grad_norm": 0.16833718121051788, + "learning_rate": 0.0001272551860468641, + "loss": 2.2105, + "step": 1809 + }, + { + "epoch": 0.7806771619581626, + "grad_norm": 0.163452610373497, + "learning_rate": 0.00012723069970108794, + "loss": 2.2795, + "step": 1810 + }, + { + "epoch": 0.7811084753073108, + "grad_norm": 0.16303634643554688, + "learning_rate": 0.00012720620254037424, + "loss": 2.0641, + "step": 1811 + }, + { + "epoch": 0.7815397886564589, + "grad_norm": 0.1870717853307724, + "learning_rate": 0.00012718169456979534, + "loss": 2.1919, + "step": 1812 + }, + { + "epoch": 0.781971102005607, + "grad_norm": 0.1807531863451004, + "learning_rate": 0.000127157175794426, + "loss": 2.3677, + "step": 1813 + }, + { + "epoch": 0.7824024153547552, + "grad_norm": 0.24385060369968414, + "learning_rate": 0.00012713264621934297, + "loss": 2.2957, + "step": 1814 + }, + { + "epoch": 0.7828337287039033, + "grad_norm": 0.16218340396881104, + "learning_rate": 0.00012710810584962547, + "loss": 2.2153, + "step": 1815 + }, + { + "epoch": 0.7832650420530516, + "grad_norm": 0.19149371981620789, + "learning_rate": 0.0001270835546903548, + "loss": 2.1862, + "step": 1816 + }, + { + "epoch": 0.7836963554021997, + "grad_norm": 0.21503080427646637, + "learning_rate": 0.00012705899274661453, + "loss": 2.3009, + "step": 1817 + }, + { + "epoch": 0.7841276687513479, + "grad_norm": 0.16244001686573029, + "learning_rate": 0.00012703442002349053, + "loss": 2.2839, + "step": 1818 + }, + { + "epoch": 0.784558982100496, + "grad_norm": 0.19366413354873657, + "learning_rate": 0.0001270098365260708, + "loss": 2.1591, + "step": 1819 + }, + { + "epoch": 0.7849902954496442, + "grad_norm": 0.16676150262355804, + "learning_rate": 0.00012698524225944566, + "loss": 2.2976, + "step": 1820 + }, + { + "epoch": 0.7854216087987923, + "grad_norm": 0.18334507942199707, + "learning_rate": 0.00012696063722870762, + "loss": 2.2395, + "step": 1821 + }, + { + "epoch": 0.7858529221479404, + "grad_norm": 0.20181263983249664, + "learning_rate": 0.0001269360214389514, + "loss": 2.3733, + "step": 1822 + }, + { + "epoch": 0.7862842354970886, + "grad_norm": 0.1683868169784546, + "learning_rate": 0.00012691139489527398, + "loss": 2.0121, + "step": 1823 + }, + { + "epoch": 0.7867155488462368, + "grad_norm": 0.1778002679347992, + "learning_rate": 0.00012688675760277456, + "loss": 2.4024, + "step": 1824 + }, + { + "epoch": 0.787146862195385, + "grad_norm": 0.173460453748703, + "learning_rate": 0.00012686210956655455, + "loss": 2.408, + "step": 1825 + }, + { + "epoch": 0.787146862195385, + "eval_loss": 2.1109418869018555, + "eval_runtime": 201.6399, + "eval_samples_per_second": 0.159, + "eval_steps_per_second": 0.159, + "step": 1825 + }, + { + "epoch": 0.7875781755445331, + "grad_norm": 0.18741174042224884, + "learning_rate": 0.00012683745079171757, + "loss": 2.3349, + "step": 1826 + }, + { + "epoch": 0.7880094888936813, + "grad_norm": 0.151434987783432, + "learning_rate": 0.00012681278128336956, + "loss": 2.049, + "step": 1827 + }, + { + "epoch": 0.7884408022428294, + "grad_norm": 0.17712783813476562, + "learning_rate": 0.00012678810104661858, + "loss": 2.1582, + "step": 1828 + }, + { + "epoch": 0.7888721155919776, + "grad_norm": 0.18137238919734955, + "learning_rate": 0.0001267634100865749, + "loss": 2.0288, + "step": 1829 + }, + { + "epoch": 0.7893034289411257, + "grad_norm": 0.1671016663312912, + "learning_rate": 0.0001267387084083511, + "loss": 2.3045, + "step": 1830 + }, + { + "epoch": 0.7897347422902739, + "grad_norm": 0.1871432512998581, + "learning_rate": 0.0001267139960170619, + "loss": 2.115, + "step": 1831 + }, + { + "epoch": 0.790166055639422, + "grad_norm": 0.16146980226039886, + "learning_rate": 0.0001266892729178243, + "loss": 2.2883, + "step": 1832 + }, + { + "epoch": 0.7905973689885702, + "grad_norm": 0.18456147611141205, + "learning_rate": 0.00012666453911575748, + "loss": 2.3554, + "step": 1833 + }, + { + "epoch": 0.7910286823377184, + "grad_norm": 0.16101932525634766, + "learning_rate": 0.00012663979461598288, + "loss": 2.1936, + "step": 1834 + }, + { + "epoch": 0.7914599956868665, + "grad_norm": 0.15960389375686646, + "learning_rate": 0.00012661503942362404, + "loss": 2.0886, + "step": 1835 + }, + { + "epoch": 0.7918913090360147, + "grad_norm": 0.1770126223564148, + "learning_rate": 0.00012659027354380685, + "loss": 2.163, + "step": 1836 + }, + { + "epoch": 0.7923226223851628, + "grad_norm": 0.16850055754184723, + "learning_rate": 0.0001265654969816593, + "loss": 2.3053, + "step": 1837 + }, + { + "epoch": 0.792753935734311, + "grad_norm": 0.9188832640647888, + "learning_rate": 0.00012654070974231173, + "loss": 2.1439, + "step": 1838 + }, + { + "epoch": 0.7931852490834591, + "grad_norm": 0.16128169000148773, + "learning_rate": 0.00012651591183089657, + "loss": 1.889, + "step": 1839 + }, + { + "epoch": 0.7936165624326073, + "grad_norm": 0.16812586784362793, + "learning_rate": 0.00012649110325254847, + "loss": 2.4054, + "step": 1840 + }, + { + "epoch": 0.7940478757817554, + "grad_norm": 0.1671290248632431, + "learning_rate": 0.00012646628401240438, + "loss": 2.2513, + "step": 1841 + }, + { + "epoch": 0.7944791891309037, + "grad_norm": 0.17929555475711823, + "learning_rate": 0.00012644145411560333, + "loss": 2.1672, + "step": 1842 + }, + { + "epoch": 0.7949105024800518, + "grad_norm": 0.1603858768939972, + "learning_rate": 0.00012641661356728668, + "loss": 1.9774, + "step": 1843 + }, + { + "epoch": 0.7953418158291999, + "grad_norm": 0.17137469351291656, + "learning_rate": 0.0001263917623725979, + "loss": 2.0653, + "step": 1844 + }, + { + "epoch": 0.7957731291783481, + "grad_norm": 0.17383916676044464, + "learning_rate": 0.0001263669005366827, + "loss": 2.1493, + "step": 1845 + }, + { + "epoch": 0.7962044425274962, + "grad_norm": 0.1622949242591858, + "learning_rate": 0.000126342028064689, + "loss": 2.2813, + "step": 1846 + }, + { + "epoch": 0.7966357558766444, + "grad_norm": 0.3186436891555786, + "learning_rate": 0.0001263171449617669, + "loss": 2.4686, + "step": 1847 + }, + { + "epoch": 0.7970670692257925, + "grad_norm": 0.17199741303920746, + "learning_rate": 0.0001262922512330688, + "loss": 2.2529, + "step": 1848 + }, + { + "epoch": 0.7974983825749407, + "grad_norm": 0.17930395901203156, + "learning_rate": 0.0001262673468837491, + "loss": 2.6291, + "step": 1849 + }, + { + "epoch": 0.7979296959240888, + "grad_norm": 0.176521897315979, + "learning_rate": 0.0001262424319189646, + "loss": 2.2637, + "step": 1850 + }, + { + "epoch": 0.7979296959240888, + "eval_loss": 2.111254930496216, + "eval_runtime": 201.5573, + "eval_samples_per_second": 0.159, + "eval_steps_per_second": 0.159, + "step": 1850 + }, + { + "epoch": 0.798361009273237, + "grad_norm": 0.196044459939003, + "learning_rate": 0.00012621750634387422, + "loss": 2.129, + "step": 1851 + }, + { + "epoch": 0.7987923226223852, + "grad_norm": 0.2050708383321762, + "learning_rate": 0.00012619257016363903, + "loss": 2.3564, + "step": 1852 + }, + { + "epoch": 0.7992236359715333, + "grad_norm": 0.16014248132705688, + "learning_rate": 0.00012616762338342236, + "loss": 2.2885, + "step": 1853 + }, + { + "epoch": 0.7996549493206815, + "grad_norm": 0.17007140815258026, + "learning_rate": 0.00012614266600838967, + "loss": 2.1042, + "step": 1854 + }, + { + "epoch": 0.8000862626698296, + "grad_norm": 0.17264525592327118, + "learning_rate": 0.00012611769804370874, + "loss": 2.2499, + "step": 1855 + }, + { + "epoch": 0.8005175760189778, + "grad_norm": 0.17515993118286133, + "learning_rate": 0.0001260927194945494, + "loss": 2.1508, + "step": 1856 + }, + { + "epoch": 0.8009488893681259, + "grad_norm": 0.18426083028316498, + "learning_rate": 0.00012606773036608374, + "loss": 2.2293, + "step": 1857 + }, + { + "epoch": 0.8013802027172741, + "grad_norm": 0.1829826682806015, + "learning_rate": 0.00012604273066348605, + "loss": 2.1917, + "step": 1858 + }, + { + "epoch": 0.8018115160664222, + "grad_norm": 0.16429126262664795, + "learning_rate": 0.0001260177203919328, + "loss": 2.114, + "step": 1859 + }, + { + "epoch": 0.8022428294155705, + "grad_norm": 0.3393753468990326, + "learning_rate": 0.0001259926995566026, + "loss": 2.1461, + "step": 1860 + }, + { + "epoch": 0.8026741427647186, + "grad_norm": 0.1876833587884903, + "learning_rate": 0.00012596766816267633, + "loss": 2.2348, + "step": 1861 + }, + { + "epoch": 0.8031054561138667, + "grad_norm": 0.1625937670469284, + "learning_rate": 0.00012594262621533697, + "loss": 2.4149, + "step": 1862 + }, + { + "epoch": 0.8035367694630149, + "grad_norm": 0.17854106426239014, + "learning_rate": 0.00012591757371976974, + "loss": 2.3269, + "step": 1863 + }, + { + "epoch": 0.803968082812163, + "grad_norm": 10.761648178100586, + "learning_rate": 0.00012589251068116208, + "loss": 2.1866, + "step": 1864 + }, + { + "epoch": 0.8043993961613112, + "grad_norm": 0.17221496999263763, + "learning_rate": 0.00012586743710470353, + "loss": 2.127, + "step": 1865 + }, + { + "epoch": 0.8048307095104593, + "grad_norm": 0.1671273559331894, + "learning_rate": 0.00012584235299558584, + "loss": 2.1688, + "step": 1866 + }, + { + "epoch": 0.8052620228596075, + "grad_norm": 0.1608709841966629, + "learning_rate": 0.00012581725835900297, + "loss": 2.2199, + "step": 1867 + }, + { + "epoch": 0.8056933362087556, + "grad_norm": 0.18937575817108154, + "learning_rate": 0.000125792153200151, + "loss": 1.926, + "step": 1868 + }, + { + "epoch": 0.8061246495579039, + "grad_norm": 0.16524960100650787, + "learning_rate": 0.0001257670375242283, + "loss": 2.2916, + "step": 1869 + }, + { + "epoch": 0.806555962907052, + "grad_norm": 0.18175652623176575, + "learning_rate": 0.00012574191133643526, + "loss": 2.3457, + "step": 1870 + }, + { + "epoch": 0.8069872762562001, + "grad_norm": 0.19647067785263062, + "learning_rate": 0.00012571677464197462, + "loss": 2.1629, + "step": 1871 + }, + { + "epoch": 0.8074185896053483, + "grad_norm": 0.2882465124130249, + "learning_rate": 0.0001256916274460511, + "loss": 2.2529, + "step": 1872 + }, + { + "epoch": 0.8078499029544964, + "grad_norm": 0.1850413829088211, + "learning_rate": 0.00012566646975387181, + "loss": 2.2831, + "step": 1873 + }, + { + "epoch": 0.8082812163036446, + "grad_norm": 0.15984345972537994, + "learning_rate": 0.00012564130157064588, + "loss": 2.1268, + "step": 1874 + }, + { + "epoch": 0.8087125296527927, + "grad_norm": 0.1659379005432129, + "learning_rate": 0.00012561612290158463, + "loss": 2.2775, + "step": 1875 + }, + { + "epoch": 0.8087125296527927, + "eval_loss": 2.1113932132720947, + "eval_runtime": 207.6287, + "eval_samples_per_second": 0.154, + "eval_steps_per_second": 0.154, + "step": 1875 + }, + { + "epoch": 0.8091438430019409, + "grad_norm": 0.17246565222740173, + "learning_rate": 0.00012559093375190161, + "loss": 2.1342, + "step": 1876 + }, + { + "epoch": 0.8095751563510891, + "grad_norm": 0.163095623254776, + "learning_rate": 0.0001255657341268125, + "loss": 2.19, + "step": 1877 + }, + { + "epoch": 0.8100064697002373, + "grad_norm": 0.16611529886722565, + "learning_rate": 0.00012554052403153517, + "loss": 2.2269, + "step": 1878 + }, + { + "epoch": 0.8104377830493854, + "grad_norm": 0.1658940315246582, + "learning_rate": 0.00012551530347128964, + "loss": 2.1327, + "step": 1879 + }, + { + "epoch": 0.8108690963985336, + "grad_norm": 0.15693336725234985, + "learning_rate": 0.0001254900724512981, + "loss": 2.2625, + "step": 1880 + }, + { + "epoch": 0.8113004097476817, + "grad_norm": 0.19481512904167175, + "learning_rate": 0.0001254648309767849, + "loss": 1.8088, + "step": 1881 + }, + { + "epoch": 0.8117317230968298, + "grad_norm": 0.1702975630760193, + "learning_rate": 0.00012543957905297658, + "loss": 2.2031, + "step": 1882 + }, + { + "epoch": 0.812163036445978, + "grad_norm": 0.17250478267669678, + "learning_rate": 0.0001254143166851018, + "loss": 2.2121, + "step": 1883 + }, + { + "epoch": 0.8125943497951261, + "grad_norm": 0.17849695682525635, + "learning_rate": 0.00012538904387839145, + "loss": 2.1903, + "step": 1884 + }, + { + "epoch": 0.8130256631442743, + "grad_norm": 0.1863728165626526, + "learning_rate": 0.00012536376063807854, + "loss": 2.2404, + "step": 1885 + }, + { + "epoch": 0.8134569764934225, + "grad_norm": 168.6988067626953, + "learning_rate": 0.00012533846696939817, + "loss": 2.2611, + "step": 1886 + }, + { + "epoch": 0.8138882898425707, + "grad_norm": 0.17046821117401123, + "learning_rate": 0.00012531316287758777, + "loss": 2.319, + "step": 1887 + }, + { + "epoch": 0.8143196031917188, + "grad_norm": 0.1797439604997635, + "learning_rate": 0.00012528784836788676, + "loss": 2.3515, + "step": 1888 + }, + { + "epoch": 0.814750916540867, + "grad_norm": 0.18330460786819458, + "learning_rate": 0.00012526252344553681, + "loss": 2.1221, + "step": 1889 + }, + { + "epoch": 0.8151822298900151, + "grad_norm": 0.18982136249542236, + "learning_rate": 0.0001252371881157817, + "loss": 2.3772, + "step": 1890 + }, + { + "epoch": 0.8156135432391632, + "grad_norm": 0.17775148153305054, + "learning_rate": 0.00012521184238386743, + "loss": 2.1501, + "step": 1891 + }, + { + "epoch": 0.8160448565883114, + "grad_norm": 0.16533853113651276, + "learning_rate": 0.00012518648625504207, + "loss": 2.0584, + "step": 1892 + }, + { + "epoch": 0.8164761699374595, + "grad_norm": 0.17575526237487793, + "learning_rate": 0.0001251611197345559, + "loss": 2.1697, + "step": 1893 + }, + { + "epoch": 0.8169074832866077, + "grad_norm": 0.17966797947883606, + "learning_rate": 0.00012513574282766138, + "loss": 2.2837, + "step": 1894 + }, + { + "epoch": 0.8173387966357559, + "grad_norm": 0.19567516446113586, + "learning_rate": 0.000125110355539613, + "loss": 2.1884, + "step": 1895 + }, + { + "epoch": 0.8177701099849041, + "grad_norm": 0.20565064251422882, + "learning_rate": 0.00012508495787566748, + "loss": 2.2495, + "step": 1896 + }, + { + "epoch": 0.8182014233340522, + "grad_norm": 0.15445753931999207, + "learning_rate": 0.00012505954984108375, + "loss": 2.1746, + "step": 1897 + }, + { + "epoch": 0.8186327366832004, + "grad_norm": 0.17870637774467468, + "learning_rate": 0.00012503413144112277, + "loss": 2.3098, + "step": 1898 + }, + { + "epoch": 0.8190640500323485, + "grad_norm": 0.1896657943725586, + "learning_rate": 0.0001250087026810477, + "loss": 2.1974, + "step": 1899 + }, + { + "epoch": 0.8194953633814966, + "grad_norm": 0.17491237819194794, + "learning_rate": 0.00012498326356612385, + "loss": 2.2314, + "step": 1900 + }, + { + "epoch": 0.8194953633814966, + "eval_loss": 2.111462354660034, + "eval_runtime": 199.5209, + "eval_samples_per_second": 0.16, + "eval_steps_per_second": 0.16, + "step": 1900 + }, + { + "epoch": 0.8199266767306448, + "grad_norm": 0.1932830959558487, + "learning_rate": 0.00012495781410161872, + "loss": 2.3258, + "step": 1901 + }, + { + "epoch": 0.8203579900797929, + "grad_norm": 0.1864049881696701, + "learning_rate": 0.0001249323542928018, + "loss": 2.2642, + "step": 1902 + }, + { + "epoch": 0.8207893034289411, + "grad_norm": 0.1658972203731537, + "learning_rate": 0.0001249068841449449, + "loss": 2.0475, + "step": 1903 + }, + { + "epoch": 0.8212206167780893, + "grad_norm": 0.16217941045761108, + "learning_rate": 0.00012488140366332184, + "loss": 2.294, + "step": 1904 + }, + { + "epoch": 0.8216519301272375, + "grad_norm": 0.1949874609708786, + "learning_rate": 0.00012485591285320868, + "loss": 2.2252, + "step": 1905 + }, + { + "epoch": 0.8220832434763856, + "grad_norm": 0.2481028139591217, + "learning_rate": 0.00012483041171988354, + "loss": 2.136, + "step": 1906 + }, + { + "epoch": 0.8225145568255338, + "grad_norm": 0.15578563511371613, + "learning_rate": 0.00012480490026862672, + "loss": 2.0641, + "step": 1907 + }, + { + "epoch": 0.8229458701746819, + "grad_norm": 0.17702731490135193, + "learning_rate": 0.00012477937850472067, + "loss": 2.2588, + "step": 1908 + }, + { + "epoch": 0.82337718352383, + "grad_norm": 0.27015769481658936, + "learning_rate": 0.00012475384643344988, + "loss": 1.9521, + "step": 1909 + }, + { + "epoch": 0.8238084968729782, + "grad_norm": 0.18160024285316467, + "learning_rate": 0.0001247283040601011, + "loss": 2.3358, + "step": 1910 + }, + { + "epoch": 0.8242398102221263, + "grad_norm": 0.18464002013206482, + "learning_rate": 0.00012470275138996313, + "loss": 2.2393, + "step": 1911 + }, + { + "epoch": 0.8246711235712745, + "grad_norm": 0.1753547638654709, + "learning_rate": 0.00012467718842832696, + "loss": 2.2028, + "step": 1912 + }, + { + "epoch": 0.8251024369204227, + "grad_norm": 0.9519688487052917, + "learning_rate": 0.00012465161518048566, + "loss": 2.1816, + "step": 1913 + }, + { + "epoch": 0.8255337502695709, + "grad_norm": 0.1543843299150467, + "learning_rate": 0.0001246260316517345, + "loss": 2.0178, + "step": 1914 + }, + { + "epoch": 0.825965063618719, + "grad_norm": 0.1509864628314972, + "learning_rate": 0.0001246004378473707, + "loss": 2.2426, + "step": 1915 + }, + { + "epoch": 0.8263963769678672, + "grad_norm": 0.15210863947868347, + "learning_rate": 0.00012457483377269392, + "loss": 2.2984, + "step": 1916 + }, + { + "epoch": 0.8268276903170153, + "grad_norm": 0.18767836689949036, + "learning_rate": 0.0001245492194330056, + "loss": 2.2096, + "step": 1917 + }, + { + "epoch": 0.8272590036661635, + "grad_norm": 0.1672392636537552, + "learning_rate": 0.00012452359483360956, + "loss": 2.1577, + "step": 1918 + }, + { + "epoch": 0.8276903170153116, + "grad_norm": 0.1950211524963379, + "learning_rate": 0.00012449795997981162, + "loss": 2.1059, + "step": 1919 + }, + { + "epoch": 0.8281216303644597, + "grad_norm": 0.15297383069992065, + "learning_rate": 0.00012447231487691974, + "loss": 2.1318, + "step": 1920 + }, + { + "epoch": 0.8285529437136079, + "grad_norm": 0.16627302765846252, + "learning_rate": 0.0001244466595302441, + "loss": 2.2125, + "step": 1921 + }, + { + "epoch": 0.8289842570627561, + "grad_norm": 0.1622379869222641, + "learning_rate": 0.00012442099394509683, + "loss": 2.1177, + "step": 1922 + }, + { + "epoch": 0.8294155704119043, + "grad_norm": 0.1775934398174286, + "learning_rate": 0.0001243953181267923, + "loss": 2.1536, + "step": 1923 + }, + { + "epoch": 0.8298468837610524, + "grad_norm": 0.20350416004657745, + "learning_rate": 0.00012436963208064698, + "loss": 2.0901, + "step": 1924 + }, + { + "epoch": 0.8302781971102006, + "grad_norm": 0.16868053376674652, + "learning_rate": 0.00012434393581197943, + "loss": 2.0385, + "step": 1925 + }, + { + "epoch": 0.8302781971102006, + "eval_loss": 2.1118338108062744, + "eval_runtime": 199.066, + "eval_samples_per_second": 0.161, + "eval_steps_per_second": 0.161, + "step": 1925 + }, + { + "epoch": 0.8307095104593487, + "grad_norm": 0.1899476796388626, + "learning_rate": 0.0001243182293261104, + "loss": 2.0417, + "step": 1926 + }, + { + "epoch": 0.8311408238084969, + "grad_norm": 0.1900355964899063, + "learning_rate": 0.0001242925126283626, + "loss": 2.2387, + "step": 1927 + }, + { + "epoch": 0.831572137157645, + "grad_norm": 0.16706283390522003, + "learning_rate": 0.00012426678572406106, + "loss": 2.2561, + "step": 1928 + }, + { + "epoch": 0.8320034505067931, + "grad_norm": 0.1549728959798813, + "learning_rate": 0.0001242410486185327, + "loss": 2.0268, + "step": 1929 + }, + { + "epoch": 0.8324347638559414, + "grad_norm": 0.19593092799186707, + "learning_rate": 0.00012421530131710677, + "loss": 2.1991, + "step": 1930 + }, + { + "epoch": 0.8328660772050895, + "grad_norm": 1.2008687257766724, + "learning_rate": 0.0001241895438251145, + "loss": 2.0635, + "step": 1931 + }, + { + "epoch": 0.8332973905542377, + "grad_norm": 0.19961075484752655, + "learning_rate": 0.00012416377614788923, + "loss": 2.2329, + "step": 1932 + }, + { + "epoch": 0.8337287039033858, + "grad_norm": 0.17825818061828613, + "learning_rate": 0.00012413799829076648, + "loss": 2.2581, + "step": 1933 + }, + { + "epoch": 0.834160017252534, + "grad_norm": 0.17735421657562256, + "learning_rate": 0.0001241122102590838, + "loss": 2.2634, + "step": 1934 + }, + { + "epoch": 0.8345913306016821, + "grad_norm": 0.15418070554733276, + "learning_rate": 0.00012408641205818093, + "loss": 2.2577, + "step": 1935 + }, + { + "epoch": 0.8350226439508303, + "grad_norm": 0.17420464754104614, + "learning_rate": 0.0001240606036933996, + "loss": 2.2465, + "step": 1936 + }, + { + "epoch": 0.8354539572999784, + "grad_norm": 0.17787966132164001, + "learning_rate": 0.00012403478517008382, + "loss": 2.2066, + "step": 1937 + }, + { + "epoch": 0.8358852706491265, + "grad_norm": 0.19807268679141998, + "learning_rate": 0.00012400895649357949, + "loss": 2.1749, + "step": 1938 + }, + { + "epoch": 0.8363165839982748, + "grad_norm": 0.17994818091392517, + "learning_rate": 0.00012398311766923477, + "loss": 2.1722, + "step": 1939 + }, + { + "epoch": 0.8367478973474229, + "grad_norm": 0.17680570483207703, + "learning_rate": 0.00012395726870239988, + "loss": 2.2587, + "step": 1940 + }, + { + "epoch": 0.8371792106965711, + "grad_norm": 0.19101424515247345, + "learning_rate": 0.00012393140959842712, + "loss": 2.1778, + "step": 1941 + }, + { + "epoch": 0.8376105240457192, + "grad_norm": 0.17281894385814667, + "learning_rate": 0.00012390554036267087, + "loss": 2.1408, + "step": 1942 + }, + { + "epoch": 0.8380418373948674, + "grad_norm": 0.18398477137088776, + "learning_rate": 0.00012387966100048771, + "loss": 2.3198, + "step": 1943 + }, + { + "epoch": 0.8384731507440155, + "grad_norm": 0.16956676542758942, + "learning_rate": 0.00012385377151723616, + "loss": 2.3611, + "step": 1944 + }, + { + "epoch": 0.8389044640931637, + "grad_norm": 0.17998960614204407, + "learning_rate": 0.00012382787191827696, + "loss": 2.1598, + "step": 1945 + }, + { + "epoch": 0.8393357774423118, + "grad_norm": 0.1674444079399109, + "learning_rate": 0.00012380196220897294, + "loss": 2.1449, + "step": 1946 + }, + { + "epoch": 0.83976709079146, + "grad_norm": 0.1496683955192566, + "learning_rate": 0.00012377604239468894, + "loss": 2.1855, + "step": 1947 + }, + { + "epoch": 0.8401984041406082, + "grad_norm": 0.17284353077411652, + "learning_rate": 0.00012375011248079196, + "loss": 2.1736, + "step": 1948 + }, + { + "epoch": 0.8406297174897563, + "grad_norm": 0.18436162173748016, + "learning_rate": 0.00012372417247265107, + "loss": 2.3424, + "step": 1949 + }, + { + "epoch": 0.8410610308389045, + "grad_norm": 0.19764438271522522, + "learning_rate": 0.0001236982223756374, + "loss": 2.2667, + "step": 1950 + }, + { + "epoch": 0.8410610308389045, + "eval_loss": 2.1127099990844727, + "eval_runtime": 199.1108, + "eval_samples_per_second": 0.161, + "eval_steps_per_second": 0.161, + "step": 1950 + }, + { + "epoch": 0.8414923441880526, + "grad_norm": 0.19147905707359314, + "learning_rate": 0.00012367226219512427, + "loss": 2.1682, + "step": 1951 + }, + { + "epoch": 0.8419236575372008, + "grad_norm": 0.1907227337360382, + "learning_rate": 0.000123646291936487, + "loss": 2.2614, + "step": 1952 + }, + { + "epoch": 0.8423549708863489, + "grad_norm": 0.19571653008460999, + "learning_rate": 0.00012362031160510295, + "loss": 2.2148, + "step": 1953 + }, + { + "epoch": 0.8427862842354971, + "grad_norm": 0.17038114368915558, + "learning_rate": 0.0001235943212063517, + "loss": 2.1184, + "step": 1954 + }, + { + "epoch": 0.8432175975846452, + "grad_norm": 0.1803564429283142, + "learning_rate": 0.0001235683207456148, + "loss": 2.25, + "step": 1955 + }, + { + "epoch": 0.8436489109337934, + "grad_norm": 0.16071362793445587, + "learning_rate": 0.000123542310228276, + "loss": 2.2377, + "step": 1956 + }, + { + "epoch": 0.8440802242829416, + "grad_norm": 0.16572268307209015, + "learning_rate": 0.000123516289659721, + "loss": 2.4341, + "step": 1957 + }, + { + "epoch": 0.8445115376320897, + "grad_norm": 0.19113953411579132, + "learning_rate": 0.00012349025904533767, + "loss": 2.1922, + "step": 1958 + }, + { + "epoch": 0.8449428509812379, + "grad_norm": 0.22091901302337646, + "learning_rate": 0.00012346421839051587, + "loss": 2.4147, + "step": 1959 + }, + { + "epoch": 0.845374164330386, + "grad_norm": 0.18341030180454254, + "learning_rate": 0.00012343816770064771, + "loss": 2.3162, + "step": 1960 + }, + { + "epoch": 0.8458054776795342, + "grad_norm": 0.1765439510345459, + "learning_rate": 0.0001234121069811272, + "loss": 2.279, + "step": 1961 + }, + { + "epoch": 0.8462367910286823, + "grad_norm": 0.29950448870658875, + "learning_rate": 0.0001233860362373505, + "loss": 2.2917, + "step": 1962 + }, + { + "epoch": 0.8466681043778305, + "grad_norm": 0.18259882926940918, + "learning_rate": 0.0001233599554747159, + "loss": 2.1786, + "step": 1963 + }, + { + "epoch": 0.8470994177269786, + "grad_norm": 0.1746501475572586, + "learning_rate": 0.00012333386469862362, + "loss": 2.308, + "step": 1964 + }, + { + "epoch": 0.8475307310761268, + "grad_norm": 0.17510280013084412, + "learning_rate": 0.00012330776391447613, + "loss": 2.5226, + "step": 1965 + }, + { + "epoch": 0.847962044425275, + "grad_norm": 0.16180452704429626, + "learning_rate": 0.00012328165312767777, + "loss": 2.0962, + "step": 1966 + }, + { + "epoch": 0.8483933577744232, + "grad_norm": 0.166132852435112, + "learning_rate": 0.0001232555323436352, + "loss": 2.2975, + "step": 1967 + }, + { + "epoch": 0.8488246711235713, + "grad_norm": 0.17394210398197174, + "learning_rate": 0.0001232294015677569, + "loss": 2.2925, + "step": 1968 + }, + { + "epoch": 0.8492559844727194, + "grad_norm": 0.168202206492424, + "learning_rate": 0.0001232032608054536, + "loss": 2.0749, + "step": 1969 + }, + { + "epoch": 0.8496872978218676, + "grad_norm": 0.17130744457244873, + "learning_rate": 0.000123177110062138, + "loss": 2.2784, + "step": 1970 + }, + { + "epoch": 0.8501186111710157, + "grad_norm": 0.16785354912281036, + "learning_rate": 0.0001231509493432249, + "loss": 2.2474, + "step": 1971 + }, + { + "epoch": 0.8505499245201639, + "grad_norm": 0.18395166099071503, + "learning_rate": 0.00012312477865413117, + "loss": 2.1545, + "step": 1972 + }, + { + "epoch": 0.850981237869312, + "grad_norm": 0.16726191341876984, + "learning_rate": 0.00012309859800027574, + "loss": 2.1333, + "step": 1973 + }, + { + "epoch": 0.8514125512184602, + "grad_norm": 0.17289185523986816, + "learning_rate": 0.0001230724073870796, + "loss": 2.1889, + "step": 1974 + }, + { + "epoch": 0.8518438645676084, + "grad_norm": 0.16984088718891144, + "learning_rate": 0.00012304620681996578, + "loss": 2.2826, + "step": 1975 + }, + { + "epoch": 0.8518438645676084, + "eval_loss": 2.1102747917175293, + "eval_runtime": 198.868, + "eval_samples_per_second": 0.161, + "eval_steps_per_second": 0.161, + "step": 1975 + }, + { + "epoch": 0.8522751779167566, + "grad_norm": 0.17141634225845337, + "learning_rate": 0.00012301999630435944, + "loss": 2.1729, + "step": 1976 + }, + { + "epoch": 0.8527064912659047, + "grad_norm": 0.17726927995681763, + "learning_rate": 0.00012299377584568772, + "loss": 2.1873, + "step": 1977 + }, + { + "epoch": 0.8531378046150528, + "grad_norm": 0.16618146002292633, + "learning_rate": 0.00012296754544937984, + "loss": 2.1897, + "step": 1978 + }, + { + "epoch": 0.853569117964201, + "grad_norm": 0.17402474582195282, + "learning_rate": 0.00012294130512086716, + "loss": 2.2492, + "step": 1979 + }, + { + "epoch": 0.8540004313133491, + "grad_norm": 0.16003389656543732, + "learning_rate": 0.00012291505486558291, + "loss": 2.232, + "step": 1980 + }, + { + "epoch": 0.8544317446624973, + "grad_norm": 0.17695653438568115, + "learning_rate": 0.00012288879468896262, + "loss": 2.1035, + "step": 1981 + }, + { + "epoch": 0.8548630580116454, + "grad_norm": 0.17198093235492706, + "learning_rate": 0.00012286252459644365, + "loss": 2.1935, + "step": 1982 + }, + { + "epoch": 0.8552943713607937, + "grad_norm": 0.1726846843957901, + "learning_rate": 0.00012283624459346557, + "loss": 2.1505, + "step": 1983 + }, + { + "epoch": 0.8557256847099418, + "grad_norm": 0.19706712663173676, + "learning_rate": 0.0001228099546854699, + "loss": 2.3153, + "step": 1984 + }, + { + "epoch": 0.85615699805909, + "grad_norm": 0.16633062064647675, + "learning_rate": 0.00012278365487790026, + "loss": 2.3088, + "step": 1985 + }, + { + "epoch": 0.8565883114082381, + "grad_norm": 0.1573697030544281, + "learning_rate": 0.00012275734517620235, + "loss": 2.1636, + "step": 1986 + }, + { + "epoch": 0.8570196247573862, + "grad_norm": 0.23323068022727966, + "learning_rate": 0.00012273102558582385, + "loss": 2.0344, + "step": 1987 + }, + { + "epoch": 0.8574509381065344, + "grad_norm": 0.22128650546073914, + "learning_rate": 0.00012270469611221454, + "loss": 2.297, + "step": 1988 + }, + { + "epoch": 0.8578822514556825, + "grad_norm": 0.16853579878807068, + "learning_rate": 0.00012267835676082616, + "loss": 2.2457, + "step": 1989 + }, + { + "epoch": 0.8583135648048307, + "grad_norm": 0.16581346094608307, + "learning_rate": 0.00012265200753711268, + "loss": 2.3109, + "step": 1990 + }, + { + "epoch": 0.8587448781539788, + "grad_norm": 0.19087955355644226, + "learning_rate": 0.00012262564844652988, + "loss": 2.332, + "step": 1991 + }, + { + "epoch": 0.8591761915031271, + "grad_norm": 0.19937820732593536, + "learning_rate": 0.00012259927949453576, + "loss": 2.2953, + "step": 1992 + }, + { + "epoch": 0.8596075048522752, + "grad_norm": 0.1905757039785385, + "learning_rate": 0.00012257290068659027, + "loss": 2.2003, + "step": 1993 + }, + { + "epoch": 0.8600388182014234, + "grad_norm": 0.15634717047214508, + "learning_rate": 0.00012254651202815543, + "loss": 2.1735, + "step": 1994 + }, + { + "epoch": 0.8604701315505715, + "grad_norm": 0.16686885058879852, + "learning_rate": 0.00012252011352469535, + "loss": 2.3553, + "step": 1995 + }, + { + "epoch": 0.8609014448997196, + "grad_norm": 0.17841187119483948, + "learning_rate": 0.00012249370518167607, + "loss": 2.1238, + "step": 1996 + }, + { + "epoch": 0.8613327582488678, + "grad_norm": 0.49504896998405457, + "learning_rate": 0.00012246728700456578, + "loss": 1.7593, + "step": 1997 + }, + { + "epoch": 0.8617640715980159, + "grad_norm": 0.17972888052463531, + "learning_rate": 0.0001224408589988346, + "loss": 2.1113, + "step": 1998 + }, + { + "epoch": 0.8621953849471641, + "grad_norm": 0.18801777064800262, + "learning_rate": 0.00012241442116995476, + "loss": 2.2476, + "step": 1999 + }, + { + "epoch": 0.8626266982963122, + "grad_norm": 0.18260492384433746, + "learning_rate": 0.00012238797352340052, + "loss": 2.34, + "step": 2000 + }, + { + "epoch": 0.8626266982963122, + "eval_loss": 2.109143018722534, + "eval_runtime": 200.0437, + "eval_samples_per_second": 0.16, + "eval_steps_per_second": 0.16, + "step": 2000 + }, + { + "epoch": 0.8630580116454605, + "grad_norm": 0.16812659800052643, + "learning_rate": 0.0001223615160646481, + "loss": 2.2668, + "step": 2001 + }, + { + "epoch": 0.8634893249946086, + "grad_norm": 0.17180365324020386, + "learning_rate": 0.00012233504879917588, + "loss": 2.1786, + "step": 2002 + }, + { + "epoch": 0.8639206383437568, + "grad_norm": 0.16643165051937103, + "learning_rate": 0.00012230857173246412, + "loss": 2.1149, + "step": 2003 + }, + { + "epoch": 0.8643519516929049, + "grad_norm": 0.18042191863059998, + "learning_rate": 0.00012228208486999526, + "loss": 1.8244, + "step": 2004 + }, + { + "epoch": 0.864783265042053, + "grad_norm": 0.18224523961544037, + "learning_rate": 0.00012225558821725362, + "loss": 2.2373, + "step": 2005 + }, + { + "epoch": 0.8652145783912012, + "grad_norm": 0.17160971462726593, + "learning_rate": 0.0001222290817797257, + "loss": 2.1969, + "step": 2006 + }, + { + "epoch": 0.8656458917403493, + "grad_norm": 0.15949644148349762, + "learning_rate": 0.0001222025655628999, + "loss": 2.0821, + "step": 2007 + }, + { + "epoch": 0.8660772050894975, + "grad_norm": 0.20599953830242157, + "learning_rate": 0.00012217603957226668, + "loss": 2.4422, + "step": 2008 + }, + { + "epoch": 0.8665085184386456, + "grad_norm": 0.17190849781036377, + "learning_rate": 0.0001221495038133186, + "loss": 2.2824, + "step": 2009 + }, + { + "epoch": 0.8669398317877939, + "grad_norm": 0.17235378921031952, + "learning_rate": 0.0001221229582915501, + "loss": 2.1513, + "step": 2010 + }, + { + "epoch": 0.867371145136942, + "grad_norm": 0.18004624545574188, + "learning_rate": 0.00012209640301245775, + "loss": 2.3513, + "step": 2011 + }, + { + "epoch": 0.8678024584860902, + "grad_norm": 0.16808605194091797, + "learning_rate": 0.00012206983798154016, + "loss": 2.2083, + "step": 2012 + }, + { + "epoch": 0.8682337718352383, + "grad_norm": 0.1684333235025406, + "learning_rate": 0.00012204326320429784, + "loss": 2.0846, + "step": 2013 + }, + { + "epoch": 0.8686650851843865, + "grad_norm": 0.17363044619560242, + "learning_rate": 0.00012201667868623343, + "loss": 2.2877, + "step": 2014 + }, + { + "epoch": 0.8690963985335346, + "grad_norm": 0.15960228443145752, + "learning_rate": 0.00012199008443285151, + "loss": 2.0895, + "step": 2015 + }, + { + "epoch": 0.8695277118826827, + "grad_norm": 0.16592063009738922, + "learning_rate": 0.00012196348044965876, + "loss": 2.164, + "step": 2016 + }, + { + "epoch": 0.8699590252318309, + "grad_norm": 0.18730241060256958, + "learning_rate": 0.00012193686674216378, + "loss": 2.2974, + "step": 2017 + }, + { + "epoch": 0.870390338580979, + "grad_norm": 0.17575466632843018, + "learning_rate": 0.00012191024331587727, + "loss": 2.0559, + "step": 2018 + }, + { + "epoch": 0.8708216519301273, + "grad_norm": 0.1630849540233612, + "learning_rate": 0.00012188361017631185, + "loss": 2.2679, + "step": 2019 + }, + { + "epoch": 0.8712529652792754, + "grad_norm": 0.16414892673492432, + "learning_rate": 0.00012185696732898226, + "loss": 2.1962, + "step": 2020 + }, + { + "epoch": 0.8716842786284236, + "grad_norm": 0.1663442999124527, + "learning_rate": 0.00012183031477940516, + "loss": 2.2859, + "step": 2021 + }, + { + "epoch": 0.8721155919775717, + "grad_norm": 0.16488784551620483, + "learning_rate": 0.00012180365253309924, + "loss": 2.2866, + "step": 2022 + }, + { + "epoch": 0.8725469053267199, + "grad_norm": 0.19378119707107544, + "learning_rate": 0.00012177698059558524, + "loss": 2.211, + "step": 2023 + }, + { + "epoch": 0.872978218675868, + "grad_norm": 0.1780066192150116, + "learning_rate": 0.00012175029897238585, + "loss": 2.3226, + "step": 2024 + }, + { + "epoch": 0.8734095320250161, + "grad_norm": 0.1780560314655304, + "learning_rate": 0.00012172360766902583, + "loss": 2.2509, + "step": 2025 + }, + { + "epoch": 0.8734095320250161, + "eval_loss": 2.1096911430358887, + "eval_runtime": 200.3092, + "eval_samples_per_second": 0.16, + "eval_steps_per_second": 0.16, + "step": 2025 + }, + { + "epoch": 0.8738408453741643, + "grad_norm": 0.17343544960021973, + "learning_rate": 0.00012169690669103187, + "loss": 2.303, + "step": 2026 + }, + { + "epoch": 0.8742721587233124, + "grad_norm": 0.18043680489063263, + "learning_rate": 0.00012167019604393272, + "loss": 2.2922, + "step": 2027 + }, + { + "epoch": 0.8747034720724607, + "grad_norm": 0.17327257990837097, + "learning_rate": 0.00012164347573325909, + "loss": 2.1745, + "step": 2028 + }, + { + "epoch": 0.8751347854216088, + "grad_norm": 0.19838827848434448, + "learning_rate": 0.00012161674576454372, + "loss": 2.1947, + "step": 2029 + }, + { + "epoch": 0.875566098770757, + "grad_norm": 0.1931244134902954, + "learning_rate": 0.00012159000614332135, + "loss": 2.2768, + "step": 2030 + }, + { + "epoch": 0.8759974121199051, + "grad_norm": 0.20393235981464386, + "learning_rate": 0.00012156325687512871, + "loss": 2.233, + "step": 2031 + }, + { + "epoch": 0.8764287254690533, + "grad_norm": 0.18255698680877686, + "learning_rate": 0.00012153649796550453, + "loss": 2.0384, + "step": 2032 + }, + { + "epoch": 0.8768600388182014, + "grad_norm": 0.17171481251716614, + "learning_rate": 0.00012150972941998953, + "loss": 2.1993, + "step": 2033 + }, + { + "epoch": 0.8772913521673495, + "grad_norm": 0.1640217900276184, + "learning_rate": 0.00012148295124412645, + "loss": 2.0965, + "step": 2034 + }, + { + "epoch": 0.8777226655164977, + "grad_norm": 0.17884527146816254, + "learning_rate": 0.00012145616344345996, + "loss": 2.365, + "step": 2035 + }, + { + "epoch": 0.878153978865646, + "grad_norm": 0.15875819325447083, + "learning_rate": 0.00012142936602353679, + "loss": 2.0765, + "step": 2036 + }, + { + "epoch": 0.8785852922147941, + "grad_norm": 0.17063796520233154, + "learning_rate": 0.00012140255898990565, + "loss": 2.2196, + "step": 2037 + }, + { + "epoch": 0.8790166055639422, + "grad_norm": 0.1654708832502365, + "learning_rate": 0.00012137574234811722, + "loss": 2.1843, + "step": 2038 + }, + { + "epoch": 0.8794479189130904, + "grad_norm": 0.15875574946403503, + "learning_rate": 0.0001213489161037242, + "loss": 2.2028, + "step": 2039 + }, + { + "epoch": 0.8798792322622385, + "grad_norm": 0.16864518821239471, + "learning_rate": 0.00012132208026228122, + "loss": 2.1658, + "step": 2040 + }, + { + "epoch": 0.8803105456113867, + "grad_norm": 0.18385015428066254, + "learning_rate": 0.00012129523482934496, + "loss": 2.2669, + "step": 2041 + }, + { + "epoch": 0.8807418589605348, + "grad_norm": 0.17301145195960999, + "learning_rate": 0.00012126837981047405, + "loss": 2.1845, + "step": 2042 + }, + { + "epoch": 0.881173172309683, + "grad_norm": 0.17012512683868408, + "learning_rate": 0.00012124151521122911, + "loss": 2.2641, + "step": 2043 + }, + { + "epoch": 0.8816044856588311, + "grad_norm": 0.1657698154449463, + "learning_rate": 0.00012121464103717276, + "loss": 2.2242, + "step": 2044 + }, + { + "epoch": 0.8820357990079793, + "grad_norm": 0.19457809627056122, + "learning_rate": 0.0001211877572938696, + "loss": 1.8951, + "step": 2045 + }, + { + "epoch": 0.8824671123571275, + "grad_norm": 0.1696557104587555, + "learning_rate": 0.00012116086398688617, + "loss": 2.1189, + "step": 2046 + }, + { + "epoch": 0.8828984257062756, + "grad_norm": 0.19344045221805573, + "learning_rate": 0.00012113396112179105, + "loss": 2.0996, + "step": 2047 + }, + { + "epoch": 0.8833297390554238, + "grad_norm": 0.1660347431898117, + "learning_rate": 0.0001211070487041548, + "loss": 2.2841, + "step": 2048 + }, + { + "epoch": 0.8837610524045719, + "grad_norm": 0.18354149162769318, + "learning_rate": 0.00012108012673954987, + "loss": 2.1996, + "step": 2049 + }, + { + "epoch": 0.8841923657537201, + "grad_norm": 0.19460587203502655, + "learning_rate": 0.00012105319523355076, + "loss": 2.3914, + "step": 2050 + }, + { + "epoch": 0.8841923657537201, + "eval_loss": 2.1087186336517334, + "eval_runtime": 199.8968, + "eval_samples_per_second": 0.16, + "eval_steps_per_second": 0.16, + "step": 2050 + }, + { + "epoch": 0.8846236791028682, + "grad_norm": 0.1793028861284256, + "learning_rate": 0.00012102625419173396, + "loss": 2.1765, + "step": 2051 + }, + { + "epoch": 0.8850549924520164, + "grad_norm": 0.17265355587005615, + "learning_rate": 0.00012099930361967791, + "loss": 2.2571, + "step": 2052 + }, + { + "epoch": 0.8854863058011645, + "grad_norm": 0.1725883036851883, + "learning_rate": 0.00012097234352296297, + "loss": 2.324, + "step": 2053 + }, + { + "epoch": 0.8859176191503128, + "grad_norm": 0.19926157593727112, + "learning_rate": 0.00012094537390717161, + "loss": 2.14, + "step": 2054 + }, + { + "epoch": 0.8863489324994609, + "grad_norm": 0.17119097709655762, + "learning_rate": 0.0001209183947778881, + "loss": 2.2903, + "step": 2055 + }, + { + "epoch": 0.886780245848609, + "grad_norm": 0.1708480715751648, + "learning_rate": 0.00012089140614069881, + "loss": 2.2251, + "step": 2056 + }, + { + "epoch": 0.8872115591977572, + "grad_norm": 0.17137430608272552, + "learning_rate": 0.00012086440800119201, + "loss": 2.2214, + "step": 2057 + }, + { + "epoch": 0.8876428725469053, + "grad_norm": 0.16498853266239166, + "learning_rate": 0.00012083740036495796, + "loss": 1.9813, + "step": 2058 + }, + { + "epoch": 0.8880741858960535, + "grad_norm": 0.15586769580841064, + "learning_rate": 0.0001208103832375889, + "loss": 2.2625, + "step": 2059 + }, + { + "epoch": 0.8885054992452016, + "grad_norm": 0.19154292345046997, + "learning_rate": 0.00012078335662467903, + "loss": 2.235, + "step": 2060 + }, + { + "epoch": 0.8889368125943498, + "grad_norm": 0.1716182976961136, + "learning_rate": 0.00012075632053182447, + "loss": 2.1366, + "step": 2061 + }, + { + "epoch": 0.8893681259434979, + "grad_norm": 0.16451750695705414, + "learning_rate": 0.00012072927496462338, + "loss": 2.3751, + "step": 2062 + }, + { + "epoch": 0.8897994392926462, + "grad_norm": 0.16332577168941498, + "learning_rate": 0.00012070221992867581, + "loss": 2.1399, + "step": 2063 + }, + { + "epoch": 0.8902307526417943, + "grad_norm": 0.18729786574840546, + "learning_rate": 0.00012067515542958383, + "loss": 2.3277, + "step": 2064 + }, + { + "epoch": 0.8906620659909424, + "grad_norm": 0.15769799053668976, + "learning_rate": 0.00012064808147295142, + "loss": 2.2074, + "step": 2065 + }, + { + "epoch": 0.8910933793400906, + "grad_norm": 0.17490021884441376, + "learning_rate": 0.00012062099806438454, + "loss": 2.0525, + "step": 2066 + }, + { + "epoch": 0.8915246926892387, + "grad_norm": 0.16716617345809937, + "learning_rate": 0.00012059390520949108, + "loss": 2.2348, + "step": 2067 + }, + { + "epoch": 0.8919560060383869, + "grad_norm": 0.15750347077846527, + "learning_rate": 0.00012056680291388097, + "loss": 2.3384, + "step": 2068 + }, + { + "epoch": 0.892387319387535, + "grad_norm": 0.16338370740413666, + "learning_rate": 0.00012053969118316599, + "loss": 2.3072, + "step": 2069 + }, + { + "epoch": 0.8928186327366832, + "grad_norm": 0.17863647639751434, + "learning_rate": 0.00012051257002295996, + "loss": 2.2572, + "step": 2070 + }, + { + "epoch": 0.8932499460858313, + "grad_norm": 0.17093229293823242, + "learning_rate": 0.00012048543943887858, + "loss": 2.0799, + "step": 2071 + }, + { + "epoch": 0.8936812594349796, + "grad_norm": 0.16067644953727722, + "learning_rate": 0.00012045829943653953, + "loss": 2.1233, + "step": 2072 + }, + { + "epoch": 0.8941125727841277, + "grad_norm": 0.1740393340587616, + "learning_rate": 0.00012043115002156247, + "loss": 1.9874, + "step": 2073 + }, + { + "epoch": 0.8945438861332758, + "grad_norm": 0.223533496260643, + "learning_rate": 0.00012040399119956896, + "loss": 2.2838, + "step": 2074 + }, + { + "epoch": 0.894975199482424, + "grad_norm": 0.17270897328853607, + "learning_rate": 0.00012037682297618256, + "loss": 2.1209, + "step": 2075 + }, + { + "epoch": 0.894975199482424, + "eval_loss": 2.1082229614257812, + "eval_runtime": 205.4679, + "eval_samples_per_second": 0.156, + "eval_steps_per_second": 0.156, + "step": 2075 + }, + { + "epoch": 0.8954065128315721, + "grad_norm": 0.177972212433815, + "learning_rate": 0.00012034964535702873, + "loss": 2.1077, + "step": 2076 + }, + { + "epoch": 0.8958378261807203, + "grad_norm": 0.16358588635921478, + "learning_rate": 0.00012032245834773488, + "loss": 2.1419, + "step": 2077 + }, + { + "epoch": 0.8962691395298684, + "grad_norm": 0.17186710238456726, + "learning_rate": 0.00012029526195393039, + "loss": 2.3137, + "step": 2078 + }, + { + "epoch": 0.8967004528790166, + "grad_norm": 0.19588297605514526, + "learning_rate": 0.00012026805618124658, + "loss": 2.3633, + "step": 2079 + }, + { + "epoch": 0.8971317662281647, + "grad_norm": 0.18950992822647095, + "learning_rate": 0.0001202408410353167, + "loss": 2.2519, + "step": 2080 + }, + { + "epoch": 0.897563079577313, + "grad_norm": 0.14333821833133698, + "learning_rate": 0.00012021361652177593, + "loss": 2.2314, + "step": 2081 + }, + { + "epoch": 0.8979943929264611, + "grad_norm": 0.16006465256214142, + "learning_rate": 0.00012018638264626143, + "loss": 2.309, + "step": 2082 + }, + { + "epoch": 0.8984257062756092, + "grad_norm": 0.1798335760831833, + "learning_rate": 0.00012015913941441222, + "loss": 2.0984, + "step": 2083 + }, + { + "epoch": 0.8988570196247574, + "grad_norm": 0.17910893261432648, + "learning_rate": 0.00012013188683186934, + "loss": 2.1577, + "step": 2084 + }, + { + "epoch": 0.8992883329739055, + "grad_norm": 0.16627615690231323, + "learning_rate": 0.00012010462490427577, + "loss": 2.1262, + "step": 2085 + }, + { + "epoch": 0.8997196463230537, + "grad_norm": 0.18889717757701874, + "learning_rate": 0.00012007735363727633, + "loss": 2.3386, + "step": 2086 + }, + { + "epoch": 0.9001509596722018, + "grad_norm": 0.15616723895072937, + "learning_rate": 0.00012005007303651789, + "loss": 2.2993, + "step": 2087 + }, + { + "epoch": 0.90058227302135, + "grad_norm": 0.1719571053981781, + "learning_rate": 0.00012002278310764913, + "loss": 2.5332, + "step": 2088 + }, + { + "epoch": 0.9010135863704982, + "grad_norm": 0.18990328907966614, + "learning_rate": 0.0001199954838563208, + "loss": 2.4486, + "step": 2089 + }, + { + "epoch": 0.9014448997196464, + "grad_norm": 0.16839830577373505, + "learning_rate": 0.00011996817528818547, + "loss": 2.0224, + "step": 2090 + }, + { + "epoch": 0.9018762130687945, + "grad_norm": 0.15824031829833984, + "learning_rate": 0.00011994085740889769, + "loss": 2.2087, + "step": 2091 + }, + { + "epoch": 0.9023075264179427, + "grad_norm": 0.169529989361763, + "learning_rate": 0.00011991353022411388, + "loss": 2.1402, + "step": 2092 + }, + { + "epoch": 0.9027388397670908, + "grad_norm": 0.16205726563930511, + "learning_rate": 0.0001198861937394925, + "loss": 2.3329, + "step": 2093 + }, + { + "epoch": 0.9031701531162389, + "grad_norm": 0.21548905968666077, + "learning_rate": 0.00011985884796069387, + "loss": 2.3336, + "step": 2094 + }, + { + "epoch": 0.9036014664653871, + "grad_norm": 0.1582777351140976, + "learning_rate": 0.00011983149289338015, + "loss": 2.2334, + "step": 2095 + }, + { + "epoch": 0.9040327798145352, + "grad_norm": 0.17056939005851746, + "learning_rate": 0.00011980412854321559, + "loss": 2.2711, + "step": 2096 + }, + { + "epoch": 0.9044640931636834, + "grad_norm": 0.17518271505832672, + "learning_rate": 0.00011977675491586621, + "loss": 2.0697, + "step": 2097 + }, + { + "epoch": 0.9048954065128316, + "grad_norm": 0.16619044542312622, + "learning_rate": 0.00011974937201700009, + "loss": 2.3312, + "step": 2098 + }, + { + "epoch": 0.9053267198619798, + "grad_norm": 0.18111206591129303, + "learning_rate": 0.0001197219798522871, + "loss": 2.1856, + "step": 2099 + }, + { + "epoch": 0.9057580332111279, + "grad_norm": 0.1699441820383072, + "learning_rate": 0.00011969457842739911, + "loss": 2.2673, + "step": 2100 + }, + { + "epoch": 0.9057580332111279, + "eval_loss": 2.1081807613372803, + "eval_runtime": 201.1293, + "eval_samples_per_second": 0.159, + "eval_steps_per_second": 0.159, + "step": 2100 + }, + { + "epoch": 0.906189346560276, + "grad_norm": 0.1888381838798523, + "learning_rate": 0.00011966716774800989, + "loss": 2.422, + "step": 2101 + }, + { + "epoch": 0.9066206599094242, + "grad_norm": 0.186203271150589, + "learning_rate": 0.00011963974781979511, + "loss": 2.0778, + "step": 2102 + }, + { + "epoch": 0.9070519732585723, + "grad_norm": 0.17752569913864136, + "learning_rate": 0.00011961231864843241, + "loss": 2.3665, + "step": 2103 + }, + { + "epoch": 0.9074832866077205, + "grad_norm": 0.1868693083524704, + "learning_rate": 0.00011958488023960123, + "loss": 2.0341, + "step": 2104 + }, + { + "epoch": 0.9079145999568686, + "grad_norm": 0.17515556514263153, + "learning_rate": 0.00011955743259898302, + "loss": 2.1787, + "step": 2105 + }, + { + "epoch": 0.9083459133060168, + "grad_norm": 0.15530408918857574, + "learning_rate": 0.00011952997573226113, + "loss": 2.1034, + "step": 2106 + }, + { + "epoch": 0.908777226655165, + "grad_norm": 0.1727735847234726, + "learning_rate": 0.0001195025096451208, + "loss": 2.2956, + "step": 2107 + }, + { + "epoch": 0.9092085400043132, + "grad_norm": 0.1671399176120758, + "learning_rate": 0.00011947503434324917, + "loss": 2.3519, + "step": 2108 + }, + { + "epoch": 0.9096398533534613, + "grad_norm": 0.16980357468128204, + "learning_rate": 0.0001194475498323353, + "loss": 2.1494, + "step": 2109 + }, + { + "epoch": 0.9100711667026095, + "grad_norm": 0.16269958019256592, + "learning_rate": 0.00011942005611807017, + "loss": 2.1425, + "step": 2110 + }, + { + "epoch": 0.9105024800517576, + "grad_norm": 0.20038245618343353, + "learning_rate": 0.00011939255320614663, + "loss": 2.3136, + "step": 2111 + }, + { + "epoch": 0.9109337934009057, + "grad_norm": 0.1741410195827484, + "learning_rate": 0.00011936504110225953, + "loss": 2.2522, + "step": 2112 + }, + { + "epoch": 0.9113651067500539, + "grad_norm": 0.17310325801372528, + "learning_rate": 0.00011933751981210548, + "loss": 2.1328, + "step": 2113 + }, + { + "epoch": 0.911796420099202, + "grad_norm": 0.19136282801628113, + "learning_rate": 0.00011930998934138306, + "loss": 2.2303, + "step": 2114 + }, + { + "epoch": 0.9122277334483502, + "grad_norm": 0.18183456361293793, + "learning_rate": 0.00011928244969579281, + "loss": 2.3831, + "step": 2115 + }, + { + "epoch": 0.9126590467974984, + "grad_norm": 0.1687455177307129, + "learning_rate": 0.00011925490088103708, + "loss": 2.2036, + "step": 2116 + }, + { + "epoch": 0.9130903601466466, + "grad_norm": 0.16309991478919983, + "learning_rate": 0.00011922734290282017, + "loss": 2.1591, + "step": 2117 + }, + { + "epoch": 0.9135216734957947, + "grad_norm": 0.16328085958957672, + "learning_rate": 0.00011919977576684822, + "loss": 2.2273, + "step": 2118 + }, + { + "epoch": 0.9139529868449429, + "grad_norm": 0.21833649277687073, + "learning_rate": 0.00011917219947882936, + "loss": 2.3208, + "step": 2119 + }, + { + "epoch": 0.914384300194091, + "grad_norm": 0.19529038667678833, + "learning_rate": 0.00011914461404447352, + "loss": 2.2636, + "step": 2120 + }, + { + "epoch": 0.9148156135432391, + "grad_norm": 0.16871854662895203, + "learning_rate": 0.00011911701946949259, + "loss": 2.2352, + "step": 2121 + }, + { + "epoch": 0.9152469268923873, + "grad_norm": 0.19284936785697937, + "learning_rate": 0.00011908941575960034, + "loss": 2.3068, + "step": 2122 + }, + { + "epoch": 0.9156782402415354, + "grad_norm": 0.1705864518880844, + "learning_rate": 0.0001190618029205124, + "loss": 2.1109, + "step": 2123 + }, + { + "epoch": 0.9161095535906836, + "grad_norm": 0.19411548972129822, + "learning_rate": 0.00011903418095794631, + "loss": 2.3403, + "step": 2124 + }, + { + "epoch": 0.9165408669398318, + "grad_norm": 0.16737037897109985, + "learning_rate": 0.00011900654987762148, + "loss": 2.3417, + "step": 2125 + }, + { + "epoch": 0.9165408669398318, + "eval_loss": 2.1069459915161133, + "eval_runtime": 201.2256, + "eval_samples_per_second": 0.159, + "eval_steps_per_second": 0.159, + "step": 2125 + }, + { + "epoch": 0.91697218028898, + "grad_norm": 0.1877131313085556, + "learning_rate": 0.00011897890968525929, + "loss": 2.2791, + "step": 2126 + }, + { + "epoch": 0.9174034936381281, + "grad_norm": 0.17505024373531342, + "learning_rate": 0.00011895126038658288, + "loss": 2.2123, + "step": 2127 + }, + { + "epoch": 0.9178348069872763, + "grad_norm": 0.1770326942205429, + "learning_rate": 0.00011892360198731738, + "loss": 2.3355, + "step": 2128 + }, + { + "epoch": 0.9182661203364244, + "grad_norm": 0.1650460958480835, + "learning_rate": 0.00011889593449318973, + "loss": 2.2143, + "step": 2129 + }, + { + "epoch": 0.9186974336855726, + "grad_norm": 0.18563257157802582, + "learning_rate": 0.00011886825790992882, + "loss": 2.1454, + "step": 2130 + }, + { + "epoch": 0.9191287470347207, + "grad_norm": 0.16880924999713898, + "learning_rate": 0.00011884057224326537, + "loss": 2.1986, + "step": 2131 + }, + { + "epoch": 0.9195600603838688, + "grad_norm": 0.15574918687343597, + "learning_rate": 0.000118812877498932, + "loss": 2.3588, + "step": 2132 + }, + { + "epoch": 0.919991373733017, + "grad_norm": 0.18395072221755981, + "learning_rate": 0.00011878517368266322, + "loss": 2.1824, + "step": 2133 + }, + { + "epoch": 0.9204226870821652, + "grad_norm": 0.17193692922592163, + "learning_rate": 0.00011875746080019538, + "loss": 2.2512, + "step": 2134 + }, + { + "epoch": 0.9208540004313134, + "grad_norm": 0.16654035449028015, + "learning_rate": 0.0001187297388572668, + "loss": 2.1752, + "step": 2135 + }, + { + "epoch": 0.9212853137804615, + "grad_norm": 0.14945533871650696, + "learning_rate": 0.00011870200785961754, + "loss": 2.1153, + "step": 2136 + }, + { + "epoch": 0.9217166271296097, + "grad_norm": 0.174434095621109, + "learning_rate": 0.00011867426781298962, + "loss": 2.1908, + "step": 2137 + }, + { + "epoch": 0.9221479404787578, + "grad_norm": 0.1762343943119049, + "learning_rate": 0.00011864651872312697, + "loss": 2.482, + "step": 2138 + }, + { + "epoch": 0.922579253827906, + "grad_norm": 0.2071942687034607, + "learning_rate": 0.0001186187605957753, + "loss": 2.175, + "step": 2139 + }, + { + "epoch": 0.9230105671770541, + "grad_norm": 0.16858017444610596, + "learning_rate": 0.00011859099343668223, + "loss": 2.136, + "step": 2140 + }, + { + "epoch": 0.9234418805262022, + "grad_norm": 0.17030109465122223, + "learning_rate": 0.00011856321725159727, + "loss": 2.2699, + "step": 2141 + }, + { + "epoch": 0.9238731938753505, + "grad_norm": 0.183243066072464, + "learning_rate": 0.00011853543204627178, + "loss": 2.135, + "step": 2142 + }, + { + "epoch": 0.9243045072244986, + "grad_norm": 0.18901191651821136, + "learning_rate": 0.00011850763782645899, + "loss": 2.1073, + "step": 2143 + }, + { + "epoch": 0.9247358205736468, + "grad_norm": 0.2096487283706665, + "learning_rate": 0.000118479834597914, + "loss": 2.363, + "step": 2144 + }, + { + "epoch": 0.9251671339227949, + "grad_norm": 0.16720110177993774, + "learning_rate": 0.00011845202236639378, + "loss": 2.0968, + "step": 2145 + }, + { + "epoch": 0.9255984472719431, + "grad_norm": 0.1674879938364029, + "learning_rate": 0.00011842420113765712, + "loss": 2.2427, + "step": 2146 + }, + { + "epoch": 0.9260297606210912, + "grad_norm": 0.18135453760623932, + "learning_rate": 0.00011839637091746476, + "loss": 2.2825, + "step": 2147 + }, + { + "epoch": 0.9264610739702394, + "grad_norm": 0.21567225456237793, + "learning_rate": 0.0001183685317115792, + "loss": 2.2642, + "step": 2148 + }, + { + "epoch": 0.9268923873193875, + "grad_norm": 0.18386003375053406, + "learning_rate": 0.0001183406835257649, + "loss": 2.1699, + "step": 2149 + }, + { + "epoch": 0.9273237006685356, + "grad_norm": 0.17996805906295776, + "learning_rate": 0.0001183128263657881, + "loss": 2.1956, + "step": 2150 + }, + { + "epoch": 0.9273237006685356, + "eval_loss": 2.106332302093506, + "eval_runtime": 201.7299, + "eval_samples_per_second": 0.159, + "eval_steps_per_second": 0.159, + "step": 2150 + }, + { + "epoch": 0.9277550140176839, + "grad_norm": 0.1627648025751114, + "learning_rate": 0.00011828496023741696, + "loss": 2.0021, + "step": 2151 + }, + { + "epoch": 0.928186327366832, + "grad_norm": 0.17289264500141144, + "learning_rate": 0.00011825708514642141, + "loss": 2.0195, + "step": 2152 + }, + { + "epoch": 0.9286176407159802, + "grad_norm": 0.1960097998380661, + "learning_rate": 0.00011822920109857335, + "loss": 2.1659, + "step": 2153 + }, + { + "epoch": 0.9290489540651283, + "grad_norm": 0.16027681529521942, + "learning_rate": 0.00011820130809964646, + "loss": 2.2256, + "step": 2154 + }, + { + "epoch": 0.9294802674142765, + "grad_norm": 0.18063746392726898, + "learning_rate": 0.00011817340615541629, + "loss": 2.1708, + "step": 2155 + }, + { + "epoch": 0.9299115807634246, + "grad_norm": 0.1574932336807251, + "learning_rate": 0.00011814549527166022, + "loss": 1.9852, + "step": 2156 + }, + { + "epoch": 0.9303428941125728, + "grad_norm": 0.16979452967643738, + "learning_rate": 0.00011811757545415755, + "loss": 2.1747, + "step": 2157 + }, + { + "epoch": 0.9307742074617209, + "grad_norm": 0.16161035001277924, + "learning_rate": 0.00011808964670868934, + "loss": 2.2419, + "step": 2158 + }, + { + "epoch": 0.931205520810869, + "grad_norm": 0.17380760610103607, + "learning_rate": 0.00011806170904103855, + "loss": 2.0742, + "step": 2159 + }, + { + "epoch": 0.9316368341600173, + "grad_norm": 0.1760004758834839, + "learning_rate": 0.00011803376245699, + "loss": 2.0483, + "step": 2160 + }, + { + "epoch": 0.9320681475091654, + "grad_norm": 0.1811494380235672, + "learning_rate": 0.00011800580696233032, + "loss": 1.969, + "step": 2161 + }, + { + "epoch": 0.9324994608583136, + "grad_norm": 0.1914118230342865, + "learning_rate": 0.000117977842562848, + "loss": 2.1789, + "step": 2162 + }, + { + "epoch": 0.9329307742074617, + "grad_norm": 0.17660760879516602, + "learning_rate": 0.0001179498692643334, + "loss": 1.9431, + "step": 2163 + }, + { + "epoch": 0.9333620875566099, + "grad_norm": 0.20114588737487793, + "learning_rate": 0.00011792188707257865, + "loss": 2.3242, + "step": 2164 + }, + { + "epoch": 0.933793400905758, + "grad_norm": 0.18631604313850403, + "learning_rate": 0.0001178938959933778, + "loss": 2.2464, + "step": 2165 + }, + { + "epoch": 0.9342247142549062, + "grad_norm": 0.18342387676239014, + "learning_rate": 0.00011786589603252672, + "loss": 2.148, + "step": 2166 + }, + { + "epoch": 0.9346560276040543, + "grad_norm": 0.18505950272083282, + "learning_rate": 0.00011783788719582309, + "loss": 2.1848, + "step": 2167 + }, + { + "epoch": 0.9350873409532025, + "grad_norm": 0.16741490364074707, + "learning_rate": 0.00011780986948906645, + "loss": 2.3077, + "step": 2168 + }, + { + "epoch": 0.9355186543023507, + "grad_norm": 0.19790899753570557, + "learning_rate": 0.00011778184291805816, + "loss": 2.2925, + "step": 2169 + }, + { + "epoch": 0.9359499676514988, + "grad_norm": 0.18894536793231964, + "learning_rate": 0.00011775380748860145, + "loss": 2.3668, + "step": 2170 + }, + { + "epoch": 0.936381281000647, + "grad_norm": 0.19968891143798828, + "learning_rate": 0.00011772576320650135, + "loss": 2.2531, + "step": 2171 + }, + { + "epoch": 0.9368125943497951, + "grad_norm": 0.18062502145767212, + "learning_rate": 0.00011769771007756472, + "loss": 2.1539, + "step": 2172 + }, + { + "epoch": 0.9372439076989433, + "grad_norm": 0.1887185275554657, + "learning_rate": 0.0001176696481076003, + "loss": 2.3336, + "step": 2173 + }, + { + "epoch": 0.9376752210480914, + "grad_norm": 0.1643417924642563, + "learning_rate": 0.00011764157730241863, + "loss": 2.2023, + "step": 2174 + }, + { + "epoch": 0.9381065343972396, + "grad_norm": 0.17743735015392303, + "learning_rate": 0.00011761349766783202, + "loss": 2.3152, + "step": 2175 + }, + { + "epoch": 0.9381065343972396, + "eval_loss": 2.1060848236083984, + "eval_runtime": 202.8585, + "eval_samples_per_second": 0.158, + "eval_steps_per_second": 0.158, + "step": 2175 + }, + { + "epoch": 0.9385378477463877, + "grad_norm": 0.15262340009212494, + "learning_rate": 0.00011758540920965472, + "loss": 2.1931, + "step": 2176 + }, + { + "epoch": 0.9389691610955359, + "grad_norm": 0.17489151656627655, + "learning_rate": 0.00011755731193370274, + "loss": 2.1653, + "step": 2177 + }, + { + "epoch": 0.9394004744446841, + "grad_norm": 0.16846604645252228, + "learning_rate": 0.00011752920584579393, + "loss": 2.3961, + "step": 2178 + }, + { + "epoch": 0.9398317877938323, + "grad_norm": 0.16123254597187042, + "learning_rate": 0.00011750109095174798, + "loss": 2.0953, + "step": 2179 + }, + { + "epoch": 0.9402631011429804, + "grad_norm": 0.16449014842510223, + "learning_rate": 0.0001174729672573863, + "loss": 2.1383, + "step": 2180 + }, + { + "epoch": 0.9406944144921285, + "grad_norm": 0.23591160774230957, + "learning_rate": 0.00011744483476853234, + "loss": 2.3589, + "step": 2181 + }, + { + "epoch": 0.9411257278412767, + "grad_norm": 0.16046296060085297, + "learning_rate": 0.00011741669349101113, + "loss": 2.1155, + "step": 2182 + }, + { + "epoch": 0.9415570411904248, + "grad_norm": 0.15319305658340454, + "learning_rate": 0.00011738854343064969, + "loss": 2.2793, + "step": 2183 + }, + { + "epoch": 0.941988354539573, + "grad_norm": 0.17600052058696747, + "learning_rate": 0.00011736038459327678, + "loss": 2.1299, + "step": 2184 + }, + { + "epoch": 0.9424196678887211, + "grad_norm": 0.17410261929035187, + "learning_rate": 0.00011733221698472299, + "loss": 2.2609, + "step": 2185 + }, + { + "epoch": 0.9428509812378693, + "grad_norm": 0.16323557496070862, + "learning_rate": 0.00011730404061082072, + "loss": 2.0666, + "step": 2186 + }, + { + "epoch": 0.9432822945870175, + "grad_norm": 0.16431361436843872, + "learning_rate": 0.00011727585547740425, + "loss": 2.1692, + "step": 2187 + }, + { + "epoch": 0.9437136079361657, + "grad_norm": 0.1787741631269455, + "learning_rate": 0.00011724766159030955, + "loss": 2.1846, + "step": 2188 + }, + { + "epoch": 0.9441449212853138, + "grad_norm": 0.17085330188274384, + "learning_rate": 0.00011721945895537452, + "loss": 2.3124, + "step": 2189 + }, + { + "epoch": 0.9445762346344619, + "grad_norm": 0.170313760638237, + "learning_rate": 0.0001171912475784388, + "loss": 2.3532, + "step": 2190 + }, + { + "epoch": 0.9450075479836101, + "grad_norm": 0.15803568065166473, + "learning_rate": 0.0001171630274653439, + "loss": 2.1966, + "step": 2191 + }, + { + "epoch": 0.9454388613327582, + "grad_norm": 0.17301326990127563, + "learning_rate": 0.00011713479862193304, + "loss": 2.1362, + "step": 2192 + }, + { + "epoch": 0.9458701746819064, + "grad_norm": 0.1860528141260147, + "learning_rate": 0.00011710656105405136, + "loss": 2.365, + "step": 2193 + }, + { + "epoch": 0.9463014880310545, + "grad_norm": 0.16941936314105988, + "learning_rate": 0.00011707831476754573, + "loss": 2.1095, + "step": 2194 + }, + { + "epoch": 0.9467328013802028, + "grad_norm": 0.1830940544605255, + "learning_rate": 0.0001170500597682649, + "loss": 2.2742, + "step": 2195 + }, + { + "epoch": 0.9471641147293509, + "grad_norm": 0.16178883612155914, + "learning_rate": 0.00011702179606205932, + "loss": 2.1781, + "step": 2196 + }, + { + "epoch": 0.9475954280784991, + "grad_norm": 0.18592123687267303, + "learning_rate": 0.00011699352365478133, + "loss": 2.1072, + "step": 2197 + }, + { + "epoch": 0.9480267414276472, + "grad_norm": 0.17916928231716156, + "learning_rate": 0.00011696524255228504, + "loss": 2.1772, + "step": 2198 + }, + { + "epoch": 0.9484580547767953, + "grad_norm": 0.1986900418996811, + "learning_rate": 0.00011693695276042633, + "loss": 2.2692, + "step": 2199 + }, + { + "epoch": 0.9488893681259435, + "grad_norm": 0.18639887869358063, + "learning_rate": 0.00011690865428506296, + "loss": 2.4306, + "step": 2200 + }, + { + "epoch": 0.9488893681259435, + "eval_loss": 2.1058454513549805, + "eval_runtime": 201.9377, + "eval_samples_per_second": 0.158, + "eval_steps_per_second": 0.158, + "step": 2200 + }, + { + "epoch": 0.9493206814750916, + "grad_norm": 0.1623578667640686, + "learning_rate": 0.0001168803471320544, + "loss": 2.2295, + "step": 2201 + }, + { + "epoch": 0.9497519948242398, + "grad_norm": 0.16612379252910614, + "learning_rate": 0.00011685203130726201, + "loss": 2.4427, + "step": 2202 + }, + { + "epoch": 0.9501833081733879, + "grad_norm": 0.16849513351917267, + "learning_rate": 0.00011682370681654879, + "loss": 2.2249, + "step": 2203 + }, + { + "epoch": 0.9506146215225362, + "grad_norm": 0.18068940937519073, + "learning_rate": 0.00011679537366577975, + "loss": 2.329, + "step": 2204 + }, + { + "epoch": 0.9510459348716843, + "grad_norm": 0.15872099995613098, + "learning_rate": 0.00011676703186082149, + "loss": 2.256, + "step": 2205 + }, + { + "epoch": 0.9514772482208325, + "grad_norm": 0.202578604221344, + "learning_rate": 0.00011673868140754254, + "loss": 2.2509, + "step": 2206 + }, + { + "epoch": 0.9519085615699806, + "grad_norm": 0.15152516961097717, + "learning_rate": 0.00011671032231181317, + "loss": 2.2613, + "step": 2207 + }, + { + "epoch": 0.9523398749191287, + "grad_norm": 0.16518807411193848, + "learning_rate": 0.00011668195457950541, + "loss": 2.2611, + "step": 2208 + }, + { + "epoch": 0.9527711882682769, + "grad_norm": 0.16880130767822266, + "learning_rate": 0.00011665357821649314, + "loss": 2.1909, + "step": 2209 + }, + { + "epoch": 0.953202501617425, + "grad_norm": 0.18211935460567474, + "learning_rate": 0.00011662519322865198, + "loss": 2.2239, + "step": 2210 + }, + { + "epoch": 0.9536338149665732, + "grad_norm": 0.17535841464996338, + "learning_rate": 0.00011659679962185937, + "loss": 2.2193, + "step": 2211 + }, + { + "epoch": 0.9540651283157213, + "grad_norm": 0.16001532971858978, + "learning_rate": 0.00011656839740199448, + "loss": 1.8185, + "step": 2212 + }, + { + "epoch": 0.9544964416648696, + "grad_norm": 0.16617374122142792, + "learning_rate": 0.00011653998657493834, + "loss": 2.1581, + "step": 2213 + }, + { + "epoch": 0.9549277550140177, + "grad_norm": 0.18469476699829102, + "learning_rate": 0.00011651156714657369, + "loss": 2.2726, + "step": 2214 + }, + { + "epoch": 0.9553590683631659, + "grad_norm": 0.16976144909858704, + "learning_rate": 0.00011648313912278511, + "loss": 2.3153, + "step": 2215 + }, + { + "epoch": 0.955790381712314, + "grad_norm": 0.17235201597213745, + "learning_rate": 0.00011645470250945892, + "loss": 2.0512, + "step": 2216 + }, + { + "epoch": 0.9562216950614622, + "grad_norm": 17.926816940307617, + "learning_rate": 0.00011642625731248325, + "loss": 2.3241, + "step": 2217 + }, + { + "epoch": 0.9566530084106103, + "grad_norm": 0.19533559679985046, + "learning_rate": 0.00011639780353774797, + "loss": 2.2302, + "step": 2218 + }, + { + "epoch": 0.9570843217597584, + "grad_norm": 0.1967902034521103, + "learning_rate": 0.00011636934119114472, + "loss": 2.3553, + "step": 2219 + }, + { + "epoch": 0.9575156351089066, + "grad_norm": 0.17256464064121246, + "learning_rate": 0.00011634087027856697, + "loss": 2.3912, + "step": 2220 + }, + { + "epoch": 0.9579469484580547, + "grad_norm": 0.17740106582641602, + "learning_rate": 0.00011631239080590998, + "loss": 2.2265, + "step": 2221 + }, + { + "epoch": 0.958378261807203, + "grad_norm": 0.1855895072221756, + "learning_rate": 0.00011628390277907065, + "loss": 2.1366, + "step": 2222 + }, + { + "epoch": 0.9588095751563511, + "grad_norm": 0.21072974801063538, + "learning_rate": 0.00011625540620394779, + "loss": 2.0472, + "step": 2223 + }, + { + "epoch": 0.9592408885054993, + "grad_norm": 0.3419710695743561, + "learning_rate": 0.00011622690108644192, + "loss": 2.3737, + "step": 2224 + }, + { + "epoch": 0.9596722018546474, + "grad_norm": 0.17139427363872528, + "learning_rate": 0.00011619838743245531, + "loss": 2.1824, + "step": 2225 + }, + { + "epoch": 0.9596722018546474, + "eval_loss": 2.106443405151367, + "eval_runtime": 206.0581, + "eval_samples_per_second": 0.155, + "eval_steps_per_second": 0.155, + "step": 2225 + }, + { + "epoch": 0.9601035152037956, + "grad_norm": 0.1631966233253479, + "learning_rate": 0.00011616986524789204, + "loss": 2.1654, + "step": 2226 + }, + { + "epoch": 0.9605348285529437, + "grad_norm": 0.16243740916252136, + "learning_rate": 0.00011614133453865798, + "loss": 2.3102, + "step": 2227 + }, + { + "epoch": 0.9609661419020918, + "grad_norm": 0.18797215819358826, + "learning_rate": 0.00011611279531066066, + "loss": 2.2347, + "step": 2228 + }, + { + "epoch": 0.96139745525124, + "grad_norm": 0.1728636771440506, + "learning_rate": 0.00011608424756980947, + "loss": 2.2274, + "step": 2229 + }, + { + "epoch": 0.9618287686003881, + "grad_norm": 0.20025914907455444, + "learning_rate": 0.00011605569132201557, + "loss": 2.1984, + "step": 2230 + }, + { + "epoch": 0.9622600819495364, + "grad_norm": 0.16293105483055115, + "learning_rate": 0.00011602712657319174, + "loss": 2.2789, + "step": 2231 + }, + { + "epoch": 0.9626913952986845, + "grad_norm": 0.17379815876483917, + "learning_rate": 0.00011599855332925275, + "loss": 2.3766, + "step": 2232 + }, + { + "epoch": 0.9631227086478327, + "grad_norm": 10.470320701599121, + "learning_rate": 0.0001159699715961149, + "loss": 1.8858, + "step": 2233 + }, + { + "epoch": 0.9635540219969808, + "grad_norm": 0.19438959658145905, + "learning_rate": 0.00011594138137969641, + "loss": 2.368, + "step": 2234 + }, + { + "epoch": 0.963985335346129, + "grad_norm": 0.19522325694561005, + "learning_rate": 0.00011591278268591717, + "loss": 2.2328, + "step": 2235 + }, + { + "epoch": 0.9644166486952771, + "grad_norm": 0.2502066493034363, + "learning_rate": 0.00011588417552069885, + "loss": 2.4032, + "step": 2236 + }, + { + "epoch": 0.9648479620444252, + "grad_norm": 0.2284168004989624, + "learning_rate": 0.00011585555988996491, + "loss": 2.3211, + "step": 2237 + }, + { + "epoch": 0.9652792753935734, + "grad_norm": 0.2279599905014038, + "learning_rate": 0.00011582693579964048, + "loss": 2.3949, + "step": 2238 + }, + { + "epoch": 0.9657105887427215, + "grad_norm": 94.24360656738281, + "learning_rate": 0.00011579830325565254, + "loss": 2.2135, + "step": 2239 + }, + { + "epoch": 0.9661419020918698, + "grad_norm": 0.17028680443763733, + "learning_rate": 0.00011576966226392974, + "loss": 2.2827, + "step": 2240 + }, + { + "epoch": 0.9665732154410179, + "grad_norm": 0.8349494338035583, + "learning_rate": 0.00011574101283040251, + "loss": 2.224, + "step": 2241 + }, + { + "epoch": 0.9670045287901661, + "grad_norm": 0.18213754892349243, + "learning_rate": 0.00011571235496100303, + "loss": 2.2549, + "step": 2242 + }, + { + "epoch": 0.9674358421393142, + "grad_norm": 0.21162238717079163, + "learning_rate": 0.00011568368866166525, + "loss": 2.3134, + "step": 2243 + }, + { + "epoch": 0.9678671554884624, + "grad_norm": 0.18910057842731476, + "learning_rate": 0.00011565501393832484, + "loss": 2.3069, + "step": 2244 + }, + { + "epoch": 0.9682984688376105, + "grad_norm": 0.2546372413635254, + "learning_rate": 0.00011562633079691917, + "loss": 2.1317, + "step": 2245 + }, + { + "epoch": 0.9687297821867586, + "grad_norm": 0.17593315243721008, + "learning_rate": 0.00011559763924338746, + "loss": 2.0852, + "step": 2246 + }, + { + "epoch": 0.9691610955359068, + "grad_norm": 0.17111103236675262, + "learning_rate": 0.00011556893928367052, + "loss": 2.3776, + "step": 2247 + }, + { + "epoch": 0.969592408885055, + "grad_norm": 0.1876457780599594, + "learning_rate": 0.0001155402309237111, + "loss": 2.2422, + "step": 2248 + }, + { + "epoch": 0.9700237222342032, + "grad_norm": 0.16125169396400452, + "learning_rate": 0.0001155115141694535, + "loss": 2.1283, + "step": 2249 + }, + { + "epoch": 0.9704550355833513, + "grad_norm": 0.16092783212661743, + "learning_rate": 0.00011548278902684386, + "loss": 2.2404, + "step": 2250 + }, + { + "epoch": 0.9704550355833513, + "eval_loss": 2.107475757598877, + "eval_runtime": 201.252, + "eval_samples_per_second": 0.159, + "eval_steps_per_second": 0.159, + "step": 2250 + }, + { + "epoch": 0.9708863489324995, + "grad_norm": 0.17660339176654816, + "learning_rate": 0.00011545405550183006, + "loss": 2.3106, + "step": 2251 + }, + { + "epoch": 0.9713176622816476, + "grad_norm": 0.21650633215904236, + "learning_rate": 0.00011542531360036165, + "loss": 2.2065, + "step": 2252 + }, + { + "epoch": 0.9717489756307958, + "grad_norm": 0.18377810716629028, + "learning_rate": 0.00011539656332838998, + "loss": 2.1903, + "step": 2253 + }, + { + "epoch": 0.9721802889799439, + "grad_norm": 0.164555162191391, + "learning_rate": 0.00011536780469186812, + "loss": 2.1216, + "step": 2254 + }, + { + "epoch": 0.972611602329092, + "grad_norm": 0.16842810809612274, + "learning_rate": 0.00011533903769675082, + "loss": 2.0297, + "step": 2255 + }, + { + "epoch": 0.9730429156782402, + "grad_norm": 0.17207801342010498, + "learning_rate": 0.00011531026234899464, + "loss": 2.1705, + "step": 2256 + }, + { + "epoch": 0.9734742290273884, + "grad_norm": 0.17921404540538788, + "learning_rate": 0.00011528147865455781, + "loss": 2.247, + "step": 2257 + }, + { + "epoch": 0.9739055423765366, + "grad_norm": 0.17994186282157898, + "learning_rate": 0.00011525268661940033, + "loss": 2.0532, + "step": 2258 + }, + { + "epoch": 0.9743368557256847, + "grad_norm": 0.19421452283859253, + "learning_rate": 0.00011522388624948388, + "loss": 2.0998, + "step": 2259 + }, + { + "epoch": 0.9747681690748329, + "grad_norm": 0.1837734878063202, + "learning_rate": 0.00011519507755077189, + "loss": 2.3626, + "step": 2260 + }, + { + "epoch": 0.975199482423981, + "grad_norm": 0.18781611323356628, + "learning_rate": 0.00011516626052922953, + "loss": 2.3344, + "step": 2261 + }, + { + "epoch": 0.9756307957731292, + "grad_norm": 0.167991504073143, + "learning_rate": 0.00011513743519082369, + "loss": 2.0942, + "step": 2262 + }, + { + "epoch": 0.9760621091222773, + "grad_norm": 0.1679423600435257, + "learning_rate": 0.00011510860154152294, + "loss": 2.1936, + "step": 2263 + }, + { + "epoch": 0.9764934224714255, + "grad_norm": 0.16400964558124542, + "learning_rate": 0.00011507975958729763, + "loss": 2.0745, + "step": 2264 + }, + { + "epoch": 0.9769247358205736, + "grad_norm": 0.18759959936141968, + "learning_rate": 0.0001150509093341198, + "loss": 2.3091, + "step": 2265 + }, + { + "epoch": 0.9773560491697219, + "grad_norm": 0.16005435585975647, + "learning_rate": 0.00011502205078796317, + "loss": 2.2465, + "step": 2266 + }, + { + "epoch": 0.97778736251887, + "grad_norm": 0.16512760519981384, + "learning_rate": 0.0001149931839548033, + "loss": 2.225, + "step": 2267 + }, + { + "epoch": 0.9782186758680181, + "grad_norm": 0.1739596426486969, + "learning_rate": 0.0001149643088406173, + "loss": 2.3453, + "step": 2268 + }, + { + "epoch": 0.9786499892171663, + "grad_norm": 0.17262277007102966, + "learning_rate": 0.00011493542545138411, + "loss": 2.0907, + "step": 2269 + }, + { + "epoch": 0.9790813025663144, + "grad_norm": 0.1671365201473236, + "learning_rate": 0.00011490653379308439, + "loss": 2.3431, + "step": 2270 + }, + { + "epoch": 0.9795126159154626, + "grad_norm": 0.16103728115558624, + "learning_rate": 0.0001148776338717004, + "loss": 2.2239, + "step": 2271 + }, + { + "epoch": 0.9799439292646107, + "grad_norm": 0.16833123564720154, + "learning_rate": 0.00011484872569321624, + "loss": 2.0763, + "step": 2272 + }, + { + "epoch": 0.9803752426137589, + "grad_norm": 0.1860317438840866, + "learning_rate": 0.00011481980926361765, + "loss": 2.0637, + "step": 2273 + }, + { + "epoch": 0.980806555962907, + "grad_norm": 0.1666116863489151, + "learning_rate": 0.00011479088458889208, + "loss": 2.149, + "step": 2274 + }, + { + "epoch": 0.9812378693120553, + "grad_norm": 0.22403891384601593, + "learning_rate": 0.0001147619516750287, + "loss": 2.3037, + "step": 2275 + }, + { + "epoch": 0.9812378693120553, + "eval_loss": 2.106496572494507, + "eval_runtime": 198.6172, + "eval_samples_per_second": 0.161, + "eval_steps_per_second": 0.161, + "step": 2275 + }, + { + "epoch": 0.9816691826612034, + "grad_norm": 0.17930032312870026, + "learning_rate": 0.00011473301052801843, + "loss": 2.218, + "step": 2276 + }, + { + "epoch": 0.9821004960103515, + "grad_norm": 0.2308432161808014, + "learning_rate": 0.00011470406115385378, + "loss": 1.9874, + "step": 2277 + }, + { + "epoch": 0.9825318093594997, + "grad_norm": 0.18067146837711334, + "learning_rate": 0.0001146751035585291, + "loss": 2.104, + "step": 2278 + }, + { + "epoch": 0.9829631227086478, + "grad_norm": 0.17085899412631989, + "learning_rate": 0.00011464613774804034, + "loss": 2.1093, + "step": 2279 + }, + { + "epoch": 0.983394436057796, + "grad_norm": 0.19405338168144226, + "learning_rate": 0.00011461716372838518, + "loss": 2.1491, + "step": 2280 + }, + { + "epoch": 0.9838257494069441, + "grad_norm": 0.17247186601161957, + "learning_rate": 0.00011458818150556304, + "loss": 2.1054, + "step": 2281 + }, + { + "epoch": 0.9842570627560923, + "grad_norm": 0.15420669317245483, + "learning_rate": 0.00011455919108557499, + "loss": 2.0272, + "step": 2282 + }, + { + "epoch": 0.9846883761052404, + "grad_norm": 0.18149659037590027, + "learning_rate": 0.0001145301924744238, + "loss": 2.2578, + "step": 2283 + }, + { + "epoch": 0.9851196894543887, + "grad_norm": 0.1740005910396576, + "learning_rate": 0.00011450118567811398, + "loss": 2.2191, + "step": 2284 + }, + { + "epoch": 0.9855510028035368, + "grad_norm": 0.16811902821063995, + "learning_rate": 0.00011447217070265168, + "loss": 2.0406, + "step": 2285 + }, + { + "epoch": 0.985982316152685, + "grad_norm": 0.17773064970970154, + "learning_rate": 0.00011444314755404478, + "loss": 2.4005, + "step": 2286 + }, + { + "epoch": 0.9864136295018331, + "grad_norm": 0.18015626072883606, + "learning_rate": 0.00011441411623830284, + "loss": 2.3781, + "step": 2287 + }, + { + "epoch": 0.9868449428509812, + "grad_norm": 0.1616867482662201, + "learning_rate": 0.0001143850767614371, + "loss": 2.1957, + "step": 2288 + }, + { + "epoch": 0.9872762562001294, + "grad_norm": 0.17316801846027374, + "learning_rate": 0.00011435602912946051, + "loss": 2.225, + "step": 2289 + }, + { + "epoch": 0.9877075695492775, + "grad_norm": 0.16571733355522156, + "learning_rate": 0.00011432697334838769, + "loss": 2.2129, + "step": 2290 + }, + { + "epoch": 0.9881388828984257, + "grad_norm": 0.19395996630191803, + "learning_rate": 0.00011429790942423497, + "loss": 2.198, + "step": 2291 + }, + { + "epoch": 0.9885701962475738, + "grad_norm": 0.18082626163959503, + "learning_rate": 0.00011426883736302036, + "loss": 2.2387, + "step": 2292 + }, + { + "epoch": 0.9890015095967221, + "grad_norm": 0.17537705600261688, + "learning_rate": 0.00011423975717076354, + "loss": 2.1682, + "step": 2293 + }, + { + "epoch": 0.9894328229458702, + "grad_norm": 0.15734168887138367, + "learning_rate": 0.00011421066885348587, + "loss": 2.2326, + "step": 2294 + }, + { + "epoch": 0.9898641362950183, + "grad_norm": 0.1727844476699829, + "learning_rate": 0.00011418157241721042, + "loss": 2.2387, + "step": 2295 + }, + { + "epoch": 0.9902954496441665, + "grad_norm": 0.24999341368675232, + "learning_rate": 0.00011415246786796189, + "loss": 1.9656, + "step": 2296 + }, + { + "epoch": 0.9907267629933146, + "grad_norm": 0.36071884632110596, + "learning_rate": 0.00011412335521176677, + "loss": 2.1987, + "step": 2297 + }, + { + "epoch": 0.9911580763424628, + "grad_norm": 0.16273179650306702, + "learning_rate": 0.00011409423445465306, + "loss": 2.1133, + "step": 2298 + }, + { + "epoch": 0.9915893896916109, + "grad_norm": 0.17782722413539886, + "learning_rate": 0.0001140651056026506, + "loss": 2.2404, + "step": 2299 + }, + { + "epoch": 0.9920207030407591, + "grad_norm": 0.17763769626617432, + "learning_rate": 0.00011403596866179084, + "loss": 2.3585, + "step": 2300 + }, + { + "epoch": 0.9920207030407591, + "eval_loss": 2.1060075759887695, + "eval_runtime": 199.8504, + "eval_samples_per_second": 0.16, + "eval_steps_per_second": 0.16, + "step": 2300 + }, + { + "epoch": 0.9924520163899073, + "grad_norm": 0.17662031948566437, + "learning_rate": 0.00011400682363810685, + "loss": 1.9006, + "step": 2301 + }, + { + "epoch": 0.9928833297390555, + "grad_norm": 0.17285652458667755, + "learning_rate": 0.00011397767053763346, + "loss": 1.9154, + "step": 2302 + }, + { + "epoch": 0.9933146430882036, + "grad_norm": 0.15654821693897247, + "learning_rate": 0.00011394850936640714, + "loss": 2.1394, + "step": 2303 + }, + { + "epoch": 0.9937459564373518, + "grad_norm": 0.15930558741092682, + "learning_rate": 0.000113919340130466, + "loss": 2.3887, + "step": 2304 + }, + { + "epoch": 0.9941772697864999, + "grad_norm": 0.23084445297718048, + "learning_rate": 0.00011389016283584987, + "loss": 2.2288, + "step": 2305 + }, + { + "epoch": 0.994608583135648, + "grad_norm": 0.16395895183086395, + "learning_rate": 0.00011386097748860025, + "loss": 2.3934, + "step": 2306 + }, + { + "epoch": 0.9950398964847962, + "grad_norm": 0.19965040683746338, + "learning_rate": 0.00011383178409476027, + "loss": 2.1694, + "step": 2307 + }, + { + "epoch": 0.9954712098339443, + "grad_norm": 0.17996490001678467, + "learning_rate": 0.00011380258266037472, + "loss": 2.0064, + "step": 2308 + }, + { + "epoch": 0.9959025231830925, + "grad_norm": 0.1590234488248825, + "learning_rate": 0.00011377337319149012, + "loss": 2.2363, + "step": 2309 + }, + { + "epoch": 0.9963338365322407, + "grad_norm": 0.17119617760181427, + "learning_rate": 0.00011374415569415455, + "loss": 2.3706, + "step": 2310 + }, + { + "epoch": 0.9967651498813889, + "grad_norm": 0.15185703337192535, + "learning_rate": 0.00011371493017441785, + "loss": 2.343, + "step": 2311 + }, + { + "epoch": 0.997196463230537, + "grad_norm": 0.18372780084609985, + "learning_rate": 0.00011368569663833149, + "loss": 2.2689, + "step": 2312 + }, + { + "epoch": 0.9976277765796852, + "grad_norm": 0.17568238079547882, + "learning_rate": 0.00011365645509194858, + "loss": 2.2873, + "step": 2313 + }, + { + "epoch": 0.9980590899288333, + "grad_norm": 0.17242491245269775, + "learning_rate": 0.00011362720554132387, + "loss": 2.1604, + "step": 2314 + }, + { + "epoch": 0.9984904032779814, + "grad_norm": 0.15822440385818481, + "learning_rate": 0.00011359794799251387, + "loss": 2.1958, + "step": 2315 + }, + { + "epoch": 0.9989217166271296, + "grad_norm": 0.17576847970485687, + "learning_rate": 0.00011356868245157665, + "loss": 2.0845, + "step": 2316 + }, + { + "epoch": 0.9993530299762777, + "grad_norm": 0.16132158041000366, + "learning_rate": 0.00011353940892457191, + "loss": 2.1926, + "step": 2317 + }, + { + "epoch": 0.9997843433254259, + "grad_norm": 0.16950838267803192, + "learning_rate": 0.00011351012741756111, + "loss": 2.3559, + "step": 2318 + }, + { + "epoch": 1.0004313133491483, + "grad_norm": 0.29467716813087463, + "learning_rate": 0.0001134808379366073, + "loss": 4.4302, + "step": 2319 + }, + { + "epoch": 1.0008626266982963, + "grad_norm": 0.15429019927978516, + "learning_rate": 0.00011345154048777516, + "loss": 2.2957, + "step": 2320 + }, + { + "epoch": 1.0012939400474445, + "grad_norm": 0.16172082722187042, + "learning_rate": 0.00011342223507713106, + "loss": 2.1032, + "step": 2321 + }, + { + "epoch": 1.0017252533965926, + "grad_norm": 0.16826793551445007, + "learning_rate": 0.00011339292171074302, + "loss": 2.2723, + "step": 2322 + }, + { + "epoch": 1.0021565667457408, + "grad_norm": 0.1786830723285675, + "learning_rate": 0.00011336360039468069, + "loss": 2.1119, + "step": 2323 + }, + { + "epoch": 1.0025878800948889, + "grad_norm": 0.15725389122962952, + "learning_rate": 0.00011333427113501533, + "loss": 1.8254, + "step": 2324 + }, + { + "epoch": 1.0030191934440371, + "grad_norm": 0.1785026639699936, + "learning_rate": 0.00011330493393781994, + "loss": 2.2904, + "step": 2325 + }, + { + "epoch": 1.0030191934440371, + "eval_loss": 2.1062262058258057, + "eval_runtime": 195.6116, + "eval_samples_per_second": 0.164, + "eval_steps_per_second": 0.164, + "step": 2325 + }, + { + "epoch": 1.0034505067931851, + "grad_norm": 0.18080540001392365, + "learning_rate": 0.0001132755888091691, + "loss": 2.2291, + "step": 2326 + }, + { + "epoch": 1.0038818201423334, + "grad_norm": 0.21925707161426544, + "learning_rate": 0.000113246235755139, + "loss": 2.1652, + "step": 2327 + }, + { + "epoch": 1.0043131334914817, + "grad_norm": 0.1603548377752304, + "learning_rate": 0.0001132168747818075, + "loss": 2.1039, + "step": 2328 + }, + { + "epoch": 1.0047444468406297, + "grad_norm": 0.15417563915252686, + "learning_rate": 0.00011318750589525418, + "loss": 2.0317, + "step": 2329 + }, + { + "epoch": 1.005175760189778, + "grad_norm": 0.18038475513458252, + "learning_rate": 0.00011315812910156015, + "loss": 2.1561, + "step": 2330 + }, + { + "epoch": 1.005607073538926, + "grad_norm": 0.19016197323799133, + "learning_rate": 0.0001131287444068082, + "loss": 2.2455, + "step": 2331 + }, + { + "epoch": 1.0060383868880742, + "grad_norm": 0.17915791273117065, + "learning_rate": 0.00011309935181708275, + "loss": 2.1837, + "step": 2332 + }, + { + "epoch": 1.0064697002372223, + "grad_norm": 0.1723218560218811, + "learning_rate": 0.00011306995133846986, + "loss": 2.1696, + "step": 2333 + }, + { + "epoch": 1.0069010135863705, + "grad_norm": 0.17618072032928467, + "learning_rate": 0.00011304054297705721, + "loss": 2.1869, + "step": 2334 + }, + { + "epoch": 1.0073323269355186, + "grad_norm": 0.1933479905128479, + "learning_rate": 0.00011301112673893413, + "loss": 2.1879, + "step": 2335 + }, + { + "epoch": 1.0077636402846668, + "grad_norm": 0.17005442082881927, + "learning_rate": 0.0001129817026301916, + "loss": 2.1525, + "step": 2336 + }, + { + "epoch": 1.008194953633815, + "grad_norm": 0.17357569932937622, + "learning_rate": 0.00011295227065692217, + "loss": 2.2398, + "step": 2337 + }, + { + "epoch": 1.008626266982963, + "grad_norm": 0.16071292757987976, + "learning_rate": 0.00011292283082522005, + "loss": 2.1212, + "step": 2338 + }, + { + "epoch": 1.0090575803321113, + "grad_norm": 0.18172769248485565, + "learning_rate": 0.0001128933831411811, + "loss": 2.1368, + "step": 2339 + }, + { + "epoch": 1.0094888936812594, + "grad_norm": 0.18058297038078308, + "learning_rate": 0.00011286392761090278, + "loss": 2.1471, + "step": 2340 + }, + { + "epoch": 1.0099202070304076, + "grad_norm": 0.17868855595588684, + "learning_rate": 0.00011283446424048417, + "loss": 2.2774, + "step": 2341 + }, + { + "epoch": 1.0103515203795557, + "grad_norm": 0.18230393528938293, + "learning_rate": 0.000112804993036026, + "loss": 2.366, + "step": 2342 + }, + { + "epoch": 1.010782833728704, + "grad_norm": 0.1880486011505127, + "learning_rate": 0.00011277551400363057, + "loss": 2.1481, + "step": 2343 + }, + { + "epoch": 1.011214147077852, + "grad_norm": 0.16729071736335754, + "learning_rate": 0.00011274602714940186, + "loss": 2.2148, + "step": 2344 + }, + { + "epoch": 1.0116454604270002, + "grad_norm": 0.16809020936489105, + "learning_rate": 0.00011271653247944545, + "loss": 2.0783, + "step": 2345 + }, + { + "epoch": 1.0120767737761485, + "grad_norm": 0.2248852401971817, + "learning_rate": 0.00011268702999986854, + "loss": 2.2623, + "step": 2346 + }, + { + "epoch": 1.0125080871252965, + "grad_norm": 0.18641109764575958, + "learning_rate": 0.00011265751971677994, + "loss": 2.25, + "step": 2347 + }, + { + "epoch": 1.0129394004744448, + "grad_norm": 0.17987805604934692, + "learning_rate": 0.00011262800163629003, + "loss": 2.1126, + "step": 2348 + }, + { + "epoch": 1.0133707138235928, + "grad_norm": 0.18353773653507233, + "learning_rate": 0.00011259847576451093, + "loss": 2.3219, + "step": 2349 + }, + { + "epoch": 1.013802027172741, + "grad_norm": 0.1718786507844925, + "learning_rate": 0.00011256894210755622, + "loss": 2.2882, + "step": 2350 + }, + { + "epoch": 1.013802027172741, + "eval_loss": 2.1062445640563965, + "eval_runtime": 200.3454, + "eval_samples_per_second": 0.16, + "eval_steps_per_second": 0.16, + "step": 2350 + }, + { + "epoch": 1.014233340521889, + "grad_norm": 0.19025760889053345, + "learning_rate": 0.0001125394006715412, + "loss": 2.1701, + "step": 2351 + }, + { + "epoch": 1.0146646538710373, + "grad_norm": 0.1575254201889038, + "learning_rate": 0.00011250985146258276, + "loss": 2.4227, + "step": 2352 + }, + { + "epoch": 1.0150959672201854, + "grad_norm": 0.18695998191833496, + "learning_rate": 0.00011248029448679937, + "loss": 2.2756, + "step": 2353 + }, + { + "epoch": 1.0155272805693336, + "grad_norm": 0.1914224773645401, + "learning_rate": 0.00011245072975031115, + "loss": 2.1814, + "step": 2354 + }, + { + "epoch": 1.0159585939184819, + "grad_norm": 0.20052509009838104, + "learning_rate": 0.00011242115725923976, + "loss": 2.2445, + "step": 2355 + }, + { + "epoch": 1.01638990726763, + "grad_norm": 0.15901760756969452, + "learning_rate": 0.00011239157701970852, + "loss": 2.1222, + "step": 2356 + }, + { + "epoch": 1.0168212206167782, + "grad_norm": 0.1845453679561615, + "learning_rate": 0.00011236198903784236, + "loss": 2.1006, + "step": 2357 + }, + { + "epoch": 1.0172525339659262, + "grad_norm": 0.1753232330083847, + "learning_rate": 0.00011233239331976778, + "loss": 2.2954, + "step": 2358 + }, + { + "epoch": 1.0176838473150744, + "grad_norm": 0.15701685845851898, + "learning_rate": 0.00011230278987161294, + "loss": 2.2849, + "step": 2359 + }, + { + "epoch": 1.0181151606642225, + "grad_norm": 0.16856098175048828, + "learning_rate": 0.00011227317869950748, + "loss": 2.1087, + "step": 2360 + }, + { + "epoch": 1.0185464740133707, + "grad_norm": 0.19807349145412445, + "learning_rate": 0.00011224355980958277, + "loss": 2.2045, + "step": 2361 + }, + { + "epoch": 1.0189777873625188, + "grad_norm": 0.17893430590629578, + "learning_rate": 0.0001122139332079717, + "loss": 2.2461, + "step": 2362 + }, + { + "epoch": 1.019409100711667, + "grad_norm": 0.2081046998500824, + "learning_rate": 0.00011218429890080882, + "loss": 2.2196, + "step": 2363 + }, + { + "epoch": 1.0198404140608153, + "grad_norm": 0.16133612394332886, + "learning_rate": 0.0001121546568942302, + "loss": 2.153, + "step": 2364 + }, + { + "epoch": 1.0202717274099633, + "grad_norm": 0.16959582269191742, + "learning_rate": 0.00011212500719437354, + "loss": 2.1781, + "step": 2365 + }, + { + "epoch": 1.0207030407591116, + "grad_norm": 0.17506343126296997, + "learning_rate": 0.00011209534980737816, + "loss": 2.0979, + "step": 2366 + }, + { + "epoch": 1.0211343541082596, + "grad_norm": 0.1758505403995514, + "learning_rate": 0.00011206568473938494, + "loss": 2.3522, + "step": 2367 + }, + { + "epoch": 1.0215656674574078, + "grad_norm": 0.17813731729984283, + "learning_rate": 0.00011203601199653634, + "loss": 2.1947, + "step": 2368 + }, + { + "epoch": 1.0219969808065559, + "grad_norm": 0.17696397006511688, + "learning_rate": 0.00011200633158497645, + "loss": 2.0171, + "step": 2369 + }, + { + "epoch": 1.0224282941557041, + "grad_norm": 0.2053089290857315, + "learning_rate": 0.00011197664351085092, + "loss": 2.1748, + "step": 2370 + }, + { + "epoch": 1.0228596075048522, + "grad_norm": 0.1562858372926712, + "learning_rate": 0.00011194694778030696, + "loss": 2.049, + "step": 2371 + }, + { + "epoch": 1.0232909208540004, + "grad_norm": 0.18027403950691223, + "learning_rate": 0.0001119172443994934, + "loss": 2.2517, + "step": 2372 + }, + { + "epoch": 1.0237222342031487, + "grad_norm": 0.16071996092796326, + "learning_rate": 0.0001118875333745607, + "loss": 2.3241, + "step": 2373 + }, + { + "epoch": 1.0241535475522967, + "grad_norm": 0.18705935776233673, + "learning_rate": 0.00011185781471166082, + "loss": 2.1256, + "step": 2374 + }, + { + "epoch": 1.024584860901445, + "grad_norm": 0.1858339160680771, + "learning_rate": 0.00011182808841694732, + "loss": 2.0979, + "step": 2375 + }, + { + "epoch": 1.024584860901445, + "eval_loss": 2.1061363220214844, + "eval_runtime": 199.9588, + "eval_samples_per_second": 0.16, + "eval_steps_per_second": 0.16, + "step": 2375 + }, + { + "epoch": 1.025016174250593, + "grad_norm": 0.1685781031847, + "learning_rate": 0.00011179835449657537, + "loss": 2.1224, + "step": 2376 + }, + { + "epoch": 1.0254474875997412, + "grad_norm": 0.15608049929141998, + "learning_rate": 0.00011176861295670169, + "loss": 2.1423, + "step": 2377 + }, + { + "epoch": 1.0258788009488893, + "grad_norm": 0.18338270485401154, + "learning_rate": 0.00011173886380348461, + "loss": 2.1042, + "step": 2378 + }, + { + "epoch": 1.0263101142980375, + "grad_norm": 0.18867897987365723, + "learning_rate": 0.00011170910704308403, + "loss": 2.009, + "step": 2379 + }, + { + "epoch": 1.0267414276471858, + "grad_norm": 0.16375763714313507, + "learning_rate": 0.0001116793426816614, + "loss": 2.1258, + "step": 2380 + }, + { + "epoch": 1.0271727409963338, + "grad_norm": 0.17954808473587036, + "learning_rate": 0.0001116495707253797, + "loss": 2.2215, + "step": 2381 + }, + { + "epoch": 1.027604054345482, + "grad_norm": 0.1598980873823166, + "learning_rate": 0.00011161979118040363, + "loss": 2.0067, + "step": 2382 + }, + { + "epoch": 1.02803536769463, + "grad_norm": 0.17175449430942535, + "learning_rate": 0.00011159000405289931, + "loss": 2.2334, + "step": 2383 + }, + { + "epoch": 1.0284666810437784, + "grad_norm": 0.15543760359287262, + "learning_rate": 0.0001115602093490345, + "loss": 2.1728, + "step": 2384 + }, + { + "epoch": 1.0288979943929264, + "grad_norm": 0.19172172248363495, + "learning_rate": 0.00011153040707497855, + "loss": 2.2146, + "step": 2385 + }, + { + "epoch": 1.0293293077420747, + "grad_norm": 0.19131535291671753, + "learning_rate": 0.00011150059723690232, + "loss": 2.279, + "step": 2386 + }, + { + "epoch": 1.0297606210912227, + "grad_norm": 0.16584806144237518, + "learning_rate": 0.00011147077984097827, + "loss": 2.1, + "step": 2387 + }, + { + "epoch": 1.030191934440371, + "grad_norm": 0.18306796252727509, + "learning_rate": 0.0001114409548933804, + "loss": 2.1369, + "step": 2388 + }, + { + "epoch": 1.0306232477895192, + "grad_norm": 0.16278353333473206, + "learning_rate": 0.00011141112240028432, + "loss": 2.2203, + "step": 2389 + }, + { + "epoch": 1.0310545611386672, + "grad_norm": 0.17837879061698914, + "learning_rate": 0.00011138128236786716, + "loss": 2.1123, + "step": 2390 + }, + { + "epoch": 1.0314858744878155, + "grad_norm": 0.17795895040035248, + "learning_rate": 0.00011135143480230763, + "loss": 2.0468, + "step": 2391 + }, + { + "epoch": 1.0319171878369635, + "grad_norm": 0.17439667880535126, + "learning_rate": 0.00011132157970978598, + "loss": 2.202, + "step": 2392 + }, + { + "epoch": 1.0323485011861118, + "grad_norm": 0.17241206765174866, + "learning_rate": 0.00011129171709648407, + "loss": 2.2526, + "step": 2393 + }, + { + "epoch": 1.0327798145352598, + "grad_norm": 0.17300650477409363, + "learning_rate": 0.00011126184696858524, + "loss": 2.1694, + "step": 2394 + }, + { + "epoch": 1.033211127884408, + "grad_norm": 0.17314332723617554, + "learning_rate": 0.00011123196933227445, + "loss": 2.1224, + "step": 2395 + }, + { + "epoch": 1.033642441233556, + "grad_norm": 0.17457662522792816, + "learning_rate": 0.00011120208419373817, + "loss": 2.2439, + "step": 2396 + }, + { + "epoch": 1.0340737545827043, + "grad_norm": 0.1702902466058731, + "learning_rate": 0.00011117219155916444, + "loss": 2.1204, + "step": 2397 + }, + { + "epoch": 1.0345050679318526, + "grad_norm": 0.1870027780532837, + "learning_rate": 0.00011114229143474293, + "loss": 2.2655, + "step": 2398 + }, + { + "epoch": 1.0349363812810006, + "grad_norm": 0.18020227551460266, + "learning_rate": 0.0001111123838266647, + "loss": 2.3726, + "step": 2399 + }, + { + "epoch": 1.0353676946301489, + "grad_norm": 0.17877480387687683, + "learning_rate": 0.00011108246874112247, + "loss": 2.0178, + "step": 2400 + }, + { + "epoch": 1.0353676946301489, + "eval_loss": 2.1056740283966064, + "eval_runtime": 200.7673, + "eval_samples_per_second": 0.159, + "eval_steps_per_second": 0.159, + "step": 2400 + }, + { + "epoch": 1.035799007979297, + "grad_norm": 0.1728811115026474, + "learning_rate": 0.00011105254618431053, + "loss": 2.1762, + "step": 2401 + }, + { + "epoch": 1.0362303213284452, + "grad_norm": 0.17834728956222534, + "learning_rate": 0.00011102261616242461, + "loss": 2.3268, + "step": 2402 + }, + { + "epoch": 1.0366616346775932, + "grad_norm": 0.17231158912181854, + "learning_rate": 0.0001109926786816621, + "loss": 2.3475, + "step": 2403 + }, + { + "epoch": 1.0370929480267415, + "grad_norm": 0.17313973605632782, + "learning_rate": 0.00011096273374822182, + "loss": 2.2619, + "step": 2404 + }, + { + "epoch": 1.0375242613758895, + "grad_norm": 0.16596461832523346, + "learning_rate": 0.00011093278136830428, + "loss": 2.2721, + "step": 2405 + }, + { + "epoch": 1.0379555747250377, + "grad_norm": 0.19695128500461578, + "learning_rate": 0.00011090282154811138, + "loss": 2.1435, + "step": 2406 + }, + { + "epoch": 1.038386888074186, + "grad_norm": 0.16759416460990906, + "learning_rate": 0.00011087285429384665, + "loss": 2.1911, + "step": 2407 + }, + { + "epoch": 1.038818201423334, + "grad_norm": 0.1748712956905365, + "learning_rate": 0.00011084287961171514, + "loss": 2.1256, + "step": 2408 + }, + { + "epoch": 1.0392495147724823, + "grad_norm": 0.1840677261352539, + "learning_rate": 0.00011081289750792343, + "loss": 2.3436, + "step": 2409 + }, + { + "epoch": 1.0396808281216303, + "grad_norm": 0.19527612626552582, + "learning_rate": 0.00011078290798867967, + "loss": 2.1762, + "step": 2410 + }, + { + "epoch": 1.0401121414707786, + "grad_norm": 0.1802646368741989, + "learning_rate": 0.00011075291106019349, + "loss": 2.4244, + "step": 2411 + }, + { + "epoch": 1.0405434548199266, + "grad_norm": 0.17762580513954163, + "learning_rate": 0.00011072290672867607, + "loss": 2.2971, + "step": 2412 + }, + { + "epoch": 1.0409747681690749, + "grad_norm": 0.1760055422782898, + "learning_rate": 0.00011069289500034016, + "loss": 2.2714, + "step": 2413 + }, + { + "epoch": 1.041406081518223, + "grad_norm": 0.21784080564975739, + "learning_rate": 0.00011066287588140002, + "loss": 2.2483, + "step": 2414 + }, + { + "epoch": 1.0418373948673711, + "grad_norm": 0.17253689467906952, + "learning_rate": 0.00011063284937807142, + "loss": 2.0145, + "step": 2415 + }, + { + "epoch": 1.0422687082165194, + "grad_norm": 0.1615447849035263, + "learning_rate": 0.00011060281549657169, + "loss": 1.9895, + "step": 2416 + }, + { + "epoch": 1.0427000215656674, + "grad_norm": 0.16580533981323242, + "learning_rate": 0.00011057277424311965, + "loss": 2.2238, + "step": 2417 + }, + { + "epoch": 1.0431313349148157, + "grad_norm": 0.16984115540981293, + "learning_rate": 0.0001105427256239357, + "loss": 2.0539, + "step": 2418 + }, + { + "epoch": 1.0435626482639637, + "grad_norm": 0.17555032670497894, + "learning_rate": 0.00011051266964524171, + "loss": 2.2437, + "step": 2419 + }, + { + "epoch": 1.043993961613112, + "grad_norm": 0.17240315675735474, + "learning_rate": 0.0001104826063132611, + "loss": 2.2337, + "step": 2420 + }, + { + "epoch": 1.04442527496226, + "grad_norm": 0.18791107833385468, + "learning_rate": 0.00011045253563421886, + "loss": 2.1752, + "step": 2421 + }, + { + "epoch": 1.0448565883114083, + "grad_norm": 0.17155669629573822, + "learning_rate": 0.00011042245761434138, + "loss": 1.9039, + "step": 2422 + }, + { + "epoch": 1.0452879016605563, + "grad_norm": 0.1768328845500946, + "learning_rate": 0.00011039237225985667, + "loss": 2.2537, + "step": 2423 + }, + { + "epoch": 1.0457192150097046, + "grad_norm": 0.19201205670833588, + "learning_rate": 0.00011036227957699428, + "loss": 2.295, + "step": 2424 + }, + { + "epoch": 1.0461505283588528, + "grad_norm": 0.1698884516954422, + "learning_rate": 0.00011033217957198516, + "loss": 2.3538, + "step": 2425 + }, + { + "epoch": 1.0461505283588528, + "eval_loss": 2.1056013107299805, + "eval_runtime": 203.8965, + "eval_samples_per_second": 0.157, + "eval_steps_per_second": 0.157, + "step": 2425 + }, + { + "epoch": 1.0465818417080008, + "grad_norm": 0.1858002245426178, + "learning_rate": 0.00011030207225106188, + "loss": 1.9397, + "step": 2426 + }, + { + "epoch": 1.047013155057149, + "grad_norm": 0.17983387410640717, + "learning_rate": 0.00011027195762045848, + "loss": 2.3015, + "step": 2427 + }, + { + "epoch": 1.0474444684062971, + "grad_norm": 0.1639018952846527, + "learning_rate": 0.00011024183568641054, + "loss": 2.1261, + "step": 2428 + }, + { + "epoch": 1.0478757817554454, + "grad_norm": 0.16954176127910614, + "learning_rate": 0.0001102117064551551, + "loss": 2.312, + "step": 2429 + }, + { + "epoch": 1.0483070951045934, + "grad_norm": 0.1672070026397705, + "learning_rate": 0.00011018156993293077, + "loss": 2.3034, + "step": 2430 + }, + { + "epoch": 1.0487384084537417, + "grad_norm": 0.18573078513145447, + "learning_rate": 0.00011015142612597765, + "loss": 2.2772, + "step": 2431 + }, + { + "epoch": 1.0491697218028897, + "grad_norm": 0.1702294945716858, + "learning_rate": 0.00011012127504053737, + "loss": 2.1996, + "step": 2432 + }, + { + "epoch": 1.049601035152038, + "grad_norm": 0.1983357071876526, + "learning_rate": 0.000110091116682853, + "loss": 2.2025, + "step": 2433 + }, + { + "epoch": 1.0500323485011862, + "grad_norm": 0.16793139278888702, + "learning_rate": 0.00011006095105916915, + "loss": 2.3513, + "step": 2434 + }, + { + "epoch": 1.0504636618503342, + "grad_norm": 0.1658577173948288, + "learning_rate": 0.00011003077817573198, + "loss": 2.0776, + "step": 2435 + }, + { + "epoch": 1.0508949751994825, + "grad_norm": 0.1889333873987198, + "learning_rate": 0.0001100005980387891, + "loss": 2.2737, + "step": 2436 + }, + { + "epoch": 1.0513262885486305, + "grad_norm": 0.17015792429447174, + "learning_rate": 0.00010997041065458963, + "loss": 2.1531, + "step": 2437 + }, + { + "epoch": 1.0517576018977788, + "grad_norm": 0.1794874370098114, + "learning_rate": 0.00010994021602938421, + "loss": 2.1614, + "step": 2438 + }, + { + "epoch": 1.0521889152469268, + "grad_norm": 0.17228356003761292, + "learning_rate": 0.00010991001416942495, + "loss": 2.3999, + "step": 2439 + }, + { + "epoch": 1.052620228596075, + "grad_norm": 0.18232309818267822, + "learning_rate": 0.0001098798050809655, + "loss": 2.0694, + "step": 2440 + }, + { + "epoch": 1.053051541945223, + "grad_norm": 0.16556480526924133, + "learning_rate": 0.00010984958877026095, + "loss": 2.186, + "step": 2441 + }, + { + "epoch": 1.0534828552943714, + "grad_norm": 0.16921713948249817, + "learning_rate": 0.00010981936524356795, + "loss": 1.9789, + "step": 2442 + }, + { + "epoch": 1.0539141686435196, + "grad_norm": 0.18567490577697754, + "learning_rate": 0.00010978913450714459, + "loss": 2.1184, + "step": 2443 + }, + { + "epoch": 1.0543454819926676, + "grad_norm": 0.16858486831188202, + "learning_rate": 0.00010975889656725051, + "loss": 2.1877, + "step": 2444 + }, + { + "epoch": 1.054776795341816, + "grad_norm": 0.19666415452957153, + "learning_rate": 0.00010972865143014674, + "loss": 2.329, + "step": 2445 + }, + { + "epoch": 1.055208108690964, + "grad_norm": 0.16206754744052887, + "learning_rate": 0.00010969839910209593, + "loss": 2.1894, + "step": 2446 + }, + { + "epoch": 1.0556394220401122, + "grad_norm": 0.17717215418815613, + "learning_rate": 0.00010966813958936213, + "loss": 1.9643, + "step": 2447 + }, + { + "epoch": 1.0560707353892602, + "grad_norm": 0.18330061435699463, + "learning_rate": 0.0001096378728982109, + "loss": 2.1257, + "step": 2448 + }, + { + "epoch": 1.0565020487384085, + "grad_norm": 0.18426138162612915, + "learning_rate": 0.00010960759903490928, + "loss": 2.2492, + "step": 2449 + }, + { + "epoch": 1.0569333620875565, + "grad_norm": 0.16594518721103668, + "learning_rate": 0.00010957731800572581, + "loss": 2.3716, + "step": 2450 + }, + { + "epoch": 1.0569333620875565, + "eval_loss": 2.1055049896240234, + "eval_runtime": 200.9772, + "eval_samples_per_second": 0.159, + "eval_steps_per_second": 0.159, + "step": 2450 + }, + { + "epoch": 1.0573646754367048, + "grad_norm": 0.1678372025489807, + "learning_rate": 0.00010954702981693055, + "loss": 2.1933, + "step": 2451 + }, + { + "epoch": 1.057795988785853, + "grad_norm": 0.17151105403900146, + "learning_rate": 0.00010951673447479495, + "loss": 2.1858, + "step": 2452 + }, + { + "epoch": 1.058227302135001, + "grad_norm": 0.17021241784095764, + "learning_rate": 0.00010948643198559201, + "loss": 2.3021, + "step": 2453 + }, + { + "epoch": 1.0586586154841493, + "grad_norm": 0.19027872383594513, + "learning_rate": 0.00010945612235559619, + "loss": 2.1876, + "step": 2454 + }, + { + "epoch": 1.0590899288332973, + "grad_norm": 0.1739296168088913, + "learning_rate": 0.00010942580559108344, + "loss": 2.2537, + "step": 2455 + }, + { + "epoch": 1.0595212421824456, + "grad_norm": 0.17704711854457855, + "learning_rate": 0.00010939548169833116, + "loss": 2.376, + "step": 2456 + }, + { + "epoch": 1.0599525555315936, + "grad_norm": 0.174795001745224, + "learning_rate": 0.00010936515068361822, + "loss": 2.1534, + "step": 2457 + }, + { + "epoch": 1.0603838688807419, + "grad_norm": 0.16426482796669006, + "learning_rate": 0.00010933481255322507, + "loss": 2.0385, + "step": 2458 + }, + { + "epoch": 1.06081518222989, + "grad_norm": 0.1784992516040802, + "learning_rate": 0.00010930446731343347, + "loss": 2.2563, + "step": 2459 + }, + { + "epoch": 1.0612464955790382, + "grad_norm": 0.17638806998729706, + "learning_rate": 0.00010927411497052679, + "loss": 2.145, + "step": 2460 + }, + { + "epoch": 1.0616778089281864, + "grad_norm": 0.17074482142925262, + "learning_rate": 0.00010924375553078977, + "loss": 2.2673, + "step": 2461 + }, + { + "epoch": 1.0621091222773344, + "grad_norm": 0.16658513247966766, + "learning_rate": 0.00010921338900050868, + "loss": 2.0698, + "step": 2462 + }, + { + "epoch": 1.0625404356264827, + "grad_norm": 0.16458797454833984, + "learning_rate": 0.00010918301538597127, + "loss": 2.1205, + "step": 2463 + }, + { + "epoch": 1.0629717489756307, + "grad_norm": 0.1653198003768921, + "learning_rate": 0.00010915263469346669, + "loss": 2.2109, + "step": 2464 + }, + { + "epoch": 1.063403062324779, + "grad_norm": 0.17059695720672607, + "learning_rate": 0.00010912224692928562, + "loss": 2.3215, + "step": 2465 + }, + { + "epoch": 1.063834375673927, + "grad_norm": 0.17046011984348297, + "learning_rate": 0.00010909185209972015, + "loss": 1.9422, + "step": 2466 + }, + { + "epoch": 1.0642656890230753, + "grad_norm": 0.16733916103839874, + "learning_rate": 0.00010906145021106391, + "loss": 2.3387, + "step": 2467 + }, + { + "epoch": 1.0646970023722235, + "grad_norm": 0.19165734946727753, + "learning_rate": 0.00010903104126961188, + "loss": 2.4835, + "step": 2468 + }, + { + "epoch": 1.0651283157213716, + "grad_norm": 0.17968103289604187, + "learning_rate": 0.0001090006252816606, + "loss": 2.2243, + "step": 2469 + }, + { + "epoch": 1.0655596290705198, + "grad_norm": 0.17020921409130096, + "learning_rate": 0.00010897020225350806, + "loss": 2.2879, + "step": 2470 + }, + { + "epoch": 1.0659909424196679, + "grad_norm": 0.17306166887283325, + "learning_rate": 0.00010893977219145362, + "loss": 2.3269, + "step": 2471 + }, + { + "epoch": 1.066422255768816, + "grad_norm": 0.18344846367835999, + "learning_rate": 0.00010890933510179822, + "loss": 2.2255, + "step": 2472 + }, + { + "epoch": 1.0668535691179641, + "grad_norm": 0.1960878223180771, + "learning_rate": 0.00010887889099084413, + "loss": 2.3315, + "step": 2473 + }, + { + "epoch": 1.0672848824671124, + "grad_norm": 0.16954079270362854, + "learning_rate": 0.00010884843986489518, + "loss": 2.1443, + "step": 2474 + }, + { + "epoch": 1.0677161958162604, + "grad_norm": 0.16392044723033905, + "learning_rate": 0.0001088179817302566, + "loss": 2.136, + "step": 2475 + }, + { + "epoch": 1.0677161958162604, + "eval_loss": 2.1050617694854736, + "eval_runtime": 200.7502, + "eval_samples_per_second": 0.159, + "eval_steps_per_second": 0.159, + "step": 2475 + }, + { + "epoch": 1.0681475091654087, + "grad_norm": 0.15941034257411957, + "learning_rate": 0.00010878751659323507, + "loss": 2.2209, + "step": 2476 + }, + { + "epoch": 1.068578822514557, + "grad_norm": 0.1872348040342331, + "learning_rate": 0.00010875704446013875, + "loss": 2.2573, + "step": 2477 + }, + { + "epoch": 1.069010135863705, + "grad_norm": 0.18529799580574036, + "learning_rate": 0.00010872656533727721, + "loss": 2.2827, + "step": 2478 + }, + { + "epoch": 1.0694414492128532, + "grad_norm": 0.1721513569355011, + "learning_rate": 0.00010869607923096148, + "loss": 2.0848, + "step": 2479 + }, + { + "epoch": 1.0698727625620013, + "grad_norm": 0.16441528499126434, + "learning_rate": 0.00010866558614750403, + "loss": 2.2571, + "step": 2480 + }, + { + "epoch": 1.0703040759111495, + "grad_norm": 0.1793617308139801, + "learning_rate": 0.00010863508609321885, + "loss": 2.1694, + "step": 2481 + }, + { + "epoch": 1.0707353892602975, + "grad_norm": 0.16509655117988586, + "learning_rate": 0.00010860457907442123, + "loss": 2.0505, + "step": 2482 + }, + { + "epoch": 1.0711667026094458, + "grad_norm": 0.19956058263778687, + "learning_rate": 0.00010857406509742807, + "loss": 2.2348, + "step": 2483 + }, + { + "epoch": 1.0715980159585938, + "grad_norm": 0.15869128704071045, + "learning_rate": 0.00010854354416855751, + "loss": 2.2292, + "step": 2484 + }, + { + "epoch": 1.072029329307742, + "grad_norm": 0.16309665143489838, + "learning_rate": 0.00010851301629412933, + "loss": 2.0799, + "step": 2485 + }, + { + "epoch": 1.0724606426568903, + "grad_norm": 0.1662570834159851, + "learning_rate": 0.00010848248148046464, + "loss": 2.0386, + "step": 2486 + }, + { + "epoch": 1.0728919560060384, + "grad_norm": 0.1775583028793335, + "learning_rate": 0.000108451939733886, + "loss": 2.3047, + "step": 2487 + }, + { + "epoch": 1.0733232693551866, + "grad_norm": 0.1622440665960312, + "learning_rate": 0.00010842139106071742, + "loss": 2.2266, + "step": 2488 + }, + { + "epoch": 1.0737545827043347, + "grad_norm": 0.18213559687137604, + "learning_rate": 0.00010839083546728431, + "loss": 2.0705, + "step": 2489 + }, + { + "epoch": 1.074185896053483, + "grad_norm": 0.179693803191185, + "learning_rate": 0.00010836027295991358, + "loss": 2.1326, + "step": 2490 + }, + { + "epoch": 1.074617209402631, + "grad_norm": 0.1911303848028183, + "learning_rate": 0.0001083297035449335, + "loss": 2.3379, + "step": 2491 + }, + { + "epoch": 1.0750485227517792, + "grad_norm": 0.16474899649620056, + "learning_rate": 0.00010829912722867383, + "loss": 1.9679, + "step": 2492 + }, + { + "epoch": 1.0754798361009272, + "grad_norm": 0.15132348239421844, + "learning_rate": 0.0001082685440174657, + "loss": 2.0117, + "step": 2493 + }, + { + "epoch": 1.0759111494500755, + "grad_norm": 0.21260079741477966, + "learning_rate": 0.00010823795391764172, + "loss": 2.2502, + "step": 2494 + }, + { + "epoch": 1.0763424627992237, + "grad_norm": 0.1629190742969513, + "learning_rate": 0.00010820735693553589, + "loss": 2.3575, + "step": 2495 + }, + { + "epoch": 1.0767737761483718, + "grad_norm": 0.21407712996006012, + "learning_rate": 0.00010817675307748365, + "loss": 2.1997, + "step": 2496 + }, + { + "epoch": 1.07720508949752, + "grad_norm": 0.18867595493793488, + "learning_rate": 0.00010814614234982189, + "loss": 2.1327, + "step": 2497 + }, + { + "epoch": 1.077636402846668, + "grad_norm": 0.1845402717590332, + "learning_rate": 0.00010811552475888885, + "loss": 2.0834, + "step": 2498 + }, + { + "epoch": 1.0780677161958163, + "grad_norm": 0.16826646029949188, + "learning_rate": 0.0001080849003110243, + "loss": 2.1984, + "step": 2499 + }, + { + "epoch": 1.0784990295449643, + "grad_norm": 0.15099520981311798, + "learning_rate": 0.00010805426901256929, + "loss": 1.981, + "step": 2500 + }, + { + "epoch": 1.0784990295449643, + "eval_loss": 2.1044869422912598, + "eval_runtime": 200.3953, + "eval_samples_per_second": 0.16, + "eval_steps_per_second": 0.16, + "step": 2500 + }, + { + "epoch": 1.0789303428941126, + "grad_norm": 0.16792334616184235, + "learning_rate": 0.00010802363086986643, + "loss": 2.3156, + "step": 2501 + }, + { + "epoch": 1.0793616562432606, + "grad_norm": 0.1842220574617386, + "learning_rate": 0.00010799298588925967, + "loss": 2.2968, + "step": 2502 + }, + { + "epoch": 1.079792969592409, + "grad_norm": 0.43980124592781067, + "learning_rate": 0.00010796233407709434, + "loss": 2.232, + "step": 2503 + }, + { + "epoch": 1.0802242829415571, + "grad_norm": 0.20864452421665192, + "learning_rate": 0.00010793167543971731, + "loss": 1.8391, + "step": 2504 + }, + { + "epoch": 1.0806555962907052, + "grad_norm": 0.16943299770355225, + "learning_rate": 0.00010790100998347671, + "loss": 2.2827, + "step": 2505 + }, + { + "epoch": 1.0810869096398534, + "grad_norm": 0.19129234552383423, + "learning_rate": 0.00010787033771472222, + "loss": 2.3839, + "step": 2506 + }, + { + "epoch": 1.0815182229890015, + "grad_norm": 0.1737593412399292, + "learning_rate": 0.00010783965863980483, + "loss": 2.0071, + "step": 2507 + }, + { + "epoch": 1.0819495363381497, + "grad_norm": 0.16698093712329865, + "learning_rate": 0.00010780897276507698, + "loss": 2.1422, + "step": 2508 + }, + { + "epoch": 1.0823808496872978, + "grad_norm": 0.16467595100402832, + "learning_rate": 0.00010777828009689255, + "loss": 2.1051, + "step": 2509 + }, + { + "epoch": 1.082812163036446, + "grad_norm": 0.18091897666454315, + "learning_rate": 0.00010774758064160676, + "loss": 2.3142, + "step": 2510 + }, + { + "epoch": 1.083243476385594, + "grad_norm": 0.18090017139911652, + "learning_rate": 0.00010771687440557628, + "loss": 2.2106, + "step": 2511 + }, + { + "epoch": 1.0836747897347423, + "grad_norm": 0.16148926317691803, + "learning_rate": 0.00010768616139515913, + "loss": 2.058, + "step": 2512 + }, + { + "epoch": 1.0841061030838905, + "grad_norm": 0.17057783901691437, + "learning_rate": 0.00010765544161671486, + "loss": 2.2992, + "step": 2513 + }, + { + "epoch": 1.0845374164330386, + "grad_norm": 0.1756206899881363, + "learning_rate": 0.00010762471507660426, + "loss": 2.1542, + "step": 2514 + }, + { + "epoch": 1.0849687297821868, + "grad_norm": 0.17915412783622742, + "learning_rate": 0.00010759398178118964, + "loss": 2.0949, + "step": 2515 + }, + { + "epoch": 1.0854000431313349, + "grad_norm": 0.1853836476802826, + "learning_rate": 0.00010756324173683465, + "loss": 2.1546, + "step": 2516 + }, + { + "epoch": 1.0858313564804831, + "grad_norm": 0.19392795860767365, + "learning_rate": 0.00010753249494990436, + "loss": 2.4471, + "step": 2517 + }, + { + "epoch": 1.0862626698296312, + "grad_norm": 0.1694418042898178, + "learning_rate": 0.00010750174142676521, + "loss": 2.137, + "step": 2518 + }, + { + "epoch": 1.0866939831787794, + "grad_norm": 0.17123928666114807, + "learning_rate": 0.00010747098117378507, + "loss": 2.26, + "step": 2519 + }, + { + "epoch": 1.0871252965279274, + "grad_norm": 0.18081289529800415, + "learning_rate": 0.00010744021419733318, + "loss": 2.1713, + "step": 2520 + }, + { + "epoch": 1.0875566098770757, + "grad_norm": 0.16639788448810577, + "learning_rate": 0.00010740944050378018, + "loss": 2.233, + "step": 2521 + }, + { + "epoch": 1.087987923226224, + "grad_norm": 0.1688801646232605, + "learning_rate": 0.00010737866009949811, + "loss": 2.1852, + "step": 2522 + }, + { + "epoch": 1.088419236575372, + "grad_norm": 0.1901995688676834, + "learning_rate": 0.00010734787299086036, + "loss": 2.1676, + "step": 2523 + }, + { + "epoch": 1.0888505499245202, + "grad_norm": 0.19242532551288605, + "learning_rate": 0.00010731707918424177, + "loss": 2.3821, + "step": 2524 + }, + { + "epoch": 1.0892818632736683, + "grad_norm": 0.17025871574878693, + "learning_rate": 0.00010728627868601852, + "loss": 2.0999, + "step": 2525 + }, + { + "epoch": 1.0892818632736683, + "eval_loss": 2.1046197414398193, + "eval_runtime": 200.6002, + "eval_samples_per_second": 0.16, + "eval_steps_per_second": 0.16, + "step": 2525 + }, + { + "epoch": 1.0897131766228165, + "grad_norm": 0.17762216925621033, + "learning_rate": 0.00010725547150256818, + "loss": 2.4327, + "step": 2526 + }, + { + "epoch": 1.0901444899719646, + "grad_norm": 0.16256682574748993, + "learning_rate": 0.00010722465764026974, + "loss": 2.1386, + "step": 2527 + }, + { + "epoch": 1.0905758033211128, + "grad_norm": 0.16706998646259308, + "learning_rate": 0.00010719383710550351, + "loss": 2.1637, + "step": 2528 + }, + { + "epoch": 1.0910071166702608, + "grad_norm": 0.2084912806749344, + "learning_rate": 0.00010716300990465128, + "loss": 2.2047, + "step": 2529 + }, + { + "epoch": 1.091438430019409, + "grad_norm": 0.17731738090515137, + "learning_rate": 0.00010713217604409609, + "loss": 2.0935, + "step": 2530 + }, + { + "epoch": 1.0918697433685574, + "grad_norm": 0.16511079668998718, + "learning_rate": 0.00010710133553022247, + "loss": 2.2436, + "step": 2531 + }, + { + "epoch": 1.0923010567177054, + "grad_norm": 0.17809303104877472, + "learning_rate": 0.00010707048836941627, + "loss": 2.3086, + "step": 2532 + }, + { + "epoch": 1.0927323700668536, + "grad_norm": 35.525386810302734, + "learning_rate": 0.00010703963456806475, + "loss": 1.9948, + "step": 2533 + }, + { + "epoch": 1.0931636834160017, + "grad_norm": 0.197676882147789, + "learning_rate": 0.0001070087741325565, + "loss": 2.2399, + "step": 2534 + }, + { + "epoch": 1.09359499676515, + "grad_norm": 0.19233070313930511, + "learning_rate": 0.0001069779070692815, + "loss": 2.1493, + "step": 2535 + }, + { + "epoch": 1.094026310114298, + "grad_norm": 0.17287050187587738, + "learning_rate": 0.00010694703338463118, + "loss": 2.1806, + "step": 2536 + }, + { + "epoch": 1.0944576234634462, + "grad_norm": 0.1617845892906189, + "learning_rate": 0.00010691615308499821, + "loss": 2.0614, + "step": 2537 + }, + { + "epoch": 1.0948889368125942, + "grad_norm": 0.16862034797668457, + "learning_rate": 0.00010688526617677671, + "loss": 2.0655, + "step": 2538 + }, + { + "epoch": 1.0953202501617425, + "grad_norm": 0.16461607813835144, + "learning_rate": 0.00010685437266636217, + "loss": 2.1918, + "step": 2539 + }, + { + "epoch": 1.0957515635108908, + "grad_norm": 0.16629716753959656, + "learning_rate": 0.00010682347256015141, + "loss": 2.1462, + "step": 2540 + }, + { + "epoch": 1.0961828768600388, + "grad_norm": 0.17277218401432037, + "learning_rate": 0.00010679256586454265, + "loss": 2.2636, + "step": 2541 + }, + { + "epoch": 1.096614190209187, + "grad_norm": 0.18575602769851685, + "learning_rate": 0.00010676165258593542, + "loss": 2.169, + "step": 2542 + }, + { + "epoch": 1.097045503558335, + "grad_norm": 0.17312225699424744, + "learning_rate": 0.00010673073273073072, + "loss": 2.1672, + "step": 2543 + }, + { + "epoch": 1.0974768169074833, + "grad_norm": 0.17106004059314728, + "learning_rate": 0.00010669980630533077, + "loss": 2.268, + "step": 2544 + }, + { + "epoch": 1.0979081302566314, + "grad_norm": 0.17834289371967316, + "learning_rate": 0.00010666887331613931, + "loss": 2.1809, + "step": 2545 + }, + { + "epoch": 1.0983394436057796, + "grad_norm": 0.17766167223453522, + "learning_rate": 0.00010663793376956126, + "loss": 2.3147, + "step": 2546 + }, + { + "epoch": 1.0987707569549277, + "grad_norm": 0.16469496488571167, + "learning_rate": 0.00010660698767200306, + "loss": 2.1159, + "step": 2547 + }, + { + "epoch": 1.099202070304076, + "grad_norm": 0.20470421016216278, + "learning_rate": 0.0001065760350298724, + "loss": 2.386, + "step": 2548 + }, + { + "epoch": 1.0996333836532242, + "grad_norm": 0.187876895070076, + "learning_rate": 0.00010654507584957837, + "loss": 2.3584, + "step": 2549 + }, + { + "epoch": 1.1000646970023722, + "grad_norm": 0.1793707013130188, + "learning_rate": 0.00010651411013753145, + "loss": 2.2252, + "step": 2550 + }, + { + "epoch": 1.1000646970023722, + "eval_loss": 2.103973865509033, + "eval_runtime": 196.7781, + "eval_samples_per_second": 0.163, + "eval_steps_per_second": 0.163, + "step": 2550 + }, + { + "epoch": 1.1004960103515204, + "grad_norm": 10.771757125854492, + "learning_rate": 0.00010648313790014336, + "loss": 2.1051, + "step": 2551 + }, + { + "epoch": 1.1009273237006685, + "grad_norm": 0.20772556960582733, + "learning_rate": 0.0001064521591438273, + "loss": 2.2376, + "step": 2552 + }, + { + "epoch": 1.1013586370498167, + "grad_norm": 0.2005288153886795, + "learning_rate": 0.0001064211738749977, + "loss": 1.88, + "step": 2553 + }, + { + "epoch": 1.1017899503989648, + "grad_norm": 0.17820362746715546, + "learning_rate": 0.00010639018210007047, + "loss": 2.3887, + "step": 2554 + }, + { + "epoch": 1.102221263748113, + "grad_norm": 0.17765012383460999, + "learning_rate": 0.00010635918382546275, + "loss": 2.3142, + "step": 2555 + }, + { + "epoch": 1.102652577097261, + "grad_norm": 0.2070050984621048, + "learning_rate": 0.00010632817905759306, + "loss": 2.0402, + "step": 2556 + }, + { + "epoch": 1.1030838904464093, + "grad_norm": 0.1713756024837494, + "learning_rate": 0.0001062971678028813, + "loss": 2.2322, + "step": 2557 + }, + { + "epoch": 1.1035152037955576, + "grad_norm": 0.1667434275150299, + "learning_rate": 0.00010626615006774869, + "loss": 2.2085, + "step": 2558 + }, + { + "epoch": 1.1039465171447056, + "grad_norm": 0.18005375564098358, + "learning_rate": 0.00010623512585861783, + "loss": 2.2999, + "step": 2559 + }, + { + "epoch": 1.1043778304938539, + "grad_norm": 0.18545310199260712, + "learning_rate": 0.00010620409518191252, + "loss": 2.183, + "step": 2560 + }, + { + "epoch": 1.1048091438430019, + "grad_norm": 0.1672646850347519, + "learning_rate": 0.0001061730580440581, + "loss": 2.1464, + "step": 2561 + }, + { + "epoch": 1.1052404571921501, + "grad_norm": 0.17547766864299774, + "learning_rate": 0.00010614201445148108, + "loss": 2.4801, + "step": 2562 + }, + { + "epoch": 1.1056717705412982, + "grad_norm": 0.18772274255752563, + "learning_rate": 0.00010611096441060942, + "loss": 2.2, + "step": 2563 + }, + { + "epoch": 1.1061030838904464, + "grad_norm": 0.19832868874073029, + "learning_rate": 0.00010607990792787236, + "loss": 2.1522, + "step": 2564 + }, + { + "epoch": 1.1065343972395945, + "grad_norm": 0.18317139148712158, + "learning_rate": 0.00010604884500970045, + "loss": 2.3309, + "step": 2565 + }, + { + "epoch": 1.1069657105887427, + "grad_norm": 0.1834000051021576, + "learning_rate": 0.00010601777566252568, + "loss": 2.2633, + "step": 2566 + }, + { + "epoch": 1.107397023937891, + "grad_norm": 0.19017237424850464, + "learning_rate": 0.00010598669989278121, + "loss": 2.2537, + "step": 2567 + }, + { + "epoch": 1.107828337287039, + "grad_norm": 0.18654869496822357, + "learning_rate": 0.00010595561770690171, + "loss": 2.1133, + "step": 2568 + }, + { + "epoch": 1.1082596506361873, + "grad_norm": 0.1633051633834839, + "learning_rate": 0.000105924529111323, + "loss": 2.2248, + "step": 2569 + }, + { + "epoch": 1.1086909639853353, + "grad_norm": 0.19110850989818573, + "learning_rate": 0.00010589343411248237, + "loss": 2.243, + "step": 2570 + }, + { + "epoch": 1.1091222773344835, + "grad_norm": 0.19738002121448517, + "learning_rate": 0.00010586233271681837, + "loss": 1.9526, + "step": 2571 + }, + { + "epoch": 1.1095535906836316, + "grad_norm": 0.1846143901348114, + "learning_rate": 0.00010583122493077084, + "loss": 2.107, + "step": 2572 + }, + { + "epoch": 1.1099849040327798, + "grad_norm": 0.1838165521621704, + "learning_rate": 0.00010580011076078108, + "loss": 2.1531, + "step": 2573 + }, + { + "epoch": 1.1104162173819279, + "grad_norm": 0.17969593405723572, + "learning_rate": 0.00010576899021329151, + "loss": 2.2307, + "step": 2574 + }, + { + "epoch": 1.1108475307310761, + "grad_norm": 0.49395978450775146, + "learning_rate": 0.00010573786329474607, + "loss": 2.063, + "step": 2575 + }, + { + "epoch": 1.1108475307310761, + "eval_loss": 2.1042685508728027, + "eval_runtime": 195.0821, + "eval_samples_per_second": 0.164, + "eval_steps_per_second": 0.164, + "step": 2575 + }, + { + "epoch": 1.1112788440802244, + "grad_norm": 0.17557857930660248, + "learning_rate": 0.00010570673001158985, + "loss": 1.9164, + "step": 2576 + }, + { + "epoch": 1.1117101574293724, + "grad_norm": 0.188553586602211, + "learning_rate": 0.00010567559037026942, + "loss": 2.3004, + "step": 2577 + }, + { + "epoch": 1.1121414707785207, + "grad_norm": 0.19646279513835907, + "learning_rate": 0.00010564444437723251, + "loss": 2.1013, + "step": 2578 + }, + { + "epoch": 1.1125727841276687, + "grad_norm": 0.1875109225511551, + "learning_rate": 0.0001056132920389283, + "loss": 2.4232, + "step": 2579 + }, + { + "epoch": 1.113004097476817, + "grad_norm": 0.21454770863056183, + "learning_rate": 0.00010558213336180716, + "loss": 2.2497, + "step": 2580 + }, + { + "epoch": 1.113435410825965, + "grad_norm": 0.1954226791858673, + "learning_rate": 0.00010555096835232089, + "loss": 2.1853, + "step": 2581 + }, + { + "epoch": 1.1138667241751132, + "grad_norm": 0.15640024840831757, + "learning_rate": 0.00010551979701692252, + "loss": 2.0693, + "step": 2582 + }, + { + "epoch": 1.1142980375242613, + "grad_norm": 0.17734797298908234, + "learning_rate": 0.0001054886193620664, + "loss": 2.2395, + "step": 2583 + }, + { + "epoch": 1.1147293508734095, + "grad_norm": 0.19922398030757904, + "learning_rate": 0.00010545743539420827, + "loss": 2.2398, + "step": 2584 + }, + { + "epoch": 1.1151606642225578, + "grad_norm": 0.18507863581180573, + "learning_rate": 0.00010542624511980499, + "loss": 2.2141, + "step": 2585 + }, + { + "epoch": 1.1155919775717058, + "grad_norm": 0.17060115933418274, + "learning_rate": 0.00010539504854531497, + "loss": 2.195, + "step": 2586 + }, + { + "epoch": 1.116023290920854, + "grad_norm": 0.19707359373569489, + "learning_rate": 0.00010536384567719777, + "loss": 2.1652, + "step": 2587 + }, + { + "epoch": 1.116454604270002, + "grad_norm": 0.15684722363948822, + "learning_rate": 0.00010533263652191422, + "loss": 2.0567, + "step": 2588 + }, + { + "epoch": 1.1168859176191503, + "grad_norm": 0.17799372971057892, + "learning_rate": 0.00010530142108592661, + "loss": 2.2559, + "step": 2589 + }, + { + "epoch": 1.1173172309682984, + "grad_norm": 0.16498693823814392, + "learning_rate": 0.00010527019937569838, + "loss": 2.231, + "step": 2590 + }, + { + "epoch": 1.1177485443174466, + "grad_norm": 0.1831119954586029, + "learning_rate": 0.00010523897139769435, + "loss": 2.1352, + "step": 2591 + }, + { + "epoch": 1.1181798576665947, + "grad_norm": 0.20394060015678406, + "learning_rate": 0.00010520773715838057, + "loss": 2.2722, + "step": 2592 + }, + { + "epoch": 1.118611171015743, + "grad_norm": 0.17225882411003113, + "learning_rate": 0.00010517649666422449, + "loss": 2.1723, + "step": 2593 + }, + { + "epoch": 1.1190424843648912, + "grad_norm": 0.20260009169578552, + "learning_rate": 0.00010514524992169477, + "loss": 2.3081, + "step": 2594 + }, + { + "epoch": 1.1194737977140392, + "grad_norm": 0.17399710416793823, + "learning_rate": 0.00010511399693726136, + "loss": 2.3719, + "step": 2595 + }, + { + "epoch": 1.1199051110631875, + "grad_norm": 0.18588174879550934, + "learning_rate": 0.00010508273771739563, + "loss": 2.2116, + "step": 2596 + }, + { + "epoch": 1.1203364244123355, + "grad_norm": 0.20267854630947113, + "learning_rate": 0.00010505147226857002, + "loss": 2.131, + "step": 2597 + }, + { + "epoch": 1.1207677377614838, + "grad_norm": 0.16706599295139313, + "learning_rate": 0.00010502020059725847, + "loss": 2.2558, + "step": 2598 + }, + { + "epoch": 1.1211990511106318, + "grad_norm": 0.19617630541324615, + "learning_rate": 0.00010498892270993606, + "loss": 2.1837, + "step": 2599 + }, + { + "epoch": 1.12163036445978, + "grad_norm": 0.20756325125694275, + "learning_rate": 0.00010495763861307929, + "loss": 2.1954, + "step": 2600 + }, + { + "epoch": 1.12163036445978, + "eval_loss": 2.1039798259735107, + "eval_runtime": 195.4933, + "eval_samples_per_second": 0.164, + "eval_steps_per_second": 0.164, + "step": 2600 + }, + { + "epoch": 1.122061677808928, + "grad_norm": 0.20020046830177307, + "learning_rate": 0.0001049263483131658, + "loss": 2.2906, + "step": 2601 + }, + { + "epoch": 1.1224929911580763, + "grad_norm": 0.2032448649406433, + "learning_rate": 0.00010489505181667465, + "loss": 2.2178, + "step": 2602 + }, + { + "epoch": 1.1229243045072246, + "grad_norm": 0.18019424378871918, + "learning_rate": 0.00010486374913008606, + "loss": 2.177, + "step": 2603 + }, + { + "epoch": 1.1233556178563726, + "grad_norm": 0.16137458384037018, + "learning_rate": 0.00010483244025988164, + "loss": 2.2681, + "step": 2604 + }, + { + "epoch": 1.1237869312055209, + "grad_norm": 0.18528112769126892, + "learning_rate": 0.00010480112521254425, + "loss": 2.3586, + "step": 2605 + }, + { + "epoch": 1.124218244554669, + "grad_norm": 0.1861640065908432, + "learning_rate": 0.00010476980399455795, + "loss": 2.0436, + "step": 2606 + }, + { + "epoch": 1.1246495579038172, + "grad_norm": 0.1724652796983719, + "learning_rate": 0.0001047384766124082, + "loss": 2.2836, + "step": 2607 + }, + { + "epoch": 1.1250808712529652, + "grad_norm": 0.16126412153244019, + "learning_rate": 0.00010470714307258164, + "loss": 2.2284, + "step": 2608 + }, + { + "epoch": 1.1255121846021134, + "grad_norm": 0.17227625846862793, + "learning_rate": 0.00010467580338156624, + "loss": 2.1648, + "step": 2609 + }, + { + "epoch": 1.1259434979512615, + "grad_norm": 0.17861312627792358, + "learning_rate": 0.00010464445754585122, + "loss": 2.1608, + "step": 2610 + }, + { + "epoch": 1.1263748113004097, + "grad_norm": 0.1870555728673935, + "learning_rate": 0.00010461310557192706, + "loss": 2.1918, + "step": 2611 + }, + { + "epoch": 1.126806124649558, + "grad_norm": 0.16503466665744781, + "learning_rate": 0.00010458174746628558, + "loss": 2.024, + "step": 2612 + }, + { + "epoch": 1.127237437998706, + "grad_norm": 0.1710997223854065, + "learning_rate": 0.00010455038323541976, + "loss": 2.112, + "step": 2613 + }, + { + "epoch": 1.1276687513478543, + "grad_norm": 0.1710243970155716, + "learning_rate": 0.00010451901288582397, + "loss": 2.3523, + "step": 2614 + }, + { + "epoch": 1.1281000646970023, + "grad_norm": 0.17720620334148407, + "learning_rate": 0.00010448763642399371, + "loss": 2.1211, + "step": 2615 + }, + { + "epoch": 1.1285313780461506, + "grad_norm": 0.17885632812976837, + "learning_rate": 0.00010445625385642586, + "loss": 2.2591, + "step": 2616 + }, + { + "epoch": 1.1289626913952986, + "grad_norm": 0.17788010835647583, + "learning_rate": 0.00010442486518961855, + "loss": 2.0312, + "step": 2617 + }, + { + "epoch": 1.1293940047444468, + "grad_norm": 0.15931367874145508, + "learning_rate": 0.0001043934704300711, + "loss": 2.4051, + "step": 2618 + }, + { + "epoch": 1.1298253180935949, + "grad_norm": 0.20527784526348114, + "learning_rate": 0.00010436206958428415, + "loss": 2.183, + "step": 2619 + }, + { + "epoch": 1.1302566314427431, + "grad_norm": 0.19679832458496094, + "learning_rate": 0.00010433066265875958, + "loss": 2.0836, + "step": 2620 + }, + { + "epoch": 1.1306879447918914, + "grad_norm": 0.1598224639892578, + "learning_rate": 0.0001042992496600006, + "loss": 2.1045, + "step": 2621 + }, + { + "epoch": 1.1311192581410394, + "grad_norm": 0.16047848761081696, + "learning_rate": 0.00010426783059451151, + "loss": 2.2245, + "step": 2622 + }, + { + "epoch": 1.1315505714901877, + "grad_norm": 0.1957550346851349, + "learning_rate": 0.00010423640546879807, + "loss": 2.3832, + "step": 2623 + }, + { + "epoch": 1.1319818848393357, + "grad_norm": 0.1612965166568756, + "learning_rate": 0.00010420497428936713, + "loss": 2.1053, + "step": 2624 + }, + { + "epoch": 1.132413198188484, + "grad_norm": 0.16824392974376678, + "learning_rate": 0.0001041735370627269, + "loss": 2.3027, + "step": 2625 + }, + { + "epoch": 1.132413198188484, + "eval_loss": 2.1031532287597656, + "eval_runtime": 195.1556, + "eval_samples_per_second": 0.164, + "eval_steps_per_second": 0.164, + "step": 2625 + }, + { + "epoch": 1.132844511537632, + "grad_norm": 0.17558202147483826, + "learning_rate": 0.00010414209379538679, + "loss": 2.1126, + "step": 2626 + }, + { + "epoch": 1.1332758248867802, + "grad_norm": 0.17710520327091217, + "learning_rate": 0.00010411064449385743, + "loss": 2.2288, + "step": 2627 + }, + { + "epoch": 1.1337071382359283, + "grad_norm": 0.16382630169391632, + "learning_rate": 0.00010407918916465083, + "loss": 2.2314, + "step": 2628 + }, + { + "epoch": 1.1341384515850765, + "grad_norm": 0.1876552700996399, + "learning_rate": 0.00010404772781428006, + "loss": 2.3205, + "step": 2629 + }, + { + "epoch": 1.1345697649342248, + "grad_norm": 0.1882568895816803, + "learning_rate": 0.00010401626044925963, + "loss": 2.2529, + "step": 2630 + }, + { + "epoch": 1.1350010782833728, + "grad_norm": 0.18390049040317535, + "learning_rate": 0.00010398478707610511, + "loss": 2.0675, + "step": 2631 + }, + { + "epoch": 1.135432391632521, + "grad_norm": 0.17786632478237152, + "learning_rate": 0.00010395330770133349, + "loss": 2.1582, + "step": 2632 + }, + { + "epoch": 1.135863704981669, + "grad_norm": 0.5382328033447266, + "learning_rate": 0.00010392182233146286, + "loss": 1.9208, + "step": 2633 + }, + { + "epoch": 1.1362950183308174, + "grad_norm": 0.18467487394809723, + "learning_rate": 0.00010389033097301264, + "loss": 2.1614, + "step": 2634 + }, + { + "epoch": 1.1367263316799656, + "grad_norm": 0.1702251136302948, + "learning_rate": 0.00010385883363250348, + "loss": 2.2288, + "step": 2635 + }, + { + "epoch": 1.1371576450291137, + "grad_norm": 0.17580457031726837, + "learning_rate": 0.00010382733031645718, + "loss": 2.0447, + "step": 2636 + }, + { + "epoch": 1.1375889583782617, + "grad_norm": 0.1831868290901184, + "learning_rate": 0.00010379582103139691, + "loss": 2.4478, + "step": 2637 + }, + { + "epoch": 1.13802027172741, + "grad_norm": 0.1700078547000885, + "learning_rate": 0.00010376430578384699, + "loss": 2.3375, + "step": 2638 + }, + { + "epoch": 1.1384515850765582, + "grad_norm": 0.17241442203521729, + "learning_rate": 0.000103732784580333, + "loss": 2.3493, + "step": 2639 + }, + { + "epoch": 1.1388828984257062, + "grad_norm": 0.17868413031101227, + "learning_rate": 0.00010370125742738173, + "loss": 2.3758, + "step": 2640 + }, + { + "epoch": 1.1393142117748545, + "grad_norm": 0.1892899125814438, + "learning_rate": 0.00010366972433152125, + "loss": 2.286, + "step": 2641 + }, + { + "epoch": 1.1397455251240025, + "grad_norm": 0.18568749725818634, + "learning_rate": 0.0001036381852992808, + "loss": 2.2917, + "step": 2642 + }, + { + "epoch": 1.1401768384731508, + "grad_norm": 0.1770264059305191, + "learning_rate": 0.00010360664033719092, + "loss": 2.1133, + "step": 2643 + }, + { + "epoch": 1.140608151822299, + "grad_norm": 0.17140696942806244, + "learning_rate": 0.00010357508945178334, + "loss": 2.1425, + "step": 2644 + }, + { + "epoch": 1.141039465171447, + "grad_norm": 0.179879292845726, + "learning_rate": 0.00010354353264959096, + "loss": 2.0747, + "step": 2645 + }, + { + "epoch": 1.1414707785205953, + "grad_norm": 0.18175861239433289, + "learning_rate": 0.00010351196993714802, + "loss": 2.2599, + "step": 2646 + }, + { + "epoch": 1.1419020918697433, + "grad_norm": 0.2078048437833786, + "learning_rate": 0.00010348040132098989, + "loss": 2.0717, + "step": 2647 + }, + { + "epoch": 1.1423334052188916, + "grad_norm": 0.18393027782440186, + "learning_rate": 0.00010344882680765322, + "loss": 2.179, + "step": 2648 + }, + { + "epoch": 1.1427647185680396, + "grad_norm": 0.16221146285533905, + "learning_rate": 0.00010341724640367585, + "loss": 2.234, + "step": 2649 + }, + { + "epoch": 1.1431960319171879, + "grad_norm": 0.17851437628269196, + "learning_rate": 0.00010338566011559683, + "loss": 1.992, + "step": 2650 + }, + { + "epoch": 1.1431960319171879, + "eval_loss": 2.1033437252044678, + "eval_runtime": 195.2141, + "eval_samples_per_second": 0.164, + "eval_steps_per_second": 0.164, + "step": 2650 + }, + { + "epoch": 1.143627345266336, + "grad_norm": 0.200714111328125, + "learning_rate": 0.0001033540679499565, + "loss": 2.3895, + "step": 2651 + }, + { + "epoch": 1.1440586586154842, + "grad_norm": 0.15753240883350372, + "learning_rate": 0.00010332246991329627, + "loss": 2.2215, + "step": 2652 + }, + { + "epoch": 1.1444899719646324, + "grad_norm": 0.17338408529758453, + "learning_rate": 0.00010329086601215898, + "loss": 2.2074, + "step": 2653 + }, + { + "epoch": 1.1449212853137805, + "grad_norm": 0.1783522218465805, + "learning_rate": 0.00010325925625308845, + "loss": 2.3323, + "step": 2654 + }, + { + "epoch": 1.1453525986629287, + "grad_norm": 0.1770525872707367, + "learning_rate": 0.0001032276406426299, + "loss": 2.0545, + "step": 2655 + }, + { + "epoch": 1.1457839120120767, + "grad_norm": 0.17786875367164612, + "learning_rate": 0.00010319601918732964, + "loss": 2.1926, + "step": 2656 + }, + { + "epoch": 1.146215225361225, + "grad_norm": 0.21019989252090454, + "learning_rate": 0.00010316439189373525, + "loss": 2.3016, + "step": 2657 + }, + { + "epoch": 1.146646538710373, + "grad_norm": 0.17054423689842224, + "learning_rate": 0.00010313275876839557, + "loss": 2.1123, + "step": 2658 + }, + { + "epoch": 1.1470778520595213, + "grad_norm": 0.16215083003044128, + "learning_rate": 0.00010310111981786048, + "loss": 2.1672, + "step": 2659 + }, + { + "epoch": 1.1475091654086693, + "grad_norm": 0.18057376146316528, + "learning_rate": 0.00010306947504868127, + "loss": 2.1375, + "step": 2660 + }, + { + "epoch": 1.1479404787578176, + "grad_norm": 0.18483436107635498, + "learning_rate": 0.00010303782446741026, + "loss": 2.1457, + "step": 2661 + }, + { + "epoch": 1.1483717921069658, + "grad_norm": 0.1700708121061325, + "learning_rate": 0.00010300616808060108, + "loss": 2.284, + "step": 2662 + }, + { + "epoch": 1.1488031054561139, + "grad_norm": 0.17855966091156006, + "learning_rate": 0.00010297450589480853, + "loss": 2.1609, + "step": 2663 + }, + { + "epoch": 1.1492344188052621, + "grad_norm": 0.16909952461719513, + "learning_rate": 0.0001029428379165886, + "loss": 2.2052, + "step": 2664 + }, + { + "epoch": 1.1496657321544101, + "grad_norm": 0.16772057116031647, + "learning_rate": 0.00010291116415249854, + "loss": 1.9577, + "step": 2665 + }, + { + "epoch": 1.1500970455035584, + "grad_norm": 0.17496979236602783, + "learning_rate": 0.00010287948460909667, + "loss": 2.1855, + "step": 2666 + }, + { + "epoch": 1.1505283588527064, + "grad_norm": 0.17731782793998718, + "learning_rate": 0.00010284779929294265, + "loss": 2.1574, + "step": 2667 + }, + { + "epoch": 1.1509596722018547, + "grad_norm": 0.1847413033246994, + "learning_rate": 0.00010281610821059722, + "loss": 2.4498, + "step": 2668 + }, + { + "epoch": 1.1513909855510027, + "grad_norm": 0.16173487901687622, + "learning_rate": 0.00010278441136862242, + "loss": 2.0799, + "step": 2669 + }, + { + "epoch": 1.151822298900151, + "grad_norm": 0.1768932342529297, + "learning_rate": 0.00010275270877358139, + "loss": 2.0765, + "step": 2670 + }, + { + "epoch": 1.1522536122492992, + "grad_norm": 0.20227771997451782, + "learning_rate": 0.00010272100043203851, + "loss": 2.1289, + "step": 2671 + }, + { + "epoch": 1.1526849255984473, + "grad_norm": 0.1585785299539566, + "learning_rate": 0.00010268928635055935, + "loss": 2.0325, + "step": 2672 + }, + { + "epoch": 1.1531162389475955, + "grad_norm": 0.17804929614067078, + "learning_rate": 0.00010265756653571063, + "loss": 2.1087, + "step": 2673 + }, + { + "epoch": 1.1535475522967436, + "grad_norm": 0.1796695739030838, + "learning_rate": 0.00010262584099406032, + "loss": 2.158, + "step": 2674 + }, + { + "epoch": 1.1539788656458918, + "grad_norm": 0.1786671131849289, + "learning_rate": 0.00010259410973217749, + "loss": 2.3043, + "step": 2675 + }, + { + "epoch": 1.1539788656458918, + "eval_loss": 2.102726697921753, + "eval_runtime": 195.1649, + "eval_samples_per_second": 0.164, + "eval_steps_per_second": 0.164, + "step": 2675 + }, + { + "epoch": 1.1544101789950398, + "grad_norm": 0.1836918145418167, + "learning_rate": 0.00010256237275663253, + "loss": 2.3597, + "step": 2676 + }, + { + "epoch": 1.154841492344188, + "grad_norm": 0.20627878606319427, + "learning_rate": 0.00010253063007399681, + "loss": 2.2212, + "step": 2677 + }, + { + "epoch": 1.1552728056933361, + "grad_norm": 0.16205772757530212, + "learning_rate": 0.0001024988816908431, + "loss": 2.2527, + "step": 2678 + }, + { + "epoch": 1.1557041190424844, + "grad_norm": 0.17430098354816437, + "learning_rate": 0.00010246712761374522, + "loss": 2.1506, + "step": 2679 + }, + { + "epoch": 1.1561354323916326, + "grad_norm": 0.2250378578901291, + "learning_rate": 0.00010243536784927817, + "loss": 2.1987, + "step": 2680 + }, + { + "epoch": 1.1565667457407807, + "grad_norm": 1.8225398063659668, + "learning_rate": 0.00010240360240401819, + "loss": 2.1556, + "step": 2681 + }, + { + "epoch": 1.156998059089929, + "grad_norm": 0.16364197432994843, + "learning_rate": 0.00010237183128454264, + "loss": 2.1345, + "step": 2682 + }, + { + "epoch": 1.157429372439077, + "grad_norm": 0.17830027639865875, + "learning_rate": 0.00010234005449743011, + "loss": 2.1186, + "step": 2683 + }, + { + "epoch": 1.1578606857882252, + "grad_norm": 0.17624956369400024, + "learning_rate": 0.00010230827204926029, + "loss": 2.3228, + "step": 2684 + }, + { + "epoch": 1.1582919991373732, + "grad_norm": 0.16102442145347595, + "learning_rate": 0.00010227648394661413, + "loss": 2.145, + "step": 2685 + }, + { + "epoch": 1.1587233124865215, + "grad_norm": 0.1778397262096405, + "learning_rate": 0.00010224469019607364, + "loss": 2.1796, + "step": 2686 + }, + { + "epoch": 1.1591546258356695, + "grad_norm": 0.17264363169670105, + "learning_rate": 0.00010221289080422212, + "loss": 2.0441, + "step": 2687 + }, + { + "epoch": 1.1595859391848178, + "grad_norm": 0.18140749633312225, + "learning_rate": 0.00010218108577764397, + "loss": 2.15, + "step": 2688 + }, + { + "epoch": 1.160017252533966, + "grad_norm": 0.1673058271408081, + "learning_rate": 0.00010214927512292473, + "loss": 2.1175, + "step": 2689 + }, + { + "epoch": 1.160448565883114, + "grad_norm": 0.17649729549884796, + "learning_rate": 0.00010211745884665124, + "loss": 2.3064, + "step": 2690 + }, + { + "epoch": 1.1608798792322623, + "grad_norm": 0.17438872158527374, + "learning_rate": 0.00010208563695541129, + "loss": 2.0611, + "step": 2691 + }, + { + "epoch": 1.1613111925814104, + "grad_norm": 0.17215564846992493, + "learning_rate": 0.00010205380945579404, + "loss": 2.1821, + "step": 2692 + }, + { + "epoch": 1.1617425059305586, + "grad_norm": 0.14636805653572083, + "learning_rate": 0.00010202197635438966, + "loss": 2.0414, + "step": 2693 + }, + { + "epoch": 1.1621738192797066, + "grad_norm": 0.18306148052215576, + "learning_rate": 0.00010199013765778958, + "loss": 2.1533, + "step": 2694 + }, + { + "epoch": 1.162605132628855, + "grad_norm": 0.17002162337303162, + "learning_rate": 0.00010195829337258634, + "loss": 2.0679, + "step": 2695 + }, + { + "epoch": 1.163036445978003, + "grad_norm": 0.17379575967788696, + "learning_rate": 0.00010192644350537362, + "loss": 2.2311, + "step": 2696 + }, + { + "epoch": 1.1634677593271512, + "grad_norm": 0.22427932918071747, + "learning_rate": 0.00010189458806274636, + "loss": 2.2645, + "step": 2697 + }, + { + "epoch": 1.1638990726762994, + "grad_norm": 0.18266619741916656, + "learning_rate": 0.00010186272705130049, + "loss": 2.2333, + "step": 2698 + }, + { + "epoch": 1.1643303860254475, + "grad_norm": 0.17629382014274597, + "learning_rate": 0.00010183086047763324, + "loss": 2.3974, + "step": 2699 + }, + { + "epoch": 1.1647616993745957, + "grad_norm": 0.16827429831027985, + "learning_rate": 0.0001017989883483429, + "loss": 2.1682, + "step": 2700 + }, + { + "epoch": 1.1647616993745957, + "eval_loss": 2.1029937267303467, + "eval_runtime": 195.1429, + "eval_samples_per_second": 0.164, + "eval_steps_per_second": 0.164, + "step": 2700 + }, + { + "epoch": 1.1651930127237438, + "grad_norm": 0.17276984453201294, + "learning_rate": 0.00010176711067002896, + "loss": 2.3793, + "step": 2701 + }, + { + "epoch": 1.165624326072892, + "grad_norm": 0.1839294582605362, + "learning_rate": 0.00010173522744929203, + "loss": 2.0978, + "step": 2702 + }, + { + "epoch": 1.16605563942204, + "grad_norm": 0.19586913287639618, + "learning_rate": 0.00010170333869273388, + "loss": 2.1481, + "step": 2703 + }, + { + "epoch": 1.1664869527711883, + "grad_norm": 0.1714116483926773, + "learning_rate": 0.00010167144440695744, + "loss": 2.1371, + "step": 2704 + }, + { + "epoch": 1.1669182661203363, + "grad_norm": 0.17770139873027802, + "learning_rate": 0.00010163954459856674, + "loss": 2.2928, + "step": 2705 + }, + { + "epoch": 1.1673495794694846, + "grad_norm": 0.16617412865161896, + "learning_rate": 0.00010160763927416704, + "loss": 2.2584, + "step": 2706 + }, + { + "epoch": 1.1677808928186328, + "grad_norm": 0.16294576227664948, + "learning_rate": 0.00010157572844036462, + "loss": 2.1495, + "step": 2707 + }, + { + "epoch": 1.1682122061677809, + "grad_norm": 0.1830517053604126, + "learning_rate": 0.00010154381210376703, + "loss": 1.9509, + "step": 2708 + }, + { + "epoch": 1.1686435195169291, + "grad_norm": 0.20103468000888824, + "learning_rate": 0.00010151189027098284, + "loss": 2.1433, + "step": 2709 + }, + { + "epoch": 1.1690748328660772, + "grad_norm": 0.18614795804023743, + "learning_rate": 0.00010147996294862185, + "loss": 2.2301, + "step": 2710 + }, + { + "epoch": 1.1695061462152254, + "grad_norm": 0.1958162486553192, + "learning_rate": 0.00010144803014329496, + "loss": 1.9723, + "step": 2711 + }, + { + "epoch": 1.1699374595643734, + "grad_norm": 0.1831211894750595, + "learning_rate": 0.00010141609186161417, + "loss": 2.4056, + "step": 2712 + }, + { + "epoch": 1.1703687729135217, + "grad_norm": 0.19260232150554657, + "learning_rate": 0.00010138414811019272, + "loss": 2.2492, + "step": 2713 + }, + { + "epoch": 1.1708000862626697, + "grad_norm": 0.17832225561141968, + "learning_rate": 0.00010135219889564484, + "loss": 2.3349, + "step": 2714 + }, + { + "epoch": 1.171231399611818, + "grad_norm": 0.19132240116596222, + "learning_rate": 0.00010132024422458602, + "loss": 2.2057, + "step": 2715 + }, + { + "epoch": 1.1716627129609662, + "grad_norm": 0.18465915322303772, + "learning_rate": 0.00010128828410363276, + "loss": 2.0808, + "step": 2716 + }, + { + "epoch": 1.1720940263101143, + "grad_norm": 0.15922652184963226, + "learning_rate": 0.00010125631853940283, + "loss": 2.1415, + "step": 2717 + }, + { + "epoch": 1.1725253396592625, + "grad_norm": 0.18575268983840942, + "learning_rate": 0.000101224347538515, + "loss": 2.287, + "step": 2718 + }, + { + "epoch": 1.1729566530084106, + "grad_norm": 0.17416925728321075, + "learning_rate": 0.00010119237110758921, + "loss": 2.0774, + "step": 2719 + }, + { + "epoch": 1.1733879663575588, + "grad_norm": 0.1689963936805725, + "learning_rate": 0.00010116038925324659, + "loss": 2.2118, + "step": 2720 + }, + { + "epoch": 1.1738192797067069, + "grad_norm": 0.18122926354408264, + "learning_rate": 0.00010112840198210925, + "loss": 2.2947, + "step": 2721 + }, + { + "epoch": 1.174250593055855, + "grad_norm": 0.17530053853988647, + "learning_rate": 0.00010109640930080058, + "loss": 2.1621, + "step": 2722 + }, + { + "epoch": 1.1746819064050031, + "grad_norm": 0.17392095923423767, + "learning_rate": 0.00010106441121594494, + "loss": 2.1327, + "step": 2723 + }, + { + "epoch": 1.1751132197541514, + "grad_norm": 0.18160422146320343, + "learning_rate": 0.00010103240773416795, + "loss": 2.2259, + "step": 2724 + }, + { + "epoch": 1.1755445331032996, + "grad_norm": 0.18598490953445435, + "learning_rate": 0.00010100039886209624, + "loss": 2.2135, + "step": 2725 + }, + { + "epoch": 1.1755445331032996, + "eval_loss": 2.1023378372192383, + "eval_runtime": 195.1753, + "eval_samples_per_second": 0.164, + "eval_steps_per_second": 0.164, + "step": 2725 + }, + { + "epoch": 1.1759758464524477, + "grad_norm": 0.16121797263622284, + "learning_rate": 0.00010096838460635761, + "loss": 2.1434, + "step": 2726 + }, + { + "epoch": 1.176407159801596, + "grad_norm": 0.1671607792377472, + "learning_rate": 0.00010093636497358096, + "loss": 2.0611, + "step": 2727 + }, + { + "epoch": 1.176838473150744, + "grad_norm": 0.17500080168247223, + "learning_rate": 0.00010090433997039631, + "loss": 1.9025, + "step": 2728 + }, + { + "epoch": 1.1772697864998922, + "grad_norm": 0.18734723329544067, + "learning_rate": 0.00010087230960343481, + "loss": 2.2704, + "step": 2729 + }, + { + "epoch": 1.1777010998490403, + "grad_norm": 0.17753678560256958, + "learning_rate": 0.00010084027387932863, + "loss": 2.1605, + "step": 2730 + }, + { + "epoch": 1.1781324131981885, + "grad_norm": 0.16886518895626068, + "learning_rate": 0.00010080823280471119, + "loss": 2.3079, + "step": 2731 + }, + { + "epoch": 1.1785637265473365, + "grad_norm": 0.17611104249954224, + "learning_rate": 0.00010077618638621688, + "loss": 2.2272, + "step": 2732 + }, + { + "epoch": 1.1789950398964848, + "grad_norm": 0.182244673371315, + "learning_rate": 0.00010074413463048132, + "loss": 2.2693, + "step": 2733 + }, + { + "epoch": 1.179426353245633, + "grad_norm": 0.17081747949123383, + "learning_rate": 0.00010071207754414116, + "loss": 2.1961, + "step": 2734 + }, + { + "epoch": 1.179857666594781, + "grad_norm": 0.1862158477306366, + "learning_rate": 0.00010068001513383413, + "loss": 2.3049, + "step": 2735 + }, + { + "epoch": 1.1802889799439293, + "grad_norm": 0.35091134905815125, + "learning_rate": 0.00010064794740619917, + "loss": 2.201, + "step": 2736 + }, + { + "epoch": 1.1807202932930774, + "grad_norm": 0.1901334524154663, + "learning_rate": 0.00010061587436787621, + "loss": 2.2042, + "step": 2737 + }, + { + "epoch": 1.1811516066422256, + "grad_norm": 0.1805984228849411, + "learning_rate": 0.00010058379602550632, + "loss": 2.4089, + "step": 2738 + }, + { + "epoch": 1.1815829199913737, + "grad_norm": 0.19295565783977509, + "learning_rate": 0.00010055171238573168, + "loss": 2.1412, + "step": 2739 + }, + { + "epoch": 1.182014233340522, + "grad_norm": 0.18122532963752747, + "learning_rate": 0.00010051962345519557, + "loss": 2.0987, + "step": 2740 + }, + { + "epoch": 1.18244554668967, + "grad_norm": 0.16439107060432434, + "learning_rate": 0.00010048752924054236, + "loss": 2.0193, + "step": 2741 + }, + { + "epoch": 1.1828768600388182, + "grad_norm": 0.18041017651557922, + "learning_rate": 0.00010045542974841747, + "loss": 2.3687, + "step": 2742 + }, + { + "epoch": 1.1833081733879665, + "grad_norm": 0.17560729384422302, + "learning_rate": 0.00010042332498546748, + "loss": 2.1835, + "step": 2743 + }, + { + "epoch": 1.1837394867371145, + "grad_norm": 0.1784038096666336, + "learning_rate": 0.00010039121495834002, + "loss": 2.2537, + "step": 2744 + }, + { + "epoch": 1.1841708000862627, + "grad_norm": 0.16383419930934906, + "learning_rate": 0.00010035909967368386, + "loss": 2.1551, + "step": 2745 + }, + { + "epoch": 1.1846021134354108, + "grad_norm": 0.17294475436210632, + "learning_rate": 0.00010032697913814875, + "loss": 1.996, + "step": 2746 + }, + { + "epoch": 1.185033426784559, + "grad_norm": 0.2089913785457611, + "learning_rate": 0.00010029485335838565, + "loss": 2.2704, + "step": 2747 + }, + { + "epoch": 1.185464740133707, + "grad_norm": 0.1756969392299652, + "learning_rate": 0.00010026272234104657, + "loss": 2.1276, + "step": 2748 + }, + { + "epoch": 1.1858960534828553, + "grad_norm": 0.17700275778770447, + "learning_rate": 0.00010023058609278453, + "loss": 2.1297, + "step": 2749 + }, + { + "epoch": 1.1863273668320033, + "grad_norm": 0.1597864329814911, + "learning_rate": 0.00010019844462025375, + "loss": 2.1652, + "step": 2750 + }, + { + "epoch": 1.1863273668320033, + "eval_loss": 2.1022770404815674, + "eval_runtime": 194.8789, + "eval_samples_per_second": 0.164, + "eval_steps_per_second": 0.164, + "step": 2750 + }, + { + "epoch": 1.1867586801811516, + "grad_norm": 0.18585540354251862, + "learning_rate": 0.0001001662979301094, + "loss": 2.2248, + "step": 2751 + }, + { + "epoch": 1.1871899935302999, + "grad_norm": 0.1518988460302353, + "learning_rate": 0.0001001341460290079, + "loss": 2.133, + "step": 2752 + }, + { + "epoch": 1.187621306879448, + "grad_norm": 0.20666302740573883, + "learning_rate": 0.00010010198892360657, + "loss": 2.1065, + "step": 2753 + }, + { + "epoch": 1.1880526202285961, + "grad_norm": 0.17307338118553162, + "learning_rate": 0.00010006982662056396, + "loss": 2.2438, + "step": 2754 + }, + { + "epoch": 1.1884839335777442, + "grad_norm": 1.0718460083007812, + "learning_rate": 0.00010003765912653954, + "loss": 2.2488, + "step": 2755 + }, + { + "epoch": 1.1889152469268924, + "grad_norm": 0.17475785315036774, + "learning_rate": 0.00010000548644819402, + "loss": 2.0289, + "step": 2756 + }, + { + "epoch": 1.1893465602760405, + "grad_norm": 0.1769273281097412, + "learning_rate": 9.997330859218906e-05, + "loss": 2.1434, + "step": 2757 + }, + { + "epoch": 1.1897778736251887, + "grad_norm": 0.16352666914463043, + "learning_rate": 9.994112556518746e-05, + "loss": 2.2072, + "step": 2758 + }, + { + "epoch": 1.1902091869743368, + "grad_norm": 0.16758856177330017, + "learning_rate": 9.990893737385306e-05, + "loss": 2.095, + "step": 2759 + }, + { + "epoch": 1.190640500323485, + "grad_norm": 0.19053834676742554, + "learning_rate": 9.987674402485078e-05, + "loss": 1.9014, + "step": 2760 + }, + { + "epoch": 1.1910718136726333, + "grad_norm": 0.19716694951057434, + "learning_rate": 9.984454552484662e-05, + "loss": 2.2309, + "step": 2761 + }, + { + "epoch": 1.1915031270217813, + "grad_norm": 0.17813928425312042, + "learning_rate": 9.981234188050757e-05, + "loss": 2.2592, + "step": 2762 + }, + { + "epoch": 1.1919344403709295, + "grad_norm": 0.18199707567691803, + "learning_rate": 9.978013309850183e-05, + "loss": 2.0669, + "step": 2763 + }, + { + "epoch": 1.1923657537200776, + "grad_norm": 0.17666861414909363, + "learning_rate": 9.974791918549853e-05, + "loss": 2.0754, + "step": 2764 + }, + { + "epoch": 1.1927970670692258, + "grad_norm": 0.15702718496322632, + "learning_rate": 9.971570014816792e-05, + "loss": 2.0688, + "step": 2765 + }, + { + "epoch": 1.1932283804183739, + "grad_norm": 0.2100336253643036, + "learning_rate": 9.968347599318131e-05, + "loss": 2.0483, + "step": 2766 + }, + { + "epoch": 1.1936596937675221, + "grad_norm": 0.18148039281368256, + "learning_rate": 9.965124672721105e-05, + "loss": 2.2175, + "step": 2767 + }, + { + "epoch": 1.1940910071166702, + "grad_norm": 0.19504286348819733, + "learning_rate": 9.96190123569306e-05, + "loss": 2.2361, + "step": 2768 + }, + { + "epoch": 1.1945223204658184, + "grad_norm": 0.1750347912311554, + "learning_rate": 9.958677288901438e-05, + "loss": 2.2762, + "step": 2769 + }, + { + "epoch": 1.1949536338149667, + "grad_norm": 0.25596529245376587, + "learning_rate": 9.955452833013798e-05, + "loss": 2.1682, + "step": 2770 + }, + { + "epoch": 1.1953849471641147, + "grad_norm": 0.18025095760822296, + "learning_rate": 9.952227868697797e-05, + "loss": 2.4384, + "step": 2771 + }, + { + "epoch": 1.195816260513263, + "grad_norm": 0.2068176120519638, + "learning_rate": 9.949002396621197e-05, + "loss": 2.2778, + "step": 2772 + }, + { + "epoch": 1.196247573862411, + "grad_norm": 0.17091801762580872, + "learning_rate": 9.945776417451869e-05, + "loss": 2.1037, + "step": 2773 + }, + { + "epoch": 1.1966788872115592, + "grad_norm": 0.1508743315935135, + "learning_rate": 9.942549931857786e-05, + "loss": 1.7748, + "step": 2774 + }, + { + "epoch": 1.1971102005607073, + "grad_norm": 0.15512053668498993, + "learning_rate": 9.939322940507033e-05, + "loss": 2.2604, + "step": 2775 + }, + { + "epoch": 1.1971102005607073, + "eval_loss": 2.1031556129455566, + "eval_runtime": 213.7409, + "eval_samples_per_second": 0.15, + "eval_steps_per_second": 0.15, + "step": 2775 + }, + { + "epoch": 1.1975415139098555, + "grad_norm": 0.16651402413845062, + "learning_rate": 9.936095444067783e-05, + "loss": 2.2371, + "step": 2776 + }, + { + "epoch": 1.1979728272590036, + "grad_norm": 0.18020476400852203, + "learning_rate": 9.932867443208338e-05, + "loss": 2.1046, + "step": 2777 + }, + { + "epoch": 1.1984041406081518, + "grad_norm": 0.16545698046684265, + "learning_rate": 9.929638938597077e-05, + "loss": 2.2312, + "step": 2778 + }, + { + "epoch": 1.1988354539573, + "grad_norm": 0.18139177560806274, + "learning_rate": 9.926409930902506e-05, + "loss": 2.319, + "step": 2779 + }, + { + "epoch": 1.199266767306448, + "grad_norm": 0.1793418824672699, + "learning_rate": 9.923180420793225e-05, + "loss": 2.1874, + "step": 2780 + }, + { + "epoch": 1.1996980806555964, + "grad_norm": 0.16347436606884003, + "learning_rate": 9.919950408937935e-05, + "loss": 1.9059, + "step": 2781 + }, + { + "epoch": 1.2001293940047444, + "grad_norm": 0.165296271443367, + "learning_rate": 9.916719896005453e-05, + "loss": 2.327, + "step": 2782 + }, + { + "epoch": 1.2005607073538926, + "grad_norm": 0.19593589007854462, + "learning_rate": 9.913488882664684e-05, + "loss": 2.2942, + "step": 2783 + }, + { + "epoch": 1.2009920207030407, + "grad_norm": 0.16539724171161652, + "learning_rate": 9.910257369584653e-05, + "loss": 2.2308, + "step": 2784 + }, + { + "epoch": 1.201423334052189, + "grad_norm": 0.17547844350337982, + "learning_rate": 9.907025357434471e-05, + "loss": 2.2597, + "step": 2785 + }, + { + "epoch": 1.201854647401337, + "grad_norm": 0.17845278978347778, + "learning_rate": 9.903792846883367e-05, + "loss": 2.0745, + "step": 2786 + }, + { + "epoch": 1.2022859607504852, + "grad_norm": 0.16618625819683075, + "learning_rate": 9.900559838600668e-05, + "loss": 2.2378, + "step": 2787 + }, + { + "epoch": 1.2027172740996335, + "grad_norm": 0.1826469749212265, + "learning_rate": 9.897326333255803e-05, + "loss": 2.0667, + "step": 2788 + }, + { + "epoch": 1.2031485874487815, + "grad_norm": 0.17142803966999054, + "learning_rate": 9.894092331518303e-05, + "loss": 2.1294, + "step": 2789 + }, + { + "epoch": 1.2035799007979298, + "grad_norm": 0.15849988162517548, + "learning_rate": 9.890857834057802e-05, + "loss": 2.079, + "step": 2790 + }, + { + "epoch": 1.2040112141470778, + "grad_norm": 0.17188440263271332, + "learning_rate": 9.887622841544045e-05, + "loss": 2.0135, + "step": 2791 + }, + { + "epoch": 1.204442527496226, + "grad_norm": 0.17471234500408173, + "learning_rate": 9.884387354646864e-05, + "loss": 2.092, + "step": 2792 + }, + { + "epoch": 1.204873840845374, + "grad_norm": 0.20161357522010803, + "learning_rate": 9.881151374036207e-05, + "loss": 2.168, + "step": 2793 + }, + { + "epoch": 1.2053051541945223, + "grad_norm": 0.16633059084415436, + "learning_rate": 9.87791490038212e-05, + "loss": 2.1851, + "step": 2794 + }, + { + "epoch": 1.2057364675436704, + "grad_norm": 0.1972508728504181, + "learning_rate": 9.874677934354748e-05, + "loss": 1.6806, + "step": 2795 + }, + { + "epoch": 1.2061677808928186, + "grad_norm": 0.18068282306194305, + "learning_rate": 9.87144047662434e-05, + "loss": 2.1382, + "step": 2796 + }, + { + "epoch": 1.2065990942419669, + "grad_norm": 0.18279866874217987, + "learning_rate": 9.868202527861244e-05, + "loss": 2.0858, + "step": 2797 + }, + { + "epoch": 1.207030407591115, + "grad_norm": 0.23289109766483307, + "learning_rate": 9.864964088735922e-05, + "loss": 1.9732, + "step": 2798 + }, + { + "epoch": 1.2074617209402632, + "grad_norm": 0.17867575585842133, + "learning_rate": 9.86172515991892e-05, + "loss": 2.2441, + "step": 2799 + }, + { + "epoch": 1.2078930342894112, + "grad_norm": 0.18418186902999878, + "learning_rate": 9.858485742080898e-05, + "loss": 2.2331, + "step": 2800 + }, + { + "epoch": 1.2078930342894112, + "eval_loss": 2.101106643676758, + "eval_runtime": 208.9884, + "eval_samples_per_second": 0.153, + "eval_steps_per_second": 0.153, + "step": 2800 + }, + { + "epoch": 1.2083243476385594, + "grad_norm": 0.16849951446056366, + "learning_rate": 9.855245835892609e-05, + "loss": 2.1511, + "step": 2801 + }, + { + "epoch": 1.2087556609877075, + "grad_norm": 0.18521761894226074, + "learning_rate": 9.852005442024914e-05, + "loss": 1.9106, + "step": 2802 + }, + { + "epoch": 1.2091869743368557, + "grad_norm": 0.18433886766433716, + "learning_rate": 9.848764561148773e-05, + "loss": 2.1457, + "step": 2803 + }, + { + "epoch": 1.2096182876860038, + "grad_norm": 0.19022980332374573, + "learning_rate": 9.845523193935241e-05, + "loss": 2.1276, + "step": 2804 + }, + { + "epoch": 1.210049601035152, + "grad_norm": 0.19845323264598846, + "learning_rate": 9.842281341055484e-05, + "loss": 2.1331, + "step": 2805 + }, + { + "epoch": 1.2104809143843003, + "grad_norm": 0.17411771416664124, + "learning_rate": 9.839039003180756e-05, + "loss": 2.3304, + "step": 2806 + }, + { + "epoch": 1.2109122277334483, + "grad_norm": 0.16950243711471558, + "learning_rate": 9.835796180982428e-05, + "loss": 1.9149, + "step": 2807 + }, + { + "epoch": 1.2113435410825966, + "grad_norm": 0.1674221009016037, + "learning_rate": 9.832552875131955e-05, + "loss": 2.0784, + "step": 2808 + }, + { + "epoch": 1.2117748544317446, + "grad_norm": 0.16383004188537598, + "learning_rate": 9.829309086300901e-05, + "loss": 2.1236, + "step": 2809 + }, + { + "epoch": 1.2122061677808929, + "grad_norm": 0.19841207563877106, + "learning_rate": 9.826064815160927e-05, + "loss": 2.2611, + "step": 2810 + }, + { + "epoch": 1.2126374811300409, + "grad_norm": 0.17123110592365265, + "learning_rate": 9.822820062383797e-05, + "loss": 2.1434, + "step": 2811 + }, + { + "epoch": 1.2130687944791891, + "grad_norm": 0.18227308988571167, + "learning_rate": 9.819574828641371e-05, + "loss": 2.2398, + "step": 2812 + }, + { + "epoch": 1.2135001078283372, + "grad_norm": 0.16920961439609528, + "learning_rate": 9.816329114605609e-05, + "loss": 2.165, + "step": 2813 + }, + { + "epoch": 1.2139314211774854, + "grad_norm": 0.1832374781370163, + "learning_rate": 9.813082920948575e-05, + "loss": 1.884, + "step": 2814 + }, + { + "epoch": 1.2143627345266337, + "grad_norm": 0.16949114203453064, + "learning_rate": 9.809836248342424e-05, + "loss": 2.308, + "step": 2815 + }, + { + "epoch": 1.2147940478757817, + "grad_norm": 0.17201551795005798, + "learning_rate": 9.80658909745942e-05, + "loss": 2.2275, + "step": 2816 + }, + { + "epoch": 1.21522536122493, + "grad_norm": 0.2006232738494873, + "learning_rate": 9.803341468971918e-05, + "loss": 2.1329, + "step": 2817 + }, + { + "epoch": 1.215656674574078, + "grad_norm": 0.16852867603302002, + "learning_rate": 9.800093363552379e-05, + "loss": 2.1475, + "step": 2818 + }, + { + "epoch": 1.2160879879232263, + "grad_norm": 0.16948576271533966, + "learning_rate": 9.79684478187335e-05, + "loss": 2.1423, + "step": 2819 + }, + { + "epoch": 1.2165193012723743, + "grad_norm": 0.17116281390190125, + "learning_rate": 9.793595724607493e-05, + "loss": 1.9302, + "step": 2820 + }, + { + "epoch": 1.2169506146215225, + "grad_norm": 0.17152751982212067, + "learning_rate": 9.790346192427563e-05, + "loss": 2.09, + "step": 2821 + }, + { + "epoch": 1.2173819279706706, + "grad_norm": 0.18134663999080658, + "learning_rate": 9.787096186006403e-05, + "loss": 2.1829, + "step": 2822 + }, + { + "epoch": 1.2178132413198188, + "grad_norm": 0.1795966625213623, + "learning_rate": 9.78384570601697e-05, + "loss": 2.343, + "step": 2823 + }, + { + "epoch": 1.218244554668967, + "grad_norm": 0.17307963967323303, + "learning_rate": 9.780594753132303e-05, + "loss": 2.1866, + "step": 2824 + }, + { + "epoch": 1.2186758680181151, + "grad_norm": 0.18871726095676422, + "learning_rate": 9.777343328025555e-05, + "loss": 2.3792, + "step": 2825 + }, + { + "epoch": 1.2186758680181151, + "eval_loss": 2.101025104522705, + "eval_runtime": 208.0421, + "eval_samples_per_second": 0.154, + "eval_steps_per_second": 0.154, + "step": 2825 + }, + { + "epoch": 1.2191071813672634, + "grad_norm": 0.19461756944656372, + "learning_rate": 9.774091431369966e-05, + "loss": 2.2572, + "step": 2826 + }, + { + "epoch": 1.2195384947164114, + "grad_norm": 0.16747793555259705, + "learning_rate": 9.770839063838876e-05, + "loss": 2.2002, + "step": 2827 + }, + { + "epoch": 1.2199698080655597, + "grad_norm": 0.19756019115447998, + "learning_rate": 9.767586226105725e-05, + "loss": 2.1019, + "step": 2828 + }, + { + "epoch": 1.2204011214147077, + "grad_norm": 0.1785685271024704, + "learning_rate": 9.764332918844048e-05, + "loss": 2.2182, + "step": 2829 + }, + { + "epoch": 1.220832434763856, + "grad_norm": 0.17922717332839966, + "learning_rate": 9.761079142727479e-05, + "loss": 2.2225, + "step": 2830 + }, + { + "epoch": 1.221263748113004, + "grad_norm": 0.2177387773990631, + "learning_rate": 9.757824898429743e-05, + "loss": 2.2951, + "step": 2831 + }, + { + "epoch": 1.2216950614621522, + "grad_norm": 0.17728032171726227, + "learning_rate": 9.754570186624675e-05, + "loss": 2.3344, + "step": 2832 + }, + { + "epoch": 1.2221263748113005, + "grad_norm": 0.16096818447113037, + "learning_rate": 9.751315007986194e-05, + "loss": 1.9994, + "step": 2833 + }, + { + "epoch": 1.2225576881604485, + "grad_norm": 0.21062934398651123, + "learning_rate": 9.748059363188317e-05, + "loss": 2.2896, + "step": 2834 + }, + { + "epoch": 1.2229890015095968, + "grad_norm": 0.1778430938720703, + "learning_rate": 9.744803252905168e-05, + "loss": 2.3748, + "step": 2835 + }, + { + "epoch": 1.2234203148587448, + "grad_norm": 0.17728528380393982, + "learning_rate": 9.741546677810955e-05, + "loss": 2.2469, + "step": 2836 + }, + { + "epoch": 1.223851628207893, + "grad_norm": 0.1769222915172577, + "learning_rate": 9.738289638579991e-05, + "loss": 2.2029, + "step": 2837 + }, + { + "epoch": 1.224282941557041, + "grad_norm": 0.15365712344646454, + "learning_rate": 9.735032135886678e-05, + "loss": 2.1346, + "step": 2838 + }, + { + "epoch": 1.2247142549061893, + "grad_norm": 0.22072498500347137, + "learning_rate": 9.731774170405518e-05, + "loss": 1.7912, + "step": 2839 + }, + { + "epoch": 1.2251455682553374, + "grad_norm": 0.16984528303146362, + "learning_rate": 9.728515742811113e-05, + "loss": 2.2203, + "step": 2840 + }, + { + "epoch": 1.2255768816044856, + "grad_norm": 0.20495721697807312, + "learning_rate": 9.725256853778152e-05, + "loss": 2.1872, + "step": 2841 + }, + { + "epoch": 1.226008194953634, + "grad_norm": 0.16966770589351654, + "learning_rate": 9.721997503981423e-05, + "loss": 2.0506, + "step": 2842 + }, + { + "epoch": 1.226439508302782, + "grad_norm": 0.18237991631031036, + "learning_rate": 9.718737694095812e-05, + "loss": 2.1769, + "step": 2843 + }, + { + "epoch": 1.2268708216519302, + "grad_norm": 0.1720055341720581, + "learning_rate": 9.7154774247963e-05, + "loss": 2.2018, + "step": 2844 + }, + { + "epoch": 1.2273021350010782, + "grad_norm": 0.18814405798912048, + "learning_rate": 9.712216696757956e-05, + "loss": 2.2869, + "step": 2845 + }, + { + "epoch": 1.2277334483502265, + "grad_norm": 0.17777247726917267, + "learning_rate": 9.708955510655955e-05, + "loss": 2.2798, + "step": 2846 + }, + { + "epoch": 1.2281647616993747, + "grad_norm": 0.1642644703388214, + "learning_rate": 9.705693867165557e-05, + "loss": 2.2026, + "step": 2847 + }, + { + "epoch": 1.2285960750485228, + "grad_norm": 0.16237300634384155, + "learning_rate": 9.702431766962124e-05, + "loss": 2.2696, + "step": 2848 + }, + { + "epoch": 1.2290273883976708, + "grad_norm": 0.19161173701286316, + "learning_rate": 9.699169210721107e-05, + "loss": 2.3309, + "step": 2849 + }, + { + "epoch": 1.229458701746819, + "grad_norm": 0.17122620344161987, + "learning_rate": 9.695906199118056e-05, + "loss": 2.2177, + "step": 2850 + }, + { + "epoch": 1.229458701746819, + "eval_loss": 2.100764036178589, + "eval_runtime": 208.0607, + "eval_samples_per_second": 0.154, + "eval_steps_per_second": 0.154, + "step": 2850 + }, + { + "epoch": 1.2298900150959673, + "grad_norm": 0.18159905076026917, + "learning_rate": 9.692642732828612e-05, + "loss": 2.3079, + "step": 2851 + }, + { + "epoch": 1.2303213284451153, + "grad_norm": 0.1928827464580536, + "learning_rate": 9.68937881252851e-05, + "loss": 2.2603, + "step": 2852 + }, + { + "epoch": 1.2307526417942636, + "grad_norm": 0.20083266496658325, + "learning_rate": 9.686114438893586e-05, + "loss": 2.3624, + "step": 2853 + }, + { + "epoch": 1.2311839551434116, + "grad_norm": 0.16932237148284912, + "learning_rate": 9.682849612599759e-05, + "loss": 2.1182, + "step": 2854 + }, + { + "epoch": 1.2316152684925599, + "grad_norm": 0.17478060722351074, + "learning_rate": 9.679584334323047e-05, + "loss": 2.2539, + "step": 2855 + }, + { + "epoch": 1.2320465818417081, + "grad_norm": 0.18880511820316315, + "learning_rate": 9.676318604739565e-05, + "loss": 2.1343, + "step": 2856 + }, + { + "epoch": 1.2324778951908562, + "grad_norm": 0.1596124768257141, + "learning_rate": 9.673052424525515e-05, + "loss": 2.1123, + "step": 2857 + }, + { + "epoch": 1.2329092085400044, + "grad_norm": 0.17469385266304016, + "learning_rate": 9.669785794357198e-05, + "loss": 2.152, + "step": 2858 + }, + { + "epoch": 1.2333405218891524, + "grad_norm": 0.20866814255714417, + "learning_rate": 9.666518714911001e-05, + "loss": 2.0822, + "step": 2859 + }, + { + "epoch": 1.2337718352383007, + "grad_norm": 0.19376201927661896, + "learning_rate": 9.663251186863415e-05, + "loss": 2.1318, + "step": 2860 + }, + { + "epoch": 1.2342031485874487, + "grad_norm": 0.1695946455001831, + "learning_rate": 9.659983210891011e-05, + "loss": 2.1996, + "step": 2861 + }, + { + "epoch": 1.234634461936597, + "grad_norm": 0.16476131975650787, + "learning_rate": 9.656714787670463e-05, + "loss": 2.1671, + "step": 2862 + }, + { + "epoch": 1.235065775285745, + "grad_norm": 0.17561404407024384, + "learning_rate": 9.653445917878532e-05, + "loss": 2.3061, + "step": 2863 + }, + { + "epoch": 1.2354970886348933, + "grad_norm": 0.18235552310943604, + "learning_rate": 9.650176602192076e-05, + "loss": 2.2073, + "step": 2864 + }, + { + "epoch": 1.2359284019840415, + "grad_norm": 0.16104130446910858, + "learning_rate": 9.646906841288038e-05, + "loss": 2.0979, + "step": 2865 + }, + { + "epoch": 1.2363597153331896, + "grad_norm": 0.344385027885437, + "learning_rate": 9.643636635843462e-05, + "loss": 2.0115, + "step": 2866 + }, + { + "epoch": 1.2367910286823378, + "grad_norm": 0.1773248314857483, + "learning_rate": 9.64036598653548e-05, + "loss": 2.1553, + "step": 2867 + }, + { + "epoch": 1.2372223420314858, + "grad_norm": 0.1927177459001541, + "learning_rate": 9.637094894041308e-05, + "loss": 2.3593, + "step": 2868 + }, + { + "epoch": 1.237653655380634, + "grad_norm": 0.17691570520401, + "learning_rate": 9.633823359038273e-05, + "loss": 2.2298, + "step": 2869 + }, + { + "epoch": 1.2380849687297821, + "grad_norm": 0.18220536410808563, + "learning_rate": 9.630551382203773e-05, + "loss": 2.1005, + "step": 2870 + }, + { + "epoch": 1.2385162820789304, + "grad_norm": 0.17187197506427765, + "learning_rate": 9.627278964215313e-05, + "loss": 2.2041, + "step": 2871 + }, + { + "epoch": 1.2389475954280784, + "grad_norm": 0.18310484290122986, + "learning_rate": 9.624006105750477e-05, + "loss": 2.3803, + "step": 2872 + }, + { + "epoch": 1.2393789087772267, + "grad_norm": 0.18291732668876648, + "learning_rate": 9.62073280748695e-05, + "loss": 2.2872, + "step": 2873 + }, + { + "epoch": 1.239810222126375, + "grad_norm": 0.18414902687072754, + "learning_rate": 9.617459070102504e-05, + "loss": 2.3016, + "step": 2874 + }, + { + "epoch": 1.240241535475523, + "grad_norm": 0.17588050663471222, + "learning_rate": 9.614184894275e-05, + "loss": 2.2289, + "step": 2875 + }, + { + "epoch": 1.240241535475523, + "eval_loss": 2.1006932258605957, + "eval_runtime": 208.1806, + "eval_samples_per_second": 0.154, + "eval_steps_per_second": 0.154, + "step": 2875 + }, + { + "epoch": 1.2406728488246712, + "grad_norm": 0.1735721081495285, + "learning_rate": 9.610910280682398e-05, + "loss": 2.1857, + "step": 2876 + }, + { + "epoch": 1.2411041621738192, + "grad_norm": 0.22414951026439667, + "learning_rate": 9.607635230002733e-05, + "loss": 2.3306, + "step": 2877 + }, + { + "epoch": 1.2415354755229675, + "grad_norm": 0.16612106561660767, + "learning_rate": 9.604359742914145e-05, + "loss": 2.1791, + "step": 2878 + }, + { + "epoch": 1.2419667888721155, + "grad_norm": 0.17590609192848206, + "learning_rate": 9.601083820094862e-05, + "loss": 2.1513, + "step": 2879 + }, + { + "epoch": 1.2423981022212638, + "grad_norm": 0.18331202864646912, + "learning_rate": 9.597807462223195e-05, + "loss": 2.1928, + "step": 2880 + }, + { + "epoch": 1.2428294155704118, + "grad_norm": 0.18726730346679688, + "learning_rate": 9.594530669977552e-05, + "loss": 2.2287, + "step": 2881 + }, + { + "epoch": 1.24326072891956, + "grad_norm": 0.2933848798274994, + "learning_rate": 9.591253444036426e-05, + "loss": 2.2461, + "step": 2882 + }, + { + "epoch": 1.2436920422687083, + "grad_norm": 0.2122485637664795, + "learning_rate": 9.587975785078409e-05, + "loss": 2.3764, + "step": 2883 + }, + { + "epoch": 1.2441233556178564, + "grad_norm": 0.18504616618156433, + "learning_rate": 9.584697693782171e-05, + "loss": 2.2626, + "step": 2884 + }, + { + "epoch": 1.2445546689670046, + "grad_norm": 0.16713541746139526, + "learning_rate": 9.581419170826476e-05, + "loss": 2.056, + "step": 2885 + }, + { + "epoch": 1.2449859823161527, + "grad_norm": 0.20109900832176208, + "learning_rate": 9.57814021689018e-05, + "loss": 2.0518, + "step": 2886 + }, + { + "epoch": 1.245417295665301, + "grad_norm": 0.19723549485206604, + "learning_rate": 9.574860832652229e-05, + "loss": 2.1423, + "step": 2887 + }, + { + "epoch": 1.245848609014449, + "grad_norm": 0.19441846013069153, + "learning_rate": 9.57158101879165e-05, + "loss": 2.1949, + "step": 2888 + }, + { + "epoch": 1.2462799223635972, + "grad_norm": 0.18298396468162537, + "learning_rate": 9.568300775987565e-05, + "loss": 2.2847, + "step": 2889 + }, + { + "epoch": 1.2467112357127452, + "grad_norm": 0.18244996666908264, + "learning_rate": 9.565020104919188e-05, + "loss": 2.1066, + "step": 2890 + }, + { + "epoch": 1.2471425490618935, + "grad_norm": 0.20596863329410553, + "learning_rate": 9.561739006265813e-05, + "loss": 2.1475, + "step": 2891 + }, + { + "epoch": 1.2475738624110417, + "grad_norm": 0.2012377828359604, + "learning_rate": 9.558457480706834e-05, + "loss": 2.3096, + "step": 2892 + }, + { + "epoch": 1.2480051757601898, + "grad_norm": 0.17559053003787994, + "learning_rate": 9.55517552892172e-05, + "loss": 2.2434, + "step": 2893 + }, + { + "epoch": 1.248436489109338, + "grad_norm": 0.1771698296070099, + "learning_rate": 9.551893151590039e-05, + "loss": 2.1215, + "step": 2894 + }, + { + "epoch": 1.248867802458486, + "grad_norm": 0.16106769442558289, + "learning_rate": 9.548610349391441e-05, + "loss": 2.1251, + "step": 2895 + }, + { + "epoch": 1.2492991158076343, + "grad_norm": 0.2000650316476822, + "learning_rate": 9.545327123005667e-05, + "loss": 2.0942, + "step": 2896 + }, + { + "epoch": 1.2497304291567823, + "grad_norm": 0.17449326813220978, + "learning_rate": 9.542043473112544e-05, + "loss": 2.1445, + "step": 2897 + }, + { + "epoch": 1.2501617425059306, + "grad_norm": 0.2125837802886963, + "learning_rate": 9.538759400391989e-05, + "loss": 2.1557, + "step": 2898 + }, + { + "epoch": 1.2505930558550786, + "grad_norm": 0.2138071060180664, + "learning_rate": 9.535474905524007e-05, + "loss": 1.7936, + "step": 2899 + }, + { + "epoch": 1.2510243692042269, + "grad_norm": 0.17955918610095978, + "learning_rate": 9.532189989188683e-05, + "loss": 2.1389, + "step": 2900 + }, + { + "epoch": 1.2510243692042269, + "eval_loss": 2.099691867828369, + "eval_runtime": 208.0343, + "eval_samples_per_second": 0.154, + "eval_steps_per_second": 0.154, + "step": 2900 + }, + { + "epoch": 1.2514556825533751, + "grad_norm": 0.16655541956424713, + "learning_rate": 9.528904652066199e-05, + "loss": 2.0131, + "step": 2901 + }, + { + "epoch": 1.2518869959025232, + "grad_norm": 0.16214993596076965, + "learning_rate": 9.52561889483682e-05, + "loss": 2.2244, + "step": 2902 + }, + { + "epoch": 1.2523183092516712, + "grad_norm": 0.17968380451202393, + "learning_rate": 9.522332718180896e-05, + "loss": 2.2194, + "step": 2903 + }, + { + "epoch": 1.2527496226008195, + "grad_norm": 0.1735077202320099, + "learning_rate": 9.519046122778868e-05, + "loss": 2.0717, + "step": 2904 + }, + { + "epoch": 1.2531809359499677, + "grad_norm": 0.1795034110546112, + "learning_rate": 9.515759109311258e-05, + "loss": 2.4281, + "step": 2905 + }, + { + "epoch": 1.2536122492991157, + "grad_norm": 0.17288808524608612, + "learning_rate": 9.512471678458684e-05, + "loss": 2.1008, + "step": 2906 + }, + { + "epoch": 1.254043562648264, + "grad_norm": 0.17616263031959534, + "learning_rate": 9.509183830901838e-05, + "loss": 2.1797, + "step": 2907 + }, + { + "epoch": 1.254474875997412, + "grad_norm": 0.16509383916854858, + "learning_rate": 9.50589556732151e-05, + "loss": 2.1597, + "step": 2908 + }, + { + "epoch": 1.2549061893465603, + "grad_norm": 0.18447110056877136, + "learning_rate": 9.502606888398567e-05, + "loss": 2.1389, + "step": 2909 + }, + { + "epoch": 1.2553375026957085, + "grad_norm": 0.1685207039117813, + "learning_rate": 9.499317794813968e-05, + "loss": 2.0595, + "step": 2910 + }, + { + "epoch": 1.2557688160448566, + "grad_norm": 0.16968344151973724, + "learning_rate": 9.496028287248756e-05, + "loss": 1.8777, + "step": 2911 + }, + { + "epoch": 1.2562001293940048, + "grad_norm": 0.19373373687267303, + "learning_rate": 9.492738366384059e-05, + "loss": 2.2779, + "step": 2912 + }, + { + "epoch": 1.2566314427431529, + "grad_norm": 0.19972391426563263, + "learning_rate": 9.489448032901089e-05, + "loss": 2.1135, + "step": 2913 + }, + { + "epoch": 1.2570627560923011, + "grad_norm": 0.17108936607837677, + "learning_rate": 9.486157287481148e-05, + "loss": 2.1786, + "step": 2914 + }, + { + "epoch": 1.2574940694414491, + "grad_norm": 0.18781358003616333, + "learning_rate": 9.482866130805623e-05, + "loss": 2.2741, + "step": 2915 + }, + { + "epoch": 1.2579253827905974, + "grad_norm": 0.17684100568294525, + "learning_rate": 9.47957456355598e-05, + "loss": 2.0681, + "step": 2916 + }, + { + "epoch": 1.2583566961397454, + "grad_norm": 0.17751702666282654, + "learning_rate": 9.476282586413774e-05, + "loss": 2.1477, + "step": 2917 + }, + { + "epoch": 1.2587880094888937, + "grad_norm": 0.16708078980445862, + "learning_rate": 9.472990200060648e-05, + "loss": 2.1511, + "step": 2918 + }, + { + "epoch": 1.259219322838042, + "grad_norm": 0.1751113384962082, + "learning_rate": 9.469697405178324e-05, + "loss": 2.0367, + "step": 2919 + }, + { + "epoch": 1.25965063618719, + "grad_norm": 0.1651589274406433, + "learning_rate": 9.466404202448613e-05, + "loss": 2.0795, + "step": 2920 + }, + { + "epoch": 1.2600819495363382, + "grad_norm": 0.16996340453624725, + "learning_rate": 9.463110592553405e-05, + "loss": 2.1443, + "step": 2921 + }, + { + "epoch": 1.2605132628854863, + "grad_norm": 0.20765995979309082, + "learning_rate": 9.459816576174685e-05, + "loss": 2.1737, + "step": 2922 + }, + { + "epoch": 1.2609445762346345, + "grad_norm": 0.21429865062236786, + "learning_rate": 9.456522153994505e-05, + "loss": 2.3622, + "step": 2923 + }, + { + "epoch": 1.2613758895837826, + "grad_norm": 0.19938156008720398, + "learning_rate": 9.453227326695021e-05, + "loss": 2.1501, + "step": 2924 + }, + { + "epoch": 1.2618072029329308, + "grad_norm": 0.17281770706176758, + "learning_rate": 9.449932094958458e-05, + "loss": 2.2994, + "step": 2925 + }, + { + "epoch": 1.2618072029329308, + "eval_loss": 2.097632646560669, + "eval_runtime": 208.7814, + "eval_samples_per_second": 0.153, + "eval_steps_per_second": 0.153, + "step": 2925 + }, + { + "epoch": 1.2622385162820788, + "grad_norm": 0.19390475749969482, + "learning_rate": 9.446636459467131e-05, + "loss": 2.1672, + "step": 2926 + }, + { + "epoch": 1.262669829631227, + "grad_norm": 0.18728719651699066, + "learning_rate": 9.443340420903438e-05, + "loss": 2.2925, + "step": 2927 + }, + { + "epoch": 1.2631011429803753, + "grad_norm": 0.17728185653686523, + "learning_rate": 9.440043979949857e-05, + "loss": 2.2727, + "step": 2928 + }, + { + "epoch": 1.2635324563295234, + "grad_norm": 0.17605188488960266, + "learning_rate": 9.436747137288955e-05, + "loss": 2.2959, + "step": 2929 + }, + { + "epoch": 1.2639637696786716, + "grad_norm": 0.18089932203292847, + "learning_rate": 9.433449893603376e-05, + "loss": 2.1482, + "step": 2930 + }, + { + "epoch": 1.2643950830278197, + "grad_norm": 0.17335166037082672, + "learning_rate": 9.430152249575855e-05, + "loss": 2.3316, + "step": 2931 + }, + { + "epoch": 1.264826396376968, + "grad_norm": 0.1741197407245636, + "learning_rate": 9.426854205889202e-05, + "loss": 2.2759, + "step": 2932 + }, + { + "epoch": 1.265257709726116, + "grad_norm": 0.1794310063123703, + "learning_rate": 9.423555763226314e-05, + "loss": 2.2444, + "step": 2933 + }, + { + "epoch": 1.2656890230752642, + "grad_norm": 0.18344342708587646, + "learning_rate": 9.420256922270169e-05, + "loss": 2.4161, + "step": 2934 + }, + { + "epoch": 1.2661203364244122, + "grad_norm": 0.19544778764247894, + "learning_rate": 9.416957683703831e-05, + "loss": 2.1683, + "step": 2935 + }, + { + "epoch": 1.2665516497735605, + "grad_norm": 0.16360361874103546, + "learning_rate": 9.413658048210441e-05, + "loss": 2.0059, + "step": 2936 + }, + { + "epoch": 1.2669829631227087, + "grad_norm": 0.1777682900428772, + "learning_rate": 9.410358016473222e-05, + "loss": 2.1029, + "step": 2937 + }, + { + "epoch": 1.2674142764718568, + "grad_norm": 0.18484915792942047, + "learning_rate": 9.407057589175484e-05, + "loss": 2.3951, + "step": 2938 + }, + { + "epoch": 1.267845589821005, + "grad_norm": 0.19149358570575714, + "learning_rate": 9.40375676700062e-05, + "loss": 2.0752, + "step": 2939 + }, + { + "epoch": 1.268276903170153, + "grad_norm": 0.1761498898267746, + "learning_rate": 9.400455550632097e-05, + "loss": 2.2717, + "step": 2940 + }, + { + "epoch": 1.2687082165193013, + "grad_norm": 0.1742338240146637, + "learning_rate": 9.397153940753468e-05, + "loss": 2.3397, + "step": 2941 + }, + { + "epoch": 1.2691395298684494, + "grad_norm": 0.18861375749111176, + "learning_rate": 9.393851938048371e-05, + "loss": 2.1523, + "step": 2942 + }, + { + "epoch": 1.2695708432175976, + "grad_norm": 0.16949686408042908, + "learning_rate": 9.390549543200518e-05, + "loss": 2.2293, + "step": 2943 + }, + { + "epoch": 1.2700021565667456, + "grad_norm": 0.17178656160831451, + "learning_rate": 9.387246756893706e-05, + "loss": 2.1625, + "step": 2944 + }, + { + "epoch": 1.270433469915894, + "grad_norm": 0.1995915025472641, + "learning_rate": 9.383943579811817e-05, + "loss": 2.0885, + "step": 2945 + }, + { + "epoch": 1.2708647832650422, + "grad_norm": 0.20199699699878693, + "learning_rate": 9.380640012638805e-05, + "loss": 2.2046, + "step": 2946 + }, + { + "epoch": 1.2712960966141902, + "grad_norm": 0.1766977161169052, + "learning_rate": 9.377336056058714e-05, + "loss": 2.1567, + "step": 2947 + }, + { + "epoch": 1.2717274099633384, + "grad_norm": 0.16584517061710358, + "learning_rate": 9.374031710755662e-05, + "loss": 2.2226, + "step": 2948 + }, + { + "epoch": 1.2721587233124865, + "grad_norm": 0.1809447705745697, + "learning_rate": 9.370726977413851e-05, + "loss": 2.128, + "step": 2949 + }, + { + "epoch": 1.2725900366616347, + "grad_norm": 0.19714750349521637, + "learning_rate": 9.36742185671756e-05, + "loss": 2.2823, + "step": 2950 + }, + { + "epoch": 1.2725900366616347, + "eval_loss": 2.097557544708252, + "eval_runtime": 203.2645, + "eval_samples_per_second": 0.157, + "eval_steps_per_second": 0.157, + "step": 2950 + }, + { + "epoch": 1.2730213500107828, + "grad_norm": 0.18128915131092072, + "learning_rate": 9.364116349351153e-05, + "loss": 2.2105, + "step": 2951 + }, + { + "epoch": 1.273452663359931, + "grad_norm": 0.19274786114692688, + "learning_rate": 9.360810455999067e-05, + "loss": 2.1284, + "step": 2952 + }, + { + "epoch": 1.273883976709079, + "grad_norm": 0.18371281027793884, + "learning_rate": 9.357504177345829e-05, + "loss": 2.2204, + "step": 2953 + }, + { + "epoch": 1.2743152900582273, + "grad_norm": 0.1810397356748581, + "learning_rate": 9.354197514076038e-05, + "loss": 2.2558, + "step": 2954 + }, + { + "epoch": 1.2747466034073756, + "grad_norm": 0.1709802746772766, + "learning_rate": 9.350890466874374e-05, + "loss": 2.0882, + "step": 2955 + }, + { + "epoch": 1.2751779167565236, + "grad_norm": 0.1786419302225113, + "learning_rate": 9.347583036425597e-05, + "loss": 2.1403, + "step": 2956 + }, + { + "epoch": 1.2756092301056718, + "grad_norm": 0.1712750643491745, + "learning_rate": 9.34427522341455e-05, + "loss": 2.2502, + "step": 2957 + }, + { + "epoch": 1.2760405434548199, + "grad_norm": 0.1820487231016159, + "learning_rate": 9.340967028526148e-05, + "loss": 2.138, + "step": 2958 + }, + { + "epoch": 1.2764718568039681, + "grad_norm": 0.16608822345733643, + "learning_rate": 9.337658452445391e-05, + "loss": 2.2128, + "step": 2959 + }, + { + "epoch": 1.2769031701531162, + "grad_norm": 0.20153141021728516, + "learning_rate": 9.334349495857353e-05, + "loss": 2.1863, + "step": 2960 + }, + { + "epoch": 1.2773344835022644, + "grad_norm": 0.18792590498924255, + "learning_rate": 9.331040159447194e-05, + "loss": 2.1333, + "step": 2961 + }, + { + "epoch": 1.2777657968514124, + "grad_norm": 0.1876603364944458, + "learning_rate": 9.327730443900146e-05, + "loss": 2.2312, + "step": 2962 + }, + { + "epoch": 1.2781971102005607, + "grad_norm": 0.17630432546138763, + "learning_rate": 9.32442034990152e-05, + "loss": 2.2698, + "step": 2963 + }, + { + "epoch": 1.278628423549709, + "grad_norm": 0.17087142169475555, + "learning_rate": 9.321109878136709e-05, + "loss": 2.0537, + "step": 2964 + }, + { + "epoch": 1.279059736898857, + "grad_norm": 0.20455554127693176, + "learning_rate": 9.317799029291185e-05, + "loss": 2.2909, + "step": 2965 + }, + { + "epoch": 1.2794910502480052, + "grad_norm": 0.1775055080652237, + "learning_rate": 9.31448780405049e-05, + "loss": 2.3168, + "step": 2966 + }, + { + "epoch": 1.2799223635971533, + "grad_norm": 0.18382766842842102, + "learning_rate": 9.311176203100253e-05, + "loss": 2.3394, + "step": 2967 + }, + { + "epoch": 1.2803536769463015, + "grad_norm": 0.38945379853248596, + "learning_rate": 9.307864227126179e-05, + "loss": 2.3609, + "step": 2968 + }, + { + "epoch": 1.2807849902954496, + "grad_norm": 0.1864595115184784, + "learning_rate": 9.304551876814043e-05, + "loss": 2.0455, + "step": 2969 + }, + { + "epoch": 1.2812163036445978, + "grad_norm": 0.19058941304683685, + "learning_rate": 9.301239152849708e-05, + "loss": 1.9847, + "step": 2970 + }, + { + "epoch": 1.2816476169937459, + "grad_norm": 0.17958027124404907, + "learning_rate": 9.297926055919109e-05, + "loss": 2.5394, + "step": 2971 + }, + { + "epoch": 1.282078930342894, + "grad_norm": 0.18691030144691467, + "learning_rate": 9.294612586708257e-05, + "loss": 2.3476, + "step": 2972 + }, + { + "epoch": 1.2825102436920424, + "grad_norm": 0.17819638550281525, + "learning_rate": 9.291298745903245e-05, + "loss": 2.4438, + "step": 2973 + }, + { + "epoch": 1.2829415570411904, + "grad_norm": 0.18015502393245697, + "learning_rate": 9.287984534190237e-05, + "loss": 2.1254, + "step": 2974 + }, + { + "epoch": 1.2833728703903386, + "grad_norm": 0.21266379952430725, + "learning_rate": 9.28466995225548e-05, + "loss": 1.987, + "step": 2975 + }, + { + "epoch": 1.2833728703903386, + "eval_loss": 2.0974252223968506, + "eval_runtime": 202.1486, + "eval_samples_per_second": 0.158, + "eval_steps_per_second": 0.158, + "step": 2975 + }, + { + "epoch": 1.2838041837394867, + "grad_norm": 0.18659232556819916, + "learning_rate": 9.28135500078529e-05, + "loss": 2.0334, + "step": 2976 + }, + { + "epoch": 1.284235497088635, + "grad_norm": 0.1787107288837433, + "learning_rate": 9.278039680466068e-05, + "loss": 2.1156, + "step": 2977 + }, + { + "epoch": 1.284666810437783, + "grad_norm": 0.17784705758094788, + "learning_rate": 9.274723991984285e-05, + "loss": 2.2095, + "step": 2978 + }, + { + "epoch": 1.2850981237869312, + "grad_norm": 0.18107645213603973, + "learning_rate": 9.271407936026492e-05, + "loss": 2.2255, + "step": 2979 + }, + { + "epoch": 1.2855294371360793, + "grad_norm": 0.18629951775074005, + "learning_rate": 9.268091513279314e-05, + "loss": 2.3375, + "step": 2980 + }, + { + "epoch": 1.2859607504852275, + "grad_norm": 0.1789446324110031, + "learning_rate": 9.264774724429454e-05, + "loss": 2.2594, + "step": 2981 + }, + { + "epoch": 1.2863920638343758, + "grad_norm": 0.17903368175029755, + "learning_rate": 9.261457570163685e-05, + "loss": 2.2187, + "step": 2982 + }, + { + "epoch": 1.2868233771835238, + "grad_norm": 0.18009735643863678, + "learning_rate": 9.258140051168865e-05, + "loss": 2.2597, + "step": 2983 + }, + { + "epoch": 1.287254690532672, + "grad_norm": 0.17903639376163483, + "learning_rate": 9.25482216813192e-05, + "loss": 2.3297, + "step": 2984 + }, + { + "epoch": 1.28768600388182, + "grad_norm": 0.169985830783844, + "learning_rate": 9.251503921739855e-05, + "loss": 2.207, + "step": 2985 + }, + { + "epoch": 1.2881173172309683, + "grad_norm": 0.17052622139453888, + "learning_rate": 9.248185312679749e-05, + "loss": 2.292, + "step": 2986 + }, + { + "epoch": 1.2885486305801164, + "grad_norm": 0.17440353333950043, + "learning_rate": 9.244866341638759e-05, + "loss": 2.2154, + "step": 2987 + }, + { + "epoch": 1.2889799439292646, + "grad_norm": 0.16670095920562744, + "learning_rate": 9.241547009304106e-05, + "loss": 1.9959, + "step": 2988 + }, + { + "epoch": 1.2894112572784127, + "grad_norm": 0.17001476883888245, + "learning_rate": 9.238227316363104e-05, + "loss": 2.269, + "step": 2989 + }, + { + "epoch": 1.289842570627561, + "grad_norm": 0.162721186876297, + "learning_rate": 9.234907263503124e-05, + "loss": 2.0803, + "step": 2990 + }, + { + "epoch": 1.2902738839767092, + "grad_norm": 0.17348825931549072, + "learning_rate": 9.231586851411626e-05, + "loss": 2.0746, + "step": 2991 + }, + { + "epoch": 1.2907051973258572, + "grad_norm": 0.17610464990139008, + "learning_rate": 9.228266080776129e-05, + "loss": 2.127, + "step": 2992 + }, + { + "epoch": 1.2911365106750055, + "grad_norm": 0.18986164033412933, + "learning_rate": 9.224944952284243e-05, + "loss": 2.3315, + "step": 2993 + }, + { + "epoch": 1.2915678240241535, + "grad_norm": 0.18086329102516174, + "learning_rate": 9.221623466623642e-05, + "loss": 2.1828, + "step": 2994 + }, + { + "epoch": 1.2919991373733017, + "grad_norm": 0.1708398312330246, + "learning_rate": 9.218301624482074e-05, + "loss": 2.2815, + "step": 2995 + }, + { + "epoch": 1.29243045072245, + "grad_norm": 0.18381886184215546, + "learning_rate": 9.214979426547364e-05, + "loss": 2.3363, + "step": 2996 + }, + { + "epoch": 1.292861764071598, + "grad_norm": 0.1695384830236435, + "learning_rate": 9.211656873507407e-05, + "loss": 2.1781, + "step": 2997 + }, + { + "epoch": 1.293293077420746, + "grad_norm": 0.18817633390426636, + "learning_rate": 9.208333966050178e-05, + "loss": 2.3369, + "step": 2998 + }, + { + "epoch": 1.2937243907698943, + "grad_norm": 0.20488505065441132, + "learning_rate": 9.205010704863718e-05, + "loss": 2.0694, + "step": 2999 + }, + { + "epoch": 1.2941557041190426, + "grad_norm": 0.17080359160900116, + "learning_rate": 9.201687090636147e-05, + "loss": 2.1132, + "step": 3000 + }, + { + "epoch": 1.2941557041190426, + "eval_loss": 2.0969784259796143, + "eval_runtime": 208.7919, + "eval_samples_per_second": 0.153, + "eval_steps_per_second": 0.153, + "step": 3000 + }, + { + "epoch": 1.2945870174681906, + "grad_norm": 3.370145082473755, + "learning_rate": 9.198363124055655e-05, + "loss": 2.1544, + "step": 3001 + }, + { + "epoch": 1.2950183308173389, + "grad_norm": 0.18138986825942993, + "learning_rate": 9.195038805810504e-05, + "loss": 2.0651, + "step": 3002 + }, + { + "epoch": 1.295449644166487, + "grad_norm": 0.16984714567661285, + "learning_rate": 9.191714136589032e-05, + "loss": 2.3752, + "step": 3003 + }, + { + "epoch": 1.2958809575156351, + "grad_norm": 0.19066904485225677, + "learning_rate": 9.188389117079647e-05, + "loss": 2.3016, + "step": 3004 + }, + { + "epoch": 1.2963122708647834, + "grad_norm": 0.38692083954811096, + "learning_rate": 9.185063747970833e-05, + "loss": 2.1112, + "step": 3005 + }, + { + "epoch": 1.2967435842139314, + "grad_norm": 0.16445212066173553, + "learning_rate": 9.18173802995114e-05, + "loss": 2.2301, + "step": 3006 + }, + { + "epoch": 1.2971748975630795, + "grad_norm": 0.17288804054260254, + "learning_rate": 9.178411963709198e-05, + "loss": 2.274, + "step": 3007 + }, + { + "epoch": 1.2976062109122277, + "grad_norm": 0.16802461445331573, + "learning_rate": 9.175085549933706e-05, + "loss": 2.0457, + "step": 3008 + }, + { + "epoch": 1.298037524261376, + "grad_norm": 0.17567449808120728, + "learning_rate": 9.171758789313431e-05, + "loss": 2.1486, + "step": 3009 + }, + { + "epoch": 1.298468837610524, + "grad_norm": 0.17225013673305511, + "learning_rate": 9.168431682537216e-05, + "loss": 2.0606, + "step": 3010 + }, + { + "epoch": 1.2989001509596723, + "grad_norm": 0.17275701463222504, + "learning_rate": 9.165104230293976e-05, + "loss": 2.1349, + "step": 3011 + }, + { + "epoch": 1.2993314643088203, + "grad_norm": 0.18286320567131042, + "learning_rate": 9.161776433272695e-05, + "loss": 2.2249, + "step": 3012 + }, + { + "epoch": 1.2997627776579685, + "grad_norm": 0.18570812046527863, + "learning_rate": 9.158448292162433e-05, + "loss": 2.2915, + "step": 3013 + }, + { + "epoch": 1.3001940910071168, + "grad_norm": 0.1986859291791916, + "learning_rate": 9.155119807652314e-05, + "loss": 2.1963, + "step": 3014 + }, + { + "epoch": 1.3006254043562648, + "grad_norm": 0.17509645223617554, + "learning_rate": 9.151790980431537e-05, + "loss": 2.259, + "step": 3015 + }, + { + "epoch": 1.3010567177054129, + "grad_norm": 0.1850888580083847, + "learning_rate": 9.148461811189375e-05, + "loss": 2.196, + "step": 3016 + }, + { + "epoch": 1.3014880310545611, + "grad_norm": 0.18214569985866547, + "learning_rate": 9.14513230061517e-05, + "loss": 1.9803, + "step": 3017 + }, + { + "epoch": 1.3019193444037094, + "grad_norm": 0.19960394501686096, + "learning_rate": 9.14180244939833e-05, + "loss": 2.2102, + "step": 3018 + }, + { + "epoch": 1.3023506577528574, + "grad_norm": 0.18149137496948242, + "learning_rate": 9.138472258228337e-05, + "loss": 2.2894, + "step": 3019 + }, + { + "epoch": 1.3027819711020057, + "grad_norm": 0.17491424083709717, + "learning_rate": 9.135141727794745e-05, + "loss": 2.0448, + "step": 3020 + }, + { + "epoch": 1.3032132844511537, + "grad_norm": 0.16022880375385284, + "learning_rate": 9.131810858787179e-05, + "loss": 1.9723, + "step": 3021 + }, + { + "epoch": 1.303644597800302, + "grad_norm": 0.17898283898830414, + "learning_rate": 9.128479651895327e-05, + "loss": 2.097, + "step": 3022 + }, + { + "epoch": 1.3040759111494502, + "grad_norm": 0.2150205373764038, + "learning_rate": 9.125148107808955e-05, + "loss": 2.0051, + "step": 3023 + }, + { + "epoch": 1.3045072244985982, + "grad_norm": 0.20482948422431946, + "learning_rate": 9.121816227217895e-05, + "loss": 2.1562, + "step": 3024 + }, + { + "epoch": 1.3049385378477463, + "grad_norm": 0.16900508105754852, + "learning_rate": 9.118484010812051e-05, + "loss": 2.2336, + "step": 3025 + }, + { + "epoch": 1.3049385378477463, + "eval_loss": 2.0978634357452393, + "eval_runtime": 204.1551, + "eval_samples_per_second": 0.157, + "eval_steps_per_second": 0.157, + "step": 3025 + }, + { + "epoch": 1.3053698511968945, + "grad_norm": 0.18790045380592346, + "learning_rate": 9.115151459281391e-05, + "loss": 2.2516, + "step": 3026 + }, + { + "epoch": 1.3058011645460428, + "grad_norm": 0.1752679944038391, + "learning_rate": 9.111818573315961e-05, + "loss": 2.111, + "step": 3027 + }, + { + "epoch": 1.3062324778951908, + "grad_norm": 0.25007501244544983, + "learning_rate": 9.108485353605866e-05, + "loss": 2.2764, + "step": 3028 + }, + { + "epoch": 1.306663791244339, + "grad_norm": 0.19131995737552643, + "learning_rate": 9.105151800841288e-05, + "loss": 2.2796, + "step": 3029 + }, + { + "epoch": 1.307095104593487, + "grad_norm": 0.18395444750785828, + "learning_rate": 9.101817915712478e-05, + "loss": 2.27, + "step": 3030 + }, + { + "epoch": 1.3075264179426354, + "grad_norm": 0.17417460680007935, + "learning_rate": 9.09848369890975e-05, + "loss": 2.0593, + "step": 3031 + }, + { + "epoch": 1.3079577312917836, + "grad_norm": 0.18455031514167786, + "learning_rate": 9.095149151123491e-05, + "loss": 2.1211, + "step": 3032 + }, + { + "epoch": 1.3083890446409316, + "grad_norm": 0.16537529230117798, + "learning_rate": 9.091814273044159e-05, + "loss": 2.2592, + "step": 3033 + }, + { + "epoch": 1.3088203579900797, + "grad_norm": 0.19210714101791382, + "learning_rate": 9.088479065362273e-05, + "loss": 2.1661, + "step": 3034 + }, + { + "epoch": 1.309251671339228, + "grad_norm": 0.2028828263282776, + "learning_rate": 9.085143528768424e-05, + "loss": 2.3139, + "step": 3035 + }, + { + "epoch": 1.3096829846883762, + "grad_norm": 0.19314557313919067, + "learning_rate": 9.081807663953272e-05, + "loss": 2.2221, + "step": 3036 + }, + { + "epoch": 1.3101142980375242, + "grad_norm": 0.19766665995121002, + "learning_rate": 9.078471471607547e-05, + "loss": 2.2387, + "step": 3037 + }, + { + "epoch": 1.3105456113866725, + "grad_norm": 0.21651652455329895, + "learning_rate": 9.07513495242204e-05, + "loss": 1.8804, + "step": 3038 + }, + { + "epoch": 1.3109769247358205, + "grad_norm": 0.20336247980594635, + "learning_rate": 9.071798107087614e-05, + "loss": 2.1254, + "step": 3039 + }, + { + "epoch": 1.3114082380849688, + "grad_norm": 0.18429327011108398, + "learning_rate": 9.068460936295205e-05, + "loss": 1.9877, + "step": 3040 + }, + { + "epoch": 1.311839551434117, + "grad_norm": 0.1765599399805069, + "learning_rate": 9.065123440735805e-05, + "loss": 2.0811, + "step": 3041 + }, + { + "epoch": 1.312270864783265, + "grad_norm": 0.16899888217449188, + "learning_rate": 9.061785621100481e-05, + "loss": 2.2005, + "step": 3042 + }, + { + "epoch": 1.312702178132413, + "grad_norm": 0.18764810264110565, + "learning_rate": 9.058447478080366e-05, + "loss": 2.2184, + "step": 3043 + }, + { + "epoch": 1.3131334914815613, + "grad_norm": 0.18549354374408722, + "learning_rate": 9.055109012366655e-05, + "loss": 2.1128, + "step": 3044 + }, + { + "epoch": 1.3135648048307096, + "grad_norm": 0.19809360802173615, + "learning_rate": 9.051770224650617e-05, + "loss": 2.2453, + "step": 3045 + }, + { + "epoch": 1.3139961181798576, + "grad_norm": 0.16052544116973877, + "learning_rate": 9.048431115623585e-05, + "loss": 2.136, + "step": 3046 + }, + { + "epoch": 1.3144274315290059, + "grad_norm": 0.1702003926038742, + "learning_rate": 9.045091685976957e-05, + "loss": 2.0531, + "step": 3047 + }, + { + "epoch": 1.314858744878154, + "grad_norm": 0.1690002977848053, + "learning_rate": 9.041751936402199e-05, + "loss": 2.0023, + "step": 3048 + }, + { + "epoch": 1.3152900582273022, + "grad_norm": 0.1664373129606247, + "learning_rate": 9.038411867590842e-05, + "loss": 2.1576, + "step": 3049 + }, + { + "epoch": 1.3157213715764504, + "grad_norm": 0.18579117953777313, + "learning_rate": 9.035071480234485e-05, + "loss": 2.159, + "step": 3050 + }, + { + "epoch": 1.3157213715764504, + "eval_loss": 2.097856044769287, + "eval_runtime": 203.1222, + "eval_samples_per_second": 0.158, + "eval_steps_per_second": 0.158, + "step": 3050 + }, + { + "epoch": 1.3161526849255984, + "grad_norm": 0.16586829721927643, + "learning_rate": 9.03173077502479e-05, + "loss": 1.9904, + "step": 3051 + }, + { + "epoch": 1.3165839982747465, + "grad_norm": 0.1913750171661377, + "learning_rate": 9.028389752653487e-05, + "loss": 2.1929, + "step": 3052 + }, + { + "epoch": 1.3170153116238947, + "grad_norm": 0.17231319844722748, + "learning_rate": 9.02504841381237e-05, + "loss": 2.132, + "step": 3053 + }, + { + "epoch": 1.317446624973043, + "grad_norm": 0.21133653819561005, + "learning_rate": 9.021706759193305e-05, + "loss": 2.2717, + "step": 3054 + }, + { + "epoch": 1.317877938322191, + "grad_norm": 0.17870862782001495, + "learning_rate": 9.018364789488211e-05, + "loss": 2.3588, + "step": 3055 + }, + { + "epoch": 1.3183092516713393, + "grad_norm": 0.18890124559402466, + "learning_rate": 9.015022505389084e-05, + "loss": 2.2494, + "step": 3056 + }, + { + "epoch": 1.3187405650204873, + "grad_norm": 0.1804681122303009, + "learning_rate": 9.01167990758798e-05, + "loss": 2.1849, + "step": 3057 + }, + { + "epoch": 1.3191718783696356, + "grad_norm": 0.17999668419361115, + "learning_rate": 9.00833699677702e-05, + "loss": 2.0232, + "step": 3058 + }, + { + "epoch": 1.3196031917187838, + "grad_norm": 0.16746574640274048, + "learning_rate": 9.004993773648389e-05, + "loss": 1.9494, + "step": 3059 + }, + { + "epoch": 1.3200345050679319, + "grad_norm": 0.209141343832016, + "learning_rate": 9.00165023889434e-05, + "loss": 2.3657, + "step": 3060 + }, + { + "epoch": 1.3204658184170799, + "grad_norm": 0.170698344707489, + "learning_rate": 8.998306393207183e-05, + "loss": 2.1525, + "step": 3061 + }, + { + "epoch": 1.3208971317662281, + "grad_norm": 0.18231551349163055, + "learning_rate": 8.994962237279306e-05, + "loss": 2.3213, + "step": 3062 + }, + { + "epoch": 1.3213284451153764, + "grad_norm": 0.19542478024959564, + "learning_rate": 8.991617771803148e-05, + "loss": 2.1471, + "step": 3063 + }, + { + "epoch": 1.3217597584645244, + "grad_norm": 0.19738048315048218, + "learning_rate": 8.988272997471219e-05, + "loss": 2.1084, + "step": 3064 + }, + { + "epoch": 1.3221910718136727, + "grad_norm": 0.24825488030910492, + "learning_rate": 8.984927914976091e-05, + "loss": 2.1697, + "step": 3065 + }, + { + "epoch": 1.3226223851628207, + "grad_norm": 0.1763385534286499, + "learning_rate": 8.981582525010398e-05, + "loss": 1.9507, + "step": 3066 + }, + { + "epoch": 1.323053698511969, + "grad_norm": 0.16423176229000092, + "learning_rate": 8.97823682826684e-05, + "loss": 2.2201, + "step": 3067 + }, + { + "epoch": 1.3234850118611172, + "grad_norm": 0.17444059252738953, + "learning_rate": 8.974890825438183e-05, + "loss": 2.2739, + "step": 3068 + }, + { + "epoch": 1.3239163252102653, + "grad_norm": 0.204995796084404, + "learning_rate": 8.97154451721725e-05, + "loss": 2.2925, + "step": 3069 + }, + { + "epoch": 1.3243476385594133, + "grad_norm": 0.19501759111881256, + "learning_rate": 8.968197904296933e-05, + "loss": 2.4235, + "step": 3070 + }, + { + "epoch": 1.3247789519085615, + "grad_norm": 0.17075209319591522, + "learning_rate": 8.964850987370182e-05, + "loss": 2.266, + "step": 3071 + }, + { + "epoch": 1.3252102652577098, + "grad_norm": 0.17689673602581024, + "learning_rate": 8.961503767130018e-05, + "loss": 2.1537, + "step": 3072 + }, + { + "epoch": 1.3256415786068578, + "grad_norm": 0.1606774777173996, + "learning_rate": 8.958156244269514e-05, + "loss": 2.1582, + "step": 3073 + }, + { + "epoch": 1.326072891956006, + "grad_norm": 0.17741374671459198, + "learning_rate": 8.954808419481815e-05, + "loss": 2.3015, + "step": 3074 + }, + { + "epoch": 1.3265042053051541, + "grad_norm": 0.19335578382015228, + "learning_rate": 8.951460293460124e-05, + "loss": 2.2417, + "step": 3075 + }, + { + "epoch": 1.3265042053051541, + "eval_loss": 2.097837209701538, + "eval_runtime": 202.0834, + "eval_samples_per_second": 0.158, + "eval_steps_per_second": 0.158, + "step": 3075 + }, + { + "epoch": 1.3269355186543024, + "grad_norm": 0.17709966003894806, + "learning_rate": 8.948111866897701e-05, + "loss": 2.171, + "step": 3076 + }, + { + "epoch": 1.3273668320034506, + "grad_norm": 0.17744885385036469, + "learning_rate": 8.944763140487885e-05, + "loss": 2.234, + "step": 3077 + }, + { + "epoch": 1.3277981453525987, + "grad_norm": 0.1813969761133194, + "learning_rate": 8.941414114924059e-05, + "loss": 2.2137, + "step": 3078 + }, + { + "epoch": 1.3282294587017467, + "grad_norm": 0.17423033714294434, + "learning_rate": 8.93806479089968e-05, + "loss": 1.9412, + "step": 3079 + }, + { + "epoch": 1.328660772050895, + "grad_norm": 0.17788293957710266, + "learning_rate": 8.934715169108257e-05, + "loss": 2.3469, + "step": 3080 + }, + { + "epoch": 1.3290920854000432, + "grad_norm": 0.1929638832807541, + "learning_rate": 8.931365250243368e-05, + "loss": 2.1577, + "step": 3081 + }, + { + "epoch": 1.3295233987491912, + "grad_norm": 0.18380503356456757, + "learning_rate": 8.928015034998653e-05, + "loss": 2.2365, + "step": 3082 + }, + { + "epoch": 1.3299547120983395, + "grad_norm": 0.1732213944196701, + "learning_rate": 8.924664524067807e-05, + "loss": 2.3558, + "step": 3083 + }, + { + "epoch": 1.3303860254474875, + "grad_norm": 0.16939488053321838, + "learning_rate": 8.921313718144592e-05, + "loss": 2.079, + "step": 3084 + }, + { + "epoch": 1.3308173387966358, + "grad_norm": 0.17569899559020996, + "learning_rate": 8.917962617922827e-05, + "loss": 2.0108, + "step": 3085 + }, + { + "epoch": 1.331248652145784, + "grad_norm": 0.16894367337226868, + "learning_rate": 8.914611224096398e-05, + "loss": 2.0602, + "step": 3086 + }, + { + "epoch": 1.331679965494932, + "grad_norm": 0.17975656688213348, + "learning_rate": 8.911259537359244e-05, + "loss": 2.2368, + "step": 3087 + }, + { + "epoch": 1.33211127884408, + "grad_norm": 0.17068015038967133, + "learning_rate": 8.90790755840537e-05, + "loss": 2.2709, + "step": 3088 + }, + { + "epoch": 1.3325425921932283, + "grad_norm": 0.17485836148262024, + "learning_rate": 8.904555287928838e-05, + "loss": 2.1873, + "step": 3089 + }, + { + "epoch": 1.3329739055423766, + "grad_norm": 0.1645931601524353, + "learning_rate": 8.901202726623775e-05, + "loss": 2.1562, + "step": 3090 + }, + { + "epoch": 1.3334052188915246, + "grad_norm": 0.16713939607143402, + "learning_rate": 8.897849875184363e-05, + "loss": 2.0063, + "step": 3091 + }, + { + "epoch": 1.333836532240673, + "grad_norm": 0.16838465631008148, + "learning_rate": 8.894496734304849e-05, + "loss": 2.1059, + "step": 3092 + }, + { + "epoch": 1.334267845589821, + "grad_norm": 0.17659622430801392, + "learning_rate": 8.891143304679534e-05, + "loss": 2.4106, + "step": 3093 + }, + { + "epoch": 1.3346991589389692, + "grad_norm": 0.19652199745178223, + "learning_rate": 8.887789587002787e-05, + "loss": 2.3738, + "step": 3094 + }, + { + "epoch": 1.3351304722881174, + "grad_norm": 0.196419358253479, + "learning_rate": 8.884435581969029e-05, + "loss": 2.0439, + "step": 3095 + }, + { + "epoch": 1.3355617856372655, + "grad_norm": 0.1740923374891281, + "learning_rate": 8.881081290272742e-05, + "loss": 2.1699, + "step": 3096 + }, + { + "epoch": 1.3359930989864135, + "grad_norm": 0.18634657561779022, + "learning_rate": 8.877726712608473e-05, + "loss": 2.2341, + "step": 3097 + }, + { + "epoch": 1.3364244123355618, + "grad_norm": 0.18123063445091248, + "learning_rate": 8.874371849670819e-05, + "loss": 2.2009, + "step": 3098 + }, + { + "epoch": 1.33685572568471, + "grad_norm": 0.16630500555038452, + "learning_rate": 8.871016702154445e-05, + "loss": 2.2254, + "step": 3099 + }, + { + "epoch": 1.337287039033858, + "grad_norm": 0.2106820046901703, + "learning_rate": 8.86766127075407e-05, + "loss": 2.1707, + "step": 3100 + }, + { + "epoch": 1.337287039033858, + "eval_loss": 2.0972704887390137, + "eval_runtime": 203.8969, + "eval_samples_per_second": 0.157, + "eval_steps_per_second": 0.157, + "step": 3100 + }, + { + "epoch": 1.3377183523830063, + "grad_norm": 0.18919944763183594, + "learning_rate": 8.864305556164473e-05, + "loss": 2.1498, + "step": 3101 + }, + { + "epoch": 1.3381496657321543, + "grad_norm": 0.16261659562587738, + "learning_rate": 8.860949559080488e-05, + "loss": 2.0927, + "step": 3102 + }, + { + "epoch": 1.3385809790813026, + "grad_norm": 0.1766456514596939, + "learning_rate": 8.857593280197016e-05, + "loss": 2.1982, + "step": 3103 + }, + { + "epoch": 1.3390122924304508, + "grad_norm": 0.18764159083366394, + "learning_rate": 8.85423672020901e-05, + "loss": 2.1275, + "step": 3104 + }, + { + "epoch": 1.3394436057795989, + "grad_norm": 0.18761979043483734, + "learning_rate": 8.85087987981148e-05, + "loss": 2.3064, + "step": 3105 + }, + { + "epoch": 1.339874919128747, + "grad_norm": 0.17793145775794983, + "learning_rate": 8.847522759699496e-05, + "loss": 2.148, + "step": 3106 + }, + { + "epoch": 1.3403062324778952, + "grad_norm": 0.17504742741584778, + "learning_rate": 8.84416536056819e-05, + "loss": 2.2292, + "step": 3107 + }, + { + "epoch": 1.3407375458270434, + "grad_norm": 0.16357029974460602, + "learning_rate": 8.840807683112748e-05, + "loss": 2.0971, + "step": 3108 + }, + { + "epoch": 1.3411688591761914, + "grad_norm": 0.40526726841926575, + "learning_rate": 8.83744972802841e-05, + "loss": 2.1609, + "step": 3109 + }, + { + "epoch": 1.3416001725253397, + "grad_norm": 0.15176741778850555, + "learning_rate": 8.834091496010482e-05, + "loss": 1.9961, + "step": 3110 + }, + { + "epoch": 1.3420314858744877, + "grad_norm": 0.19039228558540344, + "learning_rate": 8.830732987754319e-05, + "loss": 1.9417, + "step": 3111 + }, + { + "epoch": 1.342462799223636, + "grad_norm": 0.1844143122434616, + "learning_rate": 8.827374203955338e-05, + "loss": 2.1261, + "step": 3112 + }, + { + "epoch": 1.3428941125727842, + "grad_norm": 0.176142156124115, + "learning_rate": 8.824015145309014e-05, + "loss": 2.254, + "step": 3113 + }, + { + "epoch": 1.3433254259219323, + "grad_norm": 0.17060662806034088, + "learning_rate": 8.820655812510874e-05, + "loss": 2.3113, + "step": 3114 + }, + { + "epoch": 1.3437567392710803, + "grad_norm": 0.16808998584747314, + "learning_rate": 8.817296206256504e-05, + "loss": 2.1092, + "step": 3115 + }, + { + "epoch": 1.3441880526202286, + "grad_norm": 0.1780100166797638, + "learning_rate": 8.813936327241549e-05, + "loss": 2.092, + "step": 3116 + }, + { + "epoch": 1.3446193659693768, + "grad_norm": 0.18709628283977509, + "learning_rate": 8.81057617616171e-05, + "loss": 2.2783, + "step": 3117 + }, + { + "epoch": 1.3450506793185248, + "grad_norm": 0.1694032847881317, + "learning_rate": 8.80721575371274e-05, + "loss": 2.0234, + "step": 3118 + }, + { + "epoch": 1.345481992667673, + "grad_norm": 0.18214310705661774, + "learning_rate": 8.803855060590455e-05, + "loss": 2.1824, + "step": 3119 + }, + { + "epoch": 1.3459133060168211, + "grad_norm": 0.19179998338222504, + "learning_rate": 8.80049409749072e-05, + "loss": 2.4183, + "step": 3120 + }, + { + "epoch": 1.3463446193659694, + "grad_norm": 0.18185855448246002, + "learning_rate": 8.79713286510946e-05, + "loss": 2.3265, + "step": 3121 + }, + { + "epoch": 1.3467759327151176, + "grad_norm": 0.18518030643463135, + "learning_rate": 8.793771364142653e-05, + "loss": 2.2387, + "step": 3122 + }, + { + "epoch": 1.3472072460642657, + "grad_norm": 0.17802715301513672, + "learning_rate": 8.790409595286339e-05, + "loss": 2.3093, + "step": 3123 + }, + { + "epoch": 1.347638559413414, + "grad_norm": 0.1845805048942566, + "learning_rate": 8.787047559236606e-05, + "loss": 2.0402, + "step": 3124 + }, + { + "epoch": 1.348069872762562, + "grad_norm": 0.20575165748596191, + "learning_rate": 8.7836852566896e-05, + "loss": 2.1048, + "step": 3125 + }, + { + "epoch": 1.348069872762562, + "eval_loss": 2.0977463722229004, + "eval_runtime": 201.6906, + "eval_samples_per_second": 0.159, + "eval_steps_per_second": 0.159, + "step": 3125 + }, + { + "epoch": 1.3485011861117102, + "grad_norm": 0.17934945225715637, + "learning_rate": 8.780322688341525e-05, + "loss": 2.1051, + "step": 3126 + }, + { + "epoch": 1.3489324994608582, + "grad_norm": 0.18327248096466064, + "learning_rate": 8.776959854888636e-05, + "loss": 2.3397, + "step": 3127 + }, + { + "epoch": 1.3493638128100065, + "grad_norm": 0.18778805434703827, + "learning_rate": 8.773596757027244e-05, + "loss": 2.0872, + "step": 3128 + }, + { + "epoch": 1.3497951261591545, + "grad_norm": 0.16798192262649536, + "learning_rate": 8.770233395453714e-05, + "loss": 2.0782, + "step": 3129 + }, + { + "epoch": 1.3502264395083028, + "grad_norm": 0.18652935326099396, + "learning_rate": 8.766869770864471e-05, + "loss": 2.1258, + "step": 3130 + }, + { + "epoch": 1.350657752857451, + "grad_norm": 0.1795220524072647, + "learning_rate": 8.763505883955986e-05, + "loss": 2.4144, + "step": 3131 + }, + { + "epoch": 1.351089066206599, + "grad_norm": 0.1760927140712738, + "learning_rate": 8.760141735424793e-05, + "loss": 2.3392, + "step": 3132 + }, + { + "epoch": 1.3515203795557473, + "grad_norm": 0.17478467524051666, + "learning_rate": 8.75677732596747e-05, + "loss": 2.1875, + "step": 3133 + }, + { + "epoch": 1.3519516929048954, + "grad_norm": 0.1812528520822525, + "learning_rate": 8.753412656280659e-05, + "loss": 2.2426, + "step": 3134 + }, + { + "epoch": 1.3523830062540436, + "grad_norm": 0.18718136847019196, + "learning_rate": 8.75004772706105e-05, + "loss": 2.2056, + "step": 3135 + }, + { + "epoch": 1.3528143196031917, + "grad_norm": 0.3149339556694031, + "learning_rate": 8.746682539005389e-05, + "loss": 2.1533, + "step": 3136 + }, + { + "epoch": 1.35324563295234, + "grad_norm": 0.1598491221666336, + "learning_rate": 8.743317092810475e-05, + "loss": 2.0824, + "step": 3137 + }, + { + "epoch": 1.353676946301488, + "grad_norm": 0.16594962775707245, + "learning_rate": 8.739951389173159e-05, + "loss": 2.0977, + "step": 3138 + }, + { + "epoch": 1.3541082596506362, + "grad_norm": 0.1650681346654892, + "learning_rate": 8.736585428790348e-05, + "loss": 2.1672, + "step": 3139 + }, + { + "epoch": 1.3545395729997844, + "grad_norm": 0.17856769263744354, + "learning_rate": 8.733219212359004e-05, + "loss": 2.1519, + "step": 3140 + }, + { + "epoch": 1.3549708863489325, + "grad_norm": 0.17163975536823273, + "learning_rate": 8.729852740576132e-05, + "loss": 2.1415, + "step": 3141 + }, + { + "epoch": 1.3554021996980807, + "grad_norm": 0.1770739108324051, + "learning_rate": 8.726486014138803e-05, + "loss": 2.0446, + "step": 3142 + }, + { + "epoch": 1.3558335130472288, + "grad_norm": 0.17098559439182281, + "learning_rate": 8.723119033744133e-05, + "loss": 2.2327, + "step": 3143 + }, + { + "epoch": 1.356264826396377, + "grad_norm": 0.1768409013748169, + "learning_rate": 8.71975180008929e-05, + "loss": 2.1411, + "step": 3144 + }, + { + "epoch": 1.356696139745525, + "grad_norm": 0.18088935315608978, + "learning_rate": 8.716384313871495e-05, + "loss": 2.1351, + "step": 3145 + }, + { + "epoch": 1.3571274530946733, + "grad_norm": 0.1696568876504898, + "learning_rate": 8.71301657578803e-05, + "loss": 2.0054, + "step": 3146 + }, + { + "epoch": 1.3575587664438213, + "grad_norm": 0.19234417378902435, + "learning_rate": 8.709648586536214e-05, + "loss": 2.1927, + "step": 3147 + }, + { + "epoch": 1.3579900797929696, + "grad_norm": 0.1776120811700821, + "learning_rate": 8.706280346813434e-05, + "loss": 2.0688, + "step": 3148 + }, + { + "epoch": 1.3584213931421179, + "grad_norm": 0.7187220454216003, + "learning_rate": 8.702911857317112e-05, + "loss": 2.2263, + "step": 3149 + }, + { + "epoch": 1.3588527064912659, + "grad_norm": 0.1841685175895691, + "learning_rate": 8.699543118744739e-05, + "loss": 2.2218, + "step": 3150 + }, + { + "epoch": 1.3588527064912659, + "eval_loss": 2.0971086025238037, + "eval_runtime": 202.3868, + "eval_samples_per_second": 0.158, + "eval_steps_per_second": 0.158, + "step": 3150 + }, + { + "epoch": 1.3592840198404141, + "grad_norm": 0.16733163595199585, + "learning_rate": 8.696174131793843e-05, + "loss": 2.1474, + "step": 3151 + }, + { + "epoch": 1.3597153331895622, + "grad_norm": 0.17958466708660126, + "learning_rate": 8.692804897162011e-05, + "loss": 2.3287, + "step": 3152 + }, + { + "epoch": 1.3601466465387104, + "grad_norm": 0.18487928807735443, + "learning_rate": 8.689435415546883e-05, + "loss": 2.2847, + "step": 3153 + }, + { + "epoch": 1.3605779598878585, + "grad_norm": 0.1870775669813156, + "learning_rate": 8.686065687646145e-05, + "loss": 2.1921, + "step": 3154 + }, + { + "epoch": 1.3610092732370067, + "grad_norm": 0.17926892638206482, + "learning_rate": 8.682695714157533e-05, + "loss": 2.0102, + "step": 3155 + }, + { + "epoch": 1.3614405865861547, + "grad_norm": 0.1682741492986679, + "learning_rate": 8.679325495778842e-05, + "loss": 2.0879, + "step": 3156 + }, + { + "epoch": 1.361871899935303, + "grad_norm": 0.18406817317008972, + "learning_rate": 8.67595503320791e-05, + "loss": 2.1946, + "step": 3157 + }, + { + "epoch": 1.3623032132844513, + "grad_norm": 0.18852907419204712, + "learning_rate": 8.672584327142627e-05, + "loss": 2.285, + "step": 3158 + }, + { + "epoch": 1.3627345266335993, + "grad_norm": 0.1969786286354065, + "learning_rate": 8.669213378280938e-05, + "loss": 2.4023, + "step": 3159 + }, + { + "epoch": 1.3631658399827475, + "grad_norm": 0.17748579382896423, + "learning_rate": 8.66584218732083e-05, + "loss": 2.1213, + "step": 3160 + }, + { + "epoch": 1.3635971533318956, + "grad_norm": 0.16388718783855438, + "learning_rate": 8.662470754960349e-05, + "loss": 1.9148, + "step": 3161 + }, + { + "epoch": 1.3640284666810438, + "grad_norm": 0.27124521136283875, + "learning_rate": 8.659099081897587e-05, + "loss": 2.1991, + "step": 3162 + }, + { + "epoch": 1.3644597800301919, + "grad_norm": 0.18605460226535797, + "learning_rate": 8.655727168830681e-05, + "loss": 2.2165, + "step": 3163 + }, + { + "epoch": 1.3648910933793401, + "grad_norm": 0.17285169661045074, + "learning_rate": 8.652355016457831e-05, + "loss": 2.1461, + "step": 3164 + }, + { + "epoch": 1.3653224067284881, + "grad_norm": 0.18387822806835175, + "learning_rate": 8.648982625477268e-05, + "loss": 2.1026, + "step": 3165 + }, + { + "epoch": 1.3657537200776364, + "grad_norm": 0.17600493133068085, + "learning_rate": 8.64560999658729e-05, + "loss": 2.1583, + "step": 3166 + }, + { + "epoch": 1.3661850334267847, + "grad_norm": 0.1766958236694336, + "learning_rate": 8.642237130486234e-05, + "loss": 2.1425, + "step": 3167 + }, + { + "epoch": 1.3666163467759327, + "grad_norm": 1.2923835515975952, + "learning_rate": 8.638864027872487e-05, + "loss": 2.142, + "step": 3168 + }, + { + "epoch": 1.367047660125081, + "grad_norm": 0.18593095242977142, + "learning_rate": 8.63549068944449e-05, + "loss": 2.2189, + "step": 3169 + }, + { + "epoch": 1.367478973474229, + "grad_norm": 0.19588585197925568, + "learning_rate": 8.632117115900728e-05, + "loss": 2.2355, + "step": 3170 + }, + { + "epoch": 1.3679102868233772, + "grad_norm": 0.1888759285211563, + "learning_rate": 8.628743307939737e-05, + "loss": 2.1629, + "step": 3171 + }, + { + "epoch": 1.3683416001725253, + "grad_norm": 0.17903682589530945, + "learning_rate": 8.6253692662601e-05, + "loss": 2.0798, + "step": 3172 + }, + { + "epoch": 1.3687729135216735, + "grad_norm": 0.18080520629882812, + "learning_rate": 8.621994991560449e-05, + "loss": 2.2332, + "step": 3173 + }, + { + "epoch": 1.3692042268708216, + "grad_norm": 0.20339682698249817, + "learning_rate": 8.618620484539467e-05, + "loss": 2.1999, + "step": 3174 + }, + { + "epoch": 1.3696355402199698, + "grad_norm": 0.1827765554189682, + "learning_rate": 8.615245745895877e-05, + "loss": 2.0979, + "step": 3175 + }, + { + "epoch": 1.3696355402199698, + "eval_loss": 2.0981106758117676, + "eval_runtime": 202.6773, + "eval_samples_per_second": 0.158, + "eval_steps_per_second": 0.158, + "step": 3175 + }, + { + "epoch": 1.370066853569118, + "grad_norm": 0.19208596646785736, + "learning_rate": 8.611870776328463e-05, + "loss": 2.1504, + "step": 3176 + }, + { + "epoch": 1.370498166918266, + "grad_norm": 0.17461636662483215, + "learning_rate": 8.608495576536042e-05, + "loss": 1.9311, + "step": 3177 + }, + { + "epoch": 1.3709294802674143, + "grad_norm": 0.17411647737026215, + "learning_rate": 8.605120147217493e-05, + "loss": 2.305, + "step": 3178 + }, + { + "epoch": 1.3713607936165624, + "grad_norm": 0.1835533082485199, + "learning_rate": 8.601744489071731e-05, + "loss": 2.1118, + "step": 3179 + }, + { + "epoch": 1.3717921069657106, + "grad_norm": 0.17653480172157288, + "learning_rate": 8.598368602797723e-05, + "loss": 2.1825, + "step": 3180 + }, + { + "epoch": 1.3722234203148587, + "grad_norm": 0.17974716424942017, + "learning_rate": 8.594992489094486e-05, + "loss": 2.2029, + "step": 3181 + }, + { + "epoch": 1.372654733664007, + "grad_norm": 0.1939789205789566, + "learning_rate": 8.591616148661078e-05, + "loss": 2.2997, + "step": 3182 + }, + { + "epoch": 1.373086047013155, + "grad_norm": 0.19809889793395996, + "learning_rate": 8.58823958219661e-05, + "loss": 2.1236, + "step": 3183 + }, + { + "epoch": 1.3735173603623032, + "grad_norm": 0.19726596772670746, + "learning_rate": 8.584862790400232e-05, + "loss": 2.1344, + "step": 3184 + }, + { + "epoch": 1.3739486737114515, + "grad_norm": 0.18464700877666473, + "learning_rate": 8.581485773971153e-05, + "loss": 2.198, + "step": 3185 + }, + { + "epoch": 1.3743799870605995, + "grad_norm": 0.18575282394886017, + "learning_rate": 8.578108533608618e-05, + "loss": 2.3213, + "step": 3186 + }, + { + "epoch": 1.3748113004097477, + "grad_norm": 0.1865396797657013, + "learning_rate": 8.57473107001192e-05, + "loss": 2.3166, + "step": 3187 + }, + { + "epoch": 1.3752426137588958, + "grad_norm": 0.1792069673538208, + "learning_rate": 8.571353383880401e-05, + "loss": 2.152, + "step": 3188 + }, + { + "epoch": 1.375673927108044, + "grad_norm": 0.16752682626247406, + "learning_rate": 8.567975475913448e-05, + "loss": 2.3021, + "step": 3189 + }, + { + "epoch": 1.376105240457192, + "grad_norm": 0.1645505130290985, + "learning_rate": 8.564597346810492e-05, + "loss": 1.9309, + "step": 3190 + }, + { + "epoch": 1.3765365538063403, + "grad_norm": 0.18013405799865723, + "learning_rate": 8.561218997271014e-05, + "loss": 2.1577, + "step": 3191 + }, + { + "epoch": 1.3769678671554884, + "grad_norm": 0.1741134524345398, + "learning_rate": 8.557840427994536e-05, + "loss": 2.1166, + "step": 3192 + }, + { + "epoch": 1.3773991805046366, + "grad_norm": 0.184743270277977, + "learning_rate": 8.554461639680632e-05, + "loss": 2.2432, + "step": 3193 + }, + { + "epoch": 1.3778304938537849, + "grad_norm": 0.19026319682598114, + "learning_rate": 8.551082633028913e-05, + "loss": 2.2845, + "step": 3194 + }, + { + "epoch": 1.378261807202933, + "grad_norm": 0.1513228416442871, + "learning_rate": 8.547703408739042e-05, + "loss": 2.0565, + "step": 3195 + }, + { + "epoch": 1.3786931205520812, + "grad_norm": 0.18880301713943481, + "learning_rate": 8.544323967510718e-05, + "loss": 2.3363, + "step": 3196 + }, + { + "epoch": 1.3791244339012292, + "grad_norm": 0.18461208045482635, + "learning_rate": 8.540944310043702e-05, + "loss": 2.3256, + "step": 3197 + }, + { + "epoch": 1.3795557472503774, + "grad_norm": 0.18815840780735016, + "learning_rate": 8.537564437037778e-05, + "loss": 2.179, + "step": 3198 + }, + { + "epoch": 1.3799870605995255, + "grad_norm": 0.1892249882221222, + "learning_rate": 8.534184349192792e-05, + "loss": 2.2513, + "step": 3199 + }, + { + "epoch": 1.3804183739486737, + "grad_norm": 0.17379261553287506, + "learning_rate": 8.530804047208627e-05, + "loss": 1.7714, + "step": 3200 + }, + { + "epoch": 1.3804183739486737, + "eval_loss": 2.0976791381835938, + "eval_runtime": 196.5304, + "eval_samples_per_second": 0.163, + "eval_steps_per_second": 0.163, + "step": 3200 + }, + { + "epoch": 1.3808496872978218, + "grad_norm": 0.17144569754600525, + "learning_rate": 8.527423531785213e-05, + "loss": 1.9854, + "step": 3201 + }, + { + "epoch": 1.38128100064697, + "grad_norm": 0.184304341673851, + "learning_rate": 8.524042803622519e-05, + "loss": 2.1147, + "step": 3202 + }, + { + "epoch": 1.3817123139961183, + "grad_norm": 0.18346154689788818, + "learning_rate": 8.520661863420565e-05, + "loss": 1.8588, + "step": 3203 + }, + { + "epoch": 1.3821436273452663, + "grad_norm": 0.17854824662208557, + "learning_rate": 8.51728071187941e-05, + "loss": 2.1437, + "step": 3204 + }, + { + "epoch": 1.3825749406944146, + "grad_norm": 0.23320263624191284, + "learning_rate": 8.513899349699158e-05, + "loss": 2.0817, + "step": 3205 + }, + { + "epoch": 1.3830062540435626, + "grad_norm": 0.19302494823932648, + "learning_rate": 8.51051777757996e-05, + "loss": 2.2875, + "step": 3206 + }, + { + "epoch": 1.3834375673927108, + "grad_norm": 0.1735406219959259, + "learning_rate": 8.507135996222002e-05, + "loss": 2.0925, + "step": 3207 + }, + { + "epoch": 1.383868880741859, + "grad_norm": 0.1883225440979004, + "learning_rate": 8.503754006325524e-05, + "loss": 2.1741, + "step": 3208 + }, + { + "epoch": 1.3843001940910071, + "grad_norm": 0.20691944658756256, + "learning_rate": 8.500371808590802e-05, + "loss": 2.2111, + "step": 3209 + }, + { + "epoch": 1.3847315074401552, + "grad_norm": 0.22449901700019836, + "learning_rate": 8.496989403718159e-05, + "loss": 1.9673, + "step": 3210 + }, + { + "epoch": 1.3851628207893034, + "grad_norm": 0.19177977740764618, + "learning_rate": 8.493606792407954e-05, + "loss": 2.0311, + "step": 3211 + }, + { + "epoch": 1.3855941341384517, + "grad_norm": 0.1945643126964569, + "learning_rate": 8.4902239753606e-05, + "loss": 2.1568, + "step": 3212 + }, + { + "epoch": 1.3860254474875997, + "grad_norm": 1.4558762311935425, + "learning_rate": 8.486840953276541e-05, + "loss": 2.2195, + "step": 3213 + }, + { + "epoch": 1.386456760836748, + "grad_norm": 0.21869879961013794, + "learning_rate": 8.483457726856268e-05, + "loss": 2.2545, + "step": 3214 + }, + { + "epoch": 1.386888074185896, + "grad_norm": 0.1923229843378067, + "learning_rate": 8.480074296800322e-05, + "loss": 2.0498, + "step": 3215 + }, + { + "epoch": 1.3873193875350442, + "grad_norm": 0.18915557861328125, + "learning_rate": 8.476690663809273e-05, + "loss": 2.1517, + "step": 3216 + }, + { + "epoch": 1.3877507008841925, + "grad_norm": 0.16886180639266968, + "learning_rate": 8.473306828583742e-05, + "loss": 2.1885, + "step": 3217 + }, + { + "epoch": 1.3881820142333405, + "grad_norm": 0.18701143562793732, + "learning_rate": 8.469922791824387e-05, + "loss": 2.1815, + "step": 3218 + }, + { + "epoch": 1.3886133275824886, + "grad_norm": 0.17433500289916992, + "learning_rate": 8.466538554231913e-05, + "loss": 2.1176, + "step": 3219 + }, + { + "epoch": 1.3890446409316368, + "grad_norm": 0.18689289689064026, + "learning_rate": 8.463154116507061e-05, + "loss": 1.9017, + "step": 3220 + }, + { + "epoch": 1.389475954280785, + "grad_norm": 0.18619681894779205, + "learning_rate": 8.459769479350614e-05, + "loss": 2.2011, + "step": 3221 + }, + { + "epoch": 1.389907267629933, + "grad_norm": 0.19147801399230957, + "learning_rate": 8.456384643463404e-05, + "loss": 2.2078, + "step": 3222 + }, + { + "epoch": 1.3903385809790814, + "grad_norm": 0.18116207420825958, + "learning_rate": 8.452999609546291e-05, + "loss": 2.1176, + "step": 3223 + }, + { + "epoch": 1.3907698943282294, + "grad_norm": 0.17653116583824158, + "learning_rate": 8.449614378300189e-05, + "loss": 2.2895, + "step": 3224 + }, + { + "epoch": 1.3912012076773776, + "grad_norm": 0.18579091131687164, + "learning_rate": 8.446228950426045e-05, + "loss": 2.1845, + "step": 3225 + }, + { + "epoch": 1.3912012076773776, + "eval_loss": 2.097705364227295, + "eval_runtime": 194.6266, + "eval_samples_per_second": 0.164, + "eval_steps_per_second": 0.164, + "step": 3225 + }, + { + "epoch": 1.391632521026526, + "grad_norm": 0.18824826180934906, + "learning_rate": 8.44284332662485e-05, + "loss": 2.1208, + "step": 3226 + }, + { + "epoch": 1.392063834375674, + "grad_norm": 0.19434987008571625, + "learning_rate": 8.439457507597633e-05, + "loss": 2.2117, + "step": 3227 + }, + { + "epoch": 1.392495147724822, + "grad_norm": 0.17461137473583221, + "learning_rate": 8.436071494045466e-05, + "loss": 2.0738, + "step": 3228 + }, + { + "epoch": 1.3929264610739702, + "grad_norm": 0.18477419018745422, + "learning_rate": 8.432685286669458e-05, + "loss": 2.208, + "step": 3229 + }, + { + "epoch": 1.3933577744231185, + "grad_norm": 0.17233210802078247, + "learning_rate": 8.42929888617076e-05, + "loss": 2.2569, + "step": 3230 + }, + { + "epoch": 1.3937890877722665, + "grad_norm": 0.20839418470859528, + "learning_rate": 8.425912293250567e-05, + "loss": 2.1946, + "step": 3231 + }, + { + "epoch": 1.3942204011214148, + "grad_norm": 0.18050678074359894, + "learning_rate": 8.422525508610109e-05, + "loss": 2.0627, + "step": 3232 + }, + { + "epoch": 1.3946517144705628, + "grad_norm": 0.19521097838878632, + "learning_rate": 8.419138532950655e-05, + "loss": 2.3172, + "step": 3233 + }, + { + "epoch": 1.395083027819711, + "grad_norm": 0.16553284227848053, + "learning_rate": 8.415751366973517e-05, + "loss": 2.004, + "step": 3234 + }, + { + "epoch": 1.3955143411688593, + "grad_norm": 0.18260253965854645, + "learning_rate": 8.412364011380042e-05, + "loss": 2.2002, + "step": 3235 + }, + { + "epoch": 1.3959456545180073, + "grad_norm": 0.17164035141468048, + "learning_rate": 8.408976466871623e-05, + "loss": 2.2397, + "step": 3236 + }, + { + "epoch": 1.3963769678671554, + "grad_norm": 0.2340926080942154, + "learning_rate": 8.405588734149684e-05, + "loss": 2.125, + "step": 3237 + }, + { + "epoch": 1.3968082812163036, + "grad_norm": 0.18080025911331177, + "learning_rate": 8.402200813915698e-05, + "loss": 2.1483, + "step": 3238 + }, + { + "epoch": 1.3972395945654519, + "grad_norm": 0.18892081081867218, + "learning_rate": 8.398812706871163e-05, + "loss": 2.2249, + "step": 3239 + }, + { + "epoch": 1.3976709079146, + "grad_norm": 0.2113853543996811, + "learning_rate": 8.395424413717628e-05, + "loss": 2.2859, + "step": 3240 + }, + { + "epoch": 1.3981022212637482, + "grad_norm": 0.18064041435718536, + "learning_rate": 8.39203593515668e-05, + "loss": 2.06, + "step": 3241 + }, + { + "epoch": 1.3985335346128962, + "grad_norm": 0.1873074322938919, + "learning_rate": 8.388647271889933e-05, + "loss": 2.2682, + "step": 3242 + }, + { + "epoch": 1.3989648479620445, + "grad_norm": 0.18234270811080933, + "learning_rate": 8.385258424619053e-05, + "loss": 2.1766, + "step": 3243 + }, + { + "epoch": 1.3993961613111927, + "grad_norm": 0.19495293498039246, + "learning_rate": 8.38186939404573e-05, + "loss": 2.4004, + "step": 3244 + }, + { + "epoch": 1.3998274746603407, + "grad_norm": 0.16893860697746277, + "learning_rate": 8.37848018087171e-05, + "loss": 2.0107, + "step": 3245 + }, + { + "epoch": 1.4002587880094888, + "grad_norm": 0.18764536082744598, + "learning_rate": 8.37509078579876e-05, + "loss": 2.2513, + "step": 3246 + }, + { + "epoch": 1.400690101358637, + "grad_norm": 0.1757303923368454, + "learning_rate": 8.371701209528691e-05, + "loss": 2.1406, + "step": 3247 + }, + { + "epoch": 1.4011214147077853, + "grad_norm": 0.1827249675989151, + "learning_rate": 8.368311452763358e-05, + "loss": 2.3251, + "step": 3248 + }, + { + "epoch": 1.4015527280569333, + "grad_norm": 0.17690004408359528, + "learning_rate": 8.364921516204641e-05, + "loss": 2.175, + "step": 3249 + }, + { + "epoch": 1.4019840414060816, + "grad_norm": 0.2701353132724762, + "learning_rate": 8.361531400554465e-05, + "loss": 2.0582, + "step": 3250 + }, + { + "epoch": 1.4019840414060816, + "eval_loss": 2.096698522567749, + "eval_runtime": 194.8302, + "eval_samples_per_second": 0.164, + "eval_steps_per_second": 0.164, + "step": 3250 + }, + { + "epoch": 1.4024153547552296, + "grad_norm": 0.19098925590515137, + "learning_rate": 8.358141106514789e-05, + "loss": 2.2413, + "step": 3251 + }, + { + "epoch": 1.4028466681043779, + "grad_norm": 0.19250550866127014, + "learning_rate": 8.354750634787618e-05, + "loss": 1.9567, + "step": 3252 + }, + { + "epoch": 1.4032779814535261, + "grad_norm": 0.19768285751342773, + "learning_rate": 8.351359986074975e-05, + "loss": 2.133, + "step": 3253 + }, + { + "epoch": 1.4037092948026741, + "grad_norm": 0.18588483333587646, + "learning_rate": 8.34796916107894e-05, + "loss": 2.3247, + "step": 3254 + }, + { + "epoch": 1.4041406081518222, + "grad_norm": 0.19821996986865997, + "learning_rate": 8.344578160501615e-05, + "loss": 2.224, + "step": 3255 + }, + { + "epoch": 1.4045719215009704, + "grad_norm": 0.18002888560295105, + "learning_rate": 8.341186985045148e-05, + "loss": 2.1396, + "step": 3256 + }, + { + "epoch": 1.4050032348501187, + "grad_norm": 0.18589261174201965, + "learning_rate": 8.337795635411715e-05, + "loss": 2.18, + "step": 3257 + }, + { + "epoch": 1.4054345481992667, + "grad_norm": 0.1814439445734024, + "learning_rate": 8.334404112303535e-05, + "loss": 2.1433, + "step": 3258 + }, + { + "epoch": 1.405865861548415, + "grad_norm": 0.179869145154953, + "learning_rate": 8.331012416422855e-05, + "loss": 2.1902, + "step": 3259 + }, + { + "epoch": 1.406297174897563, + "grad_norm": 0.19356581568717957, + "learning_rate": 8.327620548471969e-05, + "loss": 2.0897, + "step": 3260 + }, + { + "epoch": 1.4067284882467113, + "grad_norm": 0.19356249272823334, + "learning_rate": 8.324228509153199e-05, + "loss": 2.3182, + "step": 3261 + }, + { + "epoch": 1.4071598015958595, + "grad_norm": 0.1817573457956314, + "learning_rate": 8.320836299168901e-05, + "loss": 2.288, + "step": 3262 + }, + { + "epoch": 1.4075911149450075, + "grad_norm": 0.17940185964107513, + "learning_rate": 8.317443919221471e-05, + "loss": 2.054, + "step": 3263 + }, + { + "epoch": 1.4080224282941556, + "grad_norm": 0.1818537712097168, + "learning_rate": 8.314051370013338e-05, + "loss": 2.1608, + "step": 3264 + }, + { + "epoch": 1.4084537416433038, + "grad_norm": 0.19189421832561493, + "learning_rate": 8.310658652246967e-05, + "loss": 2.1291, + "step": 3265 + }, + { + "epoch": 1.408885054992452, + "grad_norm": 0.19296544790267944, + "learning_rate": 8.307265766624857e-05, + "loss": 2.2991, + "step": 3266 + }, + { + "epoch": 1.4093163683416001, + "grad_norm": 0.20070980489253998, + "learning_rate": 8.30387271384954e-05, + "loss": 2.2874, + "step": 3267 + }, + { + "epoch": 1.4097476816907484, + "grad_norm": 0.18332870304584503, + "learning_rate": 8.300479494623589e-05, + "loss": 2.2083, + "step": 3268 + }, + { + "epoch": 1.4101789950398964, + "grad_norm": 0.19442547857761383, + "learning_rate": 8.297086109649604e-05, + "loss": 2.1842, + "step": 3269 + }, + { + "epoch": 1.4106103083890447, + "grad_norm": 0.17771577835083008, + "learning_rate": 8.293692559630223e-05, + "loss": 2.188, + "step": 3270 + }, + { + "epoch": 1.411041621738193, + "grad_norm": 0.1854981929063797, + "learning_rate": 8.290298845268117e-05, + "loss": 2.2063, + "step": 3271 + }, + { + "epoch": 1.411472935087341, + "grad_norm": 0.1690223664045334, + "learning_rate": 8.286904967265994e-05, + "loss": 2.028, + "step": 3272 + }, + { + "epoch": 1.411904248436489, + "grad_norm": 0.18008871376514435, + "learning_rate": 8.283510926326592e-05, + "loss": 2.2517, + "step": 3273 + }, + { + "epoch": 1.4123355617856372, + "grad_norm": 0.17644241452217102, + "learning_rate": 8.280116723152681e-05, + "loss": 2.1543, + "step": 3274 + }, + { + "epoch": 1.4127668751347855, + "grad_norm": 0.18835556507110596, + "learning_rate": 8.276722358447078e-05, + "loss": 2.2019, + "step": 3275 + }, + { + "epoch": 1.4127668751347855, + "eval_loss": 2.0969247817993164, + "eval_runtime": 194.9691, + "eval_samples_per_second": 0.164, + "eval_steps_per_second": 0.164, + "step": 3275 + }, + { + "epoch": 1.4131981884839335, + "grad_norm": 0.21935436129570007, + "learning_rate": 8.273327832912611e-05, + "loss": 2.4498, + "step": 3276 + }, + { + "epoch": 1.4136295018330818, + "grad_norm": 0.18742847442626953, + "learning_rate": 8.26993314725216e-05, + "loss": 2.2589, + "step": 3277 + }, + { + "epoch": 1.4140608151822298, + "grad_norm": 0.19992081820964813, + "learning_rate": 8.266538302168632e-05, + "loss": 2.1686, + "step": 3278 + }, + { + "epoch": 1.414492128531378, + "grad_norm": 0.18198950588703156, + "learning_rate": 8.263143298364966e-05, + "loss": 2.1204, + "step": 3279 + }, + { + "epoch": 1.4149234418805263, + "grad_norm": 0.19453582167625427, + "learning_rate": 8.259748136544134e-05, + "loss": 2.3806, + "step": 3280 + }, + { + "epoch": 1.4153547552296744, + "grad_norm": 0.18559814989566803, + "learning_rate": 8.256352817409142e-05, + "loss": 2.2214, + "step": 3281 + }, + { + "epoch": 1.4157860685788224, + "grad_norm": 0.1632603406906128, + "learning_rate": 8.252957341663028e-05, + "loss": 2.2278, + "step": 3282 + }, + { + "epoch": 1.4162173819279706, + "grad_norm": 0.16929636895656586, + "learning_rate": 8.249561710008861e-05, + "loss": 2.2441, + "step": 3283 + }, + { + "epoch": 1.416648695277119, + "grad_norm": 0.17531128227710724, + "learning_rate": 8.246165923149748e-05, + "loss": 2.1605, + "step": 3284 + }, + { + "epoch": 1.417080008626267, + "grad_norm": 0.18277880549430847, + "learning_rate": 8.242769981788817e-05, + "loss": 2.1051, + "step": 3285 + }, + { + "epoch": 1.4175113219754152, + "grad_norm": 0.1829131543636322, + "learning_rate": 8.239373886629242e-05, + "loss": 2.048, + "step": 3286 + }, + { + "epoch": 1.4179426353245632, + "grad_norm": 0.18608812987804413, + "learning_rate": 8.235977638374216e-05, + "loss": 2.3122, + "step": 3287 + }, + { + "epoch": 1.4183739486737115, + "grad_norm": 0.17488327622413635, + "learning_rate": 8.232581237726971e-05, + "loss": 2.095, + "step": 3288 + }, + { + "epoch": 1.4188052620228597, + "grad_norm": 0.17854848504066467, + "learning_rate": 8.229184685390771e-05, + "loss": 2.2336, + "step": 3289 + }, + { + "epoch": 1.4192365753720078, + "grad_norm": 0.18223245441913605, + "learning_rate": 8.225787982068904e-05, + "loss": 2.0173, + "step": 3290 + }, + { + "epoch": 1.4196678887211558, + "grad_norm": 0.1716955453157425, + "learning_rate": 8.222391128464704e-05, + "loss": 2.0861, + "step": 3291 + }, + { + "epoch": 1.420099202070304, + "grad_norm": 0.1973477154970169, + "learning_rate": 8.218994125281517e-05, + "loss": 2.3174, + "step": 3292 + }, + { + "epoch": 1.4205305154194523, + "grad_norm": 0.18362484872341156, + "learning_rate": 8.215596973222734e-05, + "loss": 2.1593, + "step": 3293 + }, + { + "epoch": 1.4209618287686003, + "grad_norm": 0.1910424381494522, + "learning_rate": 8.212199672991771e-05, + "loss": 2.2483, + "step": 3294 + }, + { + "epoch": 1.4213931421177486, + "grad_norm": 0.18225796520709991, + "learning_rate": 8.208802225292081e-05, + "loss": 2.1716, + "step": 3295 + }, + { + "epoch": 1.4218244554668966, + "grad_norm": 0.1668190360069275, + "learning_rate": 8.205404630827136e-05, + "loss": 2.1617, + "step": 3296 + }, + { + "epoch": 1.4222557688160449, + "grad_norm": 0.1746533215045929, + "learning_rate": 8.202006890300447e-05, + "loss": 2.2301, + "step": 3297 + }, + { + "epoch": 1.4226870821651931, + "grad_norm": 0.18129593133926392, + "learning_rate": 8.198609004415557e-05, + "loss": 2.1187, + "step": 3298 + }, + { + "epoch": 1.4231183955143412, + "grad_norm": 0.18112768232822418, + "learning_rate": 8.195210973876032e-05, + "loss": 2.08, + "step": 3299 + }, + { + "epoch": 1.4235497088634892, + "grad_norm": 0.18262645602226257, + "learning_rate": 8.191812799385473e-05, + "loss": 2.2791, + "step": 3300 + }, + { + "epoch": 1.4235497088634892, + "eval_loss": 2.096240282058716, + "eval_runtime": 194.9073, + "eval_samples_per_second": 0.164, + "eval_steps_per_second": 0.164, + "step": 3300 + }, + { + "epoch": 1.4239810222126374, + "grad_norm": 0.1792260706424713, + "learning_rate": 8.188414481647506e-05, + "loss": 2.2229, + "step": 3301 + }, + { + "epoch": 1.4244123355617857, + "grad_norm": 0.1991865336894989, + "learning_rate": 8.185016021365794e-05, + "loss": 2.2453, + "step": 3302 + }, + { + "epoch": 1.4248436489109337, + "grad_norm": 0.21256665885448456, + "learning_rate": 8.181617419244023e-05, + "loss": 1.7668, + "step": 3303 + }, + { + "epoch": 1.425274962260082, + "grad_norm": 0.17974048852920532, + "learning_rate": 8.178218675985912e-05, + "loss": 2.0113, + "step": 3304 + }, + { + "epoch": 1.42570627560923, + "grad_norm": 0.17181038856506348, + "learning_rate": 8.174819792295207e-05, + "loss": 2.0498, + "step": 3305 + }, + { + "epoch": 1.4261375889583783, + "grad_norm": 0.1864602416753769, + "learning_rate": 8.171420768875683e-05, + "loss": 2.2682, + "step": 3306 + }, + { + "epoch": 1.4265689023075265, + "grad_norm": 0.1754862517118454, + "learning_rate": 8.168021606431148e-05, + "loss": 2.2322, + "step": 3307 + }, + { + "epoch": 1.4270002156566746, + "grad_norm": 0.17984440922737122, + "learning_rate": 8.164622305665431e-05, + "loss": 2.0406, + "step": 3308 + }, + { + "epoch": 1.4274315290058226, + "grad_norm": 0.1859109103679657, + "learning_rate": 8.161222867282401e-05, + "loss": 1.9864, + "step": 3309 + }, + { + "epoch": 1.4278628423549709, + "grad_norm": 0.1888839453458786, + "learning_rate": 8.157823291985944e-05, + "loss": 2.2825, + "step": 3310 + }, + { + "epoch": 1.428294155704119, + "grad_norm": 0.18982118368148804, + "learning_rate": 8.15442358047998e-05, + "loss": 2.1656, + "step": 3311 + }, + { + "epoch": 1.4287254690532671, + "grad_norm": 0.18888141214847565, + "learning_rate": 8.151023733468458e-05, + "loss": 2.3547, + "step": 3312 + }, + { + "epoch": 1.4291567824024154, + "grad_norm": 0.1889471858739853, + "learning_rate": 8.147623751655351e-05, + "loss": 2.0428, + "step": 3313 + }, + { + "epoch": 1.4295880957515634, + "grad_norm": 0.1735839992761612, + "learning_rate": 8.144223635744666e-05, + "loss": 2.0846, + "step": 3314 + }, + { + "epoch": 1.4300194091007117, + "grad_norm": 0.17066586017608643, + "learning_rate": 8.14082338644043e-05, + "loss": 2.1956, + "step": 3315 + }, + { + "epoch": 1.43045072244986, + "grad_norm": 0.18497024476528168, + "learning_rate": 8.137423004446706e-05, + "loss": 2.1482, + "step": 3316 + }, + { + "epoch": 1.430882035799008, + "grad_norm": 0.19920971989631653, + "learning_rate": 8.134022490467577e-05, + "loss": 2.0942, + "step": 3317 + }, + { + "epoch": 1.431313349148156, + "grad_norm": 0.19529315829277039, + "learning_rate": 8.130621845207159e-05, + "loss": 1.8287, + "step": 3318 + }, + { + "epoch": 1.4317446624973043, + "grad_norm": 0.1871774047613144, + "learning_rate": 8.12722106936959e-05, + "loss": 2.2785, + "step": 3319 + }, + { + "epoch": 1.4321759758464525, + "grad_norm": 0.17630258202552795, + "learning_rate": 8.123820163659038e-05, + "loss": 2.0975, + "step": 3320 + }, + { + "epoch": 1.4326072891956005, + "grad_norm": 0.1827884465456009, + "learning_rate": 8.120419128779703e-05, + "loss": 2.1393, + "step": 3321 + }, + { + "epoch": 1.4330386025447488, + "grad_norm": 0.2376793920993805, + "learning_rate": 8.1170179654358e-05, + "loss": 2.2405, + "step": 3322 + }, + { + "epoch": 1.4334699158938968, + "grad_norm": 0.1873629242181778, + "learning_rate": 8.113616674331581e-05, + "loss": 2.2394, + "step": 3323 + }, + { + "epoch": 1.433901229243045, + "grad_norm": 0.17424309253692627, + "learning_rate": 8.110215256171318e-05, + "loss": 2.2561, + "step": 3324 + }, + { + "epoch": 1.4343325425921933, + "grad_norm": 0.20326147973537445, + "learning_rate": 8.106813711659313e-05, + "loss": 2.2324, + "step": 3325 + }, + { + "epoch": 1.4343325425921933, + "eval_loss": 2.095902919769287, + "eval_runtime": 194.7412, + "eval_samples_per_second": 0.164, + "eval_steps_per_second": 0.164, + "step": 3325 + }, + { + "epoch": 1.4347638559413414, + "grad_norm": 0.18723958730697632, + "learning_rate": 8.103412041499895e-05, + "loss": 2.294, + "step": 3326 + }, + { + "epoch": 1.4351951692904894, + "grad_norm": 0.18472136557102203, + "learning_rate": 8.100010246397413e-05, + "loss": 2.2449, + "step": 3327 + }, + { + "epoch": 1.4356264826396377, + "grad_norm": 0.18690557777881622, + "learning_rate": 8.09660832705625e-05, + "loss": 2.2492, + "step": 3328 + }, + { + "epoch": 1.436057795988786, + "grad_norm": 0.18417558073997498, + "learning_rate": 8.093206284180804e-05, + "loss": 2.0997, + "step": 3329 + }, + { + "epoch": 1.436489109337934, + "grad_norm": 0.19392870366573334, + "learning_rate": 8.089804118475515e-05, + "loss": 2.1467, + "step": 3330 + }, + { + "epoch": 1.4369204226870822, + "grad_norm": 0.18073543906211853, + "learning_rate": 8.08640183064483e-05, + "loss": 2.1225, + "step": 3331 + }, + { + "epoch": 1.4373517360362302, + "grad_norm": 0.18332725763320923, + "learning_rate": 8.082999421393233e-05, + "loss": 2.1037, + "step": 3332 + }, + { + "epoch": 1.4377830493853785, + "grad_norm": 0.4294225871562958, + "learning_rate": 8.079596891425231e-05, + "loss": 2.1481, + "step": 3333 + }, + { + "epoch": 1.4382143627345267, + "grad_norm": 0.17668937146663666, + "learning_rate": 8.076194241445356e-05, + "loss": 2.2179, + "step": 3334 + }, + { + "epoch": 1.4386456760836748, + "grad_norm": 0.17035427689552307, + "learning_rate": 8.07279147215816e-05, + "loss": 2.2046, + "step": 3335 + }, + { + "epoch": 1.439076989432823, + "grad_norm": 0.1593809276819229, + "learning_rate": 8.069388584268226e-05, + "loss": 2.1098, + "step": 3336 + }, + { + "epoch": 1.439508302781971, + "grad_norm": 2.597872734069824, + "learning_rate": 8.06598557848016e-05, + "loss": 2.3244, + "step": 3337 + }, + { + "epoch": 1.4399396161311193, + "grad_norm": 0.19738531112670898, + "learning_rate": 8.06258245549859e-05, + "loss": 2.2166, + "step": 3338 + }, + { + "epoch": 1.4403709294802673, + "grad_norm": 0.19736456871032715, + "learning_rate": 8.059179216028168e-05, + "loss": 2.2696, + "step": 3339 + }, + { + "epoch": 1.4408022428294156, + "grad_norm": 0.17271456122398376, + "learning_rate": 8.055775860773577e-05, + "loss": 1.9545, + "step": 3340 + }, + { + "epoch": 1.4412335561785636, + "grad_norm": 0.20499290525913239, + "learning_rate": 8.052372390439517e-05, + "loss": 2.2657, + "step": 3341 + }, + { + "epoch": 1.441664869527712, + "grad_norm": 0.20654812455177307, + "learning_rate": 8.048968805730712e-05, + "loss": 2.1996, + "step": 3342 + }, + { + "epoch": 1.4420961828768601, + "grad_norm": 0.1679326891899109, + "learning_rate": 8.045565107351909e-05, + "loss": 2.0968, + "step": 3343 + }, + { + "epoch": 1.4425274962260082, + "grad_norm": 0.1864512413740158, + "learning_rate": 8.04216129600789e-05, + "loss": 2.2755, + "step": 3344 + }, + { + "epoch": 1.4429588095751564, + "grad_norm": 0.18227176368236542, + "learning_rate": 8.038757372403441e-05, + "loss": 2.2069, + "step": 3345 + }, + { + "epoch": 1.4433901229243045, + "grad_norm": 0.1615314930677414, + "learning_rate": 8.035353337243389e-05, + "loss": 1.9317, + "step": 3346 + }, + { + "epoch": 1.4438214362734527, + "grad_norm": 0.18779605627059937, + "learning_rate": 8.031949191232572e-05, + "loss": 2.3717, + "step": 3347 + }, + { + "epoch": 1.4442527496226008, + "grad_norm": 0.17955152690410614, + "learning_rate": 8.02854493507586e-05, + "loss": 2.1175, + "step": 3348 + }, + { + "epoch": 1.444684062971749, + "grad_norm": 0.1738167554140091, + "learning_rate": 8.025140569478137e-05, + "loss": 2.1624, + "step": 3349 + }, + { + "epoch": 1.445115376320897, + "grad_norm": 0.18269379436969757, + "learning_rate": 8.021736095144318e-05, + "loss": 2.2026, + "step": 3350 + }, + { + "epoch": 1.445115376320897, + "eval_loss": 2.096182346343994, + "eval_runtime": 194.6214, + "eval_samples_per_second": 0.164, + "eval_steps_per_second": 0.164, + "step": 3350 + }, + { + "epoch": 1.4455466896700453, + "grad_norm": 0.18059170246124268, + "learning_rate": 8.018331512779334e-05, + "loss": 2.3293, + "step": 3351 + }, + { + "epoch": 1.4459780030191935, + "grad_norm": 0.21296890079975128, + "learning_rate": 8.01492682308814e-05, + "loss": 2.0689, + "step": 3352 + }, + { + "epoch": 1.4464093163683416, + "grad_norm": 0.1927291601896286, + "learning_rate": 8.01152202677572e-05, + "loss": 2.1997, + "step": 3353 + }, + { + "epoch": 1.4468406297174898, + "grad_norm": 0.1816396266222, + "learning_rate": 8.008117124547068e-05, + "loss": 2.0234, + "step": 3354 + }, + { + "epoch": 1.4472719430666379, + "grad_norm": 0.19203664362430573, + "learning_rate": 8.004712117107209e-05, + "loss": 2.2696, + "step": 3355 + }, + { + "epoch": 1.4477032564157861, + "grad_norm": 0.17730873823165894, + "learning_rate": 8.001307005161187e-05, + "loss": 2.2438, + "step": 3356 + }, + { + "epoch": 1.4481345697649342, + "grad_norm": 0.18266335129737854, + "learning_rate": 7.997901789414068e-05, + "loss": 2.119, + "step": 3357 + }, + { + "epoch": 1.4485658831140824, + "grad_norm": 0.17785349488258362, + "learning_rate": 7.994496470570938e-05, + "loss": 2.3753, + "step": 3358 + }, + { + "epoch": 1.4489971964632304, + "grad_norm": 0.17622007429599762, + "learning_rate": 7.991091049336904e-05, + "loss": 2.0939, + "step": 3359 + }, + { + "epoch": 1.4494285098123787, + "grad_norm": 0.1801026314496994, + "learning_rate": 7.987685526417101e-05, + "loss": 2.2183, + "step": 3360 + }, + { + "epoch": 1.449859823161527, + "grad_norm": 0.1778012067079544, + "learning_rate": 7.984279902516674e-05, + "loss": 2.2588, + "step": 3361 + }, + { + "epoch": 1.450291136510675, + "grad_norm": 0.17804737389087677, + "learning_rate": 7.980874178340799e-05, + "loss": 2.0134, + "step": 3362 + }, + { + "epoch": 1.4507224498598232, + "grad_norm": 3.638392925262451, + "learning_rate": 7.977468354594665e-05, + "loss": 2.1824, + "step": 3363 + }, + { + "epoch": 1.4511537632089713, + "grad_norm": 0.18926359713077545, + "learning_rate": 7.974062431983489e-05, + "loss": 2.3343, + "step": 3364 + }, + { + "epoch": 1.4515850765581195, + "grad_norm": 0.17392082512378693, + "learning_rate": 7.9706564112125e-05, + "loss": 1.9891, + "step": 3365 + }, + { + "epoch": 1.4520163899072676, + "grad_norm": 0.2056262493133545, + "learning_rate": 7.967250292986954e-05, + "loss": 2.204, + "step": 3366 + }, + { + "epoch": 1.4524477032564158, + "grad_norm": 0.1681697815656662, + "learning_rate": 7.963844078012128e-05, + "loss": 2.1408, + "step": 3367 + }, + { + "epoch": 1.4528790166055638, + "grad_norm": 0.18251508474349976, + "learning_rate": 7.960437766993309e-05, + "loss": 2.1993, + "step": 3368 + }, + { + "epoch": 1.453310329954712, + "grad_norm": 0.17431260645389557, + "learning_rate": 7.957031360635819e-05, + "loss": 2.0966, + "step": 3369 + }, + { + "epoch": 1.4537416433038604, + "grad_norm": 0.17686443030834198, + "learning_rate": 7.953624859644984e-05, + "loss": 2.05, + "step": 3370 + }, + { + "epoch": 1.4541729566530084, + "grad_norm": 0.16880741715431213, + "learning_rate": 7.950218264726164e-05, + "loss": 2.0137, + "step": 3371 + }, + { + "epoch": 1.4546042700021566, + "grad_norm": 0.1719847470521927, + "learning_rate": 7.946811576584729e-05, + "loss": 2.154, + "step": 3372 + }, + { + "epoch": 1.4550355833513047, + "grad_norm": 0.16456718742847443, + "learning_rate": 7.943404795926068e-05, + "loss": 2.3106, + "step": 3373 + }, + { + "epoch": 1.455466896700453, + "grad_norm": 0.17277392745018005, + "learning_rate": 7.939997923455598e-05, + "loss": 2.1343, + "step": 3374 + }, + { + "epoch": 1.455898210049601, + "grad_norm": 0.17675180733203888, + "learning_rate": 7.936590959878741e-05, + "loss": 2.0205, + "step": 3375 + }, + { + "epoch": 1.455898210049601, + "eval_loss": 2.0960140228271484, + "eval_runtime": 205.7204, + "eval_samples_per_second": 0.156, + "eval_steps_per_second": 0.156, + "step": 3375 + }, + { + "epoch": 1.4563295233987492, + "grad_norm": 0.18122228980064392, + "learning_rate": 7.933183905900957e-05, + "loss": 2.1369, + "step": 3376 + }, + { + "epoch": 1.4567608367478972, + "grad_norm": 0.18892325460910797, + "learning_rate": 7.929776762227706e-05, + "loss": 2.1501, + "step": 3377 + }, + { + "epoch": 1.4571921500970455, + "grad_norm": 0.18425828218460083, + "learning_rate": 7.926369529564477e-05, + "loss": 2.0671, + "step": 3378 + }, + { + "epoch": 1.4576234634461938, + "grad_norm": 0.1792745143175125, + "learning_rate": 7.922962208616773e-05, + "loss": 2.175, + "step": 3379 + }, + { + "epoch": 1.4580547767953418, + "grad_norm": 0.17559273540973663, + "learning_rate": 7.91955480009012e-05, + "loss": 2.1323, + "step": 3380 + }, + { + "epoch": 1.45848609014449, + "grad_norm": 0.1783163994550705, + "learning_rate": 7.916147304690056e-05, + "loss": 2.1548, + "step": 3381 + }, + { + "epoch": 1.458917403493638, + "grad_norm": 0.15873871743679047, + "learning_rate": 7.912739723122142e-05, + "loss": 2.0485, + "step": 3382 + }, + { + "epoch": 1.4593487168427863, + "grad_norm": 0.14900639653205872, + "learning_rate": 7.909332056091955e-05, + "loss": 1.9628, + "step": 3383 + }, + { + "epoch": 1.4597800301919344, + "grad_norm": 0.17377440631389618, + "learning_rate": 7.905924304305088e-05, + "loss": 2.154, + "step": 3384 + }, + { + "epoch": 1.4602113435410826, + "grad_norm": 0.2151319533586502, + "learning_rate": 7.902516468467158e-05, + "loss": 2.2311, + "step": 3385 + }, + { + "epoch": 1.4606426568902307, + "grad_norm": 0.19471900165081024, + "learning_rate": 7.899108549283788e-05, + "loss": 2.1155, + "step": 3386 + }, + { + "epoch": 1.461073970239379, + "grad_norm": 0.17895857989788055, + "learning_rate": 7.89570054746063e-05, + "loss": 1.9349, + "step": 3387 + }, + { + "epoch": 1.4615052835885272, + "grad_norm": 0.17501680552959442, + "learning_rate": 7.892292463703346e-05, + "loss": 2.1263, + "step": 3388 + }, + { + "epoch": 1.4619365969376752, + "grad_norm": 0.186353400349617, + "learning_rate": 7.888884298717617e-05, + "loss": 2.199, + "step": 3389 + }, + { + "epoch": 1.4623679102868234, + "grad_norm": 0.17816871404647827, + "learning_rate": 7.885476053209143e-05, + "loss": 2.2223, + "step": 3390 + }, + { + "epoch": 1.4627992236359715, + "grad_norm": 0.16904309391975403, + "learning_rate": 7.882067727883633e-05, + "loss": 2.1566, + "step": 3391 + }, + { + "epoch": 1.4632305369851197, + "grad_norm": 0.16086207330226898, + "learning_rate": 7.878659323446826e-05, + "loss": 2.0205, + "step": 3392 + }, + { + "epoch": 1.4636618503342678, + "grad_norm": 0.19864903390407562, + "learning_rate": 7.875250840604461e-05, + "loss": 2.3275, + "step": 3393 + }, + { + "epoch": 1.464093163683416, + "grad_norm": 0.5341060757637024, + "learning_rate": 7.871842280062309e-05, + "loss": 2.1908, + "step": 3394 + }, + { + "epoch": 1.464524477032564, + "grad_norm": 0.1994878649711609, + "learning_rate": 7.868433642526145e-05, + "loss": 2.2238, + "step": 3395 + }, + { + "epoch": 1.4649557903817123, + "grad_norm": 0.181796133518219, + "learning_rate": 7.865024928701767e-05, + "loss": 2.1112, + "step": 3396 + }, + { + "epoch": 1.4653871037308606, + "grad_norm": 0.1800110638141632, + "learning_rate": 7.861616139294986e-05, + "loss": 1.9921, + "step": 3397 + }, + { + "epoch": 1.4658184170800086, + "grad_norm": 0.18481217324733734, + "learning_rate": 7.858207275011626e-05, + "loss": 2.0483, + "step": 3398 + }, + { + "epoch": 1.4662497304291569, + "grad_norm": 0.19453781843185425, + "learning_rate": 7.854798336557535e-05, + "loss": 1.7871, + "step": 3399 + }, + { + "epoch": 1.4666810437783049, + "grad_norm": 0.1700371354818344, + "learning_rate": 7.851389324638567e-05, + "loss": 1.9453, + "step": 3400 + }, + { + "epoch": 1.4666810437783049, + "eval_loss": 2.0953893661499023, + "eval_runtime": 199.7551, + "eval_samples_per_second": 0.16, + "eval_steps_per_second": 0.16, + "step": 3400 + }, + { + "epoch": 1.4671123571274531, + "grad_norm": 0.1720868945121765, + "learning_rate": 7.847980239960598e-05, + "loss": 2.1746, + "step": 3401 + }, + { + "epoch": 1.4675436704766012, + "grad_norm": 0.1633797287940979, + "learning_rate": 7.844571083229514e-05, + "loss": 1.9359, + "step": 3402 + }, + { + "epoch": 1.4679749838257494, + "grad_norm": 0.1909305900335312, + "learning_rate": 7.841161855151221e-05, + "loss": 1.981, + "step": 3403 + }, + { + "epoch": 1.4684062971748975, + "grad_norm": 14.385684967041016, + "learning_rate": 7.837752556431632e-05, + "loss": 2.1872, + "step": 3404 + }, + { + "epoch": 1.4688376105240457, + "grad_norm": 0.17449794709682465, + "learning_rate": 7.834343187776683e-05, + "loss": 2.1808, + "step": 3405 + }, + { + "epoch": 1.469268923873194, + "grad_norm": 0.18831457197666168, + "learning_rate": 7.830933749892323e-05, + "loss": 2.2185, + "step": 3406 + }, + { + "epoch": 1.469700237222342, + "grad_norm": 0.18120522797107697, + "learning_rate": 7.827524243484509e-05, + "loss": 2.0987, + "step": 3407 + }, + { + "epoch": 1.4701315505714903, + "grad_norm": 0.19174909591674805, + "learning_rate": 7.82411466925922e-05, + "loss": 2.4923, + "step": 3408 + }, + { + "epoch": 1.4705628639206383, + "grad_norm": 0.1888199895620346, + "learning_rate": 7.820705027922442e-05, + "loss": 2.2071, + "step": 3409 + }, + { + "epoch": 1.4709941772697865, + "grad_norm": 0.1881730556488037, + "learning_rate": 7.817295320180183e-05, + "loss": 1.9944, + "step": 3410 + }, + { + "epoch": 1.4714254906189346, + "grad_norm": 0.19050532579421997, + "learning_rate": 7.813885546738457e-05, + "loss": 2.2363, + "step": 3411 + }, + { + "epoch": 1.4718568039680828, + "grad_norm": 0.18670903146266937, + "learning_rate": 7.810475708303294e-05, + "loss": 2.0754, + "step": 3412 + }, + { + "epoch": 1.4722881173172309, + "grad_norm": 0.18117989599704742, + "learning_rate": 7.807065805580745e-05, + "loss": 2.1655, + "step": 3413 + }, + { + "epoch": 1.4727194306663791, + "grad_norm": 0.16975900530815125, + "learning_rate": 7.803655839276858e-05, + "loss": 2.1339, + "step": 3414 + }, + { + "epoch": 1.4731507440155274, + "grad_norm": 0.18406729400157928, + "learning_rate": 7.800245810097714e-05, + "loss": 2.1403, + "step": 3415 + }, + { + "epoch": 1.4735820573646754, + "grad_norm": 0.16810061037540436, + "learning_rate": 7.796835718749388e-05, + "loss": 1.9413, + "step": 3416 + }, + { + "epoch": 1.4740133707138237, + "grad_norm": 0.20537112653255463, + "learning_rate": 7.793425565937982e-05, + "loss": 2.0279, + "step": 3417 + }, + { + "epoch": 1.4744446840629717, + "grad_norm": 0.30291008949279785, + "learning_rate": 7.790015352369603e-05, + "loss": 2.292, + "step": 3418 + }, + { + "epoch": 1.47487599741212, + "grad_norm": 0.18326658010482788, + "learning_rate": 7.786605078750374e-05, + "loss": 2.1898, + "step": 3419 + }, + { + "epoch": 1.4753073107612682, + "grad_norm": 0.18363992869853973, + "learning_rate": 7.783194745786429e-05, + "loss": 2.2315, + "step": 3420 + }, + { + "epoch": 1.4757386241104162, + "grad_norm": 0.17302939295768738, + "learning_rate": 7.779784354183912e-05, + "loss": 2.2566, + "step": 3421 + }, + { + "epoch": 1.4761699374595643, + "grad_norm": 0.19719082117080688, + "learning_rate": 7.77637390464899e-05, + "loss": 2.0581, + "step": 3422 + }, + { + "epoch": 1.4766012508087125, + "grad_norm": 0.20065642893314362, + "learning_rate": 7.772963397887825e-05, + "loss": 2.2128, + "step": 3423 + }, + { + "epoch": 1.4770325641578608, + "grad_norm": 0.1718757152557373, + "learning_rate": 7.769552834606606e-05, + "loss": 2.0527, + "step": 3424 + }, + { + "epoch": 1.4774638775070088, + "grad_norm": 0.19068890810012817, + "learning_rate": 7.766142215511523e-05, + "loss": 2.0046, + "step": 3425 + }, + { + "epoch": 1.4774638775070088, + "eval_loss": 2.095679998397827, + "eval_runtime": 198.9871, + "eval_samples_per_second": 0.161, + "eval_steps_per_second": 0.161, + "step": 3425 + }, + { + "epoch": 1.477895190856157, + "grad_norm": 0.17807812988758087, + "learning_rate": 7.762731541308785e-05, + "loss": 2.0032, + "step": 3426 + }, + { + "epoch": 1.478326504205305, + "grad_norm": 0.19017374515533447, + "learning_rate": 7.759320812704607e-05, + "loss": 2.3483, + "step": 3427 + }, + { + "epoch": 1.4787578175544533, + "grad_norm": 0.19937926530838013, + "learning_rate": 7.755910030405217e-05, + "loss": 1.9971, + "step": 3428 + }, + { + "epoch": 1.4791891309036016, + "grad_norm": 0.1809537261724472, + "learning_rate": 7.752499195116861e-05, + "loss": 2.0348, + "step": 3429 + }, + { + "epoch": 1.4796204442527496, + "grad_norm": 0.19054853916168213, + "learning_rate": 7.749088307545783e-05, + "loss": 2.1322, + "step": 3430 + }, + { + "epoch": 1.4800517576018977, + "grad_norm": 0.19482950866222382, + "learning_rate": 7.745677368398248e-05, + "loss": 2.1562, + "step": 3431 + }, + { + "epoch": 1.480483070951046, + "grad_norm": 0.18106971681118011, + "learning_rate": 7.742266378380524e-05, + "loss": 2.2104, + "step": 3432 + }, + { + "epoch": 1.4809143843001942, + "grad_norm": 0.17390477657318115, + "learning_rate": 7.7388553381989e-05, + "loss": 2.2025, + "step": 3433 + }, + { + "epoch": 1.4813456976493422, + "grad_norm": 0.1953706592321396, + "learning_rate": 7.735444248559666e-05, + "loss": 2.2455, + "step": 3434 + }, + { + "epoch": 1.4817770109984905, + "grad_norm": 0.18527862429618835, + "learning_rate": 7.732033110169123e-05, + "loss": 2.1696, + "step": 3435 + }, + { + "epoch": 1.4822083243476385, + "grad_norm": 0.19061794877052307, + "learning_rate": 7.728621923733592e-05, + "loss": 2.547, + "step": 3436 + }, + { + "epoch": 1.4826396376967867, + "grad_norm": 0.17946751415729523, + "learning_rate": 7.725210689959387e-05, + "loss": 2.134, + "step": 3437 + }, + { + "epoch": 1.483070951045935, + "grad_norm": 0.19174616038799286, + "learning_rate": 7.721799409552849e-05, + "loss": 2.0701, + "step": 3438 + }, + { + "epoch": 1.483502264395083, + "grad_norm": 0.18157075345516205, + "learning_rate": 7.718388083220315e-05, + "loss": 2.4425, + "step": 3439 + }, + { + "epoch": 1.483933577744231, + "grad_norm": 0.1765347272157669, + "learning_rate": 7.71497671166814e-05, + "loss": 2.1589, + "step": 3440 + }, + { + "epoch": 1.4843648910933793, + "grad_norm": 0.18968167901039124, + "learning_rate": 7.711565295602689e-05, + "loss": 2.2585, + "step": 3441 + }, + { + "epoch": 1.4847962044425276, + "grad_norm": 0.2010895311832428, + "learning_rate": 7.708153835730329e-05, + "loss": 2.373, + "step": 3442 + }, + { + "epoch": 1.4852275177916756, + "grad_norm": 0.17231179773807526, + "learning_rate": 7.70474233275744e-05, + "loss": 2.1558, + "step": 3443 + }, + { + "epoch": 1.4856588311408239, + "grad_norm": 0.17638081312179565, + "learning_rate": 7.70133078739041e-05, + "loss": 2.1843, + "step": 3444 + }, + { + "epoch": 1.486090144489972, + "grad_norm": 0.18057771027088165, + "learning_rate": 7.697919200335643e-05, + "loss": 2.0534, + "step": 3445 + }, + { + "epoch": 1.4865214578391202, + "grad_norm": 0.1979825794696808, + "learning_rate": 7.694507572299537e-05, + "loss": 2.3043, + "step": 3446 + }, + { + "epoch": 1.4869527711882684, + "grad_norm": 0.18936648964881897, + "learning_rate": 7.691095903988515e-05, + "loss": 1.9956, + "step": 3447 + }, + { + "epoch": 1.4873840845374164, + "grad_norm": 0.1901467740535736, + "learning_rate": 7.687684196108992e-05, + "loss": 2.2853, + "step": 3448 + }, + { + "epoch": 1.4878153978865645, + "grad_norm": 0.1820063441991806, + "learning_rate": 7.684272449367407e-05, + "loss": 2.0876, + "step": 3449 + }, + { + "epoch": 1.4882467112357127, + "grad_norm": 0.17177510261535645, + "learning_rate": 7.680860664470195e-05, + "loss": 2.0898, + "step": 3450 + }, + { + "epoch": 1.4882467112357127, + "eval_loss": 2.095259428024292, + "eval_runtime": 199.7432, + "eval_samples_per_second": 0.16, + "eval_steps_per_second": 0.16, + "step": 3450 + }, + { + "epoch": 1.488678024584861, + "grad_norm": 0.18127751350402832, + "learning_rate": 7.677448842123805e-05, + "loss": 2.1625, + "step": 3451 + }, + { + "epoch": 1.489109337934009, + "grad_norm": 0.18150100111961365, + "learning_rate": 7.674036983034692e-05, + "loss": 2.0997, + "step": 3452 + }, + { + "epoch": 1.4895406512831573, + "grad_norm": 0.18290740251541138, + "learning_rate": 7.670625087909316e-05, + "loss": 2.194, + "step": 3453 + }, + { + "epoch": 1.4899719646323053, + "grad_norm": 0.17931239306926727, + "learning_rate": 7.667213157454153e-05, + "loss": 2.0456, + "step": 3454 + }, + { + "epoch": 1.4904032779814536, + "grad_norm": 0.18845637142658234, + "learning_rate": 7.663801192375675e-05, + "loss": 2.4016, + "step": 3455 + }, + { + "epoch": 1.4908345913306018, + "grad_norm": 0.1789526492357254, + "learning_rate": 7.66038919338037e-05, + "loss": 2.1449, + "step": 3456 + }, + { + "epoch": 1.4912659046797498, + "grad_norm": 0.21393506228923798, + "learning_rate": 7.656977161174728e-05, + "loss": 2.1899, + "step": 3457 + }, + { + "epoch": 1.4916972180288979, + "grad_norm": 0.1698625236749649, + "learning_rate": 7.653565096465247e-05, + "loss": 2.0487, + "step": 3458 + }, + { + "epoch": 1.4921285313780461, + "grad_norm": 0.19428661465644836, + "learning_rate": 7.650152999958436e-05, + "loss": 2.0786, + "step": 3459 + }, + { + "epoch": 1.4925598447271944, + "grad_norm": 0.16949570178985596, + "learning_rate": 7.646740872360802e-05, + "loss": 2.1475, + "step": 3460 + }, + { + "epoch": 1.4929911580763424, + "grad_norm": 0.18819738924503326, + "learning_rate": 7.643328714378866e-05, + "loss": 2.1348, + "step": 3461 + }, + { + "epoch": 1.4934224714254907, + "grad_norm": 0.17552988231182098, + "learning_rate": 7.639916526719153e-05, + "loss": 2.1711, + "step": 3462 + }, + { + "epoch": 1.4938537847746387, + "grad_norm": 0.17839846014976501, + "learning_rate": 7.636504310088192e-05, + "loss": 2.0626, + "step": 3463 + }, + { + "epoch": 1.494285098123787, + "grad_norm": 0.161084845662117, + "learning_rate": 7.633092065192523e-05, + "loss": 1.9977, + "step": 3464 + }, + { + "epoch": 1.4947164114729352, + "grad_norm": 0.22303293645381927, + "learning_rate": 7.629679792738686e-05, + "loss": 2.1712, + "step": 3465 + }, + { + "epoch": 1.4951477248220832, + "grad_norm": 0.17171968519687653, + "learning_rate": 7.626267493433227e-05, + "loss": 2.1384, + "step": 3466 + }, + { + "epoch": 1.4955790381712313, + "grad_norm": 0.1698496788740158, + "learning_rate": 7.622855167982703e-05, + "loss": 2.2687, + "step": 3467 + }, + { + "epoch": 1.4960103515203795, + "grad_norm": 0.17629973590373993, + "learning_rate": 7.619442817093677e-05, + "loss": 2.0812, + "step": 3468 + }, + { + "epoch": 1.4964416648695278, + "grad_norm": 0.17922022938728333, + "learning_rate": 7.616030441472706e-05, + "loss": 2.306, + "step": 3469 + }, + { + "epoch": 1.4968729782186758, + "grad_norm": 0.18257419764995575, + "learning_rate": 7.612618041826366e-05, + "loss": 2.1525, + "step": 3470 + }, + { + "epoch": 1.497304291567824, + "grad_norm": 0.17645780742168427, + "learning_rate": 7.609205618861225e-05, + "loss": 2.0277, + "step": 3471 + }, + { + "epoch": 1.497735604916972, + "grad_norm": 0.2862652540206909, + "learning_rate": 7.605793173283869e-05, + "loss": 1.8123, + "step": 3472 + }, + { + "epoch": 1.4981669182661204, + "grad_norm": 0.28746744990348816, + "learning_rate": 7.602380705800879e-05, + "loss": 2.2131, + "step": 3473 + }, + { + "epoch": 1.4985982316152686, + "grad_norm": 0.17240886390209198, + "learning_rate": 7.598968217118841e-05, + "loss": 2.3652, + "step": 3474 + }, + { + "epoch": 1.4990295449644166, + "grad_norm": 0.1765579879283905, + "learning_rate": 7.595555707944358e-05, + "loss": 2.2843, + "step": 3475 + }, + { + "epoch": 1.4990295449644166, + "eval_loss": 2.0947535037994385, + "eval_runtime": 199.7196, + "eval_samples_per_second": 0.16, + "eval_steps_per_second": 0.16, + "step": 3475 + }, + { + "epoch": 1.4994608583135647, + "grad_norm": 0.17466917634010315, + "learning_rate": 7.592143178984015e-05, + "loss": 2.0196, + "step": 3476 + }, + { + "epoch": 1.499892171662713, + "grad_norm": 0.1990082859992981, + "learning_rate": 7.588730630944424e-05, + "loss": 2.3536, + "step": 3477 + }, + { + "epoch": 1.5003234850118612, + "grad_norm": 0.19265393912792206, + "learning_rate": 7.58531806453218e-05, + "loss": 2.0515, + "step": 3478 + }, + { + "epoch": 1.5007547983610092, + "grad_norm": 0.1856214702129364, + "learning_rate": 7.581905480453901e-05, + "loss": 2.1362, + "step": 3479 + }, + { + "epoch": 1.5011861117101575, + "grad_norm": 0.16783815622329712, + "learning_rate": 7.578492879416196e-05, + "loss": 2.2121, + "step": 3480 + }, + { + "epoch": 1.5016174250593055, + "grad_norm": 0.1762775331735611, + "learning_rate": 7.575080262125684e-05, + "loss": 1.9548, + "step": 3481 + }, + { + "epoch": 1.5020487384084538, + "grad_norm": 0.18140533566474915, + "learning_rate": 7.571667629288979e-05, + "loss": 2.0891, + "step": 3482 + }, + { + "epoch": 1.502480051757602, + "grad_norm": 0.1749914437532425, + "learning_rate": 7.568254981612709e-05, + "loss": 2.1629, + "step": 3483 + }, + { + "epoch": 1.50291136510675, + "grad_norm": 0.20092450082302094, + "learning_rate": 7.564842319803501e-05, + "loss": 2.183, + "step": 3484 + }, + { + "epoch": 1.503342678455898, + "grad_norm": 0.18456554412841797, + "learning_rate": 7.561429644567977e-05, + "loss": 2.2325, + "step": 3485 + }, + { + "epoch": 1.5037739918050463, + "grad_norm": 0.189732164144516, + "learning_rate": 7.558016956612779e-05, + "loss": 2.2595, + "step": 3486 + }, + { + "epoch": 1.5042053051541946, + "grad_norm": 0.179192915558815, + "learning_rate": 7.55460425664453e-05, + "loss": 2.1265, + "step": 3487 + }, + { + "epoch": 1.5046366185033426, + "grad_norm": 0.17854931950569153, + "learning_rate": 7.551191545369876e-05, + "loss": 2.2346, + "step": 3488 + }, + { + "epoch": 1.5050679318524909, + "grad_norm": 0.21214659512043, + "learning_rate": 7.547778823495452e-05, + "loss": 2.2265, + "step": 3489 + }, + { + "epoch": 1.505499245201639, + "grad_norm": 0.17340655624866486, + "learning_rate": 7.544366091727897e-05, + "loss": 2.3306, + "step": 3490 + }, + { + "epoch": 1.5059305585507872, + "grad_norm": 0.18704965710639954, + "learning_rate": 7.540953350773863e-05, + "loss": 2.2339, + "step": 3491 + }, + { + "epoch": 1.5063618718999354, + "grad_norm": 0.17873235046863556, + "learning_rate": 7.537540601339985e-05, + "loss": 2.1256, + "step": 3492 + }, + { + "epoch": 1.5067931852490835, + "grad_norm": 0.16924327611923218, + "learning_rate": 7.53412784413292e-05, + "loss": 2.0724, + "step": 3493 + }, + { + "epoch": 1.5072244985982315, + "grad_norm": 0.18192622065544128, + "learning_rate": 7.530715079859307e-05, + "loss": 2.1323, + "step": 3494 + }, + { + "epoch": 1.5076558119473797, + "grad_norm": 0.17550119757652283, + "learning_rate": 7.527302309225803e-05, + "loss": 1.9776, + "step": 3495 + }, + { + "epoch": 1.508087125296528, + "grad_norm": 0.186650350689888, + "learning_rate": 7.523889532939057e-05, + "loss": 2.4068, + "step": 3496 + }, + { + "epoch": 1.508518438645676, + "grad_norm": 0.1668436974287033, + "learning_rate": 7.520476751705721e-05, + "loss": 2.0524, + "step": 3497 + }, + { + "epoch": 1.5089497519948243, + "grad_norm": 0.186021625995636, + "learning_rate": 7.517063966232452e-05, + "loss": 2.1887, + "step": 3498 + }, + { + "epoch": 1.5093810653439723, + "grad_norm": 0.20452392101287842, + "learning_rate": 7.513651177225899e-05, + "loss": 2.0384, + "step": 3499 + }, + { + "epoch": 1.5098123786931206, + "grad_norm": 0.18916206061840057, + "learning_rate": 7.510238385392724e-05, + "loss": 2.2156, + "step": 3500 + }, + { + "epoch": 1.5098123786931206, + "eval_loss": 2.0948498249053955, + "eval_runtime": 199.8624, + "eval_samples_per_second": 0.16, + "eval_steps_per_second": 0.16, + "step": 3500 + }, + { + "epoch": 1.5102436920422688, + "grad_norm": 0.19154448807239532, + "learning_rate": 7.506825591439576e-05, + "loss": 2.2353, + "step": 3501 + }, + { + "epoch": 1.5106750053914169, + "grad_norm": 0.1821613907814026, + "learning_rate": 7.503412796073116e-05, + "loss": 2.1699, + "step": 3502 + }, + { + "epoch": 1.511106318740565, + "grad_norm": 0.1843271255493164, + "learning_rate": 7.5e-05, + "loss": 2.1596, + "step": 3503 + }, + { + "epoch": 1.5115376320897131, + "grad_norm": 0.1836938112974167, + "learning_rate": 7.496587203926884e-05, + "loss": 1.9115, + "step": 3504 + }, + { + "epoch": 1.5119689454388614, + "grad_norm": 0.1645289808511734, + "learning_rate": 7.493174408560423e-05, + "loss": 2.0508, + "step": 3505 + }, + { + "epoch": 1.5124002587880097, + "grad_norm": 0.18941354751586914, + "learning_rate": 7.489761614607276e-05, + "loss": 2.2317, + "step": 3506 + }, + { + "epoch": 1.5128315721371577, + "grad_norm": 0.17841602861881256, + "learning_rate": 7.486348822774101e-05, + "loss": 2.2352, + "step": 3507 + }, + { + "epoch": 1.5132628854863057, + "grad_norm": 0.20502035319805145, + "learning_rate": 7.48293603376755e-05, + "loss": 2.2623, + "step": 3508 + }, + { + "epoch": 1.513694198835454, + "grad_norm": 0.18774735927581787, + "learning_rate": 7.479523248294277e-05, + "loss": 2.1597, + "step": 3509 + }, + { + "epoch": 1.5141255121846022, + "grad_norm": 0.17617200314998627, + "learning_rate": 7.476110467060943e-05, + "loss": 2.013, + "step": 3510 + }, + { + "epoch": 1.5145568255337503, + "grad_norm": 0.17860938608646393, + "learning_rate": 7.472697690774197e-05, + "loss": 2.1755, + "step": 3511 + }, + { + "epoch": 1.5149881388828983, + "grad_norm": 0.21400243043899536, + "learning_rate": 7.469284920140692e-05, + "loss": 2.1347, + "step": 3512 + }, + { + "epoch": 1.5154194522320465, + "grad_norm": 0.18597297370433807, + "learning_rate": 7.465872155867081e-05, + "loss": 2.2397, + "step": 3513 + }, + { + "epoch": 1.5158507655811948, + "grad_norm": 0.17704488337039948, + "learning_rate": 7.462459398660014e-05, + "loss": 2.127, + "step": 3514 + }, + { + "epoch": 1.516282078930343, + "grad_norm": 0.19206072390079498, + "learning_rate": 7.459046649226138e-05, + "loss": 2.3177, + "step": 3515 + }, + { + "epoch": 1.516713392279491, + "grad_norm": 0.17340447008609772, + "learning_rate": 7.4556339082721e-05, + "loss": 1.8396, + "step": 3516 + }, + { + "epoch": 1.5171447056286391, + "grad_norm": 0.19418282806873322, + "learning_rate": 7.452221176504548e-05, + "loss": 1.8297, + "step": 3517 + }, + { + "epoch": 1.5175760189777874, + "grad_norm": 0.2816286087036133, + "learning_rate": 7.448808454630124e-05, + "loss": 2.2843, + "step": 3518 + }, + { + "epoch": 1.5180073323269356, + "grad_norm": 0.1821245402097702, + "learning_rate": 7.445395743355468e-05, + "loss": 2.1009, + "step": 3519 + }, + { + "epoch": 1.5184386456760837, + "grad_norm": 0.19121207296848297, + "learning_rate": 7.441983043387221e-05, + "loss": 2.0815, + "step": 3520 + }, + { + "epoch": 1.5188699590252317, + "grad_norm": 0.18853315711021423, + "learning_rate": 7.438570355432022e-05, + "loss": 2.3358, + "step": 3521 + }, + { + "epoch": 1.51930127237438, + "grad_norm": 0.19356144964694977, + "learning_rate": 7.435157680196502e-05, + "loss": 2.3547, + "step": 3522 + }, + { + "epoch": 1.5197325857235282, + "grad_norm": 0.18097326159477234, + "learning_rate": 7.43174501838729e-05, + "loss": 2.0562, + "step": 3523 + }, + { + "epoch": 1.5201638990726765, + "grad_norm": 0.18228799104690552, + "learning_rate": 7.42833237071102e-05, + "loss": 2.1742, + "step": 3524 + }, + { + "epoch": 1.5205952124218245, + "grad_norm": 0.17227162420749664, + "learning_rate": 7.424919737874316e-05, + "loss": 2.0236, + "step": 3525 + }, + { + "epoch": 1.5205952124218245, + "eval_loss": 2.0941295623779297, + "eval_runtime": 201.3088, + "eval_samples_per_second": 0.159, + "eval_steps_per_second": 0.159, + "step": 3525 + }, + { + "epoch": 1.5210265257709725, + "grad_norm": 0.20134098827838898, + "learning_rate": 7.421507120583801e-05, + "loss": 2.143, + "step": 3526 + }, + { + "epoch": 1.5214578391201208, + "grad_norm": 0.18537122011184692, + "learning_rate": 7.418094519546099e-05, + "loss": 2.3952, + "step": 3527 + }, + { + "epoch": 1.521889152469269, + "grad_norm": 0.21926040947437286, + "learning_rate": 7.414681935467819e-05, + "loss": 2.2343, + "step": 3528 + }, + { + "epoch": 1.522320465818417, + "grad_norm": 0.17132803797721863, + "learning_rate": 7.411269369055577e-05, + "loss": 2.0471, + "step": 3529 + }, + { + "epoch": 1.522751779167565, + "grad_norm": 0.19180268049240112, + "learning_rate": 7.407856821015984e-05, + "loss": 2.1841, + "step": 3530 + }, + { + "epoch": 1.5231830925167134, + "grad_norm": 0.17654922604560852, + "learning_rate": 7.404444292055642e-05, + "loss": 2.1316, + "step": 3531 + }, + { + "epoch": 1.5236144058658616, + "grad_norm": 0.20351848006248474, + "learning_rate": 7.401031782881157e-05, + "loss": 2.1818, + "step": 3532 + }, + { + "epoch": 1.5240457192150099, + "grad_norm": 0.172661691904068, + "learning_rate": 7.397619294199123e-05, + "loss": 2.1867, + "step": 3533 + }, + { + "epoch": 1.524477032564158, + "grad_norm": 0.19422678649425507, + "learning_rate": 7.394206826716131e-05, + "loss": 2.024, + "step": 3534 + }, + { + "epoch": 1.524908345913306, + "grad_norm": 0.18358619511127472, + "learning_rate": 7.390794381138775e-05, + "loss": 2.2407, + "step": 3535 + }, + { + "epoch": 1.5253396592624542, + "grad_norm": 0.18279418349266052, + "learning_rate": 7.387381958173636e-05, + "loss": 2.1805, + "step": 3536 + }, + { + "epoch": 1.5257709726116024, + "grad_norm": 0.17657078802585602, + "learning_rate": 7.383969558527293e-05, + "loss": 2.171, + "step": 3537 + }, + { + "epoch": 1.5262022859607505, + "grad_norm": 0.18388250470161438, + "learning_rate": 7.380557182906323e-05, + "loss": 2.073, + "step": 3538 + }, + { + "epoch": 1.5266335993098985, + "grad_norm": 0.18958689272403717, + "learning_rate": 7.377144832017296e-05, + "loss": 2.1475, + "step": 3539 + }, + { + "epoch": 1.5270649126590468, + "grad_norm": 0.16585297882556915, + "learning_rate": 7.373732506566773e-05, + "loss": 2.0459, + "step": 3540 + }, + { + "epoch": 1.527496226008195, + "grad_norm": 0.182152658700943, + "learning_rate": 7.370320207261315e-05, + "loss": 2.0672, + "step": 3541 + }, + { + "epoch": 1.5279275393573433, + "grad_norm": 0.17660200595855713, + "learning_rate": 7.366907934807477e-05, + "loss": 2.2686, + "step": 3542 + }, + { + "epoch": 1.5283588527064913, + "grad_norm": 0.1848592460155487, + "learning_rate": 7.363495689911807e-05, + "loss": 2.2825, + "step": 3543 + }, + { + "epoch": 1.5287901660556393, + "grad_norm": 0.18739467859268188, + "learning_rate": 7.360083473280844e-05, + "loss": 2.2374, + "step": 3544 + }, + { + "epoch": 1.5292214794047876, + "grad_norm": 0.17787355184555054, + "learning_rate": 7.356671285621132e-05, + "loss": 2.1722, + "step": 3545 + }, + { + "epoch": 1.5296527927539358, + "grad_norm": 0.182230144739151, + "learning_rate": 7.353259127639198e-05, + "loss": 2.2587, + "step": 3546 + }, + { + "epoch": 1.5300841061030839, + "grad_norm": 0.1774773746728897, + "learning_rate": 7.349847000041566e-05, + "loss": 2.1517, + "step": 3547 + }, + { + "epoch": 1.530515419452232, + "grad_norm": 0.18556547164916992, + "learning_rate": 7.346434903534751e-05, + "loss": 2.2128, + "step": 3548 + }, + { + "epoch": 1.5309467328013802, + "grad_norm": 0.17297889292240143, + "learning_rate": 7.343022838825272e-05, + "loss": 2.0662, + "step": 3549 + }, + { + "epoch": 1.5313780461505284, + "grad_norm": 0.1797797530889511, + "learning_rate": 7.33961080661963e-05, + "loss": 2.256, + "step": 3550 + }, + { + "epoch": 1.5313780461505284, + "eval_loss": 2.0944180488586426, + "eval_runtime": 201.7022, + "eval_samples_per_second": 0.159, + "eval_steps_per_second": 0.159, + "step": 3550 + }, + { + "epoch": 1.5318093594996767, + "grad_norm": 0.18436093628406525, + "learning_rate": 7.336198807624322e-05, + "loss": 2.1237, + "step": 3551 + }, + { + "epoch": 1.5322406728488247, + "grad_norm": 0.19445446133613586, + "learning_rate": 7.332786842545845e-05, + "loss": 2.3032, + "step": 3552 + }, + { + "epoch": 1.5326719861979727, + "grad_norm": 0.16594868898391724, + "learning_rate": 7.329374912090683e-05, + "loss": 2.1842, + "step": 3553 + }, + { + "epoch": 1.533103299547121, + "grad_norm": 0.28609853982925415, + "learning_rate": 7.325963016965308e-05, + "loss": 2.1166, + "step": 3554 + }, + { + "epoch": 1.5335346128962692, + "grad_norm": 0.19114753603935242, + "learning_rate": 7.322551157876194e-05, + "loss": 1.9692, + "step": 3555 + }, + { + "epoch": 1.5339659262454173, + "grad_norm": 0.18666251003742218, + "learning_rate": 7.319139335529804e-05, + "loss": 2.1514, + "step": 3556 + }, + { + "epoch": 1.5343972395945653, + "grad_norm": 0.1783144474029541, + "learning_rate": 7.315727550632593e-05, + "loss": 2.049, + "step": 3557 + }, + { + "epoch": 1.5348285529437136, + "grad_norm": 0.18049302697181702, + "learning_rate": 7.312315803891005e-05, + "loss": 1.9799, + "step": 3558 + }, + { + "epoch": 1.5352598662928618, + "grad_norm": 0.1771216243505478, + "learning_rate": 7.308904096011485e-05, + "loss": 2.2921, + "step": 3559 + }, + { + "epoch": 1.53569117964201, + "grad_norm": 0.17605052888393402, + "learning_rate": 7.305492427700463e-05, + "loss": 2.3114, + "step": 3560 + }, + { + "epoch": 1.536122492991158, + "grad_norm": 0.19665959477424622, + "learning_rate": 7.30208079966436e-05, + "loss": 2.288, + "step": 3561 + }, + { + "epoch": 1.5365538063403061, + "grad_norm": 0.18665872514247894, + "learning_rate": 7.298669212609588e-05, + "loss": 2.1297, + "step": 3562 + }, + { + "epoch": 1.5369851196894544, + "grad_norm": 0.17894567549228668, + "learning_rate": 7.295257667242561e-05, + "loss": 2.1004, + "step": 3563 + }, + { + "epoch": 1.5374164330386026, + "grad_norm": 0.19599032402038574, + "learning_rate": 7.291846164269671e-05, + "loss": 2.1506, + "step": 3564 + }, + { + "epoch": 1.5378477463877507, + "grad_norm": 0.17925117909908295, + "learning_rate": 7.288434704397308e-05, + "loss": 2.3149, + "step": 3565 + }, + { + "epoch": 1.5382790597368987, + "grad_norm": 0.17601613700389862, + "learning_rate": 7.285023288331858e-05, + "loss": 2.0923, + "step": 3566 + }, + { + "epoch": 1.538710373086047, + "grad_norm": 0.27391019463539124, + "learning_rate": 7.281611916779685e-05, + "loss": 2.2239, + "step": 3567 + }, + { + "epoch": 1.5391416864351952, + "grad_norm": 0.18218959867954254, + "learning_rate": 7.278200590447153e-05, + "loss": 2.1551, + "step": 3568 + }, + { + "epoch": 1.5395729997843435, + "grad_norm": 0.18566249310970306, + "learning_rate": 7.274789310040612e-05, + "loss": 2.1783, + "step": 3569 + }, + { + "epoch": 1.5400043131334915, + "grad_norm": 0.19422903656959534, + "learning_rate": 7.271378076266408e-05, + "loss": 2.1221, + "step": 3570 + }, + { + "epoch": 1.5404356264826395, + "grad_norm": 0.17731690406799316, + "learning_rate": 7.267966889830876e-05, + "loss": 2.2625, + "step": 3571 + }, + { + "epoch": 1.5408669398317878, + "grad_norm": 0.17667533457279205, + "learning_rate": 7.264555751440332e-05, + "loss": 1.8725, + "step": 3572 + }, + { + "epoch": 1.541298253180936, + "grad_norm": 0.17535041272640228, + "learning_rate": 7.261144661801099e-05, + "loss": 2.143, + "step": 3573 + }, + { + "epoch": 1.541729566530084, + "grad_norm": 0.19393585622310638, + "learning_rate": 7.257733621619474e-05, + "loss": 2.2788, + "step": 3574 + }, + { + "epoch": 1.5421608798792321, + "grad_norm": 0.18830327689647675, + "learning_rate": 7.254322631601752e-05, + "loss": 2.1969, + "step": 3575 + }, + { + "epoch": 1.5421608798792321, + "eval_loss": 2.0939698219299316, + "eval_runtime": 206.7306, + "eval_samples_per_second": 0.155, + "eval_steps_per_second": 0.155, + "step": 3575 + }, + { + "epoch": 1.5425921932283804, + "grad_norm": 0.16435548663139343, + "learning_rate": 7.250911692454216e-05, + "loss": 2.0484, + "step": 3576 + }, + { + "epoch": 1.5430235065775286, + "grad_norm": 0.20261076092720032, + "learning_rate": 7.247500804883138e-05, + "loss": 2.3425, + "step": 3577 + }, + { + "epoch": 1.5434548199266769, + "grad_norm": 0.22808308899402618, + "learning_rate": 7.244089969594782e-05, + "loss": 2.2497, + "step": 3578 + }, + { + "epoch": 1.543886133275825, + "grad_norm": 0.2001747488975525, + "learning_rate": 7.240679187295394e-05, + "loss": 2.3458, + "step": 3579 + }, + { + "epoch": 1.544317446624973, + "grad_norm": 0.19855140149593353, + "learning_rate": 7.237268458691216e-05, + "loss": 2.085, + "step": 3580 + }, + { + "epoch": 1.5447487599741212, + "grad_norm": 0.18147879838943481, + "learning_rate": 7.233857784488476e-05, + "loss": 2.1479, + "step": 3581 + }, + { + "epoch": 1.5451800733232695, + "grad_norm": 0.16147656738758087, + "learning_rate": 7.230447165393394e-05, + "loss": 2.3397, + "step": 3582 + }, + { + "epoch": 1.5456113866724175, + "grad_norm": 0.19551271200180054, + "learning_rate": 7.227036602112172e-05, + "loss": 2.2916, + "step": 3583 + }, + { + "epoch": 1.5460427000215655, + "grad_norm": 0.19639210402965546, + "learning_rate": 7.22362609535101e-05, + "loss": 2.1183, + "step": 3584 + }, + { + "epoch": 1.5464740133707138, + "grad_norm": 0.1812535971403122, + "learning_rate": 7.220215645816086e-05, + "loss": 2.4531, + "step": 3585 + }, + { + "epoch": 1.546905326719862, + "grad_norm": 0.19537201523780823, + "learning_rate": 7.216805254213573e-05, + "loss": 2.1848, + "step": 3586 + }, + { + "epoch": 1.5473366400690103, + "grad_norm": 0.17963844537734985, + "learning_rate": 7.213394921249626e-05, + "loss": 2.1498, + "step": 3587 + }, + { + "epoch": 1.5477679534181583, + "grad_norm": 0.17501026391983032, + "learning_rate": 7.209984647630397e-05, + "loss": 2.2232, + "step": 3588 + }, + { + "epoch": 1.5481992667673063, + "grad_norm": 0.20665574073791504, + "learning_rate": 7.206574434062018e-05, + "loss": 2.1346, + "step": 3589 + }, + { + "epoch": 1.5486305801164546, + "grad_norm": 0.18055406212806702, + "learning_rate": 7.20316428125061e-05, + "loss": 1.9602, + "step": 3590 + }, + { + "epoch": 1.5490618934656029, + "grad_norm": 0.18960954248905182, + "learning_rate": 7.199754189902286e-05, + "loss": 2.3107, + "step": 3591 + }, + { + "epoch": 1.549493206814751, + "grad_norm": 0.17563574016094208, + "learning_rate": 7.19634416072314e-05, + "loss": 1.9254, + "step": 3592 + }, + { + "epoch": 1.549924520163899, + "grad_norm": 0.17406399548053741, + "learning_rate": 7.192934194419255e-05, + "loss": 2.3134, + "step": 3593 + }, + { + "epoch": 1.5503558335130472, + "grad_norm": 0.19203811883926392, + "learning_rate": 7.189524291696703e-05, + "loss": 2.1172, + "step": 3594 + }, + { + "epoch": 1.5507871468621954, + "grad_norm": 0.19079047441482544, + "learning_rate": 7.186114453261542e-05, + "loss": 2.0095, + "step": 3595 + }, + { + "epoch": 1.5512184602113437, + "grad_norm": 0.1765333116054535, + "learning_rate": 7.182704679819817e-05, + "loss": 2.0992, + "step": 3596 + }, + { + "epoch": 1.5516497735604917, + "grad_norm": 0.18226292729377747, + "learning_rate": 7.179294972077555e-05, + "loss": 2.2139, + "step": 3597 + }, + { + "epoch": 1.5520810869096398, + "grad_norm": 0.1825229823589325, + "learning_rate": 7.175885330740779e-05, + "loss": 2.2704, + "step": 3598 + }, + { + "epoch": 1.552512400258788, + "grad_norm": 0.1848277598619461, + "learning_rate": 7.172475756515491e-05, + "loss": 2.1899, + "step": 3599 + }, + { + "epoch": 1.5529437136079363, + "grad_norm": 0.17505969107151031, + "learning_rate": 7.169066250107678e-05, + "loss": 1.9208, + "step": 3600 + }, + { + "epoch": 1.5529437136079363, + "eval_loss": 2.093771457672119, + "eval_runtime": 201.1603, + "eval_samples_per_second": 0.159, + "eval_steps_per_second": 0.159, + "step": 3600 + }, + { + "epoch": 1.5533750269570843, + "grad_norm": 0.19374166429042816, + "learning_rate": 7.165656812223316e-05, + "loss": 2.1802, + "step": 3601 + }, + { + "epoch": 1.5538063403062323, + "grad_norm": 0.18536177277565002, + "learning_rate": 7.162247443568368e-05, + "loss": 2.3327, + "step": 3602 + }, + { + "epoch": 1.5542376536553806, + "grad_norm": 0.19441302120685577, + "learning_rate": 7.15883814484878e-05, + "loss": 2.087, + "step": 3603 + }, + { + "epoch": 1.5546689670045288, + "grad_norm": 0.18043111264705658, + "learning_rate": 7.155428916770484e-05, + "loss": 2.2256, + "step": 3604 + }, + { + "epoch": 1.555100280353677, + "grad_norm": 0.20430094003677368, + "learning_rate": 7.152019760039402e-05, + "loss": 2.2689, + "step": 3605 + }, + { + "epoch": 1.5555315937028251, + "grad_norm": 0.20238332450389862, + "learning_rate": 7.148610675361433e-05, + "loss": 2.3419, + "step": 3606 + }, + { + "epoch": 1.5559629070519732, + "grad_norm": 0.19608813524246216, + "learning_rate": 7.145201663442465e-05, + "loss": 2.2375, + "step": 3607 + }, + { + "epoch": 1.5563942204011214, + "grad_norm": 0.1908615380525589, + "learning_rate": 7.141792724988373e-05, + "loss": 2.2167, + "step": 3608 + }, + { + "epoch": 1.5568255337502697, + "grad_norm": 0.15812508761882782, + "learning_rate": 7.138383860705015e-05, + "loss": 2.2137, + "step": 3609 + }, + { + "epoch": 1.5572568470994177, + "grad_norm": 0.16722245514392853, + "learning_rate": 7.134975071298235e-05, + "loss": 2.1217, + "step": 3610 + }, + { + "epoch": 1.5576881604485657, + "grad_norm": 0.2262086719274521, + "learning_rate": 7.131566357473854e-05, + "loss": 2.1413, + "step": 3611 + }, + { + "epoch": 1.558119473797714, + "grad_norm": 0.1825946569442749, + "learning_rate": 7.128157719937691e-05, + "loss": 2.1258, + "step": 3612 + }, + { + "epoch": 1.5585507871468622, + "grad_norm": 0.18516826629638672, + "learning_rate": 7.124749159395538e-05, + "loss": 2.3571, + "step": 3613 + }, + { + "epoch": 1.5589821004960105, + "grad_norm": 0.8397002816200256, + "learning_rate": 7.121340676553175e-05, + "loss": 2.2926, + "step": 3614 + }, + { + "epoch": 1.5594134138451585, + "grad_norm": 0.18018440902233124, + "learning_rate": 7.117932272116365e-05, + "loss": 2.2957, + "step": 3615 + }, + { + "epoch": 1.5598447271943066, + "grad_norm": 0.1772436648607254, + "learning_rate": 7.114523946790857e-05, + "loss": 2.0046, + "step": 3616 + }, + { + "epoch": 1.5602760405434548, + "grad_norm": 0.18658606708049774, + "learning_rate": 7.111115701282384e-05, + "loss": 2.1106, + "step": 3617 + }, + { + "epoch": 1.560707353892603, + "grad_norm": 0.19837208092212677, + "learning_rate": 7.107707536296652e-05, + "loss": 2.2816, + "step": 3618 + }, + { + "epoch": 1.561138667241751, + "grad_norm": 0.18111276626586914, + "learning_rate": 7.10429945253937e-05, + "loss": 2.2865, + "step": 3619 + }, + { + "epoch": 1.5615699805908991, + "grad_norm": 0.17805011570453644, + "learning_rate": 7.10089145071621e-05, + "loss": 2.0251, + "step": 3620 + }, + { + "epoch": 1.5620012939400474, + "grad_norm": 0.18128082156181335, + "learning_rate": 7.097483531532842e-05, + "loss": 2.1825, + "step": 3621 + }, + { + "epoch": 1.5624326072891956, + "grad_norm": 0.17830413579940796, + "learning_rate": 7.094075695694909e-05, + "loss": 2.236, + "step": 3622 + }, + { + "epoch": 1.562863920638344, + "grad_norm": 0.17792202532291412, + "learning_rate": 7.090667943908044e-05, + "loss": 2.2288, + "step": 3623 + }, + { + "epoch": 1.563295233987492, + "grad_norm": 0.20085816085338593, + "learning_rate": 7.087260276877858e-05, + "loss": 2.1816, + "step": 3624 + }, + { + "epoch": 1.56372654733664, + "grad_norm": 0.18358451128005981, + "learning_rate": 7.083852695309944e-05, + "loss": 2.1069, + "step": 3625 + }, + { + "epoch": 1.56372654733664, + "eval_loss": 2.093684673309326, + "eval_runtime": 201.6838, + "eval_samples_per_second": 0.159, + "eval_steps_per_second": 0.159, + "step": 3625 + }, + { + "epoch": 1.5641578606857882, + "grad_norm": 0.17750532925128937, + "learning_rate": 7.08044519990988e-05, + "loss": 2.1724, + "step": 3626 + }, + { + "epoch": 1.5645891740349365, + "grad_norm": 0.201032355427742, + "learning_rate": 7.077037791383226e-05, + "loss": 2.1289, + "step": 3627 + }, + { + "epoch": 1.5650204873840845, + "grad_norm": 0.18252480030059814, + "learning_rate": 7.073630470435522e-05, + "loss": 2.2806, + "step": 3628 + }, + { + "epoch": 1.5654518007332325, + "grad_norm": 0.17951056361198425, + "learning_rate": 7.070223237772291e-05, + "loss": 2.1345, + "step": 3629 + }, + { + "epoch": 1.5658831140823808, + "grad_norm": 0.18737827241420746, + "learning_rate": 7.066816094099042e-05, + "loss": 2.1661, + "step": 3630 + }, + { + "epoch": 1.566314427431529, + "grad_norm": 0.1997365951538086, + "learning_rate": 7.063409040121257e-05, + "loss": 2.1505, + "step": 3631 + }, + { + "epoch": 1.5667457407806773, + "grad_norm": 0.19382046163082123, + "learning_rate": 7.060002076544405e-05, + "loss": 2.1109, + "step": 3632 + }, + { + "epoch": 1.5671770541298253, + "grad_norm": 0.17259393632411957, + "learning_rate": 7.056595204073931e-05, + "loss": 2.1547, + "step": 3633 + }, + { + "epoch": 1.5676083674789734, + "grad_norm": 0.17684011161327362, + "learning_rate": 7.053188423415273e-05, + "loss": 2.2844, + "step": 3634 + }, + { + "epoch": 1.5680396808281216, + "grad_norm": 0.17988254129886627, + "learning_rate": 7.049781735273836e-05, + "loss": 2.0416, + "step": 3635 + }, + { + "epoch": 1.5684709941772699, + "grad_norm": 0.164667010307312, + "learning_rate": 7.046375140355013e-05, + "loss": 2.1502, + "step": 3636 + }, + { + "epoch": 1.568902307526418, + "grad_norm": 0.1714576631784439, + "learning_rate": 7.042968639364181e-05, + "loss": 2.112, + "step": 3637 + }, + { + "epoch": 1.569333620875566, + "grad_norm": 0.1917424499988556, + "learning_rate": 7.039562233006691e-05, + "loss": 2.2119, + "step": 3638 + }, + { + "epoch": 1.5697649342247142, + "grad_norm": 0.16879668831825256, + "learning_rate": 7.036155921987874e-05, + "loss": 1.9152, + "step": 3639 + }, + { + "epoch": 1.5701962475738624, + "grad_norm": 0.1793883591890335, + "learning_rate": 7.032749707013045e-05, + "loss": 2.1579, + "step": 3640 + }, + { + "epoch": 1.5706275609230107, + "grad_norm": 0.1909743845462799, + "learning_rate": 7.029343588787499e-05, + "loss": 2.1412, + "step": 3641 + }, + { + "epoch": 1.5710588742721587, + "grad_norm": 0.1718597561120987, + "learning_rate": 7.025937568016511e-05, + "loss": 2.0683, + "step": 3642 + }, + { + "epoch": 1.5714901876213068, + "grad_norm": 0.2004898637533188, + "learning_rate": 7.022531645405332e-05, + "loss": 2.1873, + "step": 3643 + }, + { + "epoch": 1.571921500970455, + "grad_norm": 0.18730789422988892, + "learning_rate": 7.0191258216592e-05, + "loss": 2.1092, + "step": 3644 + }, + { + "epoch": 1.5723528143196033, + "grad_norm": 0.17451775074005127, + "learning_rate": 7.015720097483326e-05, + "loss": 2.2293, + "step": 3645 + }, + { + "epoch": 1.5727841276687513, + "grad_norm": 0.17554758489131927, + "learning_rate": 7.0123144735829e-05, + "loss": 2.0978, + "step": 3646 + }, + { + "epoch": 1.5732154410178993, + "grad_norm": 0.18201641738414764, + "learning_rate": 7.008908950663095e-05, + "loss": 2.0228, + "step": 3647 + }, + { + "epoch": 1.5736467543670476, + "grad_norm": 0.18830415606498718, + "learning_rate": 7.005503529429064e-05, + "loss": 2.0015, + "step": 3648 + }, + { + "epoch": 1.5740780677161959, + "grad_norm": 0.1909087747335434, + "learning_rate": 7.002098210585932e-05, + "loss": 2.0628, + "step": 3649 + }, + { + "epoch": 1.574509381065344, + "grad_norm": 0.18361161649227142, + "learning_rate": 6.998692994838812e-05, + "loss": 1.994, + "step": 3650 + }, + { + "epoch": 1.574509381065344, + "eval_loss": 2.093738317489624, + "eval_runtime": 200.716, + "eval_samples_per_second": 0.159, + "eval_steps_per_second": 0.159, + "step": 3650 + }, + { + "epoch": 1.5749406944144921, + "grad_norm": 0.19482986629009247, + "learning_rate": 6.995287882892791e-05, + "loss": 2.1545, + "step": 3651 + }, + { + "epoch": 1.5753720077636402, + "grad_norm": 0.1884821504354477, + "learning_rate": 6.991882875452932e-05, + "loss": 2.1563, + "step": 3652 + }, + { + "epoch": 1.5758033211127884, + "grad_norm": 0.1934521347284317, + "learning_rate": 6.988477973224281e-05, + "loss": 2.2272, + "step": 3653 + }, + { + "epoch": 1.5762346344619367, + "grad_norm": 0.19240306317806244, + "learning_rate": 6.985073176911858e-05, + "loss": 2.0432, + "step": 3654 + }, + { + "epoch": 1.5766659478110847, + "grad_norm": 0.20495979487895966, + "learning_rate": 6.981668487220666e-05, + "loss": 2.3219, + "step": 3655 + }, + { + "epoch": 1.5770972611602327, + "grad_norm": 0.19982969760894775, + "learning_rate": 6.978263904855683e-05, + "loss": 2.2596, + "step": 3656 + }, + { + "epoch": 1.577528574509381, + "grad_norm": 0.1917852759361267, + "learning_rate": 6.97485943052186e-05, + "loss": 2.1474, + "step": 3657 + }, + { + "epoch": 1.5779598878585293, + "grad_norm": 0.1615539789199829, + "learning_rate": 6.97145506492414e-05, + "loss": 1.9549, + "step": 3658 + }, + { + "epoch": 1.5783912012076775, + "grad_norm": 0.19644218683242798, + "learning_rate": 6.968050808767427e-05, + "loss": 2.1014, + "step": 3659 + }, + { + "epoch": 1.5788225145568255, + "grad_norm": 0.1665312498807907, + "learning_rate": 6.964646662756611e-05, + "loss": 1.6771, + "step": 3660 + }, + { + "epoch": 1.5792538279059736, + "grad_norm": 0.18535394966602325, + "learning_rate": 6.961242627596557e-05, + "loss": 2.1066, + "step": 3661 + }, + { + "epoch": 1.5796851412551218, + "grad_norm": 0.2052871584892273, + "learning_rate": 6.95783870399211e-05, + "loss": 1.9451, + "step": 3662 + }, + { + "epoch": 1.58011645460427, + "grad_norm": 0.18887032568454742, + "learning_rate": 6.95443489264809e-05, + "loss": 1.8664, + "step": 3663 + }, + { + "epoch": 1.5805477679534181, + "grad_norm": 0.17410443723201752, + "learning_rate": 6.951031194269287e-05, + "loss": 1.9715, + "step": 3664 + }, + { + "epoch": 1.5809790813025664, + "grad_norm": 0.17642557621002197, + "learning_rate": 6.947627609560483e-05, + "loss": 2.1207, + "step": 3665 + }, + { + "epoch": 1.5814103946517144, + "grad_norm": 0.18183539807796478, + "learning_rate": 6.944224139226421e-05, + "loss": 2.0773, + "step": 3666 + }, + { + "epoch": 1.5818417080008627, + "grad_norm": 0.18124890327453613, + "learning_rate": 6.94082078397183e-05, + "loss": 2.489, + "step": 3667 + }, + { + "epoch": 1.582273021350011, + "grad_norm": 0.21090415120124817, + "learning_rate": 6.937417544501408e-05, + "loss": 2.1516, + "step": 3668 + }, + { + "epoch": 1.582704334699159, + "grad_norm": 0.21677644550800323, + "learning_rate": 6.934014421519839e-05, + "loss": 2.1173, + "step": 3669 + }, + { + "epoch": 1.583135648048307, + "grad_norm": 0.1930365264415741, + "learning_rate": 6.930611415731774e-05, + "loss": 2.0612, + "step": 3670 + }, + { + "epoch": 1.5835669613974552, + "grad_norm": 0.1924310177564621, + "learning_rate": 6.927208527841841e-05, + "loss": 2.0412, + "step": 3671 + }, + { + "epoch": 1.5839982747466035, + "grad_norm": 0.16797201335430145, + "learning_rate": 6.923805758554644e-05, + "loss": 2.1477, + "step": 3672 + }, + { + "epoch": 1.5844295880957515, + "grad_norm": 0.18496067821979523, + "learning_rate": 6.920403108574767e-05, + "loss": 2.0006, + "step": 3673 + }, + { + "epoch": 1.5848609014448998, + "grad_norm": 0.19043207168579102, + "learning_rate": 6.917000578606766e-05, + "loss": 2.2337, + "step": 3674 + }, + { + "epoch": 1.5852922147940478, + "grad_norm": 0.1813650131225586, + "learning_rate": 6.91359816935517e-05, + "loss": 2.2809, + "step": 3675 + }, + { + "epoch": 1.5852922147940478, + "eval_loss": 2.093628406524658, + "eval_runtime": 203.0369, + "eval_samples_per_second": 0.158, + "eval_steps_per_second": 0.158, + "step": 3675 + }, + { + "epoch": 1.585723528143196, + "grad_norm": 0.19538749754428864, + "learning_rate": 6.910195881524485e-05, + "loss": 2.1929, + "step": 3676 + }, + { + "epoch": 1.5861548414923443, + "grad_norm": 0.20857664942741394, + "learning_rate": 6.906793715819195e-05, + "loss": 2.0994, + "step": 3677 + }, + { + "epoch": 1.5865861548414923, + "grad_norm": 0.16829629242420197, + "learning_rate": 6.903391672943752e-05, + "loss": 2.1483, + "step": 3678 + }, + { + "epoch": 1.5870174681906404, + "grad_norm": 0.1961994618177414, + "learning_rate": 6.899989753602587e-05, + "loss": 2.1908, + "step": 3679 + }, + { + "epoch": 1.5874487815397886, + "grad_norm": 0.1927735060453415, + "learning_rate": 6.896587958500105e-05, + "loss": 2.154, + "step": 3680 + }, + { + "epoch": 1.5878800948889369, + "grad_norm": 0.17948473989963531, + "learning_rate": 6.893186288340687e-05, + "loss": 1.9965, + "step": 3681 + }, + { + "epoch": 1.588311408238085, + "grad_norm": 0.1843881756067276, + "learning_rate": 6.88978474382868e-05, + "loss": 2.0506, + "step": 3682 + }, + { + "epoch": 1.5887427215872332, + "grad_norm": 0.16958780586719513, + "learning_rate": 6.886383325668417e-05, + "loss": 2.0875, + "step": 3683 + }, + { + "epoch": 1.5891740349363812, + "grad_norm": 0.17877309024333954, + "learning_rate": 6.8829820345642e-05, + "loss": 2.1458, + "step": 3684 + }, + { + "epoch": 1.5896053482855295, + "grad_norm": 0.17716002464294434, + "learning_rate": 6.879580871220298e-05, + "loss": 2.167, + "step": 3685 + }, + { + "epoch": 1.5900366616346777, + "grad_norm": 0.18612174689769745, + "learning_rate": 6.876179836340959e-05, + "loss": 2.0139, + "step": 3686 + }, + { + "epoch": 1.5904679749838257, + "grad_norm": 0.20165590941905975, + "learning_rate": 6.87277893063041e-05, + "loss": 2.1366, + "step": 3687 + }, + { + "epoch": 1.5908992883329738, + "grad_norm": 0.19413705170154572, + "learning_rate": 6.869378154792841e-05, + "loss": 2.0338, + "step": 3688 + }, + { + "epoch": 1.591330601682122, + "grad_norm": 0.18128186464309692, + "learning_rate": 6.865977509532421e-05, + "loss": 2.1962, + "step": 3689 + }, + { + "epoch": 1.5917619150312703, + "grad_norm": 0.20013929903507233, + "learning_rate": 6.862576995553294e-05, + "loss": 2.0225, + "step": 3690 + }, + { + "epoch": 1.5921932283804183, + "grad_norm": 0.19913722574710846, + "learning_rate": 6.85917661355957e-05, + "loss": 2.1976, + "step": 3691 + }, + { + "epoch": 1.5926245417295666, + "grad_norm": 0.16997520625591278, + "learning_rate": 6.855776364255334e-05, + "loss": 2.1994, + "step": 3692 + }, + { + "epoch": 1.5930558550787146, + "grad_norm": 0.17715983092784882, + "learning_rate": 6.852376248344648e-05, + "loss": 2.0439, + "step": 3693 + }, + { + "epoch": 1.5934871684278629, + "grad_norm": 0.19184236228466034, + "learning_rate": 6.848976266531541e-05, + "loss": 2.2112, + "step": 3694 + }, + { + "epoch": 1.5939184817770111, + "grad_norm": 0.20436334609985352, + "learning_rate": 6.845576419520021e-05, + "loss": 1.9827, + "step": 3695 + }, + { + "epoch": 1.5943497951261592, + "grad_norm": 0.18236425518989563, + "learning_rate": 6.842176708014054e-05, + "loss": 2.1338, + "step": 3696 + }, + { + "epoch": 1.5947811084753072, + "grad_norm": 0.18234539031982422, + "learning_rate": 6.838777132717599e-05, + "loss": 2.2777, + "step": 3697 + }, + { + "epoch": 1.5952124218244554, + "grad_norm": 0.1843755543231964, + "learning_rate": 6.835377694334568e-05, + "loss": 2.1487, + "step": 3698 + }, + { + "epoch": 1.5956437351736037, + "grad_norm": 0.17670325934886932, + "learning_rate": 6.831978393568853e-05, + "loss": 2.1982, + "step": 3699 + }, + { + "epoch": 1.5960750485227517, + "grad_norm": 0.19596204161643982, + "learning_rate": 6.828579231124316e-05, + "loss": 2.1467, + "step": 3700 + }, + { + "epoch": 1.5960750485227517, + "eval_loss": 2.093451976776123, + "eval_runtime": 201.8392, + "eval_samples_per_second": 0.159, + "eval_steps_per_second": 0.159, + "step": 3700 + }, + { + "epoch": 1.5965063618719, + "grad_norm": 0.18570303916931152, + "learning_rate": 6.825180207704793e-05, + "loss": 1.9782, + "step": 3701 + }, + { + "epoch": 1.596937675221048, + "grad_norm": 0.2120261937379837, + "learning_rate": 6.82178132401409e-05, + "loss": 2.0996, + "step": 3702 + }, + { + "epoch": 1.5973689885701963, + "grad_norm": 0.20858530700206757, + "learning_rate": 6.818382580755975e-05, + "loss": 2.1682, + "step": 3703 + }, + { + "epoch": 1.5978003019193445, + "grad_norm": 0.1919013112783432, + "learning_rate": 6.814983978634206e-05, + "loss": 2.0944, + "step": 3704 + }, + { + "epoch": 1.5982316152684926, + "grad_norm": 0.1854456216096878, + "learning_rate": 6.811585518352493e-05, + "loss": 2.2461, + "step": 3705 + }, + { + "epoch": 1.5986629286176406, + "grad_norm": 0.1700373888015747, + "learning_rate": 6.808187200614528e-05, + "loss": 1.8029, + "step": 3706 + }, + { + "epoch": 1.5990942419667888, + "grad_norm": 0.2353130578994751, + "learning_rate": 6.804789026123967e-05, + "loss": 2.0473, + "step": 3707 + }, + { + "epoch": 1.599525555315937, + "grad_norm": 0.18630927801132202, + "learning_rate": 6.801390995584441e-05, + "loss": 2.3925, + "step": 3708 + }, + { + "epoch": 1.5999568686650851, + "grad_norm": 0.2129543125629425, + "learning_rate": 6.797993109699552e-05, + "loss": 2.2899, + "step": 3709 + }, + { + "epoch": 1.6003881820142334, + "grad_norm": 0.18666571378707886, + "learning_rate": 6.794595369172862e-05, + "loss": 1.9599, + "step": 3710 + }, + { + "epoch": 1.6008194953633814, + "grad_norm": 0.16544368863105774, + "learning_rate": 6.791197774707919e-05, + "loss": 2.0883, + "step": 3711 + }, + { + "epoch": 1.6012508087125297, + "grad_norm": 0.18772681057453156, + "learning_rate": 6.787800327008227e-05, + "loss": 2.3356, + "step": 3712 + }, + { + "epoch": 1.601682122061678, + "grad_norm": 0.19081822037696838, + "learning_rate": 6.784403026777266e-05, + "loss": 2.0163, + "step": 3713 + }, + { + "epoch": 1.602113435410826, + "grad_norm": 0.17879170179367065, + "learning_rate": 6.781005874718481e-05, + "loss": 2.2735, + "step": 3714 + }, + { + "epoch": 1.602544748759974, + "grad_norm": 0.19571247696876526, + "learning_rate": 6.777608871535296e-05, + "loss": 2.1857, + "step": 3715 + }, + { + "epoch": 1.6029760621091222, + "grad_norm": 0.19109483063220978, + "learning_rate": 6.774212017931095e-05, + "loss": 2.1034, + "step": 3716 + }, + { + "epoch": 1.6034073754582705, + "grad_norm": 0.1833193004131317, + "learning_rate": 6.770815314609228e-05, + "loss": 2.0422, + "step": 3717 + }, + { + "epoch": 1.6038386888074188, + "grad_norm": 0.1861751526594162, + "learning_rate": 6.767418762273028e-05, + "loss": 2.1788, + "step": 3718 + }, + { + "epoch": 1.6042700021565668, + "grad_norm": 0.2010440081357956, + "learning_rate": 6.764022361625784e-05, + "loss": 2.1818, + "step": 3719 + }, + { + "epoch": 1.6047013155057148, + "grad_norm": 0.20387910306453705, + "learning_rate": 6.76062611337076e-05, + "loss": 2.0423, + "step": 3720 + }, + { + "epoch": 1.605132628854863, + "grad_norm": 0.19214536249637604, + "learning_rate": 6.75723001821118e-05, + "loss": 2.2937, + "step": 3721 + }, + { + "epoch": 1.6055639422040113, + "grad_norm": 0.1803417056798935, + "learning_rate": 6.753834076850252e-05, + "loss": 2.038, + "step": 3722 + }, + { + "epoch": 1.6059952555531594, + "grad_norm": 0.19501323997974396, + "learning_rate": 6.750438289991139e-05, + "loss": 2.2286, + "step": 3723 + }, + { + "epoch": 1.6064265689023074, + "grad_norm": 0.177200585603714, + "learning_rate": 6.747042658336972e-05, + "loss": 2.261, + "step": 3724 + }, + { + "epoch": 1.6068578822514556, + "grad_norm": 0.19286774098873138, + "learning_rate": 6.743647182590857e-05, + "loss": 2.1549, + "step": 3725 + }, + { + "epoch": 1.6068578822514556, + "eval_loss": 2.0928478240966797, + "eval_runtime": 201.4313, + "eval_samples_per_second": 0.159, + "eval_steps_per_second": 0.159, + "step": 3725 + }, + { + "epoch": 1.607289195600604, + "grad_norm": 0.2139384150505066, + "learning_rate": 6.740251863455866e-05, + "loss": 2.2656, + "step": 3726 + }, + { + "epoch": 1.6077205089497522, + "grad_norm": 0.184177428483963, + "learning_rate": 6.736856701635033e-05, + "loss": 2.3855, + "step": 3727 + }, + { + "epoch": 1.6081518222989002, + "grad_norm": 0.18360666930675507, + "learning_rate": 6.733461697831366e-05, + "loss": 2.1589, + "step": 3728 + }, + { + "epoch": 1.6085831356480482, + "grad_norm": 0.9161972403526306, + "learning_rate": 6.73006685274784e-05, + "loss": 2.1925, + "step": 3729 + }, + { + "epoch": 1.6090144489971965, + "grad_norm": 0.19454066455364227, + "learning_rate": 6.72667216708739e-05, + "loss": 2.0233, + "step": 3730 + }, + { + "epoch": 1.6094457623463447, + "grad_norm": 0.18415388464927673, + "learning_rate": 6.723277641552924e-05, + "loss": 2.0935, + "step": 3731 + }, + { + "epoch": 1.6098770756954928, + "grad_norm": 0.19692564010620117, + "learning_rate": 6.719883276847316e-05, + "loss": 2.2279, + "step": 3732 + }, + { + "epoch": 1.6103083890446408, + "grad_norm": 0.21135157346725464, + "learning_rate": 6.716489073673408e-05, + "loss": 2.1733, + "step": 3733 + }, + { + "epoch": 1.610739702393789, + "grad_norm": 0.2029101848602295, + "learning_rate": 6.713095032734007e-05, + "loss": 2.1401, + "step": 3734 + }, + { + "epoch": 1.6111710157429373, + "grad_norm": 0.17743490636348724, + "learning_rate": 6.70970115473188e-05, + "loss": 1.9196, + "step": 3735 + }, + { + "epoch": 1.6116023290920856, + "grad_norm": 0.18146555125713348, + "learning_rate": 6.706307440369778e-05, + "loss": 2.3626, + "step": 3736 + }, + { + "epoch": 1.6120336424412336, + "grad_norm": 0.1897459179162979, + "learning_rate": 6.702913890350396e-05, + "loss": 2.4544, + "step": 3737 + }, + { + "epoch": 1.6124649557903816, + "grad_norm": 0.17637895047664642, + "learning_rate": 6.699520505376411e-05, + "loss": 2.1949, + "step": 3738 + }, + { + "epoch": 1.6128962691395299, + "grad_norm": 0.19048604369163513, + "learning_rate": 6.696127286150458e-05, + "loss": 2.1958, + "step": 3739 + }, + { + "epoch": 1.6133275824886781, + "grad_norm": 0.1806829422712326, + "learning_rate": 6.692734233375143e-05, + "loss": 1.9665, + "step": 3740 + }, + { + "epoch": 1.6137588958378262, + "grad_norm": 0.17890702188014984, + "learning_rate": 6.689341347753034e-05, + "loss": 2.1628, + "step": 3741 + }, + { + "epoch": 1.6141902091869742, + "grad_norm": 0.21568112075328827, + "learning_rate": 6.685948629986659e-05, + "loss": 2.2335, + "step": 3742 + }, + { + "epoch": 1.6146215225361225, + "grad_norm": 0.19446949660778046, + "learning_rate": 6.682556080778529e-05, + "loss": 2.008, + "step": 3743 + }, + { + "epoch": 1.6150528358852707, + "grad_norm": 0.18394096195697784, + "learning_rate": 6.679163700831099e-05, + "loss": 2.1141, + "step": 3744 + }, + { + "epoch": 1.615484149234419, + "grad_norm": 0.19576245546340942, + "learning_rate": 6.675771490846802e-05, + "loss": 2.2772, + "step": 3745 + }, + { + "epoch": 1.615915462583567, + "grad_norm": 0.18927359580993652, + "learning_rate": 6.672379451528029e-05, + "loss": 2.2442, + "step": 3746 + }, + { + "epoch": 1.616346775932715, + "grad_norm": 0.19549596309661865, + "learning_rate": 6.668987583577143e-05, + "loss": 2.1024, + "step": 3747 + }, + { + "epoch": 1.6167780892818633, + "grad_norm": 0.18620263040065765, + "learning_rate": 6.665595887696468e-05, + "loss": 2.1882, + "step": 3748 + }, + { + "epoch": 1.6172094026310115, + "grad_norm": 0.19507049024105072, + "learning_rate": 6.662204364588283e-05, + "loss": 2.1049, + "step": 3749 + }, + { + "epoch": 1.6176407159801596, + "grad_norm": 0.20146001875400543, + "learning_rate": 6.658813014954852e-05, + "loss": 2.2832, + "step": 3750 + }, + { + "epoch": 1.6176407159801596, + "eval_loss": 2.09356689453125, + "eval_runtime": 201.387, + "eval_samples_per_second": 0.159, + "eval_steps_per_second": 0.159, + "step": 3750 + }, + { + "epoch": 1.6180720293293076, + "grad_norm": 0.22598698735237122, + "learning_rate": 6.655421839498385e-05, + "loss": 1.9081, + "step": 3751 + }, + { + "epoch": 1.6185033426784559, + "grad_norm": 0.1829482764005661, + "learning_rate": 6.65203083892106e-05, + "loss": 2.0644, + "step": 3752 + }, + { + "epoch": 1.6189346560276041, + "grad_norm": 0.20377013087272644, + "learning_rate": 6.648640013925023e-05, + "loss": 2.233, + "step": 3753 + }, + { + "epoch": 1.6193659693767524, + "grad_norm": 0.19465422630310059, + "learning_rate": 6.645249365212382e-05, + "loss": 2.3635, + "step": 3754 + }, + { + "epoch": 1.6197972827259004, + "grad_norm": 0.18360711634159088, + "learning_rate": 6.64185889348521e-05, + "loss": 2.2299, + "step": 3755 + }, + { + "epoch": 1.6202285960750484, + "grad_norm": 0.19367410242557526, + "learning_rate": 6.638468599445533e-05, + "loss": 2.1938, + "step": 3756 + }, + { + "epoch": 1.6206599094241967, + "grad_norm": 0.1897435337305069, + "learning_rate": 6.635078483795359e-05, + "loss": 1.9657, + "step": 3757 + }, + { + "epoch": 1.621091222773345, + "grad_norm": 0.18628539144992828, + "learning_rate": 6.631688547236642e-05, + "loss": 2.2478, + "step": 3758 + }, + { + "epoch": 1.621522536122493, + "grad_norm": 0.17115898430347443, + "learning_rate": 6.628298790471307e-05, + "loss": 2.3662, + "step": 3759 + }, + { + "epoch": 1.621953849471641, + "grad_norm": 0.17931334674358368, + "learning_rate": 6.624909214201239e-05, + "loss": 2.146, + "step": 3760 + }, + { + "epoch": 1.6223851628207893, + "grad_norm": 0.22137954831123352, + "learning_rate": 6.621519819128289e-05, + "loss": 2.087, + "step": 3761 + }, + { + "epoch": 1.6228164761699375, + "grad_norm": 0.17036446928977966, + "learning_rate": 6.61813060595427e-05, + "loss": 2.0117, + "step": 3762 + }, + { + "epoch": 1.6232477895190858, + "grad_norm": 0.17956706881523132, + "learning_rate": 6.614741575380946e-05, + "loss": 2.0999, + "step": 3763 + }, + { + "epoch": 1.6236791028682338, + "grad_norm": 0.19675466418266296, + "learning_rate": 6.611352728110067e-05, + "loss": 2.1033, + "step": 3764 + }, + { + "epoch": 1.6241104162173818, + "grad_norm": 0.20353537797927856, + "learning_rate": 6.60796406484332e-05, + "loss": 2.1655, + "step": 3765 + }, + { + "epoch": 1.62454172956653, + "grad_norm": 0.21810005605220795, + "learning_rate": 6.60457558628237e-05, + "loss": 2.2077, + "step": 3766 + }, + { + "epoch": 1.6249730429156783, + "grad_norm": 0.18552573025226593, + "learning_rate": 6.601187293128836e-05, + "loss": 2.2549, + "step": 3767 + }, + { + "epoch": 1.6254043562648264, + "grad_norm": 0.20179718732833862, + "learning_rate": 6.597799186084302e-05, + "loss": 2.1223, + "step": 3768 + }, + { + "epoch": 1.6258356696139744, + "grad_norm": 0.18860100209712982, + "learning_rate": 6.594411265850315e-05, + "loss": 2.3134, + "step": 3769 + }, + { + "epoch": 1.6262669829631227, + "grad_norm": 0.18685784935951233, + "learning_rate": 6.591023533128378e-05, + "loss": 2.1947, + "step": 3770 + }, + { + "epoch": 1.626698296312271, + "grad_norm": 0.20574668049812317, + "learning_rate": 6.587635988619957e-05, + "loss": 2.1871, + "step": 3771 + }, + { + "epoch": 1.6271296096614192, + "grad_norm": 0.1886902153491974, + "learning_rate": 6.584248633026483e-05, + "loss": 2.0815, + "step": 3772 + }, + { + "epoch": 1.6275609230105672, + "grad_norm": 0.19360224902629852, + "learning_rate": 6.580861467049345e-05, + "loss": 2.2357, + "step": 3773 + }, + { + "epoch": 1.6279922363597152, + "grad_norm": 0.17270958423614502, + "learning_rate": 6.57747449138989e-05, + "loss": 2.201, + "step": 3774 + }, + { + "epoch": 1.6284235497088635, + "grad_norm": 0.18650749325752258, + "learning_rate": 6.574087706749432e-05, + "loss": 2.0524, + "step": 3775 + }, + { + "epoch": 1.6284235497088635, + "eval_loss": 2.093459367752075, + "eval_runtime": 202.3645, + "eval_samples_per_second": 0.158, + "eval_steps_per_second": 0.158, + "step": 3775 + }, + { + "epoch": 1.6288548630580117, + "grad_norm": 0.19357305765151978, + "learning_rate": 6.57070111382924e-05, + "loss": 2.1072, + "step": 3776 + }, + { + "epoch": 1.6292861764071598, + "grad_norm": 0.17448721826076508, + "learning_rate": 6.567314713330544e-05, + "loss": 1.9925, + "step": 3777 + }, + { + "epoch": 1.6297174897563078, + "grad_norm": 0.176400825381279, + "learning_rate": 6.563928505954534e-05, + "loss": 2.1545, + "step": 3778 + }, + { + "epoch": 1.630148803105456, + "grad_norm": 0.1928510069847107, + "learning_rate": 6.560542492402367e-05, + "loss": 2.1201, + "step": 3779 + }, + { + "epoch": 1.6305801164546043, + "grad_norm": 0.19245097041130066, + "learning_rate": 6.557156673375151e-05, + "loss": 2.2759, + "step": 3780 + }, + { + "epoch": 1.6310114298037526, + "grad_norm": 0.200565367937088, + "learning_rate": 6.553771049573952e-05, + "loss": 2.1643, + "step": 3781 + }, + { + "epoch": 1.6314427431529006, + "grad_norm": 0.1743149757385254, + "learning_rate": 6.55038562169981e-05, + "loss": 2.2006, + "step": 3782 + }, + { + "epoch": 1.6318740565020486, + "grad_norm": 0.18234965205192566, + "learning_rate": 6.547000390453708e-05, + "loss": 2.3149, + "step": 3783 + }, + { + "epoch": 1.632305369851197, + "grad_norm": 0.19367872178554535, + "learning_rate": 6.543615356536596e-05, + "loss": 2.2829, + "step": 3784 + }, + { + "epoch": 1.6327366832003452, + "grad_norm": 0.19692032039165497, + "learning_rate": 6.540230520649383e-05, + "loss": 2.2099, + "step": 3785 + }, + { + "epoch": 1.6331679965494932, + "grad_norm": 0.193384051322937, + "learning_rate": 6.536845883492939e-05, + "loss": 2.4338, + "step": 3786 + }, + { + "epoch": 1.6335993098986412, + "grad_norm": 0.1894853711128235, + "learning_rate": 6.533461445768087e-05, + "loss": 2.0901, + "step": 3787 + }, + { + "epoch": 1.6340306232477895, + "grad_norm": 0.1653389036655426, + "learning_rate": 6.530077208175609e-05, + "loss": 1.8634, + "step": 3788 + }, + { + "epoch": 1.6344619365969377, + "grad_norm": 0.1730147749185562, + "learning_rate": 6.526693171416258e-05, + "loss": 2.1809, + "step": 3789 + }, + { + "epoch": 1.634893249946086, + "grad_norm": 0.18234191834926605, + "learning_rate": 6.523309336190727e-05, + "loss": 2.1617, + "step": 3790 + }, + { + "epoch": 1.635324563295234, + "grad_norm": 0.19211749732494354, + "learning_rate": 6.519925703199678e-05, + "loss": 2.1767, + "step": 3791 + }, + { + "epoch": 1.635755876644382, + "grad_norm": 0.2109699696302414, + "learning_rate": 6.516542273143729e-05, + "loss": 2.3465, + "step": 3792 + }, + { + "epoch": 1.6361871899935303, + "grad_norm": 0.18605192005634308, + "learning_rate": 6.513159046723459e-05, + "loss": 2.1536, + "step": 3793 + }, + { + "epoch": 1.6366185033426786, + "grad_norm": 0.1837363839149475, + "learning_rate": 6.509776024639403e-05, + "loss": 2.1865, + "step": 3794 + }, + { + "epoch": 1.6370498166918266, + "grad_norm": 0.18559573590755463, + "learning_rate": 6.506393207592043e-05, + "loss": 2.0815, + "step": 3795 + }, + { + "epoch": 1.6374811300409746, + "grad_norm": 0.1857297122478485, + "learning_rate": 6.503010596281841e-05, + "loss": 2.2236, + "step": 3796 + }, + { + "epoch": 1.6379124433901229, + "grad_norm": 0.20153896510601044, + "learning_rate": 6.499628191409196e-05, + "loss": 2.2147, + "step": 3797 + }, + { + "epoch": 1.6383437567392711, + "grad_norm": 0.1797814816236496, + "learning_rate": 6.496245993674476e-05, + "loss": 2.1808, + "step": 3798 + }, + { + "epoch": 1.6387750700884194, + "grad_norm": 0.18175220489501953, + "learning_rate": 6.492864003777996e-05, + "loss": 2.2005, + "step": 3799 + }, + { + "epoch": 1.6392063834375674, + "grad_norm": 0.20047171413898468, + "learning_rate": 6.489482222420039e-05, + "loss": 2.1684, + "step": 3800 + }, + { + "epoch": 1.6392063834375674, + "eval_loss": 2.092963218688965, + "eval_runtime": 196.5623, + "eval_samples_per_second": 0.163, + "eval_steps_per_second": 0.163, + "step": 3800 + }, + { + "epoch": 1.6396376967867154, + "grad_norm": 0.22544299066066742, + "learning_rate": 6.486100650300842e-05, + "loss": 2.0748, + "step": 3801 + }, + { + "epoch": 1.6400690101358637, + "grad_norm": 0.19890649616718292, + "learning_rate": 6.482719288120587e-05, + "loss": 2.1556, + "step": 3802 + }, + { + "epoch": 1.640500323485012, + "grad_norm": 0.18018411099910736, + "learning_rate": 6.479338136579435e-05, + "loss": 2.1624, + "step": 3803 + }, + { + "epoch": 1.64093163683416, + "grad_norm": 0.19493593275547028, + "learning_rate": 6.47595719637748e-05, + "loss": 2.2055, + "step": 3804 + }, + { + "epoch": 1.641362950183308, + "grad_norm": 0.1730249971151352, + "learning_rate": 6.472576468214787e-05, + "loss": 1.8869, + "step": 3805 + }, + { + "epoch": 1.6417942635324563, + "grad_norm": 0.18684785068035126, + "learning_rate": 6.46919595279137e-05, + "loss": 2.2683, + "step": 3806 + }, + { + "epoch": 1.6422255768816045, + "grad_norm": 0.18305426836013794, + "learning_rate": 6.465815650807206e-05, + "loss": 2.3451, + "step": 3807 + }, + { + "epoch": 1.6426568902307528, + "grad_norm": 0.23286522924900055, + "learning_rate": 6.462435562962222e-05, + "loss": 2.1248, + "step": 3808 + }, + { + "epoch": 1.6430882035799008, + "grad_norm": 0.1933186948299408, + "learning_rate": 6.4590556899563e-05, + "loss": 2.0885, + "step": 3809 + }, + { + "epoch": 1.6435195169290489, + "grad_norm": 0.16845665872097015, + "learning_rate": 6.45567603248928e-05, + "loss": 1.8661, + "step": 3810 + }, + { + "epoch": 1.643950830278197, + "grad_norm": 0.19671374559402466, + "learning_rate": 6.45229659126096e-05, + "loss": 1.7578, + "step": 3811 + }, + { + "epoch": 1.6443821436273454, + "grad_norm": 0.19340135157108307, + "learning_rate": 6.448917366971085e-05, + "loss": 2.192, + "step": 3812 + }, + { + "epoch": 1.6448134569764934, + "grad_norm": 0.18395887315273285, + "learning_rate": 6.445538360319366e-05, + "loss": 2.2982, + "step": 3813 + }, + { + "epoch": 1.6452447703256414, + "grad_norm": 0.2056943029165268, + "learning_rate": 6.442159572005462e-05, + "loss": 2.2493, + "step": 3814 + }, + { + "epoch": 1.6456760836747897, + "grad_norm": 0.20504404604434967, + "learning_rate": 6.438781002728985e-05, + "loss": 2.1617, + "step": 3815 + }, + { + "epoch": 1.646107397023938, + "grad_norm": 0.18038319051265717, + "learning_rate": 6.435402653189509e-05, + "loss": 2.1422, + "step": 3816 + }, + { + "epoch": 1.6465387103730862, + "grad_norm": 0.18079359829425812, + "learning_rate": 6.432024524086552e-05, + "loss": 2.1337, + "step": 3817 + }, + { + "epoch": 1.6469700237222342, + "grad_norm": 0.18782946467399597, + "learning_rate": 6.428646616119599e-05, + "loss": 1.9715, + "step": 3818 + }, + { + "epoch": 1.6474013370713823, + "grad_norm": 0.1954319030046463, + "learning_rate": 6.425268929988081e-05, + "loss": 2.3628, + "step": 3819 + }, + { + "epoch": 1.6478326504205305, + "grad_norm": 0.23721271753311157, + "learning_rate": 6.421891466391381e-05, + "loss": 2.0577, + "step": 3820 + }, + { + "epoch": 1.6482639637696788, + "grad_norm": 0.1829412579536438, + "learning_rate": 6.418514226028847e-05, + "loss": 2.3678, + "step": 3821 + }, + { + "epoch": 1.6486952771188268, + "grad_norm": 0.2008577436208725, + "learning_rate": 6.415137209599767e-05, + "loss": 2.1813, + "step": 3822 + }, + { + "epoch": 1.6491265904679748, + "grad_norm": 0.1968059092760086, + "learning_rate": 6.411760417803392e-05, + "loss": 2.2353, + "step": 3823 + }, + { + "epoch": 1.649557903817123, + "grad_norm": 0.18984900414943695, + "learning_rate": 6.408383851338922e-05, + "loss": 2.1367, + "step": 3824 + }, + { + "epoch": 1.6499892171662713, + "grad_norm": 0.2056960016489029, + "learning_rate": 6.405007510905515e-05, + "loss": 2.2562, + "step": 3825 + }, + { + "epoch": 1.6499892171662713, + "eval_loss": 2.092493772506714, + "eval_runtime": 194.8674, + "eval_samples_per_second": 0.164, + "eval_steps_per_second": 0.164, + "step": 3825 + }, + { + "epoch": 1.6504205305154196, + "grad_norm": 0.1952376812696457, + "learning_rate": 6.401631397202277e-05, + "loss": 2.2311, + "step": 3826 + }, + { + "epoch": 1.6508518438645676, + "grad_norm": 0.17466172575950623, + "learning_rate": 6.398255510928268e-05, + "loss": 2.0367, + "step": 3827 + }, + { + "epoch": 1.6512831572137157, + "grad_norm": 0.3534952998161316, + "learning_rate": 6.394879852782507e-05, + "loss": 2.0924, + "step": 3828 + }, + { + "epoch": 1.651714470562864, + "grad_norm": 0.25892743468284607, + "learning_rate": 6.391504423463957e-05, + "loss": 2.1704, + "step": 3829 + }, + { + "epoch": 1.6521457839120122, + "grad_norm": 0.1772749274969101, + "learning_rate": 6.388129223671538e-05, + "loss": 2.1152, + "step": 3830 + }, + { + "epoch": 1.6525770972611602, + "grad_norm": 0.25078848004341125, + "learning_rate": 6.384754254104121e-05, + "loss": 2.1433, + "step": 3831 + }, + { + "epoch": 1.6530084106103082, + "grad_norm": 0.17856615781784058, + "learning_rate": 6.381379515460533e-05, + "loss": 2.166, + "step": 3832 + }, + { + "epoch": 1.6534397239594565, + "grad_norm": 0.17271089553833008, + "learning_rate": 6.378005008439552e-05, + "loss": 2.1329, + "step": 3833 + }, + { + "epoch": 1.6538710373086047, + "grad_norm": 0.18587853014469147, + "learning_rate": 6.374630733739898e-05, + "loss": 2.1766, + "step": 3834 + }, + { + "epoch": 1.654302350657753, + "grad_norm": 0.19487249851226807, + "learning_rate": 6.371256692060263e-05, + "loss": 2.2771, + "step": 3835 + }, + { + "epoch": 1.654733664006901, + "grad_norm": 0.20743614435195923, + "learning_rate": 6.367882884099272e-05, + "loss": 2.1246, + "step": 3836 + }, + { + "epoch": 1.655164977356049, + "grad_norm": 0.18915899097919464, + "learning_rate": 6.364509310555509e-05, + "loss": 2.35, + "step": 3837 + }, + { + "epoch": 1.6555962907051973, + "grad_norm": 0.176448255777359, + "learning_rate": 6.361135972127512e-05, + "loss": 2.1189, + "step": 3838 + }, + { + "epoch": 1.6560276040543456, + "grad_norm": 0.18612118065357208, + "learning_rate": 6.357762869513766e-05, + "loss": 2.1382, + "step": 3839 + }, + { + "epoch": 1.6564589174034936, + "grad_norm": 0.17284585535526276, + "learning_rate": 6.35439000341271e-05, + "loss": 2.1476, + "step": 3840 + }, + { + "epoch": 1.6568902307526416, + "grad_norm": 0.17270182073116302, + "learning_rate": 6.351017374522729e-05, + "loss": 1.9631, + "step": 3841 + }, + { + "epoch": 1.65732154410179, + "grad_norm": 0.19759193062782288, + "learning_rate": 6.34764498354217e-05, + "loss": 2.0957, + "step": 3842 + }, + { + "epoch": 1.6577528574509381, + "grad_norm": 0.2020193189382553, + "learning_rate": 6.344272831169317e-05, + "loss": 2.1451, + "step": 3843 + }, + { + "epoch": 1.6581841708000864, + "grad_norm": 0.18171140551567078, + "learning_rate": 6.340900918102413e-05, + "loss": 2.0582, + "step": 3844 + }, + { + "epoch": 1.6586154841492344, + "grad_norm": 0.18665428459644318, + "learning_rate": 6.33752924503965e-05, + "loss": 2.0344, + "step": 3845 + }, + { + "epoch": 1.6590467974983825, + "grad_norm": 0.18399561941623688, + "learning_rate": 6.334157812679168e-05, + "loss": 2.1475, + "step": 3846 + }, + { + "epoch": 1.6594781108475307, + "grad_norm": 0.1734449863433838, + "learning_rate": 6.330786621719064e-05, + "loss": 2.1052, + "step": 3847 + }, + { + "epoch": 1.659909424196679, + "grad_norm": 0.22349125146865845, + "learning_rate": 6.327415672857371e-05, + "loss": 2.2059, + "step": 3848 + }, + { + "epoch": 1.660340737545827, + "grad_norm": 0.17899243533611298, + "learning_rate": 6.32404496679209e-05, + "loss": 2.1341, + "step": 3849 + }, + { + "epoch": 1.660772050894975, + "grad_norm": 0.17724715173244476, + "learning_rate": 6.320674504221158e-05, + "loss": 2.0841, + "step": 3850 + }, + { + "epoch": 1.660772050894975, + "eval_loss": 2.09236741065979, + "eval_runtime": 194.9891, + "eval_samples_per_second": 0.164, + "eval_steps_per_second": 0.164, + "step": 3850 + }, + { + "epoch": 1.6612033642441233, + "grad_norm": 0.19121047854423523, + "learning_rate": 6.317304285842465e-05, + "loss": 2.2765, + "step": 3851 + }, + { + "epoch": 1.6616346775932715, + "grad_norm": 0.17873340845108032, + "learning_rate": 6.313934312353854e-05, + "loss": 2.0656, + "step": 3852 + }, + { + "epoch": 1.6620659909424198, + "grad_norm": 0.18030403554439545, + "learning_rate": 6.310564584453118e-05, + "loss": 2.1507, + "step": 3853 + }, + { + "epoch": 1.6624973042915678, + "grad_norm": 0.19685925543308258, + "learning_rate": 6.307195102837989e-05, + "loss": 2.2512, + "step": 3854 + }, + { + "epoch": 1.6629286176407159, + "grad_norm": 0.18864823877811432, + "learning_rate": 6.303825868206156e-05, + "loss": 2.1788, + "step": 3855 + }, + { + "epoch": 1.6633599309898641, + "grad_norm": 0.2153341919183731, + "learning_rate": 6.300456881255261e-05, + "loss": 2.3064, + "step": 3856 + }, + { + "epoch": 1.6637912443390124, + "grad_norm": 0.17508377134799957, + "learning_rate": 6.297088142682887e-05, + "loss": 1.8921, + "step": 3857 + }, + { + "epoch": 1.6642225576881604, + "grad_norm": 0.198838472366333, + "learning_rate": 6.293719653186569e-05, + "loss": 2.0898, + "step": 3858 + }, + { + "epoch": 1.6646538710373084, + "grad_norm": 0.17895318567752838, + "learning_rate": 6.290351413463782e-05, + "loss": 2.1175, + "step": 3859 + }, + { + "epoch": 1.6650851843864567, + "grad_norm": 0.1883961409330368, + "learning_rate": 6.28698342421197e-05, + "loss": 2.4141, + "step": 3860 + }, + { + "epoch": 1.665516497735605, + "grad_norm": 0.1782238483428955, + "learning_rate": 6.283615686128505e-05, + "loss": 2.0513, + "step": 3861 + }, + { + "epoch": 1.6659478110847532, + "grad_norm": 0.16665376722812653, + "learning_rate": 6.28024819991071e-05, + "loss": 2.0893, + "step": 3862 + }, + { + "epoch": 1.6663791244339012, + "grad_norm": 0.16384795308113098, + "learning_rate": 6.276880966255868e-05, + "loss": 1.988, + "step": 3863 + }, + { + "epoch": 1.6668104377830493, + "grad_norm": 0.17633698880672455, + "learning_rate": 6.273513985861196e-05, + "loss": 2.0995, + "step": 3864 + }, + { + "epoch": 1.6672417511321975, + "grad_norm": 0.4395500123500824, + "learning_rate": 6.270147259423868e-05, + "loss": 2.1884, + "step": 3865 + }, + { + "epoch": 1.6676730644813458, + "grad_norm": 0.20222854614257812, + "learning_rate": 6.266780787640995e-05, + "loss": 2.2195, + "step": 3866 + }, + { + "epoch": 1.6681043778304938, + "grad_norm": 0.16490493714809418, + "learning_rate": 6.26341457120965e-05, + "loss": 2.0757, + "step": 3867 + }, + { + "epoch": 1.6685356911796418, + "grad_norm": 0.17670033872127533, + "learning_rate": 6.26004861082684e-05, + "loss": 2.1008, + "step": 3868 + }, + { + "epoch": 1.66896700452879, + "grad_norm": 0.22973522543907166, + "learning_rate": 6.256682907189525e-05, + "loss": 1.952, + "step": 3869 + }, + { + "epoch": 1.6693983178779384, + "grad_norm": 0.18370388448238373, + "learning_rate": 6.253317460994611e-05, + "loss": 2.169, + "step": 3870 + }, + { + "epoch": 1.6698296312270866, + "grad_norm": 0.1987435668706894, + "learning_rate": 6.24995227293895e-05, + "loss": 2.1858, + "step": 3871 + }, + { + "epoch": 1.6702609445762346, + "grad_norm": 0.18324199318885803, + "learning_rate": 6.246587343719343e-05, + "loss": 2.0375, + "step": 3872 + }, + { + "epoch": 1.6706922579253827, + "grad_norm": 0.18455226719379425, + "learning_rate": 6.243222674032529e-05, + "loss": 2.232, + "step": 3873 + }, + { + "epoch": 1.671123571274531, + "grad_norm": 0.19305765628814697, + "learning_rate": 6.239858264575209e-05, + "loss": 2.2791, + "step": 3874 + }, + { + "epoch": 1.6715548846236792, + "grad_norm": 0.21840623021125793, + "learning_rate": 6.236494116044012e-05, + "loss": 1.8552, + "step": 3875 + }, + { + "epoch": 1.6715548846236792, + "eval_loss": 2.0930511951446533, + "eval_runtime": 195.0966, + "eval_samples_per_second": 0.164, + "eval_steps_per_second": 0.164, + "step": 3875 + }, + { + "epoch": 1.6719861979728272, + "grad_norm": 0.20301920175552368, + "learning_rate": 6.23313022913553e-05, + "loss": 2.2956, + "step": 3876 + }, + { + "epoch": 1.6724175113219755, + "grad_norm": 0.27590247988700867, + "learning_rate": 6.229766604546285e-05, + "loss": 2.2569, + "step": 3877 + }, + { + "epoch": 1.6728488246711235, + "grad_norm": 0.19490353763103485, + "learning_rate": 6.226403242972755e-05, + "loss": 2.2655, + "step": 3878 + }, + { + "epoch": 1.6732801380202718, + "grad_norm": 0.1732746809720993, + "learning_rate": 6.223040145111365e-05, + "loss": 2.1003, + "step": 3879 + }, + { + "epoch": 1.67371145136942, + "grad_norm": 0.18505042791366577, + "learning_rate": 6.219677311658473e-05, + "loss": 1.9627, + "step": 3880 + }, + { + "epoch": 1.674142764718568, + "grad_norm": 0.20846937596797943, + "learning_rate": 6.216314743310398e-05, + "loss": 2.2654, + "step": 3881 + }, + { + "epoch": 1.674574078067716, + "grad_norm": 0.17472702264785767, + "learning_rate": 6.212952440763394e-05, + "loss": 2.0093, + "step": 3882 + }, + { + "epoch": 1.6750053914168643, + "grad_norm": 0.17258580029010773, + "learning_rate": 6.209590404713663e-05, + "loss": 1.9649, + "step": 3883 + }, + { + "epoch": 1.6754367047660126, + "grad_norm": 0.1834079474210739, + "learning_rate": 6.206228635857345e-05, + "loss": 2.0188, + "step": 3884 + }, + { + "epoch": 1.6758680181151606, + "grad_norm": 0.1837594360113144, + "learning_rate": 6.20286713489054e-05, + "loss": 2.0756, + "step": 3885 + }, + { + "epoch": 1.6762993314643089, + "grad_norm": 0.19340825080871582, + "learning_rate": 6.199505902509283e-05, + "loss": 2.1349, + "step": 3886 + }, + { + "epoch": 1.676730644813457, + "grad_norm": 0.18126502633094788, + "learning_rate": 6.196144939409543e-05, + "loss": 2.2139, + "step": 3887 + }, + { + "epoch": 1.6771619581626052, + "grad_norm": 0.18060217797756195, + "learning_rate": 6.19278424628726e-05, + "loss": 2.0478, + "step": 3888 + }, + { + "epoch": 1.6775932715117534, + "grad_norm": 0.19164370000362396, + "learning_rate": 6.18942382383829e-05, + "loss": 2.3741, + "step": 3889 + }, + { + "epoch": 1.6780245848609014, + "grad_norm": 0.7620417475700378, + "learning_rate": 6.186063672758451e-05, + "loss": 2.0286, + "step": 3890 + }, + { + "epoch": 1.6784558982100495, + "grad_norm": 0.19031478464603424, + "learning_rate": 6.182703793743495e-05, + "loss": 2.2822, + "step": 3891 + }, + { + "epoch": 1.6788872115591977, + "grad_norm": 0.19598186016082764, + "learning_rate": 6.179344187489126e-05, + "loss": 2.1966, + "step": 3892 + }, + { + "epoch": 1.679318524908346, + "grad_norm": 0.1745317429304123, + "learning_rate": 6.175984854690988e-05, + "loss": 1.9842, + "step": 3893 + }, + { + "epoch": 1.679749838257494, + "grad_norm": 0.17918013036251068, + "learning_rate": 6.172625796044661e-05, + "loss": 2.2293, + "step": 3894 + }, + { + "epoch": 1.6801811516066423, + "grad_norm": 0.18294218182563782, + "learning_rate": 6.169267012245681e-05, + "loss": 2.1592, + "step": 3895 + }, + { + "epoch": 1.6806124649557903, + "grad_norm": 0.16389308869838715, + "learning_rate": 6.165908503989518e-05, + "loss": 2.0937, + "step": 3896 + }, + { + "epoch": 1.6810437783049386, + "grad_norm": 0.2014763355255127, + "learning_rate": 6.162550271971589e-05, + "loss": 2.0043, + "step": 3897 + }, + { + "epoch": 1.6814750916540868, + "grad_norm": 0.17000114917755127, + "learning_rate": 6.159192316887251e-05, + "loss": 2.0175, + "step": 3898 + }, + { + "epoch": 1.6819064050032349, + "grad_norm": 0.20296160876750946, + "learning_rate": 6.155834639431809e-05, + "loss": 2.135, + "step": 3899 + }, + { + "epoch": 1.6823377183523829, + "grad_norm": 0.16654804348945618, + "learning_rate": 6.152477240300504e-05, + "loss": 1.8905, + "step": 3900 + }, + { + "epoch": 1.6823377183523829, + "eval_loss": 2.0927248001098633, + "eval_runtime": 194.9672, + "eval_samples_per_second": 0.164, + "eval_steps_per_second": 0.164, + "step": 3900 + }, + { + "epoch": 1.6827690317015311, + "grad_norm": 0.1887587010860443, + "learning_rate": 6.14912012018852e-05, + "loss": 2.2891, + "step": 3901 + }, + { + "epoch": 1.6832003450506794, + "grad_norm": 0.174739271402359, + "learning_rate": 6.14576327979099e-05, + "loss": 2.0998, + "step": 3902 + }, + { + "epoch": 1.6836316583998274, + "grad_norm": 0.20673100650310516, + "learning_rate": 6.142406719802983e-05, + "loss": 2.1349, + "step": 3903 + }, + { + "epoch": 1.6840629717489757, + "grad_norm": 0.1663362681865692, + "learning_rate": 6.139050440919512e-05, + "loss": 2.0095, + "step": 3904 + }, + { + "epoch": 1.6844942850981237, + "grad_norm": 0.18062900006771088, + "learning_rate": 6.135694443835525e-05, + "loss": 2.2951, + "step": 3905 + }, + { + "epoch": 1.684925598447272, + "grad_norm": 0.1740148812532425, + "learning_rate": 6.13233872924593e-05, + "loss": 2.2085, + "step": 3906 + }, + { + "epoch": 1.6853569117964202, + "grad_norm": 0.19164739549160004, + "learning_rate": 6.128983297845554e-05, + "loss": 2.1054, + "step": 3907 + }, + { + "epoch": 1.6857882251455683, + "grad_norm": 0.18385522067546844, + "learning_rate": 6.12562815032918e-05, + "loss": 2.0579, + "step": 3908 + }, + { + "epoch": 1.6862195384947163, + "grad_norm": 0.17914317548274994, + "learning_rate": 6.122273287391527e-05, + "loss": 2.0341, + "step": 3909 + }, + { + "epoch": 1.6866508518438645, + "grad_norm": 0.1921696662902832, + "learning_rate": 6.118918709727257e-05, + "loss": 2.2172, + "step": 3910 + }, + { + "epoch": 1.6870821651930128, + "grad_norm": 0.20252445340156555, + "learning_rate": 6.115564418030973e-05, + "loss": 2.2343, + "step": 3911 + }, + { + "epoch": 1.6875134785421608, + "grad_norm": 0.18932776153087616, + "learning_rate": 6.112210412997212e-05, + "loss": 2.2026, + "step": 3912 + }, + { + "epoch": 1.687944791891309, + "grad_norm": 0.2137858271598816, + "learning_rate": 6.108856695320465e-05, + "loss": 2.3142, + "step": 3913 + }, + { + "epoch": 1.6883761052404571, + "grad_norm": 0.18023236095905304, + "learning_rate": 6.105503265695152e-05, + "loss": 2.1168, + "step": 3914 + }, + { + "epoch": 1.6888074185896054, + "grad_norm": 0.18548575043678284, + "learning_rate": 6.1021501248156375e-05, + "loss": 2.1705, + "step": 3915 + }, + { + "epoch": 1.6892387319387536, + "grad_norm": 0.18816471099853516, + "learning_rate": 6.098797273376224e-05, + "loss": 2.1692, + "step": 3916 + }, + { + "epoch": 1.6896700452879017, + "grad_norm": 0.1845218688249588, + "learning_rate": 6.095444712071161e-05, + "loss": 2.1266, + "step": 3917 + }, + { + "epoch": 1.6901013586370497, + "grad_norm": 0.19871877133846283, + "learning_rate": 6.0920924415946324e-05, + "loss": 2.2414, + "step": 3918 + }, + { + "epoch": 1.690532671986198, + "grad_norm": 0.17879559099674225, + "learning_rate": 6.088740462640754e-05, + "loss": 1.9735, + "step": 3919 + }, + { + "epoch": 1.6909639853353462, + "grad_norm": 0.1785094290971756, + "learning_rate": 6.085388775903602e-05, + "loss": 2.3052, + "step": 3920 + }, + { + "epoch": 1.6913952986844942, + "grad_norm": 0.19720037281513214, + "learning_rate": 6.082037382077171e-05, + "loss": 2.0464, + "step": 3921 + }, + { + "epoch": 1.6918266120336425, + "grad_norm": 0.18563510477542877, + "learning_rate": 6.078686281855408e-05, + "loss": 2.1952, + "step": 3922 + }, + { + "epoch": 1.6922579253827905, + "grad_norm": 0.19016218185424805, + "learning_rate": 6.0753354759321914e-05, + "loss": 2.162, + "step": 3923 + }, + { + "epoch": 1.6926892387319388, + "grad_norm": 0.1768539696931839, + "learning_rate": 6.071984965001346e-05, + "loss": 2.277, + "step": 3924 + }, + { + "epoch": 1.693120552081087, + "grad_norm": 0.18435247242450714, + "learning_rate": 6.068634749756632e-05, + "loss": 2.1464, + "step": 3925 + }, + { + "epoch": 1.693120552081087, + "eval_loss": 2.092581033706665, + "eval_runtime": 194.7985, + "eval_samples_per_second": 0.164, + "eval_steps_per_second": 0.164, + "step": 3925 + }, + { + "epoch": 1.693551865430235, + "grad_norm": 0.17748647928237915, + "learning_rate": 6.065284830891741e-05, + "loss": 2.134, + "step": 3926 + }, + { + "epoch": 1.693983178779383, + "grad_norm": 0.16803781688213348, + "learning_rate": 6.061935209100321e-05, + "loss": 1.9717, + "step": 3927 + }, + { + "epoch": 1.6944144921285313, + "grad_norm": 0.2016642689704895, + "learning_rate": 6.0585858850759395e-05, + "loss": 2.3664, + "step": 3928 + }, + { + "epoch": 1.6948458054776796, + "grad_norm": 0.17812767624855042, + "learning_rate": 6.055236859512114e-05, + "loss": 2.073, + "step": 3929 + }, + { + "epoch": 1.6952771188268279, + "grad_norm": 0.18797719478607178, + "learning_rate": 6.051888133102296e-05, + "loss": 2.1412, + "step": 3930 + }, + { + "epoch": 1.6957084321759759, + "grad_norm": 0.19550077617168427, + "learning_rate": 6.0485397065398764e-05, + "loss": 2.1034, + "step": 3931 + }, + { + "epoch": 1.696139745525124, + "grad_norm": 0.1779249608516693, + "learning_rate": 6.045191580518186e-05, + "loss": 1.975, + "step": 3932 + }, + { + "epoch": 1.6965710588742722, + "grad_norm": 0.1735992133617401, + "learning_rate": 6.0418437557304845e-05, + "loss": 2.1543, + "step": 3933 + }, + { + "epoch": 1.6970023722234204, + "grad_norm": 0.18777874112129211, + "learning_rate": 6.038496232869982e-05, + "loss": 2.3913, + "step": 3934 + }, + { + "epoch": 1.6974336855725685, + "grad_norm": 0.19871315360069275, + "learning_rate": 6.035149012629816e-05, + "loss": 2.2304, + "step": 3935 + }, + { + "epoch": 1.6978649989217165, + "grad_norm": 0.1709122359752655, + "learning_rate": 6.0318020957030674e-05, + "loss": 2.0261, + "step": 3936 + }, + { + "epoch": 1.6982963122708647, + "grad_norm": 0.18706877529621124, + "learning_rate": 6.0284554827827495e-05, + "loss": 2.1231, + "step": 3937 + }, + { + "epoch": 1.698727625620013, + "grad_norm": 0.18627259135246277, + "learning_rate": 6.025109174561818e-05, + "loss": 2.2471, + "step": 3938 + }, + { + "epoch": 1.6991589389691613, + "grad_norm": 0.19945907592773438, + "learning_rate": 6.0217631717331606e-05, + "loss": 2.0648, + "step": 3939 + }, + { + "epoch": 1.6995902523183093, + "grad_norm": 0.19371074438095093, + "learning_rate": 6.018417474989602e-05, + "loss": 2.0709, + "step": 3940 + }, + { + "epoch": 1.7000215656674573, + "grad_norm": 0.19727398455142975, + "learning_rate": 6.01507208502391e-05, + "loss": 2.2179, + "step": 3941 + }, + { + "epoch": 1.7004528790166056, + "grad_norm": 0.1947556436061859, + "learning_rate": 6.011727002528781e-05, + "loss": 2.1538, + "step": 3942 + }, + { + "epoch": 1.7008841923657538, + "grad_norm": 0.17219044268131256, + "learning_rate": 6.0083822281968536e-05, + "loss": 2.1982, + "step": 3943 + }, + { + "epoch": 1.7013155057149019, + "grad_norm": 0.18319495022296906, + "learning_rate": 6.005037762720692e-05, + "loss": 2.2547, + "step": 3944 + }, + { + "epoch": 1.70174681906405, + "grad_norm": 0.1841065138578415, + "learning_rate": 6.001693606792816e-05, + "loss": 2.2998, + "step": 3945 + }, + { + "epoch": 1.7021781324131982, + "grad_norm": 0.180063396692276, + "learning_rate": 5.9983497611056633e-05, + "loss": 2.1797, + "step": 3946 + }, + { + "epoch": 1.7026094457623464, + "grad_norm": 0.1915334016084671, + "learning_rate": 5.9950062263516114e-05, + "loss": 2.1179, + "step": 3947 + }, + { + "epoch": 1.7030407591114947, + "grad_norm": 0.19555287063121796, + "learning_rate": 5.9916630032229804e-05, + "loss": 2.2253, + "step": 3948 + }, + { + "epoch": 1.7034720724606427, + "grad_norm": 0.1670524626970291, + "learning_rate": 5.98832009241202e-05, + "loss": 1.9075, + "step": 3949 + }, + { + "epoch": 1.7039033858097907, + "grad_norm": 0.18049435317516327, + "learning_rate": 5.9849774946109164e-05, + "loss": 2.2191, + "step": 3950 + }, + { + "epoch": 1.7039033858097907, + "eval_loss": 2.092341184616089, + "eval_runtime": 195.676, + "eval_samples_per_second": 0.164, + "eval_steps_per_second": 0.164, + "step": 3950 + }, + { + "epoch": 1.704334699158939, + "grad_norm": 0.18992845714092255, + "learning_rate": 5.9816352105117874e-05, + "loss": 2.1745, + "step": 3951 + }, + { + "epoch": 1.7047660125080872, + "grad_norm": 0.1593848168849945, + "learning_rate": 5.978293240806696e-05, + "loss": 1.9007, + "step": 3952 + }, + { + "epoch": 1.7051973258572353, + "grad_norm": 0.1983330100774765, + "learning_rate": 5.974951586187629e-05, + "loss": 2.1191, + "step": 3953 + }, + { + "epoch": 1.7056286392063833, + "grad_norm": 0.18438118696212769, + "learning_rate": 5.971610247346513e-05, + "loss": 2.2795, + "step": 3954 + }, + { + "epoch": 1.7060599525555316, + "grad_norm": 0.18407520651817322, + "learning_rate": 5.968269224975209e-05, + "loss": 2.0768, + "step": 3955 + }, + { + "epoch": 1.7064912659046798, + "grad_norm": 0.18830512464046478, + "learning_rate": 5.964928519765515e-05, + "loss": 2.1758, + "step": 3956 + }, + { + "epoch": 1.706922579253828, + "grad_norm": 0.16916070878505707, + "learning_rate": 5.961588132409158e-05, + "loss": 2.2333, + "step": 3957 + }, + { + "epoch": 1.707353892602976, + "grad_norm": 0.19552044570446014, + "learning_rate": 5.9582480635977984e-05, + "loss": 2.2937, + "step": 3958 + }, + { + "epoch": 1.7077852059521241, + "grad_norm": 0.18797554075717926, + "learning_rate": 5.954908314023043e-05, + "loss": 2.0564, + "step": 3959 + }, + { + "epoch": 1.7082165193012724, + "grad_norm": 0.20710021257400513, + "learning_rate": 5.951568884376415e-05, + "loss": 2.0914, + "step": 3960 + }, + { + "epoch": 1.7086478326504206, + "grad_norm": 0.191316157579422, + "learning_rate": 5.948229775349383e-05, + "loss": 2.3974, + "step": 3961 + }, + { + "epoch": 1.7090791459995687, + "grad_norm": 0.1908051073551178, + "learning_rate": 5.944890987633344e-05, + "loss": 2.2372, + "step": 3962 + }, + { + "epoch": 1.7095104593487167, + "grad_norm": 0.1851019263267517, + "learning_rate": 5.941552521919635e-05, + "loss": 2.31, + "step": 3963 + }, + { + "epoch": 1.709941772697865, + "grad_norm": 1.414542317390442, + "learning_rate": 5.93821437889952e-05, + "loss": 2.346, + "step": 3964 + }, + { + "epoch": 1.7103730860470132, + "grad_norm": 0.1777050644159317, + "learning_rate": 5.9348765592641924e-05, + "loss": 2.3289, + "step": 3965 + }, + { + "epoch": 1.7108043993961615, + "grad_norm": 0.20037689805030823, + "learning_rate": 5.931539063704795e-05, + "loss": 1.9037, + "step": 3966 + }, + { + "epoch": 1.7112357127453095, + "grad_norm": 0.18143126368522644, + "learning_rate": 5.9282018929123845e-05, + "loss": 2.2531, + "step": 3967 + }, + { + "epoch": 1.7116670260944575, + "grad_norm": 0.19299474358558655, + "learning_rate": 5.924865047577961e-05, + "loss": 2.2543, + "step": 3968 + }, + { + "epoch": 1.7120983394436058, + "grad_norm": 0.20400649309158325, + "learning_rate": 5.921528528392453e-05, + "loss": 2.0489, + "step": 3969 + }, + { + "epoch": 1.712529652792754, + "grad_norm": 0.18670541048049927, + "learning_rate": 5.9181923360467264e-05, + "loss": 2.0749, + "step": 3970 + }, + { + "epoch": 1.712960966141902, + "grad_norm": 0.1974756121635437, + "learning_rate": 5.914856471231577e-05, + "loss": 2.1085, + "step": 3971 + }, + { + "epoch": 1.71339227949105, + "grad_norm": 0.1814204305410385, + "learning_rate": 5.911520934637727e-05, + "loss": 1.8131, + "step": 3972 + }, + { + "epoch": 1.7138235928401984, + "grad_norm": 0.1980465054512024, + "learning_rate": 5.9081857269558406e-05, + "loss": 2.2717, + "step": 3973 + }, + { + "epoch": 1.7142549061893466, + "grad_norm": 0.1712466925382614, + "learning_rate": 5.9048508488765066e-05, + "loss": 2.0018, + "step": 3974 + }, + { + "epoch": 1.7146862195384949, + "grad_norm": 0.17620418965816498, + "learning_rate": 5.901516301090249e-05, + "loss": 2.2338, + "step": 3975 + }, + { + "epoch": 1.7146862195384949, + "eval_loss": 2.092210054397583, + "eval_runtime": 195.6295, + "eval_samples_per_second": 0.164, + "eval_steps_per_second": 0.164, + "step": 3975 + }, + { + "epoch": 1.715117532887643, + "grad_norm": 0.18851706385612488, + "learning_rate": 5.898182084287521e-05, + "loss": 2.1636, + "step": 3976 + }, + { + "epoch": 1.715548846236791, + "grad_norm": 0.16978512704372406, + "learning_rate": 5.894848199158712e-05, + "loss": 2.191, + "step": 3977 + }, + { + "epoch": 1.7159801595859392, + "grad_norm": 0.1951020061969757, + "learning_rate": 5.891514646394135e-05, + "loss": 2.2491, + "step": 3978 + }, + { + "epoch": 1.7164114729350874, + "grad_norm": 0.2247585654258728, + "learning_rate": 5.88818142668404e-05, + "loss": 2.1196, + "step": 3979 + }, + { + "epoch": 1.7168427862842355, + "grad_norm": 0.2103835642337799, + "learning_rate": 5.884848540718608e-05, + "loss": 2.0868, + "step": 3980 + }, + { + "epoch": 1.7172740996333835, + "grad_norm": 0.1812957227230072, + "learning_rate": 5.881515989187949e-05, + "loss": 2.2188, + "step": 3981 + }, + { + "epoch": 1.7177054129825318, + "grad_norm": 0.17861905694007874, + "learning_rate": 5.878183772782105e-05, + "loss": 2.0906, + "step": 3982 + }, + { + "epoch": 1.71813672633168, + "grad_norm": 0.16796641051769257, + "learning_rate": 5.8748518921910434e-05, + "loss": 1.9122, + "step": 3983 + }, + { + "epoch": 1.7185680396808283, + "grad_norm": 0.18065188825130463, + "learning_rate": 5.8715203481046726e-05, + "loss": 2.0403, + "step": 3984 + }, + { + "epoch": 1.7189993530299763, + "grad_norm": 0.18287047743797302, + "learning_rate": 5.8681891412128224e-05, + "loss": 2.2585, + "step": 3985 + }, + { + "epoch": 1.7194306663791243, + "grad_norm": 0.18636803328990936, + "learning_rate": 5.864858272205253e-05, + "loss": 2.1291, + "step": 3986 + }, + { + "epoch": 1.7198619797282726, + "grad_norm": 0.1851028949022293, + "learning_rate": 5.861527741771663e-05, + "loss": 2.2074, + "step": 3987 + }, + { + "epoch": 1.7202932930774208, + "grad_norm": 0.18895862996578217, + "learning_rate": 5.858197550601671e-05, + "loss": 2.2836, + "step": 3988 + }, + { + "epoch": 1.7207246064265689, + "grad_norm": 0.1789833903312683, + "learning_rate": 5.854867699384832e-05, + "loss": 2.0446, + "step": 3989 + }, + { + "epoch": 1.721155919775717, + "grad_norm": 0.17047496140003204, + "learning_rate": 5.851538188810622e-05, + "loss": 2.2997, + "step": 3990 + }, + { + "epoch": 1.7215872331248652, + "grad_norm": 0.19940529763698578, + "learning_rate": 5.848209019568463e-05, + "loss": 2.1693, + "step": 3991 + }, + { + "epoch": 1.7220185464740134, + "grad_norm": 0.16889867186546326, + "learning_rate": 5.8448801923476876e-05, + "loss": 2.3546, + "step": 3992 + }, + { + "epoch": 1.7224498598231617, + "grad_norm": 0.16920462250709534, + "learning_rate": 5.841551707837567e-05, + "loss": 2.225, + "step": 3993 + }, + { + "epoch": 1.7228811731723097, + "grad_norm": 0.18416011333465576, + "learning_rate": 5.8382235667273034e-05, + "loss": 2.1297, + "step": 3994 + }, + { + "epoch": 1.7233124865214577, + "grad_norm": 0.18310239911079407, + "learning_rate": 5.834895769706024e-05, + "loss": 2.3231, + "step": 3995 + }, + { + "epoch": 1.723743799870606, + "grad_norm": 0.1977590173482895, + "learning_rate": 5.831568317462785e-05, + "loss": 2.2033, + "step": 3996 + }, + { + "epoch": 1.7241751132197543, + "grad_norm": 0.1837632954120636, + "learning_rate": 5.828241210686568e-05, + "loss": 2.5726, + "step": 3997 + }, + { + "epoch": 1.7246064265689023, + "grad_norm": 0.1981734335422516, + "learning_rate": 5.8249144500662945e-05, + "loss": 2.1677, + "step": 3998 + }, + { + "epoch": 1.7250377399180503, + "grad_norm": 0.1921195387840271, + "learning_rate": 5.821588036290801e-05, + "loss": 2.1464, + "step": 3999 + }, + { + "epoch": 1.7254690532671986, + "grad_norm": 0.20350435376167297, + "learning_rate": 5.8182619700488576e-05, + "loss": 2.0763, + "step": 4000 + }, + { + "epoch": 1.7254690532671986, + "eval_loss": 2.091747283935547, + "eval_runtime": 207.3983, + "eval_samples_per_second": 0.154, + "eval_steps_per_second": 0.154, + "step": 4000 + }, + { + "epoch": 1.7259003666163468, + "grad_norm": 0.18566665053367615, + "learning_rate": 5.814936252029167e-05, + "loss": 2.1219, + "step": 4001 + }, + { + "epoch": 1.726331679965495, + "grad_norm": 0.1934313178062439, + "learning_rate": 5.811610882920352e-05, + "loss": 2.3388, + "step": 4002 + }, + { + "epoch": 1.7267629933146431, + "grad_norm": 0.18000420928001404, + "learning_rate": 5.808285863410969e-05, + "loss": 2.2346, + "step": 4003 + }, + { + "epoch": 1.7271943066637911, + "grad_norm": 0.20419535040855408, + "learning_rate": 5.804961194189495e-05, + "loss": 2.2095, + "step": 4004 + }, + { + "epoch": 1.7276256200129394, + "grad_norm": 0.1848215013742447, + "learning_rate": 5.8016368759443454e-05, + "loss": 2.267, + "step": 4005 + }, + { + "epoch": 1.7280569333620877, + "grad_norm": 0.191674143075943, + "learning_rate": 5.7983129093638527e-05, + "loss": 1.9493, + "step": 4006 + }, + { + "epoch": 1.7284882467112357, + "grad_norm": 0.19241303205490112, + "learning_rate": 5.794989295136282e-05, + "loss": 2.0437, + "step": 4007 + }, + { + "epoch": 1.7289195600603837, + "grad_norm": 0.19676679372787476, + "learning_rate": 5.791666033949821e-05, + "loss": 2.0497, + "step": 4008 + }, + { + "epoch": 1.729350873409532, + "grad_norm": 0.17912554740905762, + "learning_rate": 5.788343126492591e-05, + "loss": 2.3007, + "step": 4009 + }, + { + "epoch": 1.7297821867586802, + "grad_norm": 0.19069208204746246, + "learning_rate": 5.7850205734526376e-05, + "loss": 2.2575, + "step": 4010 + }, + { + "epoch": 1.7302135001078285, + "grad_norm": 0.2015782594680786, + "learning_rate": 5.7816983755179237e-05, + "loss": 2.197, + "step": 4011 + }, + { + "epoch": 1.7306448134569765, + "grad_norm": 0.20349441468715668, + "learning_rate": 5.778376533376357e-05, + "loss": 2.1082, + "step": 4012 + }, + { + "epoch": 1.7310761268061245, + "grad_norm": 0.19610483944416046, + "learning_rate": 5.775055047715755e-05, + "loss": 2.3583, + "step": 4013 + }, + { + "epoch": 1.7315074401552728, + "grad_norm": 0.19321462512016296, + "learning_rate": 5.771733919223869e-05, + "loss": 2.2541, + "step": 4014 + }, + { + "epoch": 1.731938753504421, + "grad_norm": 0.18723595142364502, + "learning_rate": 5.7684131485883744e-05, + "loss": 2.2065, + "step": 4015 + }, + { + "epoch": 1.732370066853569, + "grad_norm": 0.18089692294597626, + "learning_rate": 5.765092736496874e-05, + "loss": 1.8975, + "step": 4016 + }, + { + "epoch": 1.7328013802027171, + "grad_norm": 0.18110182881355286, + "learning_rate": 5.761772683636897e-05, + "loss": 2.2158, + "step": 4017 + }, + { + "epoch": 1.7332326935518654, + "grad_norm": 0.19009125232696533, + "learning_rate": 5.758452990695892e-05, + "loss": 2.1776, + "step": 4018 + }, + { + "epoch": 1.7336640069010136, + "grad_norm": 0.2081516981124878, + "learning_rate": 5.755133658361242e-05, + "loss": 2.197, + "step": 4019 + }, + { + "epoch": 1.7340953202501619, + "grad_norm": 0.18723157048225403, + "learning_rate": 5.75181468732025e-05, + "loss": 2.1137, + "step": 4020 + }, + { + "epoch": 1.73452663359931, + "grad_norm": 0.18066422641277313, + "learning_rate": 5.748496078260144e-05, + "loss": 2.1427, + "step": 4021 + }, + { + "epoch": 1.734957946948458, + "grad_norm": 0.19186154007911682, + "learning_rate": 5.7451778318680775e-05, + "loss": 2.1782, + "step": 4022 + }, + { + "epoch": 1.7353892602976062, + "grad_norm": 0.1925586760044098, + "learning_rate": 5.741859948831135e-05, + "loss": 2.3438, + "step": 4023 + }, + { + "epoch": 1.7358205736467545, + "grad_norm": 0.17659415304660797, + "learning_rate": 5.738542429836315e-05, + "loss": 2.0247, + "step": 4024 + }, + { + "epoch": 1.7362518869959025, + "grad_norm": 0.18985515832901, + "learning_rate": 5.735225275570547e-05, + "loss": 2.2613, + "step": 4025 + }, + { + "epoch": 1.7362518869959025, + "eval_loss": 2.0919628143310547, + "eval_runtime": 201.674, + "eval_samples_per_second": 0.159, + "eval_steps_per_second": 0.159, + "step": 4025 + }, + { + "epoch": 1.7366832003450505, + "grad_norm": 0.19430814683437347, + "learning_rate": 5.731908486720685e-05, + "loss": 2.1622, + "step": 4026 + }, + { + "epoch": 1.7371145136941988, + "grad_norm": 0.18821769952774048, + "learning_rate": 5.728592063973508e-05, + "loss": 1.995, + "step": 4027 + }, + { + "epoch": 1.737545827043347, + "grad_norm": 0.2075040340423584, + "learning_rate": 5.725276008015716e-05, + "loss": 2.2193, + "step": 4028 + }, + { + "epoch": 1.7379771403924953, + "grad_norm": 0.18539956212043762, + "learning_rate": 5.721960319533931e-05, + "loss": 2.2047, + "step": 4029 + }, + { + "epoch": 1.7384084537416433, + "grad_norm": 0.22715280950069427, + "learning_rate": 5.71864499921471e-05, + "loss": 2.1533, + "step": 4030 + }, + { + "epoch": 1.7388397670907914, + "grad_norm": 0.21007855236530304, + "learning_rate": 5.715330047744522e-05, + "loss": 2.1163, + "step": 4031 + }, + { + "epoch": 1.7392710804399396, + "grad_norm": 0.1632479578256607, + "learning_rate": 5.712015465809762e-05, + "loss": 2.1047, + "step": 4032 + }, + { + "epoch": 1.7397023937890879, + "grad_norm": 0.19230559468269348, + "learning_rate": 5.708701254096755e-05, + "loss": 2.1609, + "step": 4033 + }, + { + "epoch": 1.740133707138236, + "grad_norm": 0.19357123970985413, + "learning_rate": 5.705387413291742e-05, + "loss": 2.1058, + "step": 4034 + }, + { + "epoch": 1.740565020487384, + "grad_norm": 0.17949312925338745, + "learning_rate": 5.702073944080893e-05, + "loss": 2.0622, + "step": 4035 + }, + { + "epoch": 1.7409963338365322, + "grad_norm": 0.202627494931221, + "learning_rate": 5.6987608471502894e-05, + "loss": 2.1767, + "step": 4036 + }, + { + "epoch": 1.7414276471856804, + "grad_norm": 0.19418296217918396, + "learning_rate": 5.695448123185956e-05, + "loss": 2.2173, + "step": 4037 + }, + { + "epoch": 1.7418589605348287, + "grad_norm": 0.172317773103714, + "learning_rate": 5.6921357728738226e-05, + "loss": 2.1346, + "step": 4038 + }, + { + "epoch": 1.7422902738839767, + "grad_norm": 0.1859188973903656, + "learning_rate": 5.6888237968997455e-05, + "loss": 2.1686, + "step": 4039 + }, + { + "epoch": 1.7427215872331248, + "grad_norm": 0.18734541535377502, + "learning_rate": 5.685512195949508e-05, + "loss": 2.3108, + "step": 4040 + }, + { + "epoch": 1.743152900582273, + "grad_norm": 0.18585248291492462, + "learning_rate": 5.682200970708815e-05, + "loss": 2.2407, + "step": 4041 + }, + { + "epoch": 1.7435842139314213, + "grad_norm": 0.21692951023578644, + "learning_rate": 5.678890121863291e-05, + "loss": 2.4208, + "step": 4042 + }, + { + "epoch": 1.7440155272805693, + "grad_norm": 0.18898186087608337, + "learning_rate": 5.6755796500984784e-05, + "loss": 2.0094, + "step": 4043 + }, + { + "epoch": 1.7444468406297173, + "grad_norm": 0.21968823671340942, + "learning_rate": 5.672269556099855e-05, + "loss": 2.0242, + "step": 4044 + }, + { + "epoch": 1.7448781539788656, + "grad_norm": 0.17570222914218903, + "learning_rate": 5.668959840552807e-05, + "loss": 2.1131, + "step": 4045 + }, + { + "epoch": 1.7453094673280138, + "grad_norm": 0.17798030376434326, + "learning_rate": 5.665650504142645e-05, + "loss": 2.0615, + "step": 4046 + }, + { + "epoch": 1.745740780677162, + "grad_norm": 0.18202351033687592, + "learning_rate": 5.6623415475546094e-05, + "loss": 2.1592, + "step": 4047 + }, + { + "epoch": 1.7461720940263101, + "grad_norm": 0.2013104110956192, + "learning_rate": 5.659032971473851e-05, + "loss": 2.194, + "step": 4048 + }, + { + "epoch": 1.7466034073754582, + "grad_norm": 0.1691199541091919, + "learning_rate": 5.6557247765854505e-05, + "loss": 2.3038, + "step": 4049 + }, + { + "epoch": 1.7470347207246064, + "grad_norm": 0.18335652351379395, + "learning_rate": 5.652416963574399e-05, + "loss": 2.1304, + "step": 4050 + }, + { + "epoch": 1.7470347207246064, + "eval_loss": 2.091346502304077, + "eval_runtime": 201.6741, + "eval_samples_per_second": 0.159, + "eval_steps_per_second": 0.159, + "step": 4050 + }, + { + "epoch": 1.7474660340737547, + "grad_norm": 0.17796850204467773, + "learning_rate": 5.6491095331256254e-05, + "loss": 2.3191, + "step": 4051 + }, + { + "epoch": 1.7478973474229027, + "grad_norm": 0.1794384866952896, + "learning_rate": 5.645802485923962e-05, + "loss": 1.8804, + "step": 4052 + }, + { + "epoch": 1.7483286607720507, + "grad_norm": 0.17685826122760773, + "learning_rate": 5.6424958226541694e-05, + "loss": 2.0451, + "step": 4053 + }, + { + "epoch": 1.748759974121199, + "grad_norm": 0.1787034571170807, + "learning_rate": 5.639189544000931e-05, + "loss": 2.0513, + "step": 4054 + }, + { + "epoch": 1.7491912874703472, + "grad_norm": 0.1818806678056717, + "learning_rate": 5.635883650648848e-05, + "loss": 2.4097, + "step": 4055 + }, + { + "epoch": 1.7496226008194955, + "grad_norm": 0.18707096576690674, + "learning_rate": 5.632578143282442e-05, + "loss": 2.195, + "step": 4056 + }, + { + "epoch": 1.7500539141686435, + "grad_norm": 0.2021271139383316, + "learning_rate": 5.62927302258615e-05, + "loss": 2.1611, + "step": 4057 + }, + { + "epoch": 1.7504852275177916, + "grad_norm": 0.18080992996692657, + "learning_rate": 5.625968289244338e-05, + "loss": 2.2268, + "step": 4058 + }, + { + "epoch": 1.7509165408669398, + "grad_norm": 0.18082956969738007, + "learning_rate": 5.622663943941286e-05, + "loss": 2.3477, + "step": 4059 + }, + { + "epoch": 1.751347854216088, + "grad_norm": 0.18515385687351227, + "learning_rate": 5.619359987361194e-05, + "loss": 2.2363, + "step": 4060 + }, + { + "epoch": 1.751779167565236, + "grad_norm": 0.17915287613868713, + "learning_rate": 5.616056420188182e-05, + "loss": 2.1316, + "step": 4061 + }, + { + "epoch": 1.7522104809143841, + "grad_norm": 0.18371932208538055, + "learning_rate": 5.6127532431062936e-05, + "loss": 2.0856, + "step": 4062 + }, + { + "epoch": 1.7526417942635324, + "grad_norm": 0.18611958622932434, + "learning_rate": 5.609450456799483e-05, + "loss": 2.0434, + "step": 4063 + }, + { + "epoch": 1.7530731076126806, + "grad_norm": 0.17234119772911072, + "learning_rate": 5.606148061951629e-05, + "loss": 1.9636, + "step": 4064 + }, + { + "epoch": 1.753504420961829, + "grad_norm": 0.1878260374069214, + "learning_rate": 5.6028460592465304e-05, + "loss": 2.1463, + "step": 4065 + }, + { + "epoch": 1.753935734310977, + "grad_norm": 0.2029743492603302, + "learning_rate": 5.5995444493679035e-05, + "loss": 2.32, + "step": 4066 + }, + { + "epoch": 1.754367047660125, + "grad_norm": 0.23618969321250916, + "learning_rate": 5.5962432329993817e-05, + "loss": 2.1887, + "step": 4067 + }, + { + "epoch": 1.7547983610092732, + "grad_norm": 0.2027716189622879, + "learning_rate": 5.592942410824513e-05, + "loss": 2.2154, + "step": 4068 + }, + { + "epoch": 1.7552296743584215, + "grad_norm": 0.23110395669937134, + "learning_rate": 5.589641983526779e-05, + "loss": 1.9544, + "step": 4069 + }, + { + "epoch": 1.7556609877075695, + "grad_norm": 0.17785879969596863, + "learning_rate": 5.586341951789561e-05, + "loss": 2.127, + "step": 4070 + }, + { + "epoch": 1.7560923010567175, + "grad_norm": 0.18353521823883057, + "learning_rate": 5.583042316296169e-05, + "loss": 2.1133, + "step": 4071 + }, + { + "epoch": 1.7565236144058658, + "grad_norm": 0.20097488164901733, + "learning_rate": 5.5797430777298294e-05, + "loss": 2.2941, + "step": 4072 + }, + { + "epoch": 1.756954927755014, + "grad_norm": 0.1747606247663498, + "learning_rate": 5.576444236773685e-05, + "loss": 2.0431, + "step": 4073 + }, + { + "epoch": 1.7573862411041623, + "grad_norm": 0.17923399806022644, + "learning_rate": 5.573145794110799e-05, + "loss": 2.0249, + "step": 4074 + }, + { + "epoch": 1.7578175544533103, + "grad_norm": 0.2070382833480835, + "learning_rate": 5.569847750424144e-05, + "loss": 2.2384, + "step": 4075 + }, + { + "epoch": 1.7578175544533103, + "eval_loss": 2.09136962890625, + "eval_runtime": 201.606, + "eval_samples_per_second": 0.159, + "eval_steps_per_second": 0.159, + "step": 4075 + }, + { + "epoch": 1.7582488678024584, + "grad_norm": 0.18448781967163086, + "learning_rate": 5.566550106396625e-05, + "loss": 2.0828, + "step": 4076 + }, + { + "epoch": 1.7586801811516066, + "grad_norm": 0.19364839792251587, + "learning_rate": 5.563252862711046e-05, + "loss": 2.1173, + "step": 4077 + }, + { + "epoch": 1.7591114945007549, + "grad_norm": 0.17255939543247223, + "learning_rate": 5.559956020050143e-05, + "loss": 2.0035, + "step": 4078 + }, + { + "epoch": 1.759542807849903, + "grad_norm": 0.17625278234481812, + "learning_rate": 5.5566595790965625e-05, + "loss": 2.1868, + "step": 4079 + }, + { + "epoch": 1.759974121199051, + "grad_norm": 0.18831560015678406, + "learning_rate": 5.553363540532868e-05, + "loss": 2.0214, + "step": 4080 + }, + { + "epoch": 1.7604054345481992, + "grad_norm": 0.19190707802772522, + "learning_rate": 5.5500679050415426e-05, + "loss": 2.2865, + "step": 4081 + }, + { + "epoch": 1.7608367478973475, + "grad_norm": 0.17967812716960907, + "learning_rate": 5.5467726733049765e-05, + "loss": 2.1373, + "step": 4082 + }, + { + "epoch": 1.7612680612464957, + "grad_norm": 0.19684270024299622, + "learning_rate": 5.543477846005493e-05, + "loss": 2.1682, + "step": 4083 + }, + { + "epoch": 1.7616993745956437, + "grad_norm": 0.18483242392539978, + "learning_rate": 5.5401834238253165e-05, + "loss": 2.2375, + "step": 4084 + }, + { + "epoch": 1.7621306879447918, + "grad_norm": 0.16816097497940063, + "learning_rate": 5.536889407446593e-05, + "loss": 2.146, + "step": 4085 + }, + { + "epoch": 1.76256200129394, + "grad_norm": 0.1955161839723587, + "learning_rate": 5.533595797551386e-05, + "loss": 1.9715, + "step": 4086 + }, + { + "epoch": 1.7629933146430883, + "grad_norm": 0.19922764599323273, + "learning_rate": 5.530302594821674e-05, + "loss": 2.0799, + "step": 4087 + }, + { + "epoch": 1.7634246279922363, + "grad_norm": 0.18744394183158875, + "learning_rate": 5.527009799939353e-05, + "loss": 2.2148, + "step": 4088 + }, + { + "epoch": 1.7638559413413846, + "grad_norm": 0.18498487770557404, + "learning_rate": 5.523717413586223e-05, + "loss": 2.1171, + "step": 4089 + }, + { + "epoch": 1.7642872546905326, + "grad_norm": 0.18250973522663116, + "learning_rate": 5.52042543644402e-05, + "loss": 2.2352, + "step": 4090 + }, + { + "epoch": 1.7647185680396809, + "grad_norm": 0.1901228427886963, + "learning_rate": 5.517133869194378e-05, + "loss": 2.2953, + "step": 4091 + }, + { + "epoch": 1.7651498813888291, + "grad_norm": 0.1902785301208496, + "learning_rate": 5.513842712518849e-05, + "loss": 2.2597, + "step": 4092 + }, + { + "epoch": 1.7655811947379771, + "grad_norm": 0.18489375710487366, + "learning_rate": 5.51055196709891e-05, + "loss": 1.8765, + "step": 4093 + }, + { + "epoch": 1.7660125080871252, + "grad_norm": 0.19047106802463531, + "learning_rate": 5.5072616336159413e-05, + "loss": 2.216, + "step": 4094 + }, + { + "epoch": 1.7664438214362734, + "grad_norm": 0.19863182306289673, + "learning_rate": 5.5039717127512456e-05, + "loss": 2.0272, + "step": 4095 + }, + { + "epoch": 1.7668751347854217, + "grad_norm": 0.19716544449329376, + "learning_rate": 5.500682205186032e-05, + "loss": 2.0992, + "step": 4096 + }, + { + "epoch": 1.7673064481345697, + "grad_norm": 0.1879461258649826, + "learning_rate": 5.497393111601433e-05, + "loss": 2.2904, + "step": 4097 + }, + { + "epoch": 1.767737761483718, + "grad_norm": 0.17728249728679657, + "learning_rate": 5.4941044326784906e-05, + "loss": 2.1857, + "step": 4098 + }, + { + "epoch": 1.768169074832866, + "grad_norm": 0.18926134705543518, + "learning_rate": 5.49081616909816e-05, + "loss": 2.161, + "step": 4099 + }, + { + "epoch": 1.7686003881820143, + "grad_norm": 0.18339209258556366, + "learning_rate": 5.487528321541315e-05, + "loss": 2.2346, + "step": 4100 + }, + { + "epoch": 1.7686003881820143, + "eval_loss": 2.0910797119140625, + "eval_runtime": 200.6694, + "eval_samples_per_second": 0.159, + "eval_steps_per_second": 0.159, + "step": 4100 + }, + { + "epoch": 1.7690317015311625, + "grad_norm": 0.17934048175811768, + "learning_rate": 5.4842408906887416e-05, + "loss": 2.1667, + "step": 4101 + }, + { + "epoch": 1.7694630148803105, + "grad_norm": 0.20105083286762238, + "learning_rate": 5.480953877221134e-05, + "loss": 2.0091, + "step": 4102 + }, + { + "epoch": 1.7698943282294586, + "grad_norm": 0.1738869994878769, + "learning_rate": 5.477667281819103e-05, + "loss": 2.0166, + "step": 4103 + }, + { + "epoch": 1.7703256415786068, + "grad_norm": 0.18088917434215546, + "learning_rate": 5.47438110516318e-05, + "loss": 2.0975, + "step": 4104 + }, + { + "epoch": 1.770756954927755, + "grad_norm": 0.17673984169960022, + "learning_rate": 5.4710953479338e-05, + "loss": 2.0978, + "step": 4105 + }, + { + "epoch": 1.7711882682769031, + "grad_norm": 0.18527352809906006, + "learning_rate": 5.4678100108113186e-05, + "loss": 2.0892, + "step": 4106 + }, + { + "epoch": 1.7716195816260514, + "grad_norm": 0.19644999504089355, + "learning_rate": 5.464525094475993e-05, + "loss": 2.1712, + "step": 4107 + }, + { + "epoch": 1.7720508949751994, + "grad_norm": 0.1868610829114914, + "learning_rate": 5.461240599608011e-05, + "loss": 2.1838, + "step": 4108 + }, + { + "epoch": 1.7724822083243477, + "grad_norm": 0.1908998340368271, + "learning_rate": 5.457956526887456e-05, + "loss": 2.2289, + "step": 4109 + }, + { + "epoch": 1.772913521673496, + "grad_norm": 0.22750572860240936, + "learning_rate": 5.4546728769943334e-05, + "loss": 2.1652, + "step": 4110 + }, + { + "epoch": 1.773344835022644, + "grad_norm": 0.2120228111743927, + "learning_rate": 5.451389650608559e-05, + "loss": 2.177, + "step": 4111 + }, + { + "epoch": 1.773776148371792, + "grad_norm": 0.19006375968456268, + "learning_rate": 5.448106848409962e-05, + "loss": 2.1953, + "step": 4112 + }, + { + "epoch": 1.7742074617209402, + "grad_norm": 0.18009048700332642, + "learning_rate": 5.444824471078282e-05, + "loss": 2.212, + "step": 4113 + }, + { + "epoch": 1.7746387750700885, + "grad_norm": 0.18991193175315857, + "learning_rate": 5.441542519293165e-05, + "loss": 2.2226, + "step": 4114 + }, + { + "epoch": 1.7750700884192365, + "grad_norm": 0.1789187490940094, + "learning_rate": 5.438260993734187e-05, + "loss": 2.1696, + "step": 4115 + }, + { + "epoch": 1.7755014017683848, + "grad_norm": 0.18606701493263245, + "learning_rate": 5.4349798950808124e-05, + "loss": 2.3047, + "step": 4116 + }, + { + "epoch": 1.7759327151175328, + "grad_norm": 0.18812114000320435, + "learning_rate": 5.431699224012434e-05, + "loss": 1.9963, + "step": 4117 + }, + { + "epoch": 1.776364028466681, + "grad_norm": 0.17936258018016815, + "learning_rate": 5.428418981208351e-05, + "loss": 2.1407, + "step": 4118 + }, + { + "epoch": 1.7767953418158293, + "grad_norm": 0.20092041790485382, + "learning_rate": 5.425139167347772e-05, + "loss": 2.2003, + "step": 4119 + }, + { + "epoch": 1.7772266551649774, + "grad_norm": 0.1748567521572113, + "learning_rate": 5.42185978310982e-05, + "loss": 2.2023, + "step": 4120 + }, + { + "epoch": 1.7776579685141254, + "grad_norm": 0.18740083277225494, + "learning_rate": 5.418580829173522e-05, + "loss": 2.1708, + "step": 4121 + }, + { + "epoch": 1.7780892818632736, + "grad_norm": 0.2029028683900833, + "learning_rate": 5.4153023062178294e-05, + "loss": 1.9945, + "step": 4122 + }, + { + "epoch": 1.778520595212422, + "grad_norm": 0.21911142766475677, + "learning_rate": 5.412024214921591e-05, + "loss": 1.8323, + "step": 4123 + }, + { + "epoch": 1.77895190856157, + "grad_norm": 0.1845363825559616, + "learning_rate": 5.408746555963571e-05, + "loss": 2.1028, + "step": 4124 + }, + { + "epoch": 1.7793832219107182, + "grad_norm": 0.2083241194486618, + "learning_rate": 5.4054693300224476e-05, + "loss": 2.2871, + "step": 4125 + }, + { + "epoch": 1.7793832219107182, + "eval_loss": 2.0913138389587402, + "eval_runtime": 201.3844, + "eval_samples_per_second": 0.159, + "eval_steps_per_second": 0.159, + "step": 4125 + }, + { + "epoch": 1.7798145352598662, + "grad_norm": 0.1898990422487259, + "learning_rate": 5.4021925377768055e-05, + "loss": 2.2323, + "step": 4126 + }, + { + "epoch": 1.7802458486090145, + "grad_norm": 0.182303786277771, + "learning_rate": 5.3989161799051405e-05, + "loss": 2.1606, + "step": 4127 + }, + { + "epoch": 1.7806771619581627, + "grad_norm": 0.1937088519334793, + "learning_rate": 5.395640257085853e-05, + "loss": 2.2274, + "step": 4128 + }, + { + "epoch": 1.7811084753073108, + "grad_norm": 0.18278345465660095, + "learning_rate": 5.3923647699972674e-05, + "loss": 2.0109, + "step": 4129 + }, + { + "epoch": 1.7815397886564588, + "grad_norm": 0.19565333425998688, + "learning_rate": 5.389089719317604e-05, + "loss": 2.1422, + "step": 4130 + }, + { + "epoch": 1.781971102005607, + "grad_norm": 0.1918390989303589, + "learning_rate": 5.385815105724997e-05, + "loss": 2.3274, + "step": 4131 + }, + { + "epoch": 1.7824024153547553, + "grad_norm": 0.20723852515220642, + "learning_rate": 5.382540929897494e-05, + "loss": 2.2422, + "step": 4132 + }, + { + "epoch": 1.7828337287039033, + "grad_norm": 0.17734549939632416, + "learning_rate": 5.379267192513048e-05, + "loss": 2.167, + "step": 4133 + }, + { + "epoch": 1.7832650420530516, + "grad_norm": 0.1883150041103363, + "learning_rate": 5.375993894249523e-05, + "loss": 2.1357, + "step": 4134 + }, + { + "epoch": 1.7836963554021996, + "grad_norm": 0.18035978078842163, + "learning_rate": 5.372721035784686e-05, + "loss": 2.2501, + "step": 4135 + }, + { + "epoch": 1.7841276687513479, + "grad_norm": 0.1856493055820465, + "learning_rate": 5.369448617796226e-05, + "loss": 2.2456, + "step": 4136 + }, + { + "epoch": 1.7845589821004961, + "grad_norm": 0.19670169055461884, + "learning_rate": 5.3661766409617275e-05, + "loss": 2.1014, + "step": 4137 + }, + { + "epoch": 1.7849902954496442, + "grad_norm": 0.1871079057455063, + "learning_rate": 5.362905105958689e-05, + "loss": 2.247, + "step": 4138 + }, + { + "epoch": 1.7854216087987922, + "grad_norm": 0.2041262686252594, + "learning_rate": 5.359634013464521e-05, + "loss": 2.1843, + "step": 4139 + }, + { + "epoch": 1.7858529221479404, + "grad_norm": 0.1845085173845291, + "learning_rate": 5.356363364156536e-05, + "loss": 2.3258, + "step": 4140 + }, + { + "epoch": 1.7862842354970887, + "grad_norm": 0.18137259781360626, + "learning_rate": 5.3530931587119624e-05, + "loss": 1.9705, + "step": 4141 + }, + { + "epoch": 1.786715548846237, + "grad_norm": 0.20324157178401947, + "learning_rate": 5.349823397807925e-05, + "loss": 2.3582, + "step": 4142 + }, + { + "epoch": 1.787146862195385, + "grad_norm": 0.19847539067268372, + "learning_rate": 5.346554082121467e-05, + "loss": 2.3731, + "step": 4143 + }, + { + "epoch": 1.787578175544533, + "grad_norm": 0.18964673578739166, + "learning_rate": 5.3432852123295366e-05, + "loss": 2.2977, + "step": 4144 + }, + { + "epoch": 1.7880094888936813, + "grad_norm": 0.1747114062309265, + "learning_rate": 5.3400167891089876e-05, + "loss": 2.0051, + "step": 4145 + }, + { + "epoch": 1.7884408022428295, + "grad_norm": 0.19617091119289398, + "learning_rate": 5.3367488131365846e-05, + "loss": 2.1045, + "step": 4146 + }, + { + "epoch": 1.7888721155919776, + "grad_norm": 0.18542206287384033, + "learning_rate": 5.333481285088999e-05, + "loss": 1.9911, + "step": 4147 + }, + { + "epoch": 1.7893034289411256, + "grad_norm": 0.18740200996398926, + "learning_rate": 5.3302142056428035e-05, + "loss": 2.2449, + "step": 4148 + }, + { + "epoch": 1.7897347422902739, + "grad_norm": 0.22210511565208435, + "learning_rate": 5.326947575474483e-05, + "loss": 2.0402, + "step": 4149 + }, + { + "epoch": 1.790166055639422, + "grad_norm": 0.19126732647418976, + "learning_rate": 5.323681395260434e-05, + "loss": 2.2384, + "step": 4150 + }, + { + "epoch": 1.790166055639422, + "eval_loss": 2.0913338661193848, + "eval_runtime": 200.6128, + "eval_samples_per_second": 0.16, + "eval_steps_per_second": 0.16, + "step": 4150 + }, + { + "epoch": 1.7905973689885704, + "grad_norm": 0.19874311983585358, + "learning_rate": 5.3204156656769514e-05, + "loss": 2.2883, + "step": 4151 + }, + { + "epoch": 1.7910286823377184, + "grad_norm": 0.18604867160320282, + "learning_rate": 5.317150387400243e-05, + "loss": 2.143, + "step": 4152 + }, + { + "epoch": 1.7914599956868664, + "grad_norm": 0.1757567971944809, + "learning_rate": 5.313885561106412e-05, + "loss": 2.0399, + "step": 4153 + }, + { + "epoch": 1.7918913090360147, + "grad_norm": 0.19099228084087372, + "learning_rate": 5.3106211874714885e-05, + "loss": 2.1222, + "step": 4154 + }, + { + "epoch": 1.792322622385163, + "grad_norm": 0.18796351552009583, + "learning_rate": 5.307357267171389e-05, + "loss": 2.2579, + "step": 4155 + }, + { + "epoch": 1.792753935734311, + "grad_norm": 0.17198990285396576, + "learning_rate": 5.3040938008819435e-05, + "loss": 2.106, + "step": 4156 + }, + { + "epoch": 1.793185249083459, + "grad_norm": 0.16483992338180542, + "learning_rate": 5.3008307892788926e-05, + "loss": 1.8541, + "step": 4157 + }, + { + "epoch": 1.7936165624326073, + "grad_norm": 0.18197329342365265, + "learning_rate": 5.297568233037877e-05, + "loss": 2.3625, + "step": 4158 + }, + { + "epoch": 1.7940478757817555, + "grad_norm": 0.18919214606285095, + "learning_rate": 5.294306132834445e-05, + "loss": 2.1912, + "step": 4159 + }, + { + "epoch": 1.7944791891309038, + "grad_norm": 0.19854745268821716, + "learning_rate": 5.291044489344044e-05, + "loss": 2.1172, + "step": 4160 + }, + { + "epoch": 1.7949105024800518, + "grad_norm": 0.17984916269779205, + "learning_rate": 5.2877833032420444e-05, + "loss": 1.9253, + "step": 4161 + }, + { + "epoch": 1.7953418158291998, + "grad_norm": 0.1974160075187683, + "learning_rate": 5.284522575203702e-05, + "loss": 2.0063, + "step": 4162 + }, + { + "epoch": 1.795773129178348, + "grad_norm": 0.1801133155822754, + "learning_rate": 5.281262305904186e-05, + "loss": 2.0987, + "step": 4163 + }, + { + "epoch": 1.7962044425274963, + "grad_norm": 0.1818169206380844, + "learning_rate": 5.278002496018575e-05, + "loss": 2.2302, + "step": 4164 + }, + { + "epoch": 1.7966357558766444, + "grad_norm": 0.16762112081050873, + "learning_rate": 5.274743146221848e-05, + "loss": 2.4302, + "step": 4165 + }, + { + "epoch": 1.7970670692257924, + "grad_norm": 0.19300805032253265, + "learning_rate": 5.271484257188888e-05, + "loss": 2.1999, + "step": 4166 + }, + { + "epoch": 1.7974983825749407, + "grad_norm": 0.19982370734214783, + "learning_rate": 5.268225829594479e-05, + "loss": 2.5799, + "step": 4167 + }, + { + "epoch": 1.797929695924089, + "grad_norm": 0.18215444684028625, + "learning_rate": 5.264967864113322e-05, + "loss": 2.21, + "step": 4168 + }, + { + "epoch": 1.7983610092732372, + "grad_norm": 0.20935408771038055, + "learning_rate": 5.261710361420009e-05, + "loss": 2.074, + "step": 4169 + }, + { + "epoch": 1.7987923226223852, + "grad_norm": 0.20121844112873077, + "learning_rate": 5.258453322189043e-05, + "loss": 2.3008, + "step": 4170 + }, + { + "epoch": 1.7992236359715332, + "grad_norm": 0.18649135529994965, + "learning_rate": 5.255196747094832e-05, + "loss": 2.2496, + "step": 4171 + }, + { + "epoch": 1.7996549493206815, + "grad_norm": 0.18619664013385773, + "learning_rate": 5.2519406368116806e-05, + "loss": 2.0526, + "step": 4172 + }, + { + "epoch": 1.8000862626698297, + "grad_norm": 0.18811942636966705, + "learning_rate": 5.2486849920138085e-05, + "loss": 2.2005, + "step": 4173 + }, + { + "epoch": 1.8005175760189778, + "grad_norm": 0.19535639882087708, + "learning_rate": 5.245429813375322e-05, + "loss": 2.0817, + "step": 4174 + }, + { + "epoch": 1.8009488893681258, + "grad_norm": 0.19291633367538452, + "learning_rate": 5.2421751015702554e-05, + "loss": 2.1703, + "step": 4175 + }, + { + "epoch": 1.8009488893681258, + "eval_loss": 2.090921401977539, + "eval_runtime": 201.6385, + "eval_samples_per_second": 0.159, + "eval_steps_per_second": 0.159, + "step": 4175 + }, + { + "epoch": 1.801380202717274, + "grad_norm": 0.19690266251564026, + "learning_rate": 5.238920857272522e-05, + "loss": 2.1343, + "step": 4176 + }, + { + "epoch": 1.8018115160664223, + "grad_norm": 0.17731203138828278, + "learning_rate": 5.23566708115595e-05, + "loss": 2.0657, + "step": 4177 + }, + { + "epoch": 1.8022428294155706, + "grad_norm": 0.20055049657821655, + "learning_rate": 5.232413773894274e-05, + "loss": 2.0814, + "step": 4178 + }, + { + "epoch": 1.8026741427647186, + "grad_norm": 0.20533232390880585, + "learning_rate": 5.229160936161123e-05, + "loss": 2.1925, + "step": 4179 + }, + { + "epoch": 1.8031054561138666, + "grad_norm": 0.1787910759449005, + "learning_rate": 5.2259085686300356e-05, + "loss": 2.3697, + "step": 4180 + }, + { + "epoch": 1.8035367694630149, + "grad_norm": 0.18992182612419128, + "learning_rate": 5.2226566719744454e-05, + "loss": 2.2862, + "step": 4181 + }, + { + "epoch": 1.8039680828121631, + "grad_norm": 0.21222200989723206, + "learning_rate": 5.2194052468676975e-05, + "loss": 2.162, + "step": 4182 + }, + { + "epoch": 1.8043993961613112, + "grad_norm": 0.1847771257162094, + "learning_rate": 5.216154293983032e-05, + "loss": 2.0813, + "step": 4183 + }, + { + "epoch": 1.8048307095104592, + "grad_norm": 0.18472546339035034, + "learning_rate": 5.2129038139935954e-05, + "loss": 2.1162, + "step": 4184 + }, + { + "epoch": 1.8052620228596075, + "grad_norm": 0.16956888139247894, + "learning_rate": 5.209653807572437e-05, + "loss": 2.176, + "step": 4185 + }, + { + "epoch": 1.8056933362087557, + "grad_norm": 0.21091744303703308, + "learning_rate": 5.206404275392505e-05, + "loss": 1.8348, + "step": 4186 + }, + { + "epoch": 1.806124649557904, + "grad_norm": 0.1923355609178543, + "learning_rate": 5.2031552181266494e-05, + "loss": 2.2377, + "step": 4187 + }, + { + "epoch": 1.806555962907052, + "grad_norm": 0.19391633570194244, + "learning_rate": 5.199906636447622e-05, + "loss": 2.2979, + "step": 4188 + }, + { + "epoch": 1.8069872762562, + "grad_norm": 0.20279642939567566, + "learning_rate": 5.19665853102808e-05, + "loss": 2.104, + "step": 4189 + }, + { + "epoch": 1.8074185896053483, + "grad_norm": 0.1883164495229721, + "learning_rate": 5.1934109025405784e-05, + "loss": 2.2072, + "step": 4190 + }, + { + "epoch": 1.8078499029544965, + "grad_norm": 0.1902153640985489, + "learning_rate": 5.190163751657573e-05, + "loss": 2.2251, + "step": 4191 + }, + { + "epoch": 1.8082812163036446, + "grad_norm": 0.17991870641708374, + "learning_rate": 5.186917079051424e-05, + "loss": 2.0905, + "step": 4192 + }, + { + "epoch": 1.8087125296527926, + "grad_norm": 0.17719264328479767, + "learning_rate": 5.1836708853943896e-05, + "loss": 2.2346, + "step": 4193 + }, + { + "epoch": 1.8091438430019409, + "grad_norm": 0.1893806755542755, + "learning_rate": 5.1804251713586295e-05, + "loss": 2.0722, + "step": 4194 + }, + { + "epoch": 1.8095751563510891, + "grad_norm": 0.1863139271736145, + "learning_rate": 5.177179937616201e-05, + "loss": 2.128, + "step": 4195 + }, + { + "epoch": 1.8100064697002374, + "grad_norm": 0.1845807284116745, + "learning_rate": 5.173935184839071e-05, + "loss": 2.1744, + "step": 4196 + }, + { + "epoch": 1.8104377830493854, + "grad_norm": 0.17936205863952637, + "learning_rate": 5.170690913699099e-05, + "loss": 2.0823, + "step": 4197 + }, + { + "epoch": 1.8108690963985334, + "grad_norm": 0.17481139302253723, + "learning_rate": 5.167447124868043e-05, + "loss": 2.2196, + "step": 4198 + }, + { + "epoch": 1.8113004097476817, + "grad_norm": 0.2126890867948532, + "learning_rate": 5.1642038190175704e-05, + "loss": 1.7506, + "step": 4199 + }, + { + "epoch": 1.81173172309683, + "grad_norm": 0.1977444440126419, + "learning_rate": 5.1609609968192424e-05, + "loss": 2.1454, + "step": 4200 + }, + { + "epoch": 1.81173172309683, + "eval_loss": 2.090848684310913, + "eval_runtime": 201.5534, + "eval_samples_per_second": 0.159, + "eval_steps_per_second": 0.159, + "step": 4200 + }, + { + "epoch": 1.812163036445978, + "grad_norm": 0.2021864801645279, + "learning_rate": 5.1577186589445186e-05, + "loss": 2.1619, + "step": 4201 + }, + { + "epoch": 1.812594349795126, + "grad_norm": 0.1859135925769806, + "learning_rate": 5.154476806064758e-05, + "loss": 2.1467, + "step": 4202 + }, + { + "epoch": 1.8130256631442743, + "grad_norm": 0.2111302763223648, + "learning_rate": 5.1512354388512274e-05, + "loss": 2.1874, + "step": 4203 + }, + { + "epoch": 1.8134569764934225, + "grad_norm": 0.19402331113815308, + "learning_rate": 5.1479945579750845e-05, + "loss": 2.2364, + "step": 4204 + }, + { + "epoch": 1.8138882898425708, + "grad_norm": 0.19910752773284912, + "learning_rate": 5.1447541641073915e-05, + "loss": 2.2734, + "step": 4205 + }, + { + "epoch": 1.8143196031917188, + "grad_norm": 0.1855575144290924, + "learning_rate": 5.1415142579191016e-05, + "loss": 2.3032, + "step": 4206 + }, + { + "epoch": 1.8147509165408668, + "grad_norm": 0.20500385761260986, + "learning_rate": 5.1382748400810796e-05, + "loss": 2.0675, + "step": 4207 + }, + { + "epoch": 1.815182229890015, + "grad_norm": 0.22097423672676086, + "learning_rate": 5.135035911264078e-05, + "loss": 2.3183, + "step": 4208 + }, + { + "epoch": 1.8156135432391634, + "grad_norm": 0.18906009197235107, + "learning_rate": 5.131797472138752e-05, + "loss": 2.1094, + "step": 4209 + }, + { + "epoch": 1.8160448565883114, + "grad_norm": 0.1831335723400116, + "learning_rate": 5.128559523375661e-05, + "loss": 2.0186, + "step": 4210 + }, + { + "epoch": 1.8164761699374594, + "grad_norm": 0.19466155767440796, + "learning_rate": 5.125322065645253e-05, + "loss": 2.1229, + "step": 4211 + }, + { + "epoch": 1.8169074832866077, + "grad_norm": 0.4434211850166321, + "learning_rate": 5.122085099617882e-05, + "loss": 2.2268, + "step": 4212 + }, + { + "epoch": 1.817338796635756, + "grad_norm": 0.1960372030735016, + "learning_rate": 5.11884862596379e-05, + "loss": 2.1383, + "step": 4213 + }, + { + "epoch": 1.8177701099849042, + "grad_norm": 0.1842081993818283, + "learning_rate": 5.115612645353136e-05, + "loss": 2.2059, + "step": 4214 + }, + { + "epoch": 1.8182014233340522, + "grad_norm": 0.18694734573364258, + "learning_rate": 5.112377158455957e-05, + "loss": 2.1145, + "step": 4215 + }, + { + "epoch": 1.8186327366832002, + "grad_norm": 0.2071239948272705, + "learning_rate": 5.109142165942196e-05, + "loss": 2.2563, + "step": 4216 + }, + { + "epoch": 1.8190640500323485, + "grad_norm": 0.17906248569488525, + "learning_rate": 5.105907668481697e-05, + "loss": 2.1458, + "step": 4217 + }, + { + "epoch": 1.8194953633814968, + "grad_norm": 0.18535931408405304, + "learning_rate": 5.1026736667441975e-05, + "loss": 2.1868, + "step": 4218 + }, + { + "epoch": 1.8199266767306448, + "grad_norm": 0.19665364921092987, + "learning_rate": 5.099440161399333e-05, + "loss": 2.2715, + "step": 4219 + }, + { + "epoch": 1.8203579900797928, + "grad_norm": 0.20698903501033783, + "learning_rate": 5.0962071531166316e-05, + "loss": 2.2083, + "step": 4220 + }, + { + "epoch": 1.820789303428941, + "grad_norm": 0.19022543728351593, + "learning_rate": 5.0929746425655286e-05, + "loss": 1.999, + "step": 4221 + }, + { + "epoch": 1.8212206167780893, + "grad_norm": 0.18219402432441711, + "learning_rate": 5.089742630415349e-05, + "loss": 2.2458, + "step": 4222 + }, + { + "epoch": 1.8216519301272376, + "grad_norm": 0.19657008349895477, + "learning_rate": 5.086511117335313e-05, + "loss": 2.1712, + "step": 4223 + }, + { + "epoch": 1.8220832434763856, + "grad_norm": 0.19631534814834595, + "learning_rate": 5.0832801039945466e-05, + "loss": 2.0941, + "step": 4224 + }, + { + "epoch": 1.8225145568255336, + "grad_norm": 0.2034425437450409, + "learning_rate": 5.080049591062065e-05, + "loss": 2.0156, + "step": 4225 + }, + { + "epoch": 1.8225145568255336, + "eval_loss": 2.090664863586426, + "eval_runtime": 210.7797, + "eval_samples_per_second": 0.152, + "eval_steps_per_second": 0.152, + "step": 4225 + }, + { + "epoch": 1.822945870174682, + "grad_norm": 0.20252689719200134, + "learning_rate": 5.076819579206777e-05, + "loss": 2.1967, + "step": 4226 + }, + { + "epoch": 1.8233771835238302, + "grad_norm": 0.17087893187999725, + "learning_rate": 5.073590069097494e-05, + "loss": 1.9148, + "step": 4227 + }, + { + "epoch": 1.8238084968729782, + "grad_norm": 0.18986919522285461, + "learning_rate": 5.0703610614029234e-05, + "loss": 2.2916, + "step": 4228 + }, + { + "epoch": 1.8242398102221262, + "grad_norm": 0.20265305042266846, + "learning_rate": 5.0671325567916644e-05, + "loss": 2.177, + "step": 4229 + }, + { + "epoch": 1.8246711235712745, + "grad_norm": 0.183542400598526, + "learning_rate": 5.063904555932214e-05, + "loss": 2.1599, + "step": 4230 + }, + { + "epoch": 1.8251024369204227, + "grad_norm": 0.20302563905715942, + "learning_rate": 5.060677059492967e-05, + "loss": 2.1233, + "step": 4231 + }, + { + "epoch": 1.825533750269571, + "grad_norm": 0.15920883417129517, + "learning_rate": 5.057450068142214e-05, + "loss": 1.9773, + "step": 4232 + }, + { + "epoch": 1.825965063618719, + "grad_norm": 0.17501461505889893, + "learning_rate": 5.054223582548131e-05, + "loss": 2.1985, + "step": 4233 + }, + { + "epoch": 1.826396376967867, + "grad_norm": 0.16960765421390533, + "learning_rate": 5.050997603378803e-05, + "loss": 2.2563, + "step": 4234 + }, + { + "epoch": 1.8268276903170153, + "grad_norm": 0.19847039878368378, + "learning_rate": 5.047772131302204e-05, + "loss": 2.1497, + "step": 4235 + }, + { + "epoch": 1.8272590036661636, + "grad_norm": 0.18529687821865082, + "learning_rate": 5.044547166986202e-05, + "loss": 2.1033, + "step": 4236 + }, + { + "epoch": 1.8276903170153116, + "grad_norm": 0.1980198174715042, + "learning_rate": 5.0413227110985596e-05, + "loss": 2.0504, + "step": 4237 + }, + { + "epoch": 1.8281216303644596, + "grad_norm": 0.17270773649215698, + "learning_rate": 5.0380987643069394e-05, + "loss": 2.0891, + "step": 4238 + }, + { + "epoch": 1.8285529437136079, + "grad_norm": 0.19012048840522766, + "learning_rate": 5.0348753272788946e-05, + "loss": 2.1669, + "step": 4239 + }, + { + "epoch": 1.8289842570627561, + "grad_norm": 0.1755671203136444, + "learning_rate": 5.03165240068187e-05, + "loss": 2.0635, + "step": 4240 + }, + { + "epoch": 1.8294155704119044, + "grad_norm": 0.1877172291278839, + "learning_rate": 5.028429985183208e-05, + "loss": 2.1066, + "step": 4241 + }, + { + "epoch": 1.8298468837610524, + "grad_norm": 0.18054725229740143, + "learning_rate": 5.0252080814501466e-05, + "loss": 2.0344, + "step": 4242 + }, + { + "epoch": 1.8302781971102005, + "grad_norm": 0.17783662676811218, + "learning_rate": 5.0219866901498166e-05, + "loss": 1.9928, + "step": 4243 + }, + { + "epoch": 1.8307095104593487, + "grad_norm": 0.19931326806545258, + "learning_rate": 5.0187658119492394e-05, + "loss": 1.9977, + "step": 4244 + }, + { + "epoch": 1.831140823808497, + "grad_norm": 0.19601735472679138, + "learning_rate": 5.0155454475153376e-05, + "loss": 2.187, + "step": 4245 + }, + { + "epoch": 1.831572137157645, + "grad_norm": 0.19454285502433777, + "learning_rate": 5.012325597514922e-05, + "loss": 2.204, + "step": 4246 + }, + { + "epoch": 1.832003450506793, + "grad_norm": 0.1874435693025589, + "learning_rate": 5.0091062626146936e-05, + "loss": 1.9728, + "step": 4247 + }, + { + "epoch": 1.8324347638559413, + "grad_norm": 0.1848047524690628, + "learning_rate": 5.0058874434812524e-05, + "loss": 2.1505, + "step": 4248 + }, + { + "epoch": 1.8328660772050895, + "grad_norm": 0.2200363278388977, + "learning_rate": 5.0026691407810926e-05, + "loss": 1.9998, + "step": 4249 + }, + { + "epoch": 1.8332973905542378, + "grad_norm": 0.20545153319835663, + "learning_rate": 4.9994513551805976e-05, + "loss": 2.1764, + "step": 4250 + }, + { + "epoch": 1.8332973905542378, + "eval_loss": 2.0905096530914307, + "eval_runtime": 204.2211, + "eval_samples_per_second": 0.157, + "eval_steps_per_second": 0.157, + "step": 4250 + }, + { + "epoch": 1.8337287039033858, + "grad_norm": 0.19514457881450653, + "learning_rate": 4.996234087346047e-05, + "loss": 2.2134, + "step": 4251 + }, + { + "epoch": 1.8341600172525339, + "grad_norm": 0.18694037199020386, + "learning_rate": 4.9930173379436043e-05, + "loss": 2.2111, + "step": 4252 + }, + { + "epoch": 1.8345913306016821, + "grad_norm": 0.1747187077999115, + "learning_rate": 4.9898011076393426e-05, + "loss": 2.2134, + "step": 4253 + }, + { + "epoch": 1.8350226439508304, + "grad_norm": 0.1897505521774292, + "learning_rate": 4.9865853970992104e-05, + "loss": 2.201, + "step": 4254 + }, + { + "epoch": 1.8354539572999784, + "grad_norm": 0.1944442093372345, + "learning_rate": 4.9833702069890575e-05, + "loss": 2.1418, + "step": 4255 + }, + { + "epoch": 1.8358852706491264, + "grad_norm": 0.21999591588974, + "learning_rate": 4.9801555379746254e-05, + "loss": 2.1245, + "step": 4256 + }, + { + "epoch": 1.8363165839982747, + "grad_norm": 0.19068877398967743, + "learning_rate": 4.976941390721546e-05, + "loss": 2.124, + "step": 4257 + }, + { + "epoch": 1.836747897347423, + "grad_norm": 0.19536010921001434, + "learning_rate": 4.973727765895345e-05, + "loss": 2.2138, + "step": 4258 + }, + { + "epoch": 1.8371792106965712, + "grad_norm": 0.20152810215950012, + "learning_rate": 4.970514664161432e-05, + "loss": 2.1205, + "step": 4259 + }, + { + "epoch": 1.8376105240457192, + "grad_norm": 0.18851511180400848, + "learning_rate": 4.9673020861851245e-05, + "loss": 2.0916, + "step": 4260 + }, + { + "epoch": 1.8380418373948673, + "grad_norm": 0.19676554203033447, + "learning_rate": 4.964090032631616e-05, + "loss": 2.2678, + "step": 4261 + }, + { + "epoch": 1.8384731507440155, + "grad_norm": 0.18774870038032532, + "learning_rate": 4.960878504165996e-05, + "loss": 2.3108, + "step": 4262 + }, + { + "epoch": 1.8389044640931638, + "grad_norm": 0.18802188336849213, + "learning_rate": 4.957667501453252e-05, + "loss": 2.1035, + "step": 4263 + }, + { + "epoch": 1.8393357774423118, + "grad_norm": 0.18654939532279968, + "learning_rate": 4.954457025158253e-05, + "loss": 2.0947, + "step": 4264 + }, + { + "epoch": 1.8397670907914598, + "grad_norm": 0.17369955778121948, + "learning_rate": 4.951247075945767e-05, + "loss": 2.1399, + "step": 4265 + }, + { + "epoch": 1.840198404140608, + "grad_norm": 0.1797541379928589, + "learning_rate": 4.948037654480443e-05, + "loss": 2.13, + "step": 4266 + }, + { + "epoch": 1.8406297174897563, + "grad_norm": 0.19826960563659668, + "learning_rate": 4.944828761426832e-05, + "loss": 2.2791, + "step": 4267 + }, + { + "epoch": 1.8410610308389046, + "grad_norm": 0.21253716945648193, + "learning_rate": 4.9416203974493684e-05, + "loss": 2.2161, + "step": 4268 + }, + { + "epoch": 1.8414923441880526, + "grad_norm": 0.177699014544487, + "learning_rate": 4.9384125632123784e-05, + "loss": 2.127, + "step": 4269 + }, + { + "epoch": 1.8419236575372007, + "grad_norm": 0.19456614553928375, + "learning_rate": 4.935205259380082e-05, + "loss": 2.1979, + "step": 4270 + }, + { + "epoch": 1.842354970886349, + "grad_norm": 0.21789413690567017, + "learning_rate": 4.9319984866165855e-05, + "loss": 2.1581, + "step": 4271 + }, + { + "epoch": 1.8427862842354972, + "grad_norm": 0.18620431423187256, + "learning_rate": 4.928792245585886e-05, + "loss": 2.0833, + "step": 4272 + }, + { + "epoch": 1.8432175975846452, + "grad_norm": 0.2038848102092743, + "learning_rate": 4.925586536951867e-05, + "loss": 2.199, + "step": 4273 + }, + { + "epoch": 1.8436489109337932, + "grad_norm": 0.18201008439064026, + "learning_rate": 4.92238136137831e-05, + "loss": 2.1769, + "step": 4274 + }, + { + "epoch": 1.8440802242829415, + "grad_norm": 0.18450991809368134, + "learning_rate": 4.9191767195288815e-05, + "loss": 2.3893, + "step": 4275 + }, + { + "epoch": 1.8440802242829415, + "eval_loss": 2.0907812118530273, + "eval_runtime": 203.8949, + "eval_samples_per_second": 0.157, + "eval_steps_per_second": 0.157, + "step": 4275 + }, + { + "epoch": 1.8445115376320897, + "grad_norm": 0.18853482604026794, + "learning_rate": 4.9159726120671343e-05, + "loss": 2.1397, + "step": 4276 + }, + { + "epoch": 1.844942850981238, + "grad_norm": 0.22452791035175323, + "learning_rate": 4.912769039656519e-05, + "loss": 2.3544, + "step": 4277 + }, + { + "epoch": 1.845374164330386, + "grad_norm": 0.20442944765090942, + "learning_rate": 4.909566002960369e-05, + "loss": 2.2584, + "step": 4278 + }, + { + "epoch": 1.845805477679534, + "grad_norm": 0.2112889438867569, + "learning_rate": 4.906363502641904e-05, + "loss": 2.221, + "step": 4279 + }, + { + "epoch": 1.8462367910286823, + "grad_norm": 0.19078004360198975, + "learning_rate": 4.9031615393642375e-05, + "loss": 2.2339, + "step": 4280 + }, + { + "epoch": 1.8466681043778306, + "grad_norm": 0.17326442897319794, + "learning_rate": 4.899960113790376e-05, + "loss": 2.1394, + "step": 4281 + }, + { + "epoch": 1.8470994177269786, + "grad_norm": 0.2283521592617035, + "learning_rate": 4.8967592265832054e-05, + "loss": 2.2435, + "step": 4282 + }, + { + "epoch": 1.8475307310761266, + "grad_norm": 0.18705476820468903, + "learning_rate": 4.893558878405504e-05, + "loss": 2.4744, + "step": 4283 + }, + { + "epoch": 1.847962044425275, + "grad_norm": 0.1761569231748581, + "learning_rate": 4.8903590699199416e-05, + "loss": 2.0534, + "step": 4284 + }, + { + "epoch": 1.8483933577744232, + "grad_norm": 0.19114083051681519, + "learning_rate": 4.887159801789074e-05, + "loss": 2.2539, + "step": 4285 + }, + { + "epoch": 1.8488246711235714, + "grad_norm": 0.1969086229801178, + "learning_rate": 4.883961074675343e-05, + "loss": 2.2408, + "step": 4286 + }, + { + "epoch": 1.8492559844727194, + "grad_norm": 0.19349128007888794, + "learning_rate": 4.880762889241076e-05, + "loss": 2.0226, + "step": 4287 + }, + { + "epoch": 1.8496872978218675, + "grad_norm": 0.1920590102672577, + "learning_rate": 4.877565246148499e-05, + "loss": 2.2275, + "step": 4288 + }, + { + "epoch": 1.8501186111710157, + "grad_norm": 0.17939484119415283, + "learning_rate": 4.8743681460597164e-05, + "loss": 2.1999, + "step": 4289 + }, + { + "epoch": 1.850549924520164, + "grad_norm": 0.19642594456672668, + "learning_rate": 4.8711715896367206e-05, + "loss": 2.1037, + "step": 4290 + }, + { + "epoch": 1.850981237869312, + "grad_norm": 0.1994245946407318, + "learning_rate": 4.867975577541398e-05, + "loss": 2.0785, + "step": 4291 + }, + { + "epoch": 1.85141255121846, + "grad_norm": 0.22636477649211884, + "learning_rate": 4.8647801104355165e-05, + "loss": 2.1379, + "step": 4292 + }, + { + "epoch": 1.8518438645676083, + "grad_norm": 0.18629512190818787, + "learning_rate": 4.8615851889807294e-05, + "loss": 2.2186, + "step": 4293 + }, + { + "epoch": 1.8522751779167566, + "grad_norm": 0.1882411539554596, + "learning_rate": 4.85839081383858e-05, + "loss": 2.1205, + "step": 4294 + }, + { + "epoch": 1.8527064912659048, + "grad_norm": 0.17799854278564453, + "learning_rate": 4.855196985670504e-05, + "loss": 2.143, + "step": 4295 + }, + { + "epoch": 1.8531378046150528, + "grad_norm": 0.19559095799922943, + "learning_rate": 4.852003705137815e-05, + "loss": 2.1317, + "step": 4296 + }, + { + "epoch": 1.8535691179642009, + "grad_norm": 0.18567374348640442, + "learning_rate": 4.848810972901717e-05, + "loss": 2.1988, + "step": 4297 + }, + { + "epoch": 1.8540004313133491, + "grad_norm": 0.21284270286560059, + "learning_rate": 4.845618789623296e-05, + "loss": 2.1892, + "step": 4298 + }, + { + "epoch": 1.8544317446624974, + "grad_norm": 0.18262144923210144, + "learning_rate": 4.842427155963537e-05, + "loss": 2.0618, + "step": 4299 + }, + { + "epoch": 1.8548630580116454, + "grad_norm": 0.19378851354122162, + "learning_rate": 4.839236072583297e-05, + "loss": 2.1478, + "step": 4300 + }, + { + "epoch": 1.8548630580116454, + "eval_loss": 2.0903193950653076, + "eval_runtime": 202.772, + "eval_samples_per_second": 0.158, + "eval_steps_per_second": 0.158, + "step": 4300 + }, + { + "epoch": 1.8552943713607937, + "grad_norm": 0.19272704422473907, + "learning_rate": 4.8360455401433236e-05, + "loss": 2.0959, + "step": 4301 + }, + { + "epoch": 1.8557256847099417, + "grad_norm": 0.21725627779960632, + "learning_rate": 4.832855559304256e-05, + "loss": 2.2652, + "step": 4302 + }, + { + "epoch": 1.85615699805909, + "grad_norm": 0.18098394572734833, + "learning_rate": 4.829666130726611e-05, + "loss": 2.2715, + "step": 4303 + }, + { + "epoch": 1.8565883114082382, + "grad_norm": 0.17119595408439636, + "learning_rate": 4.8264772550707994e-05, + "loss": 2.1291, + "step": 4304 + }, + { + "epoch": 1.8570196247573862, + "grad_norm": 0.19038599729537964, + "learning_rate": 4.823288932997104e-05, + "loss": 1.9872, + "step": 4305 + }, + { + "epoch": 1.8574509381065343, + "grad_norm": 0.2354617565870285, + "learning_rate": 4.82010116516571e-05, + "loss": 2.251, + "step": 4306 + }, + { + "epoch": 1.8578822514556825, + "grad_norm": 0.18846596777439117, + "learning_rate": 4.8169139522366756e-05, + "loss": 2.1968, + "step": 4307 + }, + { + "epoch": 1.8583135648048308, + "grad_norm": 0.1935591846704483, + "learning_rate": 4.813727294869949e-05, + "loss": 2.2544, + "step": 4308 + }, + { + "epoch": 1.8587448781539788, + "grad_norm": 0.19950206577777863, + "learning_rate": 4.810541193725364e-05, + "loss": 2.2831, + "step": 4309 + }, + { + "epoch": 1.859176191503127, + "grad_norm": 0.1986803561449051, + "learning_rate": 4.8073556494626366e-05, + "loss": 2.2389, + "step": 4310 + }, + { + "epoch": 1.859607504852275, + "grad_norm": 0.1936737447977066, + "learning_rate": 4.8041706627413676e-05, + "loss": 2.1537, + "step": 4311 + }, + { + "epoch": 1.8600388182014234, + "grad_norm": 0.17519107460975647, + "learning_rate": 4.800986234221041e-05, + "loss": 2.1312, + "step": 4312 + }, + { + "epoch": 1.8604701315505716, + "grad_norm": 0.1902090162038803, + "learning_rate": 4.797802364561033e-05, + "loss": 2.3116, + "step": 4313 + }, + { + "epoch": 1.8609014448997196, + "grad_norm": 0.1804785579442978, + "learning_rate": 4.794619054420597e-05, + "loss": 2.0768, + "step": 4314 + }, + { + "epoch": 1.8613327582488677, + "grad_norm": 0.19297802448272705, + "learning_rate": 4.7914363044588684e-05, + "loss": 1.7029, + "step": 4315 + }, + { + "epoch": 1.861764071598016, + "grad_norm": 0.19850334525108337, + "learning_rate": 4.788254115334876e-05, + "loss": 2.0629, + "step": 4316 + }, + { + "epoch": 1.8621953849471642, + "grad_norm": 0.2027156800031662, + "learning_rate": 4.785072487707525e-05, + "loss": 2.1974, + "step": 4317 + }, + { + "epoch": 1.8626266982963122, + "grad_norm": 0.2021334320306778, + "learning_rate": 4.7818914222356044e-05, + "loss": 2.285, + "step": 4318 + }, + { + "epoch": 1.8630580116454605, + "grad_norm": 0.1869433969259262, + "learning_rate": 4.7787109195777876e-05, + "loss": 2.2128, + "step": 4319 + }, + { + "epoch": 1.8634893249946085, + "grad_norm": 0.1924419105052948, + "learning_rate": 4.775530980392636e-05, + "loss": 2.1366, + "step": 4320 + }, + { + "epoch": 1.8639206383437568, + "grad_norm": 0.17339825630187988, + "learning_rate": 4.772351605338589e-05, + "loss": 2.0719, + "step": 4321 + }, + { + "epoch": 1.864351951692905, + "grad_norm": 0.1681831032037735, + "learning_rate": 4.76917279507397e-05, + "loss": 1.7802, + "step": 4322 + }, + { + "epoch": 1.864783265042053, + "grad_norm": 0.19009758532047272, + "learning_rate": 4.765994550256989e-05, + "loss": 2.1927, + "step": 4323 + }, + { + "epoch": 1.865214578391201, + "grad_norm": 0.17810070514678955, + "learning_rate": 4.762816871545736e-05, + "loss": 2.1567, + "step": 4324 + }, + { + "epoch": 1.8656458917403493, + "grad_norm": 0.17919707298278809, + "learning_rate": 4.759639759598183e-05, + "loss": 2.0385, + "step": 4325 + }, + { + "epoch": 1.8656458917403493, + "eval_loss": 2.0903007984161377, + "eval_runtime": 206.9154, + "eval_samples_per_second": 0.155, + "eval_steps_per_second": 0.155, + "step": 4325 + }, + { + "epoch": 1.8660772050894976, + "grad_norm": 0.21091590821743011, + "learning_rate": 4.756463215072183e-05, + "loss": 2.3909, + "step": 4326 + }, + { + "epoch": 1.8665085184386456, + "grad_norm": 0.20307664573192596, + "learning_rate": 4.7532872386254786e-05, + "loss": 2.2274, + "step": 4327 + }, + { + "epoch": 1.8669398317877939, + "grad_norm": 0.1939563900232315, + "learning_rate": 4.7501118309156896e-05, + "loss": 2.103, + "step": 4328 + }, + { + "epoch": 1.867371145136942, + "grad_norm": 0.1929791271686554, + "learning_rate": 4.746936992600317e-05, + "loss": 2.2962, + "step": 4329 + }, + { + "epoch": 1.8678024584860902, + "grad_norm": 0.17812767624855042, + "learning_rate": 4.7437627243367475e-05, + "loss": 2.1675, + "step": 4330 + }, + { + "epoch": 1.8682337718352384, + "grad_norm": 0.18598608672618866, + "learning_rate": 4.74058902678225e-05, + "loss": 2.042, + "step": 4331 + }, + { + "epoch": 1.8686650851843865, + "grad_norm": 0.19546982645988464, + "learning_rate": 4.737415900593969e-05, + "loss": 2.2392, + "step": 4332 + }, + { + "epoch": 1.8690963985335345, + "grad_norm": 0.16199365258216858, + "learning_rate": 4.734243346428935e-05, + "loss": 2.0524, + "step": 4333 + }, + { + "epoch": 1.8695277118826827, + "grad_norm": 0.18136630952358246, + "learning_rate": 4.731071364944064e-05, + "loss": 2.1181, + "step": 4334 + }, + { + "epoch": 1.869959025231831, + "grad_norm": 0.20470373332500458, + "learning_rate": 4.727899956796148e-05, + "loss": 2.237, + "step": 4335 + }, + { + "epoch": 1.870390338580979, + "grad_norm": 0.21079720556735992, + "learning_rate": 4.724729122641859e-05, + "loss": 1.9939, + "step": 4336 + }, + { + "epoch": 1.8708216519301273, + "grad_norm": 0.19446231424808502, + "learning_rate": 4.721558863137756e-05, + "loss": 2.2238, + "step": 4337 + }, + { + "epoch": 1.8712529652792753, + "grad_norm": 0.19449461996555328, + "learning_rate": 4.718389178940277e-05, + "loss": 2.1381, + "step": 4338 + }, + { + "epoch": 1.8716842786284236, + "grad_norm": 0.18257984519004822, + "learning_rate": 4.715220070705737e-05, + "loss": 2.2377, + "step": 4339 + }, + { + "epoch": 1.8721155919775718, + "grad_norm": 0.18961714208126068, + "learning_rate": 4.7120515390903325e-05, + "loss": 2.245, + "step": 4340 + }, + { + "epoch": 1.8725469053267199, + "grad_norm": 0.20834636688232422, + "learning_rate": 4.708883584750147e-05, + "loss": 2.1548, + "step": 4341 + }, + { + "epoch": 1.872978218675868, + "grad_norm": 0.20628146827220917, + "learning_rate": 4.705716208341138e-05, + "loss": 2.2773, + "step": 4342 + }, + { + "epoch": 1.8734095320250161, + "grad_norm": 0.19520138204097748, + "learning_rate": 4.7025494105191486e-05, + "loss": 2.1877, + "step": 4343 + }, + { + "epoch": 1.8738408453741644, + "grad_norm": 0.20871102809906006, + "learning_rate": 4.6993831919398925e-05, + "loss": 2.2529, + "step": 4344 + }, + { + "epoch": 1.8742721587233124, + "grad_norm": 0.19515718519687653, + "learning_rate": 4.696217553258975e-05, + "loss": 2.2391, + "step": 4345 + }, + { + "epoch": 1.8747034720724607, + "grad_norm": 0.1878291219472885, + "learning_rate": 4.6930524951318735e-05, + "loss": 2.1282, + "step": 4346 + }, + { + "epoch": 1.8751347854216087, + "grad_norm": 0.19603882730007172, + "learning_rate": 4.68988801821395e-05, + "loss": 2.1458, + "step": 4347 + }, + { + "epoch": 1.875566098770757, + "grad_norm": 0.21738454699516296, + "learning_rate": 4.6867241231604425e-05, + "loss": 2.2133, + "step": 4348 + }, + { + "epoch": 1.8759974121199052, + "grad_norm": 0.22887803614139557, + "learning_rate": 4.683560810626473e-05, + "loss": 2.1376, + "step": 4349 + }, + { + "epoch": 1.8764287254690533, + "grad_norm": 0.18480657041072845, + "learning_rate": 4.680398081267037e-05, + "loss": 1.9866, + "step": 4350 + }, + { + "epoch": 1.8764287254690533, + "eval_loss": 2.0904784202575684, + "eval_runtime": 205.3133, + "eval_samples_per_second": 0.156, + "eval_steps_per_second": 0.156, + "step": 4350 + }, + { + "epoch": 1.8768600388182013, + "grad_norm": 0.1917465478181839, + "learning_rate": 4.677235935737011e-05, + "loss": 2.1556, + "step": 4351 + }, + { + "epoch": 1.8772913521673495, + "grad_norm": 0.1883143186569214, + "learning_rate": 4.6740743746911554e-05, + "loss": 2.0558, + "step": 4352 + }, + { + "epoch": 1.8777226655164978, + "grad_norm": 0.19136305153369904, + "learning_rate": 4.670913398784104e-05, + "loss": 2.3204, + "step": 4353 + }, + { + "epoch": 1.878153978865646, + "grad_norm": 0.1742907613515854, + "learning_rate": 4.66775300867037e-05, + "loss": 2.0352, + "step": 4354 + }, + { + "epoch": 1.878585292214794, + "grad_norm": 0.18656355142593384, + "learning_rate": 4.66459320500435e-05, + "loss": 2.1635, + "step": 4355 + }, + { + "epoch": 1.8790166055639421, + "grad_norm": 0.1927529126405716, + "learning_rate": 4.6614339884403165e-05, + "loss": 2.1339, + "step": 4356 + }, + { + "epoch": 1.8794479189130904, + "grad_norm": 0.17686046659946442, + "learning_rate": 4.658275359632416e-05, + "loss": 2.1561, + "step": 4357 + }, + { + "epoch": 1.8798792322622386, + "grad_norm": 0.20081140100955963, + "learning_rate": 4.655117319234677e-05, + "loss": 2.1218, + "step": 4358 + }, + { + "epoch": 1.8803105456113867, + "grad_norm": 0.20209652185440063, + "learning_rate": 4.65195986790101e-05, + "loss": 2.2096, + "step": 4359 + }, + { + "epoch": 1.8807418589605347, + "grad_norm": 0.1781439483165741, + "learning_rate": 4.648803006285198e-05, + "loss": 2.1468, + "step": 4360 + }, + { + "epoch": 1.881173172309683, + "grad_norm": 0.2034546285867691, + "learning_rate": 4.645646735040902e-05, + "loss": 2.2163, + "step": 4361 + }, + { + "epoch": 1.8816044856588312, + "grad_norm": 0.18482056260108948, + "learning_rate": 4.642491054821667e-05, + "loss": 2.1812, + "step": 4362 + }, + { + "epoch": 1.8820357990079795, + "grad_norm": 0.2234092801809311, + "learning_rate": 4.639335966280908e-05, + "loss": 1.8202, + "step": 4363 + }, + { + "epoch": 1.8824671123571275, + "grad_norm": 0.24373753368854523, + "learning_rate": 4.63618147007192e-05, + "loss": 2.0694, + "step": 4364 + }, + { + "epoch": 1.8828984257062755, + "grad_norm": 0.20102065801620483, + "learning_rate": 4.633027566847875e-05, + "loss": 2.048, + "step": 4365 + }, + { + "epoch": 1.8833297390554238, + "grad_norm": 0.19503894448280334, + "learning_rate": 4.6298742572618266e-05, + "loss": 2.228, + "step": 4366 + }, + { + "epoch": 1.883761052404572, + "grad_norm": 0.18882206082344055, + "learning_rate": 4.626721541966701e-05, + "loss": 2.1524, + "step": 4367 + }, + { + "epoch": 1.88419236575372, + "grad_norm": 0.20258937776088715, + "learning_rate": 4.6235694216152995e-05, + "loss": 2.344, + "step": 4368 + }, + { + "epoch": 1.884623679102868, + "grad_norm": 0.1838570535182953, + "learning_rate": 4.620417896860307e-05, + "loss": 2.1242, + "step": 4369 + }, + { + "epoch": 1.8850549924520164, + "grad_norm": 0.1915067583322525, + "learning_rate": 4.6172669683542816e-05, + "loss": 2.2123, + "step": 4370 + }, + { + "epoch": 1.8854863058011646, + "grad_norm": 0.20054484903812408, + "learning_rate": 4.614116636749654e-05, + "loss": 2.2716, + "step": 4371 + }, + { + "epoch": 1.8859176191503129, + "grad_norm": 0.1780989170074463, + "learning_rate": 4.6109669026987336e-05, + "loss": 2.0959, + "step": 4372 + }, + { + "epoch": 1.886348932499461, + "grad_norm": 0.19114157557487488, + "learning_rate": 4.607817766853712e-05, + "loss": 2.2411, + "step": 4373 + }, + { + "epoch": 1.886780245848609, + "grad_norm": 0.20129813253879547, + "learning_rate": 4.6046692298666506e-05, + "loss": 2.1702, + "step": 4374 + }, + { + "epoch": 1.8872115591977572, + "grad_norm": 0.18966692686080933, + "learning_rate": 4.6015212923894864e-05, + "loss": 2.175, + "step": 4375 + }, + { + "epoch": 1.8872115591977572, + "eval_loss": 2.0902810096740723, + "eval_runtime": 206.0401, + "eval_samples_per_second": 0.155, + "eval_steps_per_second": 0.155, + "step": 4375 + }, + { + "epoch": 1.8876428725469054, + "grad_norm": 0.18641971051692963, + "learning_rate": 4.598373955074037e-05, + "loss": 1.9328, + "step": 4376 + }, + { + "epoch": 1.8880741858960535, + "grad_norm": 0.17950253188610077, + "learning_rate": 4.595227218571994e-05, + "loss": 2.2209, + "step": 4377 + }, + { + "epoch": 1.8885054992452015, + "grad_norm": 0.18584321439266205, + "learning_rate": 4.592081083534919e-05, + "loss": 2.1834, + "step": 4378 + }, + { + "epoch": 1.8889368125943498, + "grad_norm": 0.18755564093589783, + "learning_rate": 4.588935550614255e-05, + "loss": 2.0961, + "step": 4379 + }, + { + "epoch": 1.889368125943498, + "grad_norm": 0.19564920663833618, + "learning_rate": 4.585790620461322e-05, + "loss": 2.3325, + "step": 4380 + }, + { + "epoch": 1.8897994392926463, + "grad_norm": 0.1829662322998047, + "learning_rate": 4.58264629372731e-05, + "loss": 2.0914, + "step": 4381 + }, + { + "epoch": 1.8902307526417943, + "grad_norm": 0.19613660871982574, + "learning_rate": 4.579502571063284e-05, + "loss": 2.2863, + "step": 4382 + }, + { + "epoch": 1.8906620659909423, + "grad_norm": 0.17947323620319366, + "learning_rate": 4.5763594531201914e-05, + "loss": 2.162, + "step": 4383 + }, + { + "epoch": 1.8910933793400906, + "grad_norm": 0.18751266598701477, + "learning_rate": 4.573216940548847e-05, + "loss": 1.9898, + "step": 4384 + }, + { + "epoch": 1.8915246926892388, + "grad_norm": 0.18965020775794983, + "learning_rate": 4.5700750339999416e-05, + "loss": 2.1875, + "step": 4385 + }, + { + "epoch": 1.8919560060383869, + "grad_norm": 0.18930767476558685, + "learning_rate": 4.56693373412404e-05, + "loss": 2.2875, + "step": 4386 + }, + { + "epoch": 1.892387319387535, + "grad_norm": 0.1917998045682907, + "learning_rate": 4.563793041571585e-05, + "loss": 2.2625, + "step": 4387 + }, + { + "epoch": 1.8928186327366832, + "grad_norm": 0.19999705255031586, + "learning_rate": 4.56065295699289e-05, + "loss": 2.2053, + "step": 4388 + }, + { + "epoch": 1.8932499460858314, + "grad_norm": 0.1889103204011917, + "learning_rate": 4.557513481038144e-05, + "loss": 2.023, + "step": 4389 + }, + { + "epoch": 1.8936812594349797, + "grad_norm": 0.19728577136993408, + "learning_rate": 4.5543746143574126e-05, + "loss": 2.0746, + "step": 4390 + }, + { + "epoch": 1.8941125727841277, + "grad_norm": 0.18072137236595154, + "learning_rate": 4.55123635760063e-05, + "loss": 1.9347, + "step": 4391 + }, + { + "epoch": 1.8945438861332757, + "grad_norm": 0.24109086394309998, + "learning_rate": 4.548098711417605e-05, + "loss": 2.2294, + "step": 4392 + }, + { + "epoch": 1.894975199482424, + "grad_norm": 0.20103009045124054, + "learning_rate": 4.544961676458022e-05, + "loss": 2.0669, + "step": 4393 + }, + { + "epoch": 1.8954065128315722, + "grad_norm": 0.21188940107822418, + "learning_rate": 4.541825253371442e-05, + "loss": 2.0499, + "step": 4394 + }, + { + "epoch": 1.8958378261807203, + "grad_norm": 0.2471272051334381, + "learning_rate": 4.538689442807294e-05, + "loss": 2.0948, + "step": 4395 + }, + { + "epoch": 1.8962691395298683, + "grad_norm": 0.18075866997241974, + "learning_rate": 4.5355542454148795e-05, + "loss": 2.2735, + "step": 4396 + }, + { + "epoch": 1.8967004528790166, + "grad_norm": 0.22322553396224976, + "learning_rate": 4.532419661843376e-05, + "loss": 2.3035, + "step": 4397 + }, + { + "epoch": 1.8971317662281648, + "grad_norm": 0.20870225131511688, + "learning_rate": 4.529285692741836e-05, + "loss": 2.199, + "step": 4398 + }, + { + "epoch": 1.897563079577313, + "grad_norm": 0.18207068741321564, + "learning_rate": 4.5261523387591804e-05, + "loss": 2.1874, + "step": 4399 + }, + { + "epoch": 1.897994392926461, + "grad_norm": 0.17733348906040192, + "learning_rate": 4.523019600544203e-05, + "loss": 2.2675, + "step": 4400 + }, + { + "epoch": 1.897994392926461, + "eval_loss": 2.0905203819274902, + "eval_runtime": 205.3714, + "eval_samples_per_second": 0.156, + "eval_steps_per_second": 0.156, + "step": 4400 + }, + { + "epoch": 1.8984257062756091, + "grad_norm": 0.1851731687784195, + "learning_rate": 4.519887478745575e-05, + "loss": 2.0445, + "step": 4401 + }, + { + "epoch": 1.8988570196247574, + "grad_norm": 0.1968875527381897, + "learning_rate": 4.5167559740118354e-05, + "loss": 2.1068, + "step": 4402 + }, + { + "epoch": 1.8992883329739056, + "grad_norm": 0.19119924306869507, + "learning_rate": 4.5136250869913936e-05, + "loss": 2.074, + "step": 4403 + }, + { + "epoch": 1.8997196463230537, + "grad_norm": 0.20993638038635254, + "learning_rate": 4.5104948183325354e-05, + "loss": 2.2871, + "step": 4404 + }, + { + "epoch": 1.9001509596722017, + "grad_norm": 0.17726275324821472, + "learning_rate": 4.5073651686834197e-05, + "loss": 2.2556, + "step": 4405 + }, + { + "epoch": 1.90058227302135, + "grad_norm": 0.1941853165626526, + "learning_rate": 4.5042361386920714e-05, + "loss": 2.4722, + "step": 4406 + }, + { + "epoch": 1.9010135863704982, + "grad_norm": 0.19786939024925232, + "learning_rate": 4.501107729006391e-05, + "loss": 2.3993, + "step": 4407 + }, + { + "epoch": 1.9014448997196465, + "grad_norm": 0.21278390288352966, + "learning_rate": 4.4979799402741517e-05, + "loss": 1.9771, + "step": 4408 + }, + { + "epoch": 1.9018762130687945, + "grad_norm": 0.17766079306602478, + "learning_rate": 4.494852773142998e-05, + "loss": 2.1706, + "step": 4409 + }, + { + "epoch": 1.9023075264179425, + "grad_norm": 0.1998787224292755, + "learning_rate": 4.4917262282604385e-05, + "loss": 2.093, + "step": 4410 + }, + { + "epoch": 1.9027388397670908, + "grad_norm": 0.18379029631614685, + "learning_rate": 4.4886003062738604e-05, + "loss": 2.2796, + "step": 4411 + }, + { + "epoch": 1.903170153116239, + "grad_norm": 0.19698943197727203, + "learning_rate": 4.485475007830523e-05, + "loss": 2.2868, + "step": 4412 + }, + { + "epoch": 1.903601466465387, + "grad_norm": 0.18271850049495697, + "learning_rate": 4.48235033357755e-05, + "loss": 2.1838, + "step": 4413 + }, + { + "epoch": 1.9040327798145351, + "grad_norm": 0.19257652759552002, + "learning_rate": 4.479226284161941e-05, + "loss": 2.2262, + "step": 4414 + }, + { + "epoch": 1.9044640931636834, + "grad_norm": 0.18733306229114532, + "learning_rate": 4.476102860230565e-05, + "loss": 2.0258, + "step": 4415 + }, + { + "epoch": 1.9048954065128316, + "grad_norm": 0.19409288465976715, + "learning_rate": 4.472980062430162e-05, + "loss": 2.2714, + "step": 4416 + }, + { + "epoch": 1.9053267198619799, + "grad_norm": 0.19599054753780365, + "learning_rate": 4.4698578914073395e-05, + "loss": 2.1417, + "step": 4417 + }, + { + "epoch": 1.905758033211128, + "grad_norm": 0.18327564001083374, + "learning_rate": 4.466736347808575e-05, + "loss": 2.2224, + "step": 4418 + }, + { + "epoch": 1.906189346560276, + "grad_norm": 0.2031586766242981, + "learning_rate": 4.4636154322802236e-05, + "loss": 2.3707, + "step": 4419 + }, + { + "epoch": 1.9066206599094242, + "grad_norm": 0.21735779941082, + "learning_rate": 4.460495145468501e-05, + "loss": 2.0034, + "step": 4420 + }, + { + "epoch": 1.9070519732585725, + "grad_norm": 0.2004113644361496, + "learning_rate": 4.4573754880194976e-05, + "loss": 2.3207, + "step": 4421 + }, + { + "epoch": 1.9074832866077205, + "grad_norm": 0.19796982407569885, + "learning_rate": 4.4542564605791734e-05, + "loss": 1.9953, + "step": 4422 + }, + { + "epoch": 1.9079145999568685, + "grad_norm": 0.1805429458618164, + "learning_rate": 4.4511380637933595e-05, + "loss": 2.1355, + "step": 4423 + }, + { + "epoch": 1.9083459133060168, + "grad_norm": 0.17156881093978882, + "learning_rate": 4.448020298307749e-05, + "loss": 2.0615, + "step": 4424 + }, + { + "epoch": 1.908777226655165, + "grad_norm": 0.19201727211475372, + "learning_rate": 4.44490316476791e-05, + "loss": 2.2494, + "step": 4425 + }, + { + "epoch": 1.908777226655165, + "eval_loss": 2.0903611183166504, + "eval_runtime": 202.0722, + "eval_samples_per_second": 0.158, + "eval_steps_per_second": 0.158, + "step": 4425 + }, + { + "epoch": 1.9092085400043133, + "grad_norm": 0.2967647910118103, + "learning_rate": 4.441786663819282e-05, + "loss": 2.3066, + "step": 4426 + }, + { + "epoch": 1.9096398533534613, + "grad_norm": 0.19616886973381042, + "learning_rate": 4.43867079610717e-05, + "loss": 2.0931, + "step": 4427 + }, + { + "epoch": 1.9100711667026093, + "grad_norm": 0.2792205810546875, + "learning_rate": 4.435555562276746e-05, + "loss": 2.0895, + "step": 4428 + }, + { + "epoch": 1.9105024800517576, + "grad_norm": 0.2176106572151184, + "learning_rate": 4.432440962973059e-05, + "loss": 2.2637, + "step": 4429 + }, + { + "epoch": 1.9109337934009059, + "grad_norm": 0.19296325743198395, + "learning_rate": 4.429326998841014e-05, + "loss": 2.2174, + "step": 4430 + }, + { + "epoch": 1.9113651067500539, + "grad_norm": 0.18656092882156372, + "learning_rate": 4.4262136705253946e-05, + "loss": 2.0757, + "step": 4431 + }, + { + "epoch": 1.911796420099202, + "grad_norm": 0.19575899839401245, + "learning_rate": 4.423100978670847e-05, + "loss": 2.1884, + "step": 4432 + }, + { + "epoch": 1.9122277334483502, + "grad_norm": 0.20120398700237274, + "learning_rate": 4.419988923921893e-05, + "loss": 2.3305, + "step": 4433 + }, + { + "epoch": 1.9126590467974984, + "grad_norm": 0.1956479698419571, + "learning_rate": 4.4168775069229156e-05, + "loss": 2.154, + "step": 4434 + }, + { + "epoch": 1.9130903601466467, + "grad_norm": 0.19334623217582703, + "learning_rate": 4.413766728318162e-05, + "loss": 2.1075, + "step": 4435 + }, + { + "epoch": 1.9135216734957947, + "grad_norm": 0.19544517993927002, + "learning_rate": 4.410656588751762e-05, + "loss": 2.1744, + "step": 4436 + }, + { + "epoch": 1.9139529868449427, + "grad_norm": 0.20631077885627747, + "learning_rate": 4.407547088867699e-05, + "loss": 2.272, + "step": 4437 + }, + { + "epoch": 1.914384300194091, + "grad_norm": 0.18286816775798798, + "learning_rate": 4.4044382293098294e-05, + "loss": 2.2174, + "step": 4438 + }, + { + "epoch": 1.9148156135432393, + "grad_norm": 0.20003879070281982, + "learning_rate": 4.401330010721876e-05, + "loss": 2.1827, + "step": 4439 + }, + { + "epoch": 1.9152469268923873, + "grad_norm": 0.20353899896144867, + "learning_rate": 4.398222433747432e-05, + "loss": 2.2601, + "step": 4440 + }, + { + "epoch": 1.9156782402415353, + "grad_norm": 0.1923852264881134, + "learning_rate": 4.395115499029954e-05, + "loss": 2.0675, + "step": 4441 + }, + { + "epoch": 1.9161095535906836, + "grad_norm": 0.20454512536525726, + "learning_rate": 4.392009207212766e-05, + "loss": 2.289, + "step": 4442 + }, + { + "epoch": 1.9165408669398318, + "grad_norm": 0.20002296566963196, + "learning_rate": 4.388903558939057e-05, + "loss": 2.2905, + "step": 4443 + }, + { + "epoch": 1.91697218028898, + "grad_norm": 0.2085457295179367, + "learning_rate": 4.385798554851891e-05, + "loss": 2.2252, + "step": 4444 + }, + { + "epoch": 1.9174034936381281, + "grad_norm": 0.18940648436546326, + "learning_rate": 4.3826941955941914e-05, + "loss": 2.1704, + "step": 4445 + }, + { + "epoch": 1.9178348069872762, + "grad_norm": 0.20197077095508575, + "learning_rate": 4.3795904818087456e-05, + "loss": 2.2808, + "step": 4446 + }, + { + "epoch": 1.9182661203364244, + "grad_norm": 0.1914563924074173, + "learning_rate": 4.376487414138219e-05, + "loss": 2.1631, + "step": 4447 + }, + { + "epoch": 1.9186974336855727, + "grad_norm": 0.19904504716396332, + "learning_rate": 4.373384993225129e-05, + "loss": 2.0973, + "step": 4448 + }, + { + "epoch": 1.9191287470347207, + "grad_norm": 0.1938726156949997, + "learning_rate": 4.3702832197118694e-05, + "loss": 2.1442, + "step": 4449 + }, + { + "epoch": 1.9195600603838687, + "grad_norm": 0.176131471991539, + "learning_rate": 4.367182094240691e-05, + "loss": 2.3222, + "step": 4450 + }, + { + "epoch": 1.9195600603838687, + "eval_loss": 2.089779853820801, + "eval_runtime": 196.8906, + "eval_samples_per_second": 0.163, + "eval_steps_per_second": 0.163, + "step": 4450 + }, + { + "epoch": 1.919991373733017, + "grad_norm": 0.2034505158662796, + "learning_rate": 4.364081617453727e-05, + "loss": 2.1326, + "step": 4451 + }, + { + "epoch": 1.9204226870821652, + "grad_norm": 0.20447219908237457, + "learning_rate": 4.360981789992955e-05, + "loss": 2.2005, + "step": 4452 + }, + { + "epoch": 1.9208540004313135, + "grad_norm": 0.19651800394058228, + "learning_rate": 4.357882612500227e-05, + "loss": 2.1302, + "step": 4453 + }, + { + "epoch": 1.9212853137804615, + "grad_norm": 0.1718902885913849, + "learning_rate": 4.354784085617269e-05, + "loss": 2.0705, + "step": 4454 + }, + { + "epoch": 1.9217166271296096, + "grad_norm": 0.18895719945430756, + "learning_rate": 4.3516862099856625e-05, + "loss": 2.1459, + "step": 4455 + }, + { + "epoch": 1.9221479404787578, + "grad_norm": 0.20136317610740662, + "learning_rate": 4.348588986246857e-05, + "loss": 2.432, + "step": 4456 + }, + { + "epoch": 1.922579253827906, + "grad_norm": 0.21606945991516113, + "learning_rate": 4.345492415042161e-05, + "loss": 2.1322, + "step": 4457 + }, + { + "epoch": 1.923010567177054, + "grad_norm": 0.2078695446252823, + "learning_rate": 4.3423964970127595e-05, + "loss": 2.0868, + "step": 4458 + }, + { + "epoch": 1.9234418805262021, + "grad_norm": 0.1945011019706726, + "learning_rate": 4.339301232799694e-05, + "loss": 2.2264, + "step": 4459 + }, + { + "epoch": 1.9238731938753504, + "grad_norm": 0.18762318789958954, + "learning_rate": 4.336206623043873e-05, + "loss": 2.0857, + "step": 4460 + }, + { + "epoch": 1.9243045072244986, + "grad_norm": 0.21926428377628326, + "learning_rate": 4.3331126683860704e-05, + "loss": 2.0618, + "step": 4461 + }, + { + "epoch": 1.924735820573647, + "grad_norm": 0.20801591873168945, + "learning_rate": 4.330019369466921e-05, + "loss": 2.3102, + "step": 4462 + }, + { + "epoch": 1.925167133922795, + "grad_norm": 0.20444561541080475, + "learning_rate": 4.3269267269269286e-05, + "loss": 2.0529, + "step": 4463 + }, + { + "epoch": 1.925598447271943, + "grad_norm": 0.19824370741844177, + "learning_rate": 4.323834741406454e-05, + "loss": 2.2004, + "step": 4464 + }, + { + "epoch": 1.9260297606210912, + "grad_norm": 0.21752721071243286, + "learning_rate": 4.320743413545736e-05, + "loss": 2.2265, + "step": 4465 + }, + { + "epoch": 1.9264610739702395, + "grad_norm": 0.21213920414447784, + "learning_rate": 4.31765274398486e-05, + "loss": 2.2037, + "step": 4466 + }, + { + "epoch": 1.9268923873193875, + "grad_norm": 0.1710169017314911, + "learning_rate": 4.314562733363781e-05, + "loss": 2.1363, + "step": 4467 + }, + { + "epoch": 1.9273237006685355, + "grad_norm": 0.19569765031337738, + "learning_rate": 4.3114733823223264e-05, + "loss": 2.1503, + "step": 4468 + }, + { + "epoch": 1.9277550140176838, + "grad_norm": 0.17711645364761353, + "learning_rate": 4.308384691500178e-05, + "loss": 1.9612, + "step": 4469 + }, + { + "epoch": 1.928186327366832, + "grad_norm": 0.21835118532180786, + "learning_rate": 4.305296661536884e-05, + "loss": 1.9699, + "step": 4470 + }, + { + "epoch": 1.9286176407159803, + "grad_norm": 0.20168553292751312, + "learning_rate": 4.302209293071847e-05, + "loss": 2.1138, + "step": 4471 + }, + { + "epoch": 1.9290489540651283, + "grad_norm": 0.18217918276786804, + "learning_rate": 4.29912258674435e-05, + "loss": 2.1849, + "step": 4472 + }, + { + "epoch": 1.9294802674142764, + "grad_norm": 0.18976938724517822, + "learning_rate": 4.2960365431935257e-05, + "loss": 2.1309, + "step": 4473 + }, + { + "epoch": 1.9299115807634246, + "grad_norm": 0.1749623417854309, + "learning_rate": 4.2929511630583726e-05, + "loss": 1.9406, + "step": 4474 + }, + { + "epoch": 1.9303428941125729, + "grad_norm": 0.18413615226745605, + "learning_rate": 4.289866446977752e-05, + "loss": 2.1305, + "step": 4475 + }, + { + "epoch": 1.9303428941125729, + "eval_loss": 2.0894153118133545, + "eval_runtime": 196.8798, + "eval_samples_per_second": 0.163, + "eval_steps_per_second": 0.163, + "step": 4475 + }, + { + "epoch": 1.930774207461721, + "grad_norm": 0.18336045742034912, + "learning_rate": 4.286782395590391e-05, + "loss": 2.1981, + "step": 4476 + }, + { + "epoch": 1.931205520810869, + "grad_norm": 0.17901524901390076, + "learning_rate": 4.283699009534872e-05, + "loss": 2.0308, + "step": 4477 + }, + { + "epoch": 1.9316368341600172, + "grad_norm": 0.17348520457744598, + "learning_rate": 4.2806162894496445e-05, + "loss": 2.0078, + "step": 4478 + }, + { + "epoch": 1.9320681475091654, + "grad_norm": 0.2009512186050415, + "learning_rate": 4.277534235973026e-05, + "loss": 1.9249, + "step": 4479 + }, + { + "epoch": 1.9324994608583137, + "grad_norm": 0.19998937845230103, + "learning_rate": 4.274452849743182e-05, + "loss": 2.1278, + "step": 4480 + }, + { + "epoch": 1.9329307742074617, + "grad_norm": 0.17956745624542236, + "learning_rate": 4.271372131398147e-05, + "loss": 1.9026, + "step": 4481 + }, + { + "epoch": 1.9333620875566098, + "grad_norm": 0.2049476057291031, + "learning_rate": 4.268292081575821e-05, + "loss": 2.2686, + "step": 4482 + }, + { + "epoch": 1.933793400905758, + "grad_norm": 0.18582724034786224, + "learning_rate": 4.265212700913962e-05, + "loss": 2.2011, + "step": 4483 + }, + { + "epoch": 1.9342247142549063, + "grad_norm": 0.18372566998004913, + "learning_rate": 4.2621339900501914e-05, + "loss": 2.0969, + "step": 4484 + }, + { + "epoch": 1.9346560276040543, + "grad_norm": 0.20212997496128082, + "learning_rate": 4.2590559496219805e-05, + "loss": 2.1321, + "step": 4485 + }, + { + "epoch": 1.9350873409532023, + "grad_norm": 0.1954246163368225, + "learning_rate": 4.25597858026668e-05, + "loss": 2.2599, + "step": 4486 + }, + { + "epoch": 1.9355186543023506, + "grad_norm": 0.22167876362800598, + "learning_rate": 4.2529018826214925e-05, + "loss": 2.2438, + "step": 4487 + }, + { + "epoch": 1.9359499676514988, + "grad_norm": 0.1907162368297577, + "learning_rate": 4.249825857323477e-05, + "loss": 2.3102, + "step": 4488 + }, + { + "epoch": 1.936381281000647, + "grad_norm": 0.20281952619552612, + "learning_rate": 4.246750505009563e-05, + "loss": 2.2093, + "step": 4489 + }, + { + "epoch": 1.9368125943497951, + "grad_norm": 0.19904431700706482, + "learning_rate": 4.2436758263165336e-05, + "loss": 2.0995, + "step": 4490 + }, + { + "epoch": 1.9372439076989432, + "grad_norm": 0.19451448321342468, + "learning_rate": 4.240601821881036e-05, + "loss": 2.2836, + "step": 4491 + }, + { + "epoch": 1.9376752210480914, + "grad_norm": 0.1946668177843094, + "learning_rate": 4.2375284923395736e-05, + "loss": 2.1484, + "step": 4492 + }, + { + "epoch": 1.9381065343972397, + "grad_norm": 0.18578791618347168, + "learning_rate": 4.2344558383285146e-05, + "loss": 2.2732, + "step": 4493 + }, + { + "epoch": 1.9385378477463877, + "grad_norm": 0.16853195428848267, + "learning_rate": 4.231383860484086e-05, + "loss": 2.1543, + "step": 4494 + }, + { + "epoch": 1.9389691610955357, + "grad_norm": 0.2038288712501526, + "learning_rate": 4.228312559442375e-05, + "loss": 2.1296, + "step": 4495 + }, + { + "epoch": 1.939400474444684, + "grad_norm": 0.28761857748031616, + "learning_rate": 4.225241935839323e-05, + "loss": 2.3585, + "step": 4496 + }, + { + "epoch": 1.9398317877938323, + "grad_norm": 0.19457559287548065, + "learning_rate": 4.2221719903107465e-05, + "loss": 2.0445, + "step": 4497 + }, + { + "epoch": 1.9402631011429805, + "grad_norm": 0.3036458492279053, + "learning_rate": 4.2191027234923026e-05, + "loss": 2.0874, + "step": 4498 + }, + { + "epoch": 1.9406944144921285, + "grad_norm": 0.19075198471546173, + "learning_rate": 4.216034136019516e-05, + "loss": 2.3152, + "step": 4499 + }, + { + "epoch": 1.9411257278412766, + "grad_norm": 0.2023703008890152, + "learning_rate": 4.2129662285277776e-05, + "loss": 2.0717, + "step": 4500 + }, + { + "epoch": 1.9411257278412766, + "eval_loss": 2.089297294616699, + "eval_runtime": 196.9909, + "eval_samples_per_second": 0.162, + "eval_steps_per_second": 0.162, + "step": 4500 + }, + { + "epoch": 1.9415570411904248, + "grad_norm": 0.18044228851795197, + "learning_rate": 4.209899001652327e-05, + "loss": 2.2405, + "step": 4501 + }, + { + "epoch": 1.941988354539573, + "grad_norm": 0.18647083640098572, + "learning_rate": 4.206832456028272e-05, + "loss": 2.0873, + "step": 4502 + }, + { + "epoch": 1.9424196678887211, + "grad_norm": 0.18730871379375458, + "learning_rate": 4.203766592290564e-05, + "loss": 2.2108, + "step": 4503 + }, + { + "epoch": 1.9428509812378691, + "grad_norm": 0.19313322007656097, + "learning_rate": 4.200701411074034e-05, + "loss": 2.0227, + "step": 4504 + }, + { + "epoch": 1.9432822945870174, + "grad_norm": 0.7987073063850403, + "learning_rate": 4.197636913013356e-05, + "loss": 2.1318, + "step": 4505 + }, + { + "epoch": 1.9437136079361657, + "grad_norm": 0.19378934800624847, + "learning_rate": 4.1945730987430694e-05, + "loss": 2.1356, + "step": 4506 + }, + { + "epoch": 1.944144921285314, + "grad_norm": 0.18642853200435638, + "learning_rate": 4.191509968897571e-05, + "loss": 2.2687, + "step": 4507 + }, + { + "epoch": 1.944576234634462, + "grad_norm": 0.1928897500038147, + "learning_rate": 4.188447524111114e-05, + "loss": 2.3043, + "step": 4508 + }, + { + "epoch": 1.94500754798361, + "grad_norm": 0.18140451610088348, + "learning_rate": 4.185385765017811e-05, + "loss": 2.1565, + "step": 4509 + }, + { + "epoch": 1.9454388613327582, + "grad_norm": 0.1956416219472885, + "learning_rate": 4.1823246922516324e-05, + "loss": 2.0902, + "step": 4510 + }, + { + "epoch": 1.9458701746819065, + "grad_norm": 0.19609729945659637, + "learning_rate": 4.179264306446412e-05, + "loss": 2.3285, + "step": 4511 + }, + { + "epoch": 1.9463014880310545, + "grad_norm": 0.18456441164016724, + "learning_rate": 4.176204608235829e-05, + "loss": 2.0629, + "step": 4512 + }, + { + "epoch": 1.9467328013802028, + "grad_norm": 0.1965542435646057, + "learning_rate": 4.173145598253429e-05, + "loss": 2.2371, + "step": 4513 + }, + { + "epoch": 1.9471641147293508, + "grad_norm": 0.18580764532089233, + "learning_rate": 4.170087277132616e-05, + "loss": 2.1333, + "step": 4514 + }, + { + "epoch": 1.947595428078499, + "grad_norm": 0.18453587591648102, + "learning_rate": 4.167029645506648e-05, + "loss": 2.0632, + "step": 4515 + }, + { + "epoch": 1.9480267414276473, + "grad_norm": 0.19088754057884216, + "learning_rate": 4.163972704008643e-05, + "loss": 2.1317, + "step": 4516 + }, + { + "epoch": 1.9484580547767953, + "grad_norm": 0.20419196784496307, + "learning_rate": 4.160916453271567e-05, + "loss": 2.2218, + "step": 4517 + }, + { + "epoch": 1.9488893681259434, + "grad_norm": 0.20570674538612366, + "learning_rate": 4.157860893928257e-05, + "loss": 2.3841, + "step": 4518 + }, + { + "epoch": 1.9493206814750916, + "grad_norm": 0.18321706354618073, + "learning_rate": 4.154806026611399e-05, + "loss": 2.1808, + "step": 4519 + }, + { + "epoch": 1.9497519948242399, + "grad_norm": 0.1876799464225769, + "learning_rate": 4.151751851953535e-05, + "loss": 2.3955, + "step": 4520 + }, + { + "epoch": 1.950183308173388, + "grad_norm": 0.34781232476234436, + "learning_rate": 4.1486983705870656e-05, + "loss": 2.1793, + "step": 4521 + }, + { + "epoch": 1.9506146215225362, + "grad_norm": 0.18850237131118774, + "learning_rate": 4.145645583144248e-05, + "loss": 2.2796, + "step": 4522 + }, + { + "epoch": 1.9510459348716842, + "grad_norm": 0.18012240529060364, + "learning_rate": 4.1425934902571955e-05, + "loss": 2.2152, + "step": 4523 + }, + { + "epoch": 1.9514772482208325, + "grad_norm": 0.21512307226657867, + "learning_rate": 4.139542092557873e-05, + "loss": 2.1978, + "step": 4524 + }, + { + "epoch": 1.9519085615699807, + "grad_norm": 0.1743171215057373, + "learning_rate": 4.1364913906781164e-05, + "loss": 2.2178, + "step": 4525 + }, + { + "epoch": 1.9519085615699807, + "eval_loss": 2.089406967163086, + "eval_runtime": 196.9204, + "eval_samples_per_second": 0.163, + "eval_steps_per_second": 0.163, + "step": 4525 + }, + { + "epoch": 1.9523398749191287, + "grad_norm": 0.18968987464904785, + "learning_rate": 4.133441385249597e-05, + "loss": 2.2091, + "step": 4526 + }, + { + "epoch": 1.9527711882682768, + "grad_norm": 0.1950238198041916, + "learning_rate": 4.130392076903851e-05, + "loss": 2.1411, + "step": 4527 + }, + { + "epoch": 1.953202501617425, + "grad_norm": 0.20082992315292358, + "learning_rate": 4.127343466272278e-05, + "loss": 2.1609, + "step": 4528 + }, + { + "epoch": 1.9536338149665733, + "grad_norm": 0.19735755026340485, + "learning_rate": 4.1242955539861235e-05, + "loss": 2.164, + "step": 4529 + }, + { + "epoch": 1.9540651283157213, + "grad_norm": 0.18233181536197662, + "learning_rate": 4.121248340676493e-05, + "loss": 1.7721, + "step": 4530 + }, + { + "epoch": 1.9544964416648696, + "grad_norm": 0.18331976234912872, + "learning_rate": 4.118201826974338e-05, + "loss": 2.1153, + "step": 4531 + }, + { + "epoch": 1.9549277550140176, + "grad_norm": 0.2071094512939453, + "learning_rate": 4.11515601351048e-05, + "loss": 2.2119, + "step": 4532 + }, + { + "epoch": 1.9553590683631659, + "grad_norm": 0.19779692590236664, + "learning_rate": 4.1121109009155857e-05, + "loss": 2.2707, + "step": 4533 + }, + { + "epoch": 1.9557903817123141, + "grad_norm": 0.19060441851615906, + "learning_rate": 4.109066489820178e-05, + "loss": 2.0028, + "step": 4534 + }, + { + "epoch": 1.9562216950614622, + "grad_norm": 0.19514651596546173, + "learning_rate": 4.106022780854636e-05, + "loss": 2.3064, + "step": 4535 + }, + { + "epoch": 1.9566530084106102, + "grad_norm": 0.19676922261714935, + "learning_rate": 4.102979774649195e-05, + "loss": 2.183, + "step": 4536 + }, + { + "epoch": 1.9570843217597584, + "grad_norm": 0.19450150430202484, + "learning_rate": 4.09993747183394e-05, + "loss": 2.3104, + "step": 4537 + }, + { + "epoch": 1.9575156351089067, + "grad_norm": 0.19470670819282532, + "learning_rate": 4.096895873038813e-05, + "loss": 2.3458, + "step": 4538 + }, + { + "epoch": 1.9579469484580547, + "grad_norm": 0.19570592045783997, + "learning_rate": 4.093854978893612e-05, + "loss": 2.1843, + "step": 4539 + }, + { + "epoch": 1.958378261807203, + "grad_norm": 0.19192062318325043, + "learning_rate": 4.0908147900279855e-05, + "loss": 2.0946, + "step": 4540 + }, + { + "epoch": 1.958809575156351, + "grad_norm": 0.19365684688091278, + "learning_rate": 4.087775307071439e-05, + "loss": 2.0085, + "step": 4541 + }, + { + "epoch": 1.9592408885054993, + "grad_norm": 0.20169289410114288, + "learning_rate": 4.084736530653329e-05, + "loss": 2.323, + "step": 4542 + }, + { + "epoch": 1.9596722018546475, + "grad_norm": 0.18891608715057373, + "learning_rate": 4.081698461402875e-05, + "loss": 2.1272, + "step": 4543 + }, + { + "epoch": 1.9601035152037956, + "grad_norm": 0.17692779004573822, + "learning_rate": 4.078661099949132e-05, + "loss": 2.1207, + "step": 4544 + }, + { + "epoch": 1.9605348285529436, + "grad_norm": 0.1865505576133728, + "learning_rate": 4.0756244469210216e-05, + "loss": 2.2669, + "step": 4545 + }, + { + "epoch": 1.9609661419020918, + "grad_norm": 0.19029918313026428, + "learning_rate": 4.07258850294732e-05, + "loss": 2.181, + "step": 4546 + }, + { + "epoch": 1.96139745525124, + "grad_norm": 0.19787052273750305, + "learning_rate": 4.069553268656652e-05, + "loss": 2.1802, + "step": 4547 + }, + { + "epoch": 1.9618287686003881, + "grad_norm": 0.21842846274375916, + "learning_rate": 4.066518744677495e-05, + "loss": 2.1489, + "step": 4548 + }, + { + "epoch": 1.9622600819495364, + "grad_norm": 0.18567824363708496, + "learning_rate": 4.0634849316381754e-05, + "loss": 2.2396, + "step": 4549 + }, + { + "epoch": 1.9626913952986844, + "grad_norm": 0.19945816695690155, + "learning_rate": 4.060451830166885e-05, + "loss": 2.3239, + "step": 4550 + }, + { + "epoch": 1.9626913952986844, + "eval_loss": 2.0894079208374023, + "eval_runtime": 196.9192, + "eval_samples_per_second": 0.163, + "eval_steps_per_second": 0.163, + "step": 4550 + }, + { + "epoch": 1.9631227086478327, + "grad_norm": 0.16943685710430145, + "learning_rate": 4.0574194408916565e-05, + "loss": 1.8699, + "step": 4551 + }, + { + "epoch": 1.963554021996981, + "grad_norm": 0.1928304135799408, + "learning_rate": 4.054387764440381e-05, + "loss": 2.3189, + "step": 4552 + }, + { + "epoch": 1.963985335346129, + "grad_norm": 0.2010587602853775, + "learning_rate": 4.0513568014407993e-05, + "loss": 2.1832, + "step": 4553 + }, + { + "epoch": 1.964416648695277, + "grad_norm": 0.22043828666210175, + "learning_rate": 4.0483265525205046e-05, + "loss": 2.3322, + "step": 4554 + }, + { + "epoch": 1.9648479620444252, + "grad_norm": 0.1929306536912918, + "learning_rate": 4.045297018306945e-05, + "loss": 2.2782, + "step": 4555 + }, + { + "epoch": 1.9652792753935735, + "grad_norm": 0.20045654475688934, + "learning_rate": 4.0422681994274145e-05, + "loss": 2.3496, + "step": 4556 + }, + { + "epoch": 1.9657105887427215, + "grad_norm": 0.18351252377033234, + "learning_rate": 4.039240096509073e-05, + "loss": 2.1992, + "step": 4557 + }, + { + "epoch": 1.9661419020918698, + "grad_norm": 0.18495838344097137, + "learning_rate": 4.036212710178912e-05, + "loss": 2.2457, + "step": 4558 + }, + { + "epoch": 1.9665732154410178, + "grad_norm": 0.1930808424949646, + "learning_rate": 4.033186041063786e-05, + "loss": 2.1731, + "step": 4559 + }, + { + "epoch": 1.967004528790166, + "grad_norm": 0.1892196387052536, + "learning_rate": 4.030160089790405e-05, + "loss": 2.1982, + "step": 4560 + }, + { + "epoch": 1.9674358421393143, + "grad_norm": 0.27983418107032776, + "learning_rate": 4.0271348569853246e-05, + "loss": 2.2619, + "step": 4561 + }, + { + "epoch": 1.9678671554884624, + "grad_norm": 0.20122027397155762, + "learning_rate": 4.024110343274952e-05, + "loss": 2.2464, + "step": 4562 + }, + { + "epoch": 1.9682984688376104, + "grad_norm": 0.19067595899105072, + "learning_rate": 4.021086549285539e-05, + "loss": 2.0845, + "step": 4563 + }, + { + "epoch": 1.9687297821867586, + "grad_norm": 0.18311598896980286, + "learning_rate": 4.018063475643204e-05, + "loss": 2.0351, + "step": 4564 + }, + { + "epoch": 1.969161095535907, + "grad_norm": 0.1905970424413681, + "learning_rate": 4.015041122973904e-05, + "loss": 2.3344, + "step": 4565 + }, + { + "epoch": 1.9695924088850552, + "grad_norm": 0.1892702430486679, + "learning_rate": 4.01201949190345e-05, + "loss": 2.2001, + "step": 4566 + }, + { + "epoch": 1.9700237222342032, + "grad_norm": 0.17685829102993011, + "learning_rate": 4.0089985830575046e-05, + "loss": 2.0874, + "step": 4567 + }, + { + "epoch": 1.9704550355833512, + "grad_norm": 0.18338528275489807, + "learning_rate": 4.00597839706158e-05, + "loss": 2.2016, + "step": 4568 + }, + { + "epoch": 1.9708863489324995, + "grad_norm": 0.19318626821041107, + "learning_rate": 4.002958934541037e-05, + "loss": 2.2684, + "step": 4569 + }, + { + "epoch": 1.9713176622816477, + "grad_norm": 0.22403284907341003, + "learning_rate": 3.9999401961210906e-05, + "loss": 2.145, + "step": 4570 + }, + { + "epoch": 1.9717489756307958, + "grad_norm": 0.20169135928153992, + "learning_rate": 3.996922182426802e-05, + "loss": 2.148, + "step": 4571 + }, + { + "epoch": 1.9721802889799438, + "grad_norm": 0.19115835428237915, + "learning_rate": 3.993904894083085e-05, + "loss": 2.0836, + "step": 4572 + }, + { + "epoch": 1.972611602329092, + "grad_norm": 0.21998874843120575, + "learning_rate": 3.9908883317146984e-05, + "loss": 1.9864, + "step": 4573 + }, + { + "epoch": 1.9730429156782403, + "grad_norm": 0.19121584296226501, + "learning_rate": 3.987872495946261e-05, + "loss": 2.1121, + "step": 4574 + }, + { + "epoch": 1.9734742290273886, + "grad_norm": 0.1847795993089676, + "learning_rate": 3.984857387402234e-05, + "loss": 2.1973, + "step": 4575 + }, + { + "epoch": 1.9734742290273886, + "eval_loss": 2.0891201496124268, + "eval_runtime": 204.498, + "eval_samples_per_second": 0.156, + "eval_steps_per_second": 0.156, + "step": 4575 + }, + { + "epoch": 1.9739055423765366, + "grad_norm": 0.19345210492610931, + "learning_rate": 3.9818430067069234e-05, + "loss": 2.0098, + "step": 4576 + }, + { + "epoch": 1.9743368557256846, + "grad_norm": 0.1937677562236786, + "learning_rate": 3.978829354484488e-05, + "loss": 2.0485, + "step": 4577 + }, + { + "epoch": 1.9747681690748329, + "grad_norm": 0.19139151275157928, + "learning_rate": 3.975816431358946e-05, + "loss": 2.3189, + "step": 4578 + }, + { + "epoch": 1.9751994824239811, + "grad_norm": 0.21577173471450806, + "learning_rate": 3.9728042379541516e-05, + "loss": 2.2798, + "step": 4579 + }, + { + "epoch": 1.9756307957731292, + "grad_norm": 0.17227891087532043, + "learning_rate": 3.969792774893811e-05, + "loss": 2.0565, + "step": 4580 + }, + { + "epoch": 1.9760621091222772, + "grad_norm": 0.19595201313495636, + "learning_rate": 3.966782042801483e-05, + "loss": 2.1484, + "step": 4581 + }, + { + "epoch": 1.9764934224714255, + "grad_norm": 0.1743748039007187, + "learning_rate": 3.963772042300572e-05, + "loss": 2.0228, + "step": 4582 + }, + { + "epoch": 1.9769247358205737, + "grad_norm": 0.20380571484565735, + "learning_rate": 3.960762774014331e-05, + "loss": 2.2646, + "step": 4583 + }, + { + "epoch": 1.977356049169722, + "grad_norm": 0.1720813810825348, + "learning_rate": 3.9577542385658614e-05, + "loss": 2.2076, + "step": 4584 + }, + { + "epoch": 1.97778736251887, + "grad_norm": 0.19041219353675842, + "learning_rate": 3.9547464365781145e-05, + "loss": 2.1829, + "step": 4585 + }, + { + "epoch": 1.978218675868018, + "grad_norm": 0.1973971128463745, + "learning_rate": 3.951739368673889e-05, + "loss": 2.3014, + "step": 4586 + }, + { + "epoch": 1.9786499892171663, + "grad_norm": 0.17989139258861542, + "learning_rate": 3.948733035475829e-05, + "loss": 2.0504, + "step": 4587 + }, + { + "epoch": 1.9790813025663145, + "grad_norm": 0.20466317236423492, + "learning_rate": 3.945727437606428e-05, + "loss": 2.2927, + "step": 4588 + }, + { + "epoch": 1.9795126159154626, + "grad_norm": 0.17659124732017517, + "learning_rate": 3.9427225756880353e-05, + "loss": 2.1761, + "step": 4589 + }, + { + "epoch": 1.9799439292646106, + "grad_norm": 0.1800772100687027, + "learning_rate": 3.939718450342832e-05, + "loss": 2.0339, + "step": 4590 + }, + { + "epoch": 1.9803752426137589, + "grad_norm": 0.20760327577590942, + "learning_rate": 3.9367150621928556e-05, + "loss": 2.0183, + "step": 4591 + }, + { + "epoch": 1.9808065559629071, + "grad_norm": 0.17776834964752197, + "learning_rate": 3.933712411859995e-05, + "loss": 2.1135, + "step": 4592 + }, + { + "epoch": 1.9812378693120554, + "grad_norm": 0.19570772349834442, + "learning_rate": 3.9307104999659813e-05, + "loss": 2.2599, + "step": 4593 + }, + { + "epoch": 1.9816691826612034, + "grad_norm": 119.74149322509766, + "learning_rate": 3.927709327132394e-05, + "loss": 2.166, + "step": 4594 + }, + { + "epoch": 1.9821004960103514, + "grad_norm": 0.22241385281085968, + "learning_rate": 3.92470889398065e-05, + "loss": 1.9044, + "step": 4595 + }, + { + "epoch": 1.9825318093594997, + "grad_norm": 0.19045329093933105, + "learning_rate": 3.9217092011320317e-05, + "loss": 2.0649, + "step": 4596 + }, + { + "epoch": 1.982963122708648, + "grad_norm": 0.19328786432743073, + "learning_rate": 3.918710249207655e-05, + "loss": 2.055, + "step": 4597 + }, + { + "epoch": 1.983394436057796, + "grad_norm": 0.21507321298122406, + "learning_rate": 3.915712038828485e-05, + "loss": 2.068, + "step": 4598 + }, + { + "epoch": 1.983825749406944, + "grad_norm": 0.19425652921199799, + "learning_rate": 3.912714570615335e-05, + "loss": 2.0588, + "step": 4599 + }, + { + "epoch": 1.9842570627560923, + "grad_norm": 0.17666107416152954, + "learning_rate": 3.909717845188863e-05, + "loss": 1.9879, + "step": 4600 + }, + { + "epoch": 1.9842570627560923, + "eval_loss": 2.0891807079315186, + "eval_runtime": 201.3417, + "eval_samples_per_second": 0.159, + "eval_steps_per_second": 0.159, + "step": 4600 + }, + { + "epoch": 1.9846883761052405, + "grad_norm": 0.19912296533584595, + "learning_rate": 3.906721863169573e-05, + "loss": 2.2162, + "step": 4601 + }, + { + "epoch": 1.9851196894543888, + "grad_norm": 0.1944214552640915, + "learning_rate": 3.903726625177815e-05, + "loss": 2.1721, + "step": 4602 + }, + { + "epoch": 1.9855510028035368, + "grad_norm": 0.18800340592861176, + "learning_rate": 3.9007321318337925e-05, + "loss": 1.9881, + "step": 4603 + }, + { + "epoch": 1.9859823161526848, + "grad_norm": 0.18974347412586212, + "learning_rate": 3.89773838375754e-05, + "loss": 2.3557, + "step": 4604 + }, + { + "epoch": 1.986413629501833, + "grad_norm": 0.19073079526424408, + "learning_rate": 3.894745381568946e-05, + "loss": 2.3348, + "step": 4605 + }, + { + "epoch": 1.9868449428509813, + "grad_norm": 0.20038451254367828, + "learning_rate": 3.89175312588775e-05, + "loss": 2.1509, + "step": 4606 + }, + { + "epoch": 1.9872762562001294, + "grad_norm": 0.1876484900712967, + "learning_rate": 3.8887616173335285e-05, + "loss": 2.1822, + "step": 4607 + }, + { + "epoch": 1.9877075695492774, + "grad_norm": 0.1894012689590454, + "learning_rate": 3.88577085652571e-05, + "loss": 2.16, + "step": 4608 + }, + { + "epoch": 1.9881388828984257, + "grad_norm": 0.19239047169685364, + "learning_rate": 3.882780844083552e-05, + "loss": 2.1529, + "step": 4609 + }, + { + "epoch": 1.988570196247574, + "grad_norm": 0.20243610441684723, + "learning_rate": 3.879791580626183e-05, + "loss": 2.1936, + "step": 4610 + }, + { + "epoch": 1.9890015095967222, + "grad_norm": 0.20150430500507355, + "learning_rate": 3.876803066772555e-05, + "loss": 2.1183, + "step": 4611 + }, + { + "epoch": 1.9894328229458702, + "grad_norm": 0.17391133308410645, + "learning_rate": 3.873815303141475e-05, + "loss": 2.1931, + "step": 4612 + }, + { + "epoch": 1.9898641362950182, + "grad_norm": 0.18860773742198944, + "learning_rate": 3.870828290351593e-05, + "loss": 2.1888, + "step": 4613 + }, + { + "epoch": 1.9902954496441665, + "grad_norm": 0.18391074240207672, + "learning_rate": 3.8678420290214e-05, + "loss": 1.9288, + "step": 4614 + }, + { + "epoch": 1.9907267629933147, + "grad_norm": 0.19212834537029266, + "learning_rate": 3.864856519769237e-05, + "loss": 2.1522, + "step": 4615 + }, + { + "epoch": 1.9911580763424628, + "grad_norm": 0.18585047125816345, + "learning_rate": 3.861871763213284e-05, + "loss": 2.068, + "step": 4616 + }, + { + "epoch": 1.9915893896916108, + "grad_norm": 0.25356727838516235, + "learning_rate": 3.858887759971568e-05, + "loss": 2.1914, + "step": 4617 + }, + { + "epoch": 1.992020703040759, + "grad_norm": 0.19800281524658203, + "learning_rate": 3.85590451066196e-05, + "loss": 2.313, + "step": 4618 + }, + { + "epoch": 1.9924520163899073, + "grad_norm": 0.18604665994644165, + "learning_rate": 3.852922015902171e-05, + "loss": 1.8551, + "step": 4619 + }, + { + "epoch": 1.9928833297390556, + "grad_norm": 0.18251395225524902, + "learning_rate": 3.849940276309766e-05, + "loss": 1.8771, + "step": 4620 + }, + { + "epoch": 1.9933146430882036, + "grad_norm": 0.18314236402511597, + "learning_rate": 3.8469592925021466e-05, + "loss": 2.107, + "step": 4621 + }, + { + "epoch": 1.9937459564373516, + "grad_norm": 0.18971586227416992, + "learning_rate": 3.8439790650965505e-05, + "loss": 2.347, + "step": 4622 + }, + { + "epoch": 1.9941772697865, + "grad_norm": 0.19586211442947388, + "learning_rate": 3.840999594710068e-05, + "loss": 2.1796, + "step": 4623 + }, + { + "epoch": 1.9946085831356481, + "grad_norm": 0.20417676866054535, + "learning_rate": 3.838020881959636e-05, + "loss": 2.3454, + "step": 4624 + }, + { + "epoch": 1.9950398964847962, + "grad_norm": 0.18715451657772064, + "learning_rate": 3.835042927462029e-05, + "loss": 2.125, + "step": 4625 + }, + { + "epoch": 1.9950398964847962, + "eval_loss": 2.089186429977417, + "eval_runtime": 200.4625, + "eval_samples_per_second": 0.16, + "eval_steps_per_second": 0.16, + "step": 4625 + }, + { + "epoch": 1.9954712098339442, + "grad_norm": 0.17249004542827606, + "learning_rate": 3.832065731833861e-05, + "loss": 1.9698, + "step": 4626 + }, + { + "epoch": 1.9959025231830925, + "grad_norm": 0.17829373478889465, + "learning_rate": 3.829089295691597e-05, + "loss": 2.2089, + "step": 4627 + }, + { + "epoch": 1.9963338365322407, + "grad_norm": 0.1930740773677826, + "learning_rate": 3.826113619651537e-05, + "loss": 2.3168, + "step": 4628 + }, + { + "epoch": 1.996765149881389, + "grad_norm": 0.1748417466878891, + "learning_rate": 3.82313870432983e-05, + "loss": 2.3054, + "step": 4629 + }, + { + "epoch": 1.997196463230537, + "grad_norm": 0.19034349918365479, + "learning_rate": 3.820164550342464e-05, + "loss": 2.2194, + "step": 4630 + }, + { + "epoch": 1.997627776579685, + "grad_norm": 0.2013319879770279, + "learning_rate": 3.8171911583052686e-05, + "loss": 2.2437, + "step": 4631 + }, + { + "epoch": 1.9980590899288333, + "grad_norm": 0.1998211145401001, + "learning_rate": 3.814218528833919e-05, + "loss": 2.114, + "step": 4632 + }, + { + "epoch": 1.9984904032779816, + "grad_norm": 0.18663814663887024, + "learning_rate": 3.8112466625439294e-05, + "loss": 2.1535, + "step": 4633 + }, + { + "epoch": 1.9989217166271296, + "grad_norm": 0.18947145342826843, + "learning_rate": 3.808275560050656e-05, + "loss": 2.0403, + "step": 4634 + }, + { + "epoch": 1.9993530299762776, + "grad_norm": 0.20086605846881866, + "learning_rate": 3.805305221969306e-05, + "loss": 2.1506, + "step": 4635 + }, + { + "epoch": 1.9997843433254259, + "grad_norm": 0.1824001967906952, + "learning_rate": 3.802335648914911e-05, + "loss": 2.3072, + "step": 4636 + }, + { + "epoch": 2.0004313133491483, + "grad_norm": 0.3254935145378113, + "learning_rate": 3.7993668415023526e-05, + "loss": 4.2159, + "step": 4637 + }, + { + "epoch": 2.0008626266982965, + "grad_norm": 0.18431530892848969, + "learning_rate": 3.796398800346364e-05, + "loss": 2.1423, + "step": 4638 + }, + { + "epoch": 2.0012939400474443, + "grad_norm": 0.1791626363992691, + "learning_rate": 3.793431526061504e-05, + "loss": 2.0886, + "step": 4639 + }, + { + "epoch": 2.0017252533965926, + "grad_norm": 0.1967422068119049, + "learning_rate": 3.790465019262186e-05, + "loss": 2.0144, + "step": 4640 + }, + { + "epoch": 2.002156566745741, + "grad_norm": 0.19516336917877197, + "learning_rate": 3.7874992805626443e-05, + "loss": 2.1952, + "step": 4641 + }, + { + "epoch": 2.002587880094889, + "grad_norm": 0.1892661154270172, + "learning_rate": 3.7845343105769796e-05, + "loss": 2.0195, + "step": 4642 + }, + { + "epoch": 2.003019193444037, + "grad_norm": 0.1858513206243515, + "learning_rate": 3.7815701099191184e-05, + "loss": 2.1626, + "step": 4643 + }, + { + "epoch": 2.003450506793185, + "grad_norm": 0.18395952880382538, + "learning_rate": 3.778606679202829e-05, + "loss": 2.1628, + "step": 4644 + }, + { + "epoch": 2.0038818201423334, + "grad_norm": 0.18125440180301666, + "learning_rate": 3.775644019041723e-05, + "loss": 2.125, + "step": 4645 + }, + { + "epoch": 2.0043131334914817, + "grad_norm": 0.19737988710403442, + "learning_rate": 3.7726821300492524e-05, + "loss": 2.1642, + "step": 4646 + }, + { + "epoch": 2.00474444684063, + "grad_norm": 0.19009336829185486, + "learning_rate": 3.769721012838708e-05, + "loss": 1.9746, + "step": 4647 + }, + { + "epoch": 2.0051757601897777, + "grad_norm": 0.21443523466587067, + "learning_rate": 3.766760668023218e-05, + "loss": 2.2279, + "step": 4648 + }, + { + "epoch": 2.005607073538926, + "grad_norm": 0.21913030743598938, + "learning_rate": 3.7638010962157635e-05, + "loss": 2.1498, + "step": 4649 + }, + { + "epoch": 2.0060383868880742, + "grad_norm": 0.19212424755096436, + "learning_rate": 3.760842298029148e-05, + "loss": 2.1715, + "step": 4650 + }, + { + "epoch": 2.0060383868880742, + "eval_loss": 2.0894405841827393, + "eval_runtime": 200.6853, + "eval_samples_per_second": 0.159, + "eval_steps_per_second": 0.159, + "step": 4650 + }, + { + "epoch": 2.0064697002372225, + "grad_norm": 0.1889164000749588, + "learning_rate": 3.7578842740760224e-05, + "loss": 2.1657, + "step": 4651 + }, + { + "epoch": 2.0069010135863703, + "grad_norm": 0.1890760362148285, + "learning_rate": 3.754927024968885e-05, + "loss": 2.1499, + "step": 4652 + }, + { + "epoch": 2.0073323269355186, + "grad_norm": 0.19354365766048431, + "learning_rate": 3.7519705513200606e-05, + "loss": 2.1976, + "step": 4653 + }, + { + "epoch": 2.007763640284667, + "grad_norm": 0.19646655023097992, + "learning_rate": 3.7490148537417245e-05, + "loss": 1.9383, + "step": 4654 + }, + { + "epoch": 2.008194953633815, + "grad_norm": 0.1919821798801422, + "learning_rate": 3.746059932845878e-05, + "loss": 2.189, + "step": 4655 + }, + { + "epoch": 2.0086262669829633, + "grad_norm": 0.18532203137874603, + "learning_rate": 3.7431057892443777e-05, + "loss": 2.0868, + "step": 4656 + }, + { + "epoch": 2.009057580332111, + "grad_norm": 0.18428903818130493, + "learning_rate": 3.7401524235489074e-05, + "loss": 2.1462, + "step": 4657 + }, + { + "epoch": 2.0094888936812594, + "grad_norm": 0.20539259910583496, + "learning_rate": 3.737199836370996e-05, + "loss": 2.2161, + "step": 4658 + }, + { + "epoch": 2.0099202070304076, + "grad_norm": 0.19485358893871307, + "learning_rate": 3.734248028322007e-05, + "loss": 2.1433, + "step": 4659 + }, + { + "epoch": 2.010351520379556, + "grad_norm": 0.18940050899982452, + "learning_rate": 3.731297000013146e-05, + "loss": 1.9886, + "step": 4660 + }, + { + "epoch": 2.0107828337287037, + "grad_norm": 0.20166797935962677, + "learning_rate": 3.7283467520554546e-05, + "loss": 2.0954, + "step": 4661 + }, + { + "epoch": 2.011214147077852, + "grad_norm": 0.19941836595535278, + "learning_rate": 3.725397285059814e-05, + "loss": 2.1256, + "step": 4662 + }, + { + "epoch": 2.011645460427, + "grad_norm": 0.18807263672351837, + "learning_rate": 3.7224485996369445e-05, + "loss": 2.1517, + "step": 4663 + }, + { + "epoch": 2.0120767737761485, + "grad_norm": 0.22854174673557281, + "learning_rate": 3.7195006963974026e-05, + "loss": 2.2179, + "step": 4664 + }, + { + "epoch": 2.0125080871252967, + "grad_norm": 0.19089582562446594, + "learning_rate": 3.716553575951582e-05, + "loss": 2.1794, + "step": 4665 + }, + { + "epoch": 2.0129394004744445, + "grad_norm": 0.17721687257289886, + "learning_rate": 3.7136072389097204e-05, + "loss": 2.0641, + "step": 4666 + }, + { + "epoch": 2.013370713823593, + "grad_norm": 0.18833722174167633, + "learning_rate": 3.710661685881891e-05, + "loss": 2.0732, + "step": 4667 + }, + { + "epoch": 2.013802027172741, + "grad_norm": 1.9127404689788818, + "learning_rate": 3.7077169174779956e-05, + "loss": 2.1706, + "step": 4668 + }, + { + "epoch": 2.0142333405218893, + "grad_norm": 0.19390110671520233, + "learning_rate": 3.7047729343077814e-05, + "loss": 2.1484, + "step": 4669 + }, + { + "epoch": 2.014664653871037, + "grad_norm": 0.19157013297080994, + "learning_rate": 3.701829736980839e-05, + "loss": 1.9822, + "step": 4670 + }, + { + "epoch": 2.0150959672201854, + "grad_norm": 0.18170066177845, + "learning_rate": 3.698887326106584e-05, + "loss": 1.9444, + "step": 4671 + }, + { + "epoch": 2.0155272805693336, + "grad_norm": 0.18794336915016174, + "learning_rate": 3.6959457022942775e-05, + "loss": 2.2588, + "step": 4672 + }, + { + "epoch": 2.015958593918482, + "grad_norm": 0.21366532146930695, + "learning_rate": 3.6930048661530135e-05, + "loss": 2.2123, + "step": 4673 + }, + { + "epoch": 2.01638990726763, + "grad_norm": 0.19540277123451233, + "learning_rate": 3.690064818291725e-05, + "loss": 2.0826, + "step": 4674 + }, + { + "epoch": 2.016821220616778, + "grad_norm": 0.18262942135334015, + "learning_rate": 3.6871255593191794e-05, + "loss": 2.004, + "step": 4675 + }, + { + "epoch": 2.016821220616778, + "eval_loss": 2.089542865753174, + "eval_runtime": 200.436, + "eval_samples_per_second": 0.16, + "eval_steps_per_second": 0.16, + "step": 4675 + }, + { + "epoch": 2.017252533965926, + "grad_norm": 0.1919758915901184, + "learning_rate": 3.6841870898439846e-05, + "loss": 2.3566, + "step": 4676 + }, + { + "epoch": 2.0176838473150744, + "grad_norm": 0.18473289906978607, + "learning_rate": 3.681249410474581e-05, + "loss": 1.8332, + "step": 4677 + }, + { + "epoch": 2.0181151606642227, + "grad_norm": 0.1841718554496765, + "learning_rate": 3.6783125218192494e-05, + "loss": 1.9875, + "step": 4678 + }, + { + "epoch": 2.0185464740133705, + "grad_norm": 0.1876533031463623, + "learning_rate": 3.6753764244861025e-05, + "loss": 2.0937, + "step": 4679 + }, + { + "epoch": 2.0189777873625188, + "grad_norm": 0.19570693373680115, + "learning_rate": 3.67244111908309e-05, + "loss": 2.2245, + "step": 4680 + }, + { + "epoch": 2.019409100711667, + "grad_norm": 0.1707245111465454, + "learning_rate": 3.669506606218007e-05, + "loss": 2.1095, + "step": 4681 + }, + { + "epoch": 2.0198404140608153, + "grad_norm": 0.19620443880558014, + "learning_rate": 3.666572886498467e-05, + "loss": 2.158, + "step": 4682 + }, + { + "epoch": 2.0202717274099635, + "grad_norm": 0.17991629242897034, + "learning_rate": 3.6636399605319307e-05, + "loss": 1.9504, + "step": 4683 + }, + { + "epoch": 2.0207030407591113, + "grad_norm": 0.17959769070148468, + "learning_rate": 3.660707828925696e-05, + "loss": 2.097, + "step": 4684 + }, + { + "epoch": 2.0211343541082596, + "grad_norm": 0.18759562075138092, + "learning_rate": 3.657776492286893e-05, + "loss": 2.3082, + "step": 4685 + }, + { + "epoch": 2.021565667457408, + "grad_norm": 0.18067705631256104, + "learning_rate": 3.654845951222486e-05, + "loss": 2.1922, + "step": 4686 + }, + { + "epoch": 2.021996980806556, + "grad_norm": 0.19888699054718018, + "learning_rate": 3.6519162063392695e-05, + "loss": 2.2701, + "step": 4687 + }, + { + "epoch": 2.022428294155704, + "grad_norm": 0.18732723593711853, + "learning_rate": 3.6489872582438883e-05, + "loss": 2.1285, + "step": 4688 + }, + { + "epoch": 2.022859607504852, + "grad_norm": 0.19551338255405426, + "learning_rate": 3.646059107542808e-05, + "loss": 2.1456, + "step": 4689 + }, + { + "epoch": 2.0232909208540004, + "grad_norm": 0.20126861333847046, + "learning_rate": 3.643131754842336e-05, + "loss": 2.1727, + "step": 4690 + }, + { + "epoch": 2.0237222342031487, + "grad_norm": 0.1837441325187683, + "learning_rate": 3.6402052007486115e-05, + "loss": 2.1804, + "step": 4691 + }, + { + "epoch": 2.024153547552297, + "grad_norm": 0.202824667096138, + "learning_rate": 3.6372794458676115e-05, + "loss": 2.2157, + "step": 4692 + }, + { + "epoch": 2.0245848609014447, + "grad_norm": 0.201461061835289, + "learning_rate": 3.6343544908051434e-05, + "loss": 2.1783, + "step": 4693 + }, + { + "epoch": 2.025016174250593, + "grad_norm": 0.20021897554397583, + "learning_rate": 3.6314303361668514e-05, + "loss": 2.101, + "step": 4694 + }, + { + "epoch": 2.0254474875997412, + "grad_norm": 0.18160514533519745, + "learning_rate": 3.6285069825582154e-05, + "loss": 2.0716, + "step": 4695 + }, + { + "epoch": 2.0258788009488895, + "grad_norm": 0.19927164912223816, + "learning_rate": 3.6255844305845464e-05, + "loss": 2.1218, + "step": 4696 + }, + { + "epoch": 2.0263101142980373, + "grad_norm": 0.21796950697898865, + "learning_rate": 3.6226626808509876e-05, + "loss": 2.1368, + "step": 4697 + }, + { + "epoch": 2.0267414276471856, + "grad_norm": 0.20131273567676544, + "learning_rate": 3.619741733962525e-05, + "loss": 2.2224, + "step": 4698 + }, + { + "epoch": 2.027172740996334, + "grad_norm": 0.2108275443315506, + "learning_rate": 3.616821590523974e-05, + "loss": 2.2149, + "step": 4699 + }, + { + "epoch": 2.027604054345482, + "grad_norm": 0.182052344083786, + "learning_rate": 3.613902251139975e-05, + "loss": 2.1594, + "step": 4700 + }, + { + "epoch": 2.027604054345482, + "eval_loss": 2.089719295501709, + "eval_runtime": 200.7819, + "eval_samples_per_second": 0.159, + "eval_steps_per_second": 0.159, + "step": 4700 + }, + { + "epoch": 2.0280353676946303, + "grad_norm": 0.20290504395961761, + "learning_rate": 3.6109837164150105e-05, + "loss": 2.1564, + "step": 4701 + }, + { + "epoch": 2.028466681043778, + "grad_norm": 0.18281979858875275, + "learning_rate": 3.6080659869533994e-05, + "loss": 1.9623, + "step": 4702 + }, + { + "epoch": 2.0288979943929264, + "grad_norm": 0.18263626098632812, + "learning_rate": 3.605149063359287e-05, + "loss": 1.9182, + "step": 4703 + }, + { + "epoch": 2.0293293077420747, + "grad_norm": 0.20167215168476105, + "learning_rate": 3.602232946236654e-05, + "loss": 2.1137, + "step": 4704 + }, + { + "epoch": 2.029760621091223, + "grad_norm": 0.19266636669635773, + "learning_rate": 3.5993176361893154e-05, + "loss": 2.1873, + "step": 4705 + }, + { + "epoch": 2.0301919344403707, + "grad_norm": 0.18675893545150757, + "learning_rate": 3.596403133820917e-05, + "loss": 2.1701, + "step": 4706 + }, + { + "epoch": 2.030623247789519, + "grad_norm": 0.20420628786087036, + "learning_rate": 3.5934894397349394e-05, + "loss": 2.0722, + "step": 4707 + }, + { + "epoch": 2.0310545611386672, + "grad_norm": 0.19602304697036743, + "learning_rate": 3.590576554534694e-05, + "loss": 2.0701, + "step": 4708 + }, + { + "epoch": 2.0314858744878155, + "grad_norm": 0.21883141994476318, + "learning_rate": 3.587664478823325e-05, + "loss": 2.2202, + "step": 4709 + }, + { + "epoch": 2.0319171878369637, + "grad_norm": 0.181867316365242, + "learning_rate": 3.584753213203811e-05, + "loss": 2.0648, + "step": 4710 + }, + { + "epoch": 2.0323485011861115, + "grad_norm": 0.2570772171020508, + "learning_rate": 3.581842758278957e-05, + "loss": 2.0124, + "step": 4711 + }, + { + "epoch": 2.03277981453526, + "grad_norm": 0.1787121295928955, + "learning_rate": 3.578933114651411e-05, + "loss": 2.1435, + "step": 4712 + }, + { + "epoch": 2.033211127884408, + "grad_norm": 0.205344557762146, + "learning_rate": 3.576024282923647e-05, + "loss": 2.4994, + "step": 4713 + }, + { + "epoch": 2.0336424412335563, + "grad_norm": 0.19113320112228394, + "learning_rate": 3.573116263697965e-05, + "loss": 2.1884, + "step": 4714 + }, + { + "epoch": 2.034073754582704, + "grad_norm": 0.19865363836288452, + "learning_rate": 3.5702090575765006e-05, + "loss": 1.9913, + "step": 4715 + }, + { + "epoch": 2.0345050679318524, + "grad_norm": 0.22159428894519806, + "learning_rate": 3.5673026651612295e-05, + "loss": 2.2952, + "step": 4716 + }, + { + "epoch": 2.0349363812810006, + "grad_norm": 0.20246855914592743, + "learning_rate": 3.5643970870539485e-05, + "loss": 2.3079, + "step": 4717 + }, + { + "epoch": 2.035367694630149, + "grad_norm": 0.18345387279987335, + "learning_rate": 3.561492323856289e-05, + "loss": 2.1188, + "step": 4718 + }, + { + "epoch": 2.035799007979297, + "grad_norm": 0.17065024375915527, + "learning_rate": 3.558588376169716e-05, + "loss": 2.0935, + "step": 4719 + }, + { + "epoch": 2.036230321328445, + "grad_norm": 0.2016269415616989, + "learning_rate": 3.555685244595521e-05, + "loss": 2.2215, + "step": 4720 + }, + { + "epoch": 2.036661634677593, + "grad_norm": 0.19801582396030426, + "learning_rate": 3.552782929734831e-05, + "loss": 2.0749, + "step": 4721 + }, + { + "epoch": 2.0370929480267415, + "grad_norm": 0.19953446090221405, + "learning_rate": 3.5498814321886016e-05, + "loss": 2.1919, + "step": 4722 + }, + { + "epoch": 2.0375242613758897, + "grad_norm": 0.18833646178245544, + "learning_rate": 3.546980752557619e-05, + "loss": 2.1415, + "step": 4723 + }, + { + "epoch": 2.0379555747250375, + "grad_norm": 0.18866832554340363, + "learning_rate": 3.544080891442502e-05, + "loss": 1.9907, + "step": 4724 + }, + { + "epoch": 2.0383868880741858, + "grad_norm": 0.1904328614473343, + "learning_rate": 3.541181849443694e-05, + "loss": 2.0671, + "step": 4725 + }, + { + "epoch": 2.0383868880741858, + "eval_loss": 2.089667320251465, + "eval_runtime": 200.4158, + "eval_samples_per_second": 0.16, + "eval_steps_per_second": 0.16, + "step": 4725 + }, + { + "epoch": 2.038818201423334, + "grad_norm": 0.19026018679141998, + "learning_rate": 3.538283627161479e-05, + "loss": 2.0147, + "step": 4726 + }, + { + "epoch": 2.0392495147724823, + "grad_norm": 0.19227205216884613, + "learning_rate": 3.535386225195968e-05, + "loss": 2.2522, + "step": 4727 + }, + { + "epoch": 2.0396808281216305, + "grad_norm": 0.24319662153720856, + "learning_rate": 3.5324896441470916e-05, + "loss": 2.0963, + "step": 4728 + }, + { + "epoch": 2.0401121414707784, + "grad_norm": 0.17921534180641174, + "learning_rate": 3.52959388461462e-05, + "loss": 2.0643, + "step": 4729 + }, + { + "epoch": 2.0405434548199266, + "grad_norm": 0.21287348866462708, + "learning_rate": 3.526698947198157e-05, + "loss": 2.1059, + "step": 4730 + }, + { + "epoch": 2.040974768169075, + "grad_norm": 0.2085811197757721, + "learning_rate": 3.523804832497127e-05, + "loss": 1.9961, + "step": 4731 + }, + { + "epoch": 2.041406081518223, + "grad_norm": 0.1964728683233261, + "learning_rate": 3.520911541110794e-05, + "loss": 2.1386, + "step": 4732 + }, + { + "epoch": 2.041837394867371, + "grad_norm": 0.1720174103975296, + "learning_rate": 3.518019073638235e-05, + "loss": 2.0119, + "step": 4733 + }, + { + "epoch": 2.042268708216519, + "grad_norm": 0.17993386089801788, + "learning_rate": 3.515127430678375e-05, + "loss": 2.034, + "step": 4734 + }, + { + "epoch": 2.0427000215656674, + "grad_norm": 0.20251108705997467, + "learning_rate": 3.5122366128299594e-05, + "loss": 2.0783, + "step": 4735 + }, + { + "epoch": 2.0431313349148157, + "grad_norm": 0.1712861955165863, + "learning_rate": 3.509346620691561e-05, + "loss": 2.0154, + "step": 4736 + }, + { + "epoch": 2.043562648263964, + "grad_norm": 0.19915875792503357, + "learning_rate": 3.506457454861587e-05, + "loss": 2.2971, + "step": 4737 + }, + { + "epoch": 2.0439939616131118, + "grad_norm": 0.19039981067180634, + "learning_rate": 3.5035691159382696e-05, + "loss": 2.051, + "step": 4738 + }, + { + "epoch": 2.04442527496226, + "grad_norm": 0.19815868139266968, + "learning_rate": 3.5006816045196704e-05, + "loss": 2.2007, + "step": 4739 + }, + { + "epoch": 2.0448565883114083, + "grad_norm": 0.2041560560464859, + "learning_rate": 3.497794921203682e-05, + "loss": 2.0514, + "step": 4740 + }, + { + "epoch": 2.0452879016605565, + "grad_norm": 0.20165760815143585, + "learning_rate": 3.4949090665880206e-05, + "loss": 2.1084, + "step": 4741 + }, + { + "epoch": 2.0457192150097043, + "grad_norm": 0.19834686815738678, + "learning_rate": 3.492024041270238e-05, + "loss": 2.1179, + "step": 4742 + }, + { + "epoch": 2.0461505283588526, + "grad_norm": 0.1835087090730667, + "learning_rate": 3.489139845847704e-05, + "loss": 2.0642, + "step": 4743 + }, + { + "epoch": 2.046581841708001, + "grad_norm": 0.20027287304401398, + "learning_rate": 3.4862564809176296e-05, + "loss": 2.1164, + "step": 4744 + }, + { + "epoch": 2.047013155057149, + "grad_norm": 0.1918504238128662, + "learning_rate": 3.4833739470770475e-05, + "loss": 2.0904, + "step": 4745 + }, + { + "epoch": 2.0474444684062973, + "grad_norm": 0.173620343208313, + "learning_rate": 3.480492244922812e-05, + "loss": 2.0412, + "step": 4746 + }, + { + "epoch": 2.047875781755445, + "grad_norm": 0.1913483589887619, + "learning_rate": 3.477611375051612e-05, + "loss": 2.0591, + "step": 4747 + }, + { + "epoch": 2.0483070951045934, + "grad_norm": 0.1801992505788803, + "learning_rate": 3.474731338059966e-05, + "loss": 2.0881, + "step": 4748 + }, + { + "epoch": 2.0487384084537417, + "grad_norm": 0.19101691246032715, + "learning_rate": 3.4718521345442174e-05, + "loss": 2.0747, + "step": 4749 + }, + { + "epoch": 2.04916972180289, + "grad_norm": 0.1829538494348526, + "learning_rate": 3.468973765100535e-05, + "loss": 2.0801, + "step": 4750 + }, + { + "epoch": 2.04916972180289, + "eval_loss": 2.0897274017333984, + "eval_runtime": 201.0233, + "eval_samples_per_second": 0.159, + "eval_steps_per_second": 0.159, + "step": 4750 + }, + { + "epoch": 2.0496010351520377, + "grad_norm": 0.20499111711978912, + "learning_rate": 3.466096230324916e-05, + "loss": 2.1689, + "step": 4751 + }, + { + "epoch": 2.050032348501186, + "grad_norm": 0.18667557835578918, + "learning_rate": 3.463219530813188e-05, + "loss": 1.9828, + "step": 4752 + }, + { + "epoch": 2.0504636618503342, + "grad_norm": 0.20864151418209076, + "learning_rate": 3.4603436671610005e-05, + "loss": 2.1977, + "step": 4753 + }, + { + "epoch": 2.0508949751994825, + "grad_norm": 0.1942366361618042, + "learning_rate": 3.457468639963835e-05, + "loss": 2.0802, + "step": 4754 + }, + { + "epoch": 2.0513262885486307, + "grad_norm": 0.1807696372270584, + "learning_rate": 3.454594449816995e-05, + "loss": 1.9287, + "step": 4755 + }, + { + "epoch": 2.0517576018977786, + "grad_norm": 0.19067130982875824, + "learning_rate": 3.451721097315614e-05, + "loss": 2.0488, + "step": 4756 + }, + { + "epoch": 2.052188915246927, + "grad_norm": 0.21157202124595642, + "learning_rate": 3.448848583054648e-05, + "loss": 2.1799, + "step": 4757 + }, + { + "epoch": 2.052620228596075, + "grad_norm": 0.1818079799413681, + "learning_rate": 3.445976907628889e-05, + "loss": 2.0869, + "step": 4758 + }, + { + "epoch": 2.0530515419452233, + "grad_norm": 0.198555126786232, + "learning_rate": 3.443106071632948e-05, + "loss": 2.1775, + "step": 4759 + }, + { + "epoch": 2.0534828552943716, + "grad_norm": 0.19541983306407928, + "learning_rate": 3.440236075661258e-05, + "loss": 2.0928, + "step": 4760 + }, + { + "epoch": 2.0539141686435194, + "grad_norm": 0.2012958824634552, + "learning_rate": 3.437366920308081e-05, + "loss": 2.1032, + "step": 4761 + }, + { + "epoch": 2.0543454819926676, + "grad_norm": 0.2103503793478012, + "learning_rate": 3.434498606167515e-05, + "loss": 2.1628, + "step": 4762 + }, + { + "epoch": 2.054776795341816, + "grad_norm": 0.17942892014980316, + "learning_rate": 3.4316311338334726e-05, + "loss": 2.0742, + "step": 4763 + }, + { + "epoch": 2.055208108690964, + "grad_norm": 0.19537143409252167, + "learning_rate": 3.4287645038996946e-05, + "loss": 2.115, + "step": 4764 + }, + { + "epoch": 2.055639422040112, + "grad_norm": 0.18903794884681702, + "learning_rate": 3.425898716959748e-05, + "loss": 2.0925, + "step": 4765 + }, + { + "epoch": 2.05607073538926, + "grad_norm": 0.19615252315998077, + "learning_rate": 3.423033773607026e-05, + "loss": 2.2102, + "step": 4766 + }, + { + "epoch": 2.0565020487384085, + "grad_norm": 0.19369672238826752, + "learning_rate": 3.420169674434746e-05, + "loss": 2.0199, + "step": 4767 + }, + { + "epoch": 2.0569333620875567, + "grad_norm": 0.22473537921905518, + "learning_rate": 3.417306420035951e-05, + "loss": 2.0792, + "step": 4768 + }, + { + "epoch": 2.057364675436705, + "grad_norm": 0.8319330811500549, + "learning_rate": 3.41444401100351e-05, + "loss": 2.124, + "step": 4769 + }, + { + "epoch": 2.057795988785853, + "grad_norm": 0.21528014540672302, + "learning_rate": 3.411582447930115e-05, + "loss": 2.2047, + "step": 4770 + }, + { + "epoch": 2.058227302135001, + "grad_norm": 0.206075057387352, + "learning_rate": 3.408721731408281e-05, + "loss": 1.9907, + "step": 4771 + }, + { + "epoch": 2.0586586154841493, + "grad_norm": 0.20266813039779663, + "learning_rate": 3.405861862030357e-05, + "loss": 2.0341, + "step": 4772 + }, + { + "epoch": 2.0590899288332976, + "grad_norm": 0.19045396149158478, + "learning_rate": 3.4030028403885104e-05, + "loss": 2.172, + "step": 4773 + }, + { + "epoch": 2.0595212421824454, + "grad_norm": 0.18947143852710724, + "learning_rate": 3.400144667074727e-05, + "loss": 2.2121, + "step": 4774 + }, + { + "epoch": 2.0599525555315936, + "grad_norm": 0.18560044467449188, + "learning_rate": 3.397287342680823e-05, + "loss": 2.0714, + "step": 4775 + }, + { + "epoch": 2.0599525555315936, + "eval_loss": 2.089815616607666, + "eval_runtime": 200.7389, + "eval_samples_per_second": 0.159, + "eval_steps_per_second": 0.159, + "step": 4775 + }, + { + "epoch": 2.060383868880742, + "grad_norm": 0.18350154161453247, + "learning_rate": 3.394430867798443e-05, + "loss": 2.0451, + "step": 4776 + }, + { + "epoch": 2.06081518222989, + "grad_norm": 0.19061952829360962, + "learning_rate": 3.3915752430190504e-05, + "loss": 2.2662, + "step": 4777 + }, + { + "epoch": 2.0612464955790384, + "grad_norm": 0.19650185108184814, + "learning_rate": 3.3887204689339354e-05, + "loss": 2.059, + "step": 4778 + }, + { + "epoch": 2.061677808928186, + "grad_norm": 0.17481555044651031, + "learning_rate": 3.3858665461342017e-05, + "loss": 2.0273, + "step": 4779 + }, + { + "epoch": 2.0621091222773344, + "grad_norm": 0.2005576342344284, + "learning_rate": 3.383013475210793e-05, + "loss": 1.9869, + "step": 4780 + }, + { + "epoch": 2.0625404356264827, + "grad_norm": 0.18315300345420837, + "learning_rate": 3.380161256754469e-05, + "loss": 2.2135, + "step": 4781 + }, + { + "epoch": 2.062971748975631, + "grad_norm": 0.2013261318206787, + "learning_rate": 3.377309891355808e-05, + "loss": 2.2487, + "step": 4782 + }, + { + "epoch": 2.0634030623247788, + "grad_norm": 0.2194448709487915, + "learning_rate": 3.3744593796052206e-05, + "loss": 1.9857, + "step": 4783 + }, + { + "epoch": 2.063834375673927, + "grad_norm": 0.20069117844104767, + "learning_rate": 3.3716097220929346e-05, + "loss": 2.0198, + "step": 4784 + }, + { + "epoch": 2.0642656890230753, + "grad_norm": 0.1908555030822754, + "learning_rate": 3.368760919409003e-05, + "loss": 2.2062, + "step": 4785 + }, + { + "epoch": 2.0646970023722235, + "grad_norm": 0.18771453201770782, + "learning_rate": 3.365912972143301e-05, + "loss": 2.0383, + "step": 4786 + }, + { + "epoch": 2.065128315721372, + "grad_norm": 0.19914081692695618, + "learning_rate": 3.3630658808855277e-05, + "loss": 2.134, + "step": 4787 + }, + { + "epoch": 2.0655596290705196, + "grad_norm": 0.18912968039512634, + "learning_rate": 3.3602196462252047e-05, + "loss": 2.1419, + "step": 4788 + }, + { + "epoch": 2.065990942419668, + "grad_norm": 0.1924545168876648, + "learning_rate": 3.3573742687516736e-05, + "loss": 2.1747, + "step": 4789 + }, + { + "epoch": 2.066422255768816, + "grad_norm": 0.18185552954673767, + "learning_rate": 3.354529749054105e-05, + "loss": 1.9784, + "step": 4790 + }, + { + "epoch": 2.0668535691179644, + "grad_norm": 0.2076537311077118, + "learning_rate": 3.351686087721489e-05, + "loss": 2.2722, + "step": 4791 + }, + { + "epoch": 2.067284882467112, + "grad_norm": 0.1932411938905716, + "learning_rate": 3.348843285342632e-05, + "loss": 2.1012, + "step": 4792 + }, + { + "epoch": 2.0677161958162604, + "grad_norm": 0.1866043508052826, + "learning_rate": 3.3460013425061655e-05, + "loss": 2.0856, + "step": 4793 + }, + { + "epoch": 2.0681475091654087, + "grad_norm": 0.1825817972421646, + "learning_rate": 3.3431602598005504e-05, + "loss": 2.1824, + "step": 4794 + }, + { + "epoch": 2.068578822514557, + "grad_norm": 0.19377149641513824, + "learning_rate": 3.340320037814063e-05, + "loss": 2.2126, + "step": 4795 + }, + { + "epoch": 2.069010135863705, + "grad_norm": 0.19704613089561462, + "learning_rate": 3.3374806771348006e-05, + "loss": 2.2377, + "step": 4796 + }, + { + "epoch": 2.069441449212853, + "grad_norm": 0.19628547132015228, + "learning_rate": 3.334642178350685e-05, + "loss": 2.0471, + "step": 4797 + }, + { + "epoch": 2.0698727625620013, + "grad_norm": 0.19294996559619904, + "learning_rate": 3.331804542049458e-05, + "loss": 2.223, + "step": 4798 + }, + { + "epoch": 2.0703040759111495, + "grad_norm": 0.20850534737110138, + "learning_rate": 3.328967768818683e-05, + "loss": 2.1281, + "step": 4799 + }, + { + "epoch": 2.0707353892602978, + "grad_norm": 0.18277223408222198, + "learning_rate": 3.3261318592457455e-05, + "loss": 2.0091, + "step": 4800 + }, + { + "epoch": 2.0707353892602978, + "eval_loss": 2.0896167755126953, + "eval_runtime": 197.9135, + "eval_samples_per_second": 0.162, + "eval_steps_per_second": 0.162, + "step": 4800 + }, + { + "epoch": 2.0711667026094456, + "grad_norm": 0.1938290148973465, + "learning_rate": 3.323296813917851e-05, + "loss": 2.2023, + "step": 4801 + }, + { + "epoch": 2.071598015958594, + "grad_norm": 0.18563704192638397, + "learning_rate": 3.320462633422026e-05, + "loss": 2.1932, + "step": 4802 + }, + { + "epoch": 2.072029329307742, + "grad_norm": 0.18315260112285614, + "learning_rate": 3.317629318345117e-05, + "loss": 2.0308, + "step": 4803 + }, + { + "epoch": 2.0724606426568903, + "grad_norm": 0.18531833589076996, + "learning_rate": 3.314796869273799e-05, + "loss": 2.0022, + "step": 4804 + }, + { + "epoch": 2.0728919560060386, + "grad_norm": 0.20288553833961487, + "learning_rate": 3.31196528679456e-05, + "loss": 2.2519, + "step": 4805 + }, + { + "epoch": 2.0733232693551864, + "grad_norm": 0.19610491394996643, + "learning_rate": 3.3091345714937044e-05, + "loss": 2.1821, + "step": 4806 + }, + { + "epoch": 2.0737545827043347, + "grad_norm": 0.21335402131080627, + "learning_rate": 3.306304723957364e-05, + "loss": 2.0207, + "step": 4807 + }, + { + "epoch": 2.074185896053483, + "grad_norm": 0.20559577643871307, + "learning_rate": 3.303475744771495e-05, + "loss": 2.0895, + "step": 4808 + }, + { + "epoch": 2.074617209402631, + "grad_norm": 0.2079014629125595, + "learning_rate": 3.300647634521865e-05, + "loss": 2.2938, + "step": 4809 + }, + { + "epoch": 2.075048522751779, + "grad_norm": 0.1839982271194458, + "learning_rate": 3.297820393794067e-05, + "loss": 1.927, + "step": 4810 + }, + { + "epoch": 2.0754798361009272, + "grad_norm": 0.17540310323238373, + "learning_rate": 3.294994023173509e-05, + "loss": 1.9777, + "step": 4811 + }, + { + "epoch": 2.0759111494500755, + "grad_norm": 0.2142418920993805, + "learning_rate": 3.2921685232454246e-05, + "loss": 2.2027, + "step": 4812 + }, + { + "epoch": 2.0763424627992237, + "grad_norm": 0.18835563957691193, + "learning_rate": 3.289343894594864e-05, + "loss": 2.306, + "step": 4813 + }, + { + "epoch": 2.076773776148372, + "grad_norm": 0.2013714760541916, + "learning_rate": 3.286520137806696e-05, + "loss": 2.1524, + "step": 4814 + }, + { + "epoch": 2.07720508949752, + "grad_norm": 0.18839602172374725, + "learning_rate": 3.2836972534656115e-05, + "loss": 2.096, + "step": 4815 + }, + { + "epoch": 2.077636402846668, + "grad_norm": 0.19380220770835876, + "learning_rate": 3.280875242156119e-05, + "loss": 2.0306, + "step": 4816 + }, + { + "epoch": 2.0780677161958163, + "grad_norm": 0.19123317301273346, + "learning_rate": 3.278054104462545e-05, + "loss": 2.1462, + "step": 4817 + }, + { + "epoch": 2.0784990295449646, + "grad_norm": 0.17127066850662231, + "learning_rate": 3.275233840969045e-05, + "loss": 1.9398, + "step": 4818 + }, + { + "epoch": 2.0789303428941124, + "grad_norm": 9.493552207946777, + "learning_rate": 3.2724144522595755e-05, + "loss": 2.2694, + "step": 4819 + }, + { + "epoch": 2.0793616562432606, + "grad_norm": 0.20617228746414185, + "learning_rate": 3.269595938917927e-05, + "loss": 2.2542, + "step": 4820 + }, + { + "epoch": 2.079792969592409, + "grad_norm": 0.1898934245109558, + "learning_rate": 3.266778301527699e-05, + "loss": 2.2002, + "step": 4821 + }, + { + "epoch": 2.080224282941557, + "grad_norm": 5.428417682647705, + "learning_rate": 3.26396154067232e-05, + "loss": 1.76, + "step": 4822 + }, + { + "epoch": 2.0806555962907054, + "grad_norm": 0.19115063548088074, + "learning_rate": 3.2611456569350316e-05, + "loss": 2.2404, + "step": 4823 + }, + { + "epoch": 2.081086909639853, + "grad_norm": 0.20664536952972412, + "learning_rate": 3.2583306508988874e-05, + "loss": 2.3256, + "step": 4824 + }, + { + "epoch": 2.0815182229890015, + "grad_norm": 0.18666943907737732, + "learning_rate": 3.255516523146765e-05, + "loss": 1.9698, + "step": 4825 + }, + { + "epoch": 2.0815182229890015, + "eval_loss": 2.089388370513916, + "eval_runtime": 195.2277, + "eval_samples_per_second": 0.164, + "eval_steps_per_second": 0.164, + "step": 4825 + }, + { + "epoch": 2.0819495363381497, + "grad_norm": 0.18007901310920715, + "learning_rate": 3.252703274261367e-05, + "loss": 2.1054, + "step": 4826 + }, + { + "epoch": 2.082380849687298, + "grad_norm": 0.17201073467731476, + "learning_rate": 3.2498909048252024e-05, + "loss": 2.0739, + "step": 4827 + }, + { + "epoch": 2.082812163036446, + "grad_norm": 0.1966872364282608, + "learning_rate": 3.247079415420606e-05, + "loss": 2.2652, + "step": 4828 + }, + { + "epoch": 2.083243476385594, + "grad_norm": 0.19481778144836426, + "learning_rate": 3.244268806629724e-05, + "loss": 2.1652, + "step": 4829 + }, + { + "epoch": 2.0836747897347423, + "grad_norm": 0.18113017082214355, + "learning_rate": 3.2414590790345266e-05, + "loss": 2.0232, + "step": 4830 + }, + { + "epoch": 2.0841061030838905, + "grad_norm": 0.2021803855895996, + "learning_rate": 3.238650233216797e-05, + "loss": 2.2541, + "step": 4831 + }, + { + "epoch": 2.084537416433039, + "grad_norm": 0.19255667924880981, + "learning_rate": 3.235842269758139e-05, + "loss": 2.1073, + "step": 4832 + }, + { + "epoch": 2.0849687297821866, + "grad_norm": 0.18869124352931976, + "learning_rate": 3.23303518923997e-05, + "loss": 2.0478, + "step": 4833 + }, + { + "epoch": 2.085400043131335, + "grad_norm": 1.98809814453125, + "learning_rate": 3.230228992243528e-05, + "loss": 2.1051, + "step": 4834 + }, + { + "epoch": 2.085831356480483, + "grad_norm": 0.22017183899879456, + "learning_rate": 3.227423679349864e-05, + "loss": 2.3927, + "step": 4835 + }, + { + "epoch": 2.0862626698296314, + "grad_norm": 0.18822191655635834, + "learning_rate": 3.224619251139854e-05, + "loss": 2.0913, + "step": 4836 + }, + { + "epoch": 2.086693983178779, + "grad_norm": 0.18337860703468323, + "learning_rate": 3.221815708194185e-05, + "loss": 2.2166, + "step": 4837 + }, + { + "epoch": 2.0871252965279274, + "grad_norm": 0.19429202377796173, + "learning_rate": 3.2190130510933564e-05, + "loss": 2.1312, + "step": 4838 + }, + { + "epoch": 2.0875566098770757, + "grad_norm": 0.18895988166332245, + "learning_rate": 3.2162112804176896e-05, + "loss": 2.1958, + "step": 4839 + }, + { + "epoch": 2.087987923226224, + "grad_norm": 0.18243250250816345, + "learning_rate": 3.213410396747326e-05, + "loss": 2.1509, + "step": 4840 + }, + { + "epoch": 2.088419236575372, + "grad_norm": 0.2050832360982895, + "learning_rate": 3.2106104006622176e-05, + "loss": 2.1202, + "step": 4841 + }, + { + "epoch": 2.08885054992452, + "grad_norm": 0.4805981516838074, + "learning_rate": 3.207811292742135e-05, + "loss": 2.3342, + "step": 4842 + }, + { + "epoch": 2.0892818632736683, + "grad_norm": 0.18699911236763, + "learning_rate": 3.205013073566661e-05, + "loss": 2.0611, + "step": 4843 + }, + { + "epoch": 2.0897131766228165, + "grad_norm": 0.2027273327112198, + "learning_rate": 3.2022157437152e-05, + "loss": 2.3847, + "step": 4844 + }, + { + "epoch": 2.090144489971965, + "grad_norm": 0.18583311140537262, + "learning_rate": 3.199419303766968e-05, + "loss": 2.1031, + "step": 4845 + }, + { + "epoch": 2.0905758033211126, + "grad_norm": 0.18679817020893097, + "learning_rate": 3.196623754301e-05, + "loss": 2.1334, + "step": 4846 + }, + { + "epoch": 2.091007116670261, + "grad_norm": 0.2197234332561493, + "learning_rate": 3.193829095896145e-05, + "loss": 2.1589, + "step": 4847 + }, + { + "epoch": 2.091438430019409, + "grad_norm": 0.19306516647338867, + "learning_rate": 3.191035329131067e-05, + "loss": 2.0514, + "step": 4848 + }, + { + "epoch": 2.0918697433685574, + "grad_norm": 0.19103452563285828, + "learning_rate": 3.1882424545842435e-05, + "loss": 2.2015, + "step": 4849 + }, + { + "epoch": 2.0923010567177056, + "grad_norm": 0.2152070254087448, + "learning_rate": 3.1854504728339754e-05, + "loss": 2.2573, + "step": 4850 + }, + { + "epoch": 2.0923010567177056, + "eval_loss": 2.0900402069091797, + "eval_runtime": 195.8281, + "eval_samples_per_second": 0.163, + "eval_steps_per_second": 0.163, + "step": 4850 + }, + { + "epoch": 2.0927323700668534, + "grad_norm": 4.18394136428833, + "learning_rate": 3.182659384458372e-05, + "loss": 1.9889, + "step": 4851 + }, + { + "epoch": 2.0931636834160017, + "grad_norm": 0.2186499387025833, + "learning_rate": 3.179869190035355e-05, + "loss": 2.1699, + "step": 4852 + }, + { + "epoch": 2.09359499676515, + "grad_norm": 0.20570923388004303, + "learning_rate": 3.1770798901426624e-05, + "loss": 2.1021, + "step": 4853 + }, + { + "epoch": 2.094026310114298, + "grad_norm": 0.5099383592605591, + "learning_rate": 3.174291485357856e-05, + "loss": 2.1402, + "step": 4854 + }, + { + "epoch": 2.094457623463446, + "grad_norm": 0.1793668121099472, + "learning_rate": 3.1715039762583034e-05, + "loss": 2.0289, + "step": 4855 + }, + { + "epoch": 2.0948889368125942, + "grad_norm": 0.19396327435970306, + "learning_rate": 3.1687173634211884e-05, + "loss": 2.0263, + "step": 4856 + }, + { + "epoch": 2.0953202501617425, + "grad_norm": 0.2883221507072449, + "learning_rate": 3.165931647423509e-05, + "loss": 2.1575, + "step": 4857 + }, + { + "epoch": 2.0957515635108908, + "grad_norm": 0.2391921877861023, + "learning_rate": 3.1631468288420786e-05, + "loss": 2.113, + "step": 4858 + }, + { + "epoch": 2.096182876860039, + "grad_norm": 0.21463486552238464, + "learning_rate": 3.160362908253524e-05, + "loss": 2.2261, + "step": 4859 + }, + { + "epoch": 2.096614190209187, + "grad_norm": 0.20496898889541626, + "learning_rate": 3.157579886234287e-05, + "loss": 2.1229, + "step": 4860 + }, + { + "epoch": 2.097045503558335, + "grad_norm": 0.21032458543777466, + "learning_rate": 3.1547977633606226e-05, + "loss": 2.1148, + "step": 4861 + }, + { + "epoch": 2.0974768169074833, + "grad_norm": 0.20047256350517273, + "learning_rate": 3.152016540208599e-05, + "loss": 2.2359, + "step": 4862 + }, + { + "epoch": 2.0979081302566316, + "grad_norm": 0.20530466735363007, + "learning_rate": 3.149236217354098e-05, + "loss": 2.1404, + "step": 4863 + }, + { + "epoch": 2.0983394436057794, + "grad_norm": 0.1921357959508896, + "learning_rate": 3.146456795372822e-05, + "loss": 2.2775, + "step": 4864 + }, + { + "epoch": 2.0987707569549277, + "grad_norm": 0.18893566727638245, + "learning_rate": 3.143678274840273e-05, + "loss": 2.0688, + "step": 4865 + }, + { + "epoch": 2.099202070304076, + "grad_norm": 0.19544082880020142, + "learning_rate": 3.140900656331778e-05, + "loss": 2.3392, + "step": 4866 + }, + { + "epoch": 2.099633383653224, + "grad_norm": 0.20501480996608734, + "learning_rate": 3.138123940422469e-05, + "loss": 2.3066, + "step": 4867 + }, + { + "epoch": 2.1000646970023724, + "grad_norm": 0.20116248726844788, + "learning_rate": 3.135348127687302e-05, + "loss": 2.1698, + "step": 4868 + }, + { + "epoch": 2.1004960103515202, + "grad_norm": 0.18571919202804565, + "learning_rate": 3.1325732187010385e-05, + "loss": 2.0816, + "step": 4869 + }, + { + "epoch": 2.1009273237006685, + "grad_norm": 0.22345946729183197, + "learning_rate": 3.1297992140382486e-05, + "loss": 2.177, + "step": 4870 + }, + { + "epoch": 2.1013586370498167, + "grad_norm": 0.24064777791500092, + "learning_rate": 3.127026114273321e-05, + "loss": 1.808, + "step": 4871 + }, + { + "epoch": 2.101789950398965, + "grad_norm": 0.19728608429431915, + "learning_rate": 3.124253919980461e-05, + "loss": 2.3509, + "step": 4872 + }, + { + "epoch": 2.102221263748113, + "grad_norm": 0.20240743458271027, + "learning_rate": 3.121482631733678e-05, + "loss": 2.2624, + "step": 4873 + }, + { + "epoch": 2.102652577097261, + "grad_norm": 65.099853515625, + "learning_rate": 3.1187122501068e-05, + "loss": 1.9778, + "step": 4874 + }, + { + "epoch": 2.1030838904464093, + "grad_norm": 0.19699519872665405, + "learning_rate": 3.115942775673464e-05, + "loss": 2.1841, + "step": 4875 + }, + { + "epoch": 2.1030838904464093, + "eval_loss": 2.0889768600463867, + "eval_runtime": 195.1623, + "eval_samples_per_second": 0.164, + "eval_steps_per_second": 0.164, + "step": 4875 + }, + { + "epoch": 2.1035152037955576, + "grad_norm": 0.18935461342334747, + "learning_rate": 3.113174209007118e-05, + "loss": 2.1658, + "step": 4876 + }, + { + "epoch": 2.103946517144706, + "grad_norm": 0.19691212475299835, + "learning_rate": 3.110406550681027e-05, + "loss": 2.2538, + "step": 4877 + }, + { + "epoch": 2.1043778304938536, + "grad_norm": 0.20997954905033112, + "learning_rate": 3.107639801268263e-05, + "loss": 2.1355, + "step": 4878 + }, + { + "epoch": 2.104809143843002, + "grad_norm": 0.17843768000602722, + "learning_rate": 3.104873961341712e-05, + "loss": 2.1078, + "step": 4879 + }, + { + "epoch": 2.10524045719215, + "grad_norm": 0.1926320642232895, + "learning_rate": 3.102109031474071e-05, + "loss": 2.4314, + "step": 4880 + }, + { + "epoch": 2.1056717705412984, + "grad_norm": 0.19593141973018646, + "learning_rate": 3.099345012237848e-05, + "loss": 2.156, + "step": 4881 + }, + { + "epoch": 2.106103083890446, + "grad_norm": 0.2088003307580948, + "learning_rate": 3.0965819042053674e-05, + "loss": 2.1124, + "step": 4882 + }, + { + "epoch": 2.1065343972395945, + "grad_norm": 0.19262035191059113, + "learning_rate": 3.093819707948761e-05, + "loss": 2.2852, + "step": 4883 + }, + { + "epoch": 2.1069657105887427, + "grad_norm": 0.22138433158397675, + "learning_rate": 3.0910584240399665e-05, + "loss": 2.2125, + "step": 4884 + }, + { + "epoch": 2.107397023937891, + "grad_norm": 0.8872026801109314, + "learning_rate": 3.088298053050738e-05, + "loss": 2.2015, + "step": 4885 + }, + { + "epoch": 2.1078283372870392, + "grad_norm": 0.1949310153722763, + "learning_rate": 3.085538595552646e-05, + "loss": 2.0707, + "step": 4886 + }, + { + "epoch": 2.108259650636187, + "grad_norm": 0.19012707471847534, + "learning_rate": 3.0827800521170634e-05, + "loss": 2.1818, + "step": 4887 + }, + { + "epoch": 2.1086909639853353, + "grad_norm": 0.20888254046440125, + "learning_rate": 3.080022423315177e-05, + "loss": 2.2013, + "step": 4888 + }, + { + "epoch": 2.1091222773344835, + "grad_norm": 0.23211535811424255, + "learning_rate": 3.077265709717984e-05, + "loss": 1.8655, + "step": 4889 + }, + { + "epoch": 2.109553590683632, + "grad_norm": 0.2039070874452591, + "learning_rate": 3.074509911896292e-05, + "loss": 2.0674, + "step": 4890 + }, + { + "epoch": 2.1099849040327796, + "grad_norm": 0.20388320088386536, + "learning_rate": 3.071755030420719e-05, + "loss": 2.1095, + "step": 4891 + }, + { + "epoch": 2.110416217381928, + "grad_norm": 0.2010328471660614, + "learning_rate": 3.0690010658616926e-05, + "loss": 2.1818, + "step": 4892 + }, + { + "epoch": 2.110847530731076, + "grad_norm": 0.20505084097385406, + "learning_rate": 3.066248018789454e-05, + "loss": 2.0214, + "step": 4893 + }, + { + "epoch": 2.1112788440802244, + "grad_norm": 0.19544564187526703, + "learning_rate": 3.0634958897740486e-05, + "loss": 1.857, + "step": 4894 + }, + { + "epoch": 2.1117101574293726, + "grad_norm": 0.20017707347869873, + "learning_rate": 3.0607446793853326e-05, + "loss": 2.2595, + "step": 4895 + }, + { + "epoch": 2.1121414707785204, + "grad_norm": 0.21018147468566895, + "learning_rate": 3.057994388192981e-05, + "loss": 2.0568, + "step": 4896 + }, + { + "epoch": 2.1125727841276687, + "grad_norm": 0.20233690738677979, + "learning_rate": 3.0552450167664716e-05, + "loss": 2.3687, + "step": 4897 + }, + { + "epoch": 2.113004097476817, + "grad_norm": 0.20755895972251892, + "learning_rate": 3.0524965656750845e-05, + "loss": 2.2019, + "step": 4898 + }, + { + "epoch": 2.113435410825965, + "grad_norm": 0.22112730145454407, + "learning_rate": 3.049749035487919e-05, + "loss": 2.1214, + "step": 4899 + }, + { + "epoch": 2.113866724175113, + "grad_norm": 0.1795358955860138, + "learning_rate": 3.0470024267738848e-05, + "loss": 2.0371, + "step": 4900 + }, + { + "epoch": 2.113866724175113, + "eval_loss": 2.0892462730407715, + "eval_runtime": 195.3029, + "eval_samples_per_second": 0.164, + "eval_steps_per_second": 0.164, + "step": 4900 + }, + { + "epoch": 2.1142980375242613, + "grad_norm": 0.18432939052581787, + "learning_rate": 3.044256740101696e-05, + "loss": 2.2024, + "step": 4901 + }, + { + "epoch": 2.1147293508734095, + "grad_norm": 0.195393368601799, + "learning_rate": 3.041511976039876e-05, + "loss": 2.1915, + "step": 4902 + }, + { + "epoch": 2.1151606642225578, + "grad_norm": 0.21330732107162476, + "learning_rate": 3.0387681351567586e-05, + "loss": 2.1721, + "step": 4903 + }, + { + "epoch": 2.115591977571706, + "grad_norm": 0.1951751857995987, + "learning_rate": 3.036025218020486e-05, + "loss": 2.1533, + "step": 4904 + }, + { + "epoch": 2.116023290920854, + "grad_norm": 0.2118057757616043, + "learning_rate": 3.0332832251990092e-05, + "loss": 2.1225, + "step": 4905 + }, + { + "epoch": 2.116454604270002, + "grad_norm": 0.17883312702178955, + "learning_rate": 3.0305421572600876e-05, + "loss": 2.022, + "step": 4906 + }, + { + "epoch": 2.1168859176191503, + "grad_norm": 0.20854587852954865, + "learning_rate": 3.0278020147712895e-05, + "loss": 2.2135, + "step": 4907 + }, + { + "epoch": 2.1173172309682986, + "grad_norm": 0.1910894513130188, + "learning_rate": 3.0250627982999915e-05, + "loss": 2.1853, + "step": 4908 + }, + { + "epoch": 2.1177485443174464, + "grad_norm": 0.20413783192634583, + "learning_rate": 3.022324508413376e-05, + "loss": 2.0861, + "step": 4909 + }, + { + "epoch": 2.1181798576665947, + "grad_norm": 0.2057296335697174, + "learning_rate": 3.019587145678443e-05, + "loss": 2.227, + "step": 4910 + }, + { + "epoch": 2.118611171015743, + "grad_norm": 0.19873298704624176, + "learning_rate": 3.0168507106619856e-05, + "loss": 2.1258, + "step": 4911 + }, + { + "epoch": 2.119042484364891, + "grad_norm": 0.20596614480018616, + "learning_rate": 3.014115203930615e-05, + "loss": 2.2611, + "step": 4912 + }, + { + "epoch": 2.1194737977140394, + "grad_norm": 0.8568402528762817, + "learning_rate": 3.0113806260507466e-05, + "loss": 2.333, + "step": 4913 + }, + { + "epoch": 2.1199051110631872, + "grad_norm": 0.18790197372436523, + "learning_rate": 3.008646977588609e-05, + "loss": 2.1652, + "step": 4914 + }, + { + "epoch": 2.1203364244123355, + "grad_norm": 0.19961890578269958, + "learning_rate": 3.0059142591102328e-05, + "loss": 2.083, + "step": 4915 + }, + { + "epoch": 2.1207677377614838, + "grad_norm": 0.19002991914749146, + "learning_rate": 3.0031824711814517e-05, + "loss": 2.2128, + "step": 4916 + }, + { + "epoch": 2.121199051110632, + "grad_norm": 0.1990480273962021, + "learning_rate": 3.000451614367918e-05, + "loss": 2.1486, + "step": 4917 + }, + { + "epoch": 2.12163036445978, + "grad_norm": 0.1983197182416916, + "learning_rate": 2.9977216892350845e-05, + "loss": 2.1523, + "step": 4918 + }, + { + "epoch": 2.122061677808928, + "grad_norm": 0.20927980542182922, + "learning_rate": 2.9949926963482106e-05, + "loss": 2.2411, + "step": 4919 + }, + { + "epoch": 2.1224929911580763, + "grad_norm": 1.3745379447937012, + "learning_rate": 2.9922646362723658e-05, + "loss": 2.1764, + "step": 4920 + }, + { + "epoch": 2.1229243045072246, + "grad_norm": 0.20675300061702728, + "learning_rate": 2.9895375095724223e-05, + "loss": 2.1202, + "step": 4921 + }, + { + "epoch": 2.123355617856373, + "grad_norm": 0.18410125374794006, + "learning_rate": 2.986811316813064e-05, + "loss": 2.2244, + "step": 4922 + }, + { + "epoch": 2.1237869312055206, + "grad_norm": 0.4287169575691223, + "learning_rate": 2.9840860585587783e-05, + "loss": 2.3137, + "step": 4923 + }, + { + "epoch": 2.124218244554669, + "grad_norm": 0.19607892632484436, + "learning_rate": 2.9813617353738596e-05, + "loss": 2.0012, + "step": 4924 + }, + { + "epoch": 2.124649557903817, + "grad_norm": 0.19588974118232727, + "learning_rate": 2.9786383478224072e-05, + "loss": 2.2361, + "step": 4925 + }, + { + "epoch": 2.124649557903817, + "eval_loss": 2.089700222015381, + "eval_runtime": 195.2735, + "eval_samples_per_second": 0.164, + "eval_steps_per_second": 0.164, + "step": 4925 + }, + { + "epoch": 2.1250808712529654, + "grad_norm": 0.1861879676580429, + "learning_rate": 2.975915896468331e-05, + "loss": 2.1908, + "step": 4926 + }, + { + "epoch": 2.1255121846021137, + "grad_norm": 0.1903456449508667, + "learning_rate": 2.9731943818753394e-05, + "loss": 2.1286, + "step": 4927 + }, + { + "epoch": 2.1259434979512615, + "grad_norm": 0.19927555322647095, + "learning_rate": 2.9704738046069584e-05, + "loss": 2.1186, + "step": 4928 + }, + { + "epoch": 2.1263748113004097, + "grad_norm": 0.20145562291145325, + "learning_rate": 2.967754165226513e-05, + "loss": 2.1554, + "step": 4929 + }, + { + "epoch": 2.126806124649558, + "grad_norm": 0.18136072158813477, + "learning_rate": 2.9650354642971286e-05, + "loss": 1.9959, + "step": 4930 + }, + { + "epoch": 2.1272374379987062, + "grad_norm": 0.1880529671907425, + "learning_rate": 2.9623177023817423e-05, + "loss": 2.0683, + "step": 4931 + }, + { + "epoch": 2.127668751347854, + "grad_norm": 0.1958688646554947, + "learning_rate": 2.959600880043102e-05, + "loss": 2.3095, + "step": 4932 + }, + { + "epoch": 2.1281000646970023, + "grad_norm": 0.19369158148765564, + "learning_rate": 2.9568849978437517e-05, + "loss": 2.072, + "step": 4933 + }, + { + "epoch": 2.1285313780461506, + "grad_norm": 0.19699695706367493, + "learning_rate": 2.9541700563460457e-05, + "loss": 2.2225, + "step": 4934 + }, + { + "epoch": 2.128962691395299, + "grad_norm": 0.21904636919498444, + "learning_rate": 2.951456056112142e-05, + "loss": 1.9931, + "step": 4935 + }, + { + "epoch": 2.129394004744447, + "grad_norm": 0.18061836063861847, + "learning_rate": 2.9487429977040034e-05, + "loss": 2.3713, + "step": 4936 + }, + { + "epoch": 2.129825318093595, + "grad_norm": 0.23203131556510925, + "learning_rate": 2.9460308816833988e-05, + "loss": 2.1424, + "step": 4937 + }, + { + "epoch": 2.130256631442743, + "grad_norm": 0.19963587820529938, + "learning_rate": 2.9433197086119022e-05, + "loss": 2.0433, + "step": 4938 + }, + { + "epoch": 2.1306879447918914, + "grad_norm": 0.1829756796360016, + "learning_rate": 2.9406094790508904e-05, + "loss": 2.0641, + "step": 4939 + }, + { + "epoch": 2.1311192581410396, + "grad_norm": 0.21752561628818512, + "learning_rate": 2.937900193561547e-05, + "loss": 2.1851, + "step": 4940 + }, + { + "epoch": 2.1315505714901875, + "grad_norm": 0.1959228664636612, + "learning_rate": 2.9351918527048557e-05, + "loss": 2.3427, + "step": 4941 + }, + { + "epoch": 2.1319818848393357, + "grad_norm": 0.17727304995059967, + "learning_rate": 2.932484457041617e-05, + "loss": 2.0664, + "step": 4942 + }, + { + "epoch": 2.132413198188484, + "grad_norm": 0.18500760197639465, + "learning_rate": 2.9297780071324184e-05, + "loss": 2.2685, + "step": 4943 + }, + { + "epoch": 2.132844511537632, + "grad_norm": 0.19489331543445587, + "learning_rate": 2.9270725035376625e-05, + "loss": 2.0761, + "step": 4944 + }, + { + "epoch": 2.1332758248867805, + "grad_norm": 0.19658870995044708, + "learning_rate": 2.92436794681755e-05, + "loss": 2.1817, + "step": 4945 + }, + { + "epoch": 2.1337071382359283, + "grad_norm": 0.1881716102361679, + "learning_rate": 2.921664337532095e-05, + "loss": 2.1831, + "step": 4946 + }, + { + "epoch": 2.1341384515850765, + "grad_norm": 0.19964499771595, + "learning_rate": 2.918961676241111e-05, + "loss": 2.2812, + "step": 4947 + }, + { + "epoch": 2.134569764934225, + "grad_norm": 0.19984838366508484, + "learning_rate": 2.9162599635042025e-05, + "loss": 2.214, + "step": 4948 + }, + { + "epoch": 2.135001078283373, + "grad_norm": 0.3285719156265259, + "learning_rate": 2.9135591998807985e-05, + "loss": 2.0155, + "step": 4949 + }, + { + "epoch": 2.135432391632521, + "grad_norm": 0.2052600383758545, + "learning_rate": 2.9108593859301185e-05, + "loss": 2.1134, + "step": 4950 + }, + { + "epoch": 2.135432391632521, + "eval_loss": 2.0897464752197266, + "eval_runtime": 195.252, + "eval_samples_per_second": 0.164, + "eval_steps_per_second": 0.164, + "step": 4950 + }, + { + "epoch": 2.135863704981669, + "grad_norm": 0.18723013997077942, + "learning_rate": 2.9081605222111893e-05, + "loss": 1.8742, + "step": 4951 + }, + { + "epoch": 2.1362950183308174, + "grad_norm": 0.20071329176425934, + "learning_rate": 2.9054626092828386e-05, + "loss": 2.1228, + "step": 4952 + }, + { + "epoch": 2.1367263316799656, + "grad_norm": 0.19112932682037354, + "learning_rate": 2.9027656477037002e-05, + "loss": 2.1832, + "step": 4953 + }, + { + "epoch": 2.137157645029114, + "grad_norm": 0.1887974590063095, + "learning_rate": 2.900069638032209e-05, + "loss": 2.0068, + "step": 4954 + }, + { + "epoch": 2.1375889583782617, + "grad_norm": 0.2175447791814804, + "learning_rate": 2.8973745808266e-05, + "loss": 2.4054, + "step": 4955 + }, + { + "epoch": 2.13802027172741, + "grad_norm": 0.19409482181072235, + "learning_rate": 2.8946804766449234e-05, + "loss": 2.2914, + "step": 4956 + }, + { + "epoch": 2.138451585076558, + "grad_norm": 0.20012657344341278, + "learning_rate": 2.8919873260450146e-05, + "loss": 2.3076, + "step": 4957 + }, + { + "epoch": 2.1388828984257064, + "grad_norm": 0.19830457866191864, + "learning_rate": 2.8892951295845222e-05, + "loss": 2.336, + "step": 4958 + }, + { + "epoch": 2.1393142117748543, + "grad_norm": 0.1941172331571579, + "learning_rate": 2.8866038878208915e-05, + "loss": 2.2341, + "step": 4959 + }, + { + "epoch": 2.1397455251240025, + "grad_norm": 0.20140814781188965, + "learning_rate": 2.883913601311381e-05, + "loss": 2.249, + "step": 4960 + }, + { + "epoch": 2.1401768384731508, + "grad_norm": 0.19238823652267456, + "learning_rate": 2.881224270613042e-05, + "loss": 2.0708, + "step": 4961 + }, + { + "epoch": 2.140608151822299, + "grad_norm": 0.19633318483829498, + "learning_rate": 2.8785358962827222e-05, + "loss": 2.0966, + "step": 4962 + }, + { + "epoch": 2.1410394651714473, + "grad_norm": 0.2094806581735611, + "learning_rate": 2.8758484788770876e-05, + "loss": 2.0274, + "step": 4963 + }, + { + "epoch": 2.141470778520595, + "grad_norm": 0.197383850812912, + "learning_rate": 2.873162018952595e-05, + "loss": 2.2224, + "step": 4964 + }, + { + "epoch": 2.1419020918697433, + "grad_norm": 0.2031429558992386, + "learning_rate": 2.8704765170655032e-05, + "loss": 2.0326, + "step": 4965 + }, + { + "epoch": 2.1423334052188916, + "grad_norm": 0.18839189410209656, + "learning_rate": 2.867791973771877e-05, + "loss": 2.1396, + "step": 4966 + }, + { + "epoch": 2.14276471856804, + "grad_norm": 0.18205758929252625, + "learning_rate": 2.8651083896275798e-05, + "loss": 2.1926, + "step": 4967 + }, + { + "epoch": 2.1431960319171877, + "grad_norm": 0.20114341378211975, + "learning_rate": 2.8624257651882755e-05, + "loss": 1.9477, + "step": 4968 + }, + { + "epoch": 2.143627345266336, + "grad_norm": 0.21320831775665283, + "learning_rate": 2.8597441010094336e-05, + "loss": 2.3524, + "step": 4969 + }, + { + "epoch": 2.144058658615484, + "grad_norm": 0.17399685084819794, + "learning_rate": 2.85706339764632e-05, + "loss": 2.1856, + "step": 4970 + }, + { + "epoch": 2.1444899719646324, + "grad_norm": 0.20471583306789398, + "learning_rate": 2.8543836556540045e-05, + "loss": 2.1719, + "step": 4971 + }, + { + "epoch": 2.1449212853137807, + "grad_norm": 0.19019953906536102, + "learning_rate": 2.8517048755873564e-05, + "loss": 2.2953, + "step": 4972 + }, + { + "epoch": 2.1453525986629285, + "grad_norm": 0.18899403512477875, + "learning_rate": 2.8490270580010448e-05, + "loss": 2.0092, + "step": 4973 + }, + { + "epoch": 2.1457839120120767, + "grad_norm": 0.18942353129386902, + "learning_rate": 2.846350203449545e-05, + "loss": 2.15, + "step": 4974 + }, + { + "epoch": 2.146215225361225, + "grad_norm": 0.2105664610862732, + "learning_rate": 2.843674312487129e-05, + "loss": 2.2613, + "step": 4975 + }, + { + "epoch": 2.146215225361225, + "eval_loss": 2.0892958641052246, + "eval_runtime": 212.7334, + "eval_samples_per_second": 0.15, + "eval_steps_per_second": 0.15, + "step": 4975 + }, + { + "epoch": 2.1466465387103733, + "grad_norm": 0.19566898047924042, + "learning_rate": 2.8409993856678655e-05, + "loss": 2.0619, + "step": 4976 + }, + { + "epoch": 2.147077852059521, + "grad_norm": 0.18931123614311218, + "learning_rate": 2.8383254235456268e-05, + "loss": 2.1251, + "step": 4977 + }, + { + "epoch": 2.1475091654086693, + "grad_norm": 0.20515230298042297, + "learning_rate": 2.8356524266740908e-05, + "loss": 2.0915, + "step": 4978 + }, + { + "epoch": 2.1479404787578176, + "grad_norm": 0.20855127274990082, + "learning_rate": 2.832980395606728e-05, + "loss": 2.108, + "step": 4979 + }, + { + "epoch": 2.148371792106966, + "grad_norm": 0.18840979039669037, + "learning_rate": 2.8303093308968126e-05, + "loss": 2.2457, + "step": 4980 + }, + { + "epoch": 2.148803105456114, + "grad_norm": 0.20065249502658844, + "learning_rate": 2.8276392330974164e-05, + "loss": 2.1103, + "step": 4981 + }, + { + "epoch": 2.149234418805262, + "grad_norm": 0.19785119593143463, + "learning_rate": 2.824970102761413e-05, + "loss": 2.1526, + "step": 4982 + }, + { + "epoch": 2.14966573215441, + "grad_norm": 0.18779438734054565, + "learning_rate": 2.8223019404414755e-05, + "loss": 1.9152, + "step": 4983 + }, + { + "epoch": 2.1500970455035584, + "grad_norm": 0.20784412324428558, + "learning_rate": 2.8196347466900757e-05, + "loss": 2.136, + "step": 4984 + }, + { + "epoch": 2.1505283588527067, + "grad_norm": 0.19135190546512604, + "learning_rate": 2.8169685220594852e-05, + "loss": 2.1212, + "step": 4985 + }, + { + "epoch": 2.1509596722018545, + "grad_norm": 0.20130488276481628, + "learning_rate": 2.8143032671017748e-05, + "loss": 2.4026, + "step": 4986 + }, + { + "epoch": 2.1513909855510027, + "grad_norm": 0.17856507003307343, + "learning_rate": 2.8116389823688117e-05, + "loss": 2.0381, + "step": 4987 + }, + { + "epoch": 2.151822298900151, + "grad_norm": 0.1963197886943817, + "learning_rate": 2.8089756684122748e-05, + "loss": 2.0304, + "step": 4988 + }, + { + "epoch": 2.1522536122492992, + "grad_norm": 0.22759243845939636, + "learning_rate": 2.8063133257836222e-05, + "loss": 2.085, + "step": 4989 + }, + { + "epoch": 2.1526849255984475, + "grad_norm": 0.1714658886194229, + "learning_rate": 2.8036519550341253e-05, + "loss": 1.9922, + "step": 4990 + }, + { + "epoch": 2.1531162389475953, + "grad_norm": 0.20330412685871124, + "learning_rate": 2.8009915567148466e-05, + "loss": 2.0577, + "step": 4991 + }, + { + "epoch": 2.1535475522967436, + "grad_norm": 0.20327334105968475, + "learning_rate": 2.7983321313766568e-05, + "loss": 2.1033, + "step": 4992 + }, + { + "epoch": 2.153978865645892, + "grad_norm": 0.1952381581068039, + "learning_rate": 2.795673679570218e-05, + "loss": 2.2632, + "step": 4993 + }, + { + "epoch": 2.15441017899504, + "grad_norm": 0.2106030136346817, + "learning_rate": 2.793016201845984e-05, + "loss": 2.3056, + "step": 4994 + }, + { + "epoch": 2.154841492344188, + "grad_norm": 0.2100643515586853, + "learning_rate": 2.7903596987542226e-05, + "loss": 2.1765, + "step": 4995 + }, + { + "epoch": 2.155272805693336, + "grad_norm": 0.18079394102096558, + "learning_rate": 2.78770417084499e-05, + "loss": 2.2167, + "step": 4996 + }, + { + "epoch": 2.1557041190424844, + "grad_norm": 0.19508136808872223, + "learning_rate": 2.7850496186681413e-05, + "loss": 2.1174, + "step": 4997 + }, + { + "epoch": 2.1561354323916326, + "grad_norm": 0.19507291913032532, + "learning_rate": 2.782396042773331e-05, + "loss": 2.1595, + "step": 4998 + }, + { + "epoch": 2.156566745740781, + "grad_norm": 0.20600342750549316, + "learning_rate": 2.779743443710011e-05, + "loss": 2.1292, + "step": 4999 + }, + { + "epoch": 2.1569980590899287, + "grad_norm": 0.1863345503807068, + "learning_rate": 2.77709182202743e-05, + "loss": 2.0938, + "step": 5000 + }, + { + "epoch": 2.1569980590899287, + "eval_loss": 2.0890560150146484, + "eval_runtime": 206.7094, + "eval_samples_per_second": 0.155, + "eval_steps_per_second": 0.155, + "step": 5000 + }, + { + "epoch": 2.157429372439077, + "grad_norm": 0.22767731547355652, + "learning_rate": 2.7744411782746343e-05, + "loss": 2.0735, + "step": 5001 + }, + { + "epoch": 2.157860685788225, + "grad_norm": 0.20078809559345245, + "learning_rate": 2.7717915130004757e-05, + "loss": 2.2704, + "step": 5002 + }, + { + "epoch": 2.1582919991373735, + "grad_norm": 0.18619324266910553, + "learning_rate": 2.7691428267535885e-05, + "loss": 2.1069, + "step": 5003 + }, + { + "epoch": 2.1587233124865213, + "grad_norm": 0.21010684967041016, + "learning_rate": 2.766495120082415e-05, + "loss": 2.1341, + "step": 5004 + }, + { + "epoch": 2.1591546258356695, + "grad_norm": 0.18977265059947968, + "learning_rate": 2.763848393535188e-05, + "loss": 1.9988, + "step": 5005 + }, + { + "epoch": 2.159585939184818, + "grad_norm": 0.22023966908454895, + "learning_rate": 2.761202647659948e-05, + "loss": 2.1059, + "step": 5006 + }, + { + "epoch": 2.160017252533966, + "grad_norm": 0.18467730283737183, + "learning_rate": 2.7585578830045253e-05, + "loss": 2.072, + "step": 5007 + }, + { + "epoch": 2.1604485658831143, + "grad_norm": 0.19137078523635864, + "learning_rate": 2.7559141001165396e-05, + "loss": 2.2661, + "step": 5008 + }, + { + "epoch": 2.160879879232262, + "grad_norm": 0.19556930661201477, + "learning_rate": 2.753271299543422e-05, + "loss": 2.017, + "step": 5009 + }, + { + "epoch": 2.1613111925814104, + "grad_norm": 0.19731999933719635, + "learning_rate": 2.750629481832391e-05, + "loss": 2.1438, + "step": 5010 + }, + { + "epoch": 2.1617425059305586, + "grad_norm": 0.17040055990219116, + "learning_rate": 2.747988647530464e-05, + "loss": 2.0037, + "step": 5011 + }, + { + "epoch": 2.162173819279707, + "grad_norm": 0.20169128477573395, + "learning_rate": 2.7453487971844548e-05, + "loss": 2.1116, + "step": 5012 + }, + { + "epoch": 2.1626051326288547, + "grad_norm": 0.19329191744327545, + "learning_rate": 2.7427099313409733e-05, + "loss": 2.0255, + "step": 5013 + }, + { + "epoch": 2.163036445978003, + "grad_norm": 0.19396258890628815, + "learning_rate": 2.7400720505464253e-05, + "loss": 2.1873, + "step": 5014 + }, + { + "epoch": 2.163467759327151, + "grad_norm": 0.1850919872522354, + "learning_rate": 2.7374351553470124e-05, + "loss": 2.2272, + "step": 5015 + }, + { + "epoch": 2.1638990726762994, + "grad_norm": 0.1951063722372055, + "learning_rate": 2.7347992462887335e-05, + "loss": 2.1948, + "step": 5016 + }, + { + "epoch": 2.1643303860254477, + "grad_norm": 0.19742482900619507, + "learning_rate": 2.7321643239173817e-05, + "loss": 2.3527, + "step": 5017 + }, + { + "epoch": 2.1647616993745955, + "grad_norm": 0.19653549790382385, + "learning_rate": 2.7295303887785475e-05, + "loss": 2.1226, + "step": 5018 + }, + { + "epoch": 2.1651930127237438, + "grad_norm": 0.19391842186450958, + "learning_rate": 2.7268974414176124e-05, + "loss": 2.3314, + "step": 5019 + }, + { + "epoch": 2.165624326072892, + "grad_norm": 0.2106139212846756, + "learning_rate": 2.7242654823797627e-05, + "loss": 2.0381, + "step": 5020 + }, + { + "epoch": 2.1660556394220403, + "grad_norm": 0.19162026047706604, + "learning_rate": 2.7216345122099737e-05, + "loss": 2.1074, + "step": 5021 + }, + { + "epoch": 2.166486952771188, + "grad_norm": 0.18334653973579407, + "learning_rate": 2.7190045314530112e-05, + "loss": 2.1011, + "step": 5022 + }, + { + "epoch": 2.1669182661203363, + "grad_norm": 0.20370136201381683, + "learning_rate": 2.7163755406534425e-05, + "loss": 2.2357, + "step": 5023 + }, + { + "epoch": 2.1673495794694846, + "grad_norm": 0.19202132523059845, + "learning_rate": 2.7137475403556333e-05, + "loss": 2.2211, + "step": 5024 + }, + { + "epoch": 2.167780892818633, + "grad_norm": 0.1901710033416748, + "learning_rate": 2.711120531103738e-05, + "loss": 2.1074, + "step": 5025 + }, + { + "epoch": 2.167780892818633, + "eval_loss": 2.0886192321777344, + "eval_runtime": 207.6955, + "eval_samples_per_second": 0.154, + "eval_steps_per_second": 0.154, + "step": 5025 + }, + { + "epoch": 2.168212206167781, + "grad_norm": 0.1857885867357254, + "learning_rate": 2.708494513441706e-05, + "loss": 1.9152, + "step": 5026 + }, + { + "epoch": 2.168643519516929, + "grad_norm": 0.19882744550704956, + "learning_rate": 2.7058694879132852e-05, + "loss": 2.0951, + "step": 5027 + }, + { + "epoch": 2.169074832866077, + "grad_norm": 0.3018729090690613, + "learning_rate": 2.703245455062014e-05, + "loss": 2.1891, + "step": 5028 + }, + { + "epoch": 2.1695061462152254, + "grad_norm": 0.20267686247825623, + "learning_rate": 2.7006224154312274e-05, + "loss": 1.928, + "step": 5029 + }, + { + "epoch": 2.1699374595643737, + "grad_norm": 0.19706034660339355, + "learning_rate": 2.698000369564055e-05, + "loss": 2.3663, + "step": 5030 + }, + { + "epoch": 2.1703687729135215, + "grad_norm": 0.19120123982429504, + "learning_rate": 2.6953793180034207e-05, + "loss": 2.2033, + "step": 5031 + }, + { + "epoch": 2.1708000862626697, + "grad_norm": 0.19281716644763947, + "learning_rate": 2.6927592612920404e-05, + "loss": 2.2885, + "step": 5032 + }, + { + "epoch": 2.171231399611818, + "grad_norm": 0.22012881934642792, + "learning_rate": 2.690140199972423e-05, + "loss": 2.158, + "step": 5033 + }, + { + "epoch": 2.1716627129609662, + "grad_norm": 0.1995842605829239, + "learning_rate": 2.687522134586883e-05, + "loss": 2.039, + "step": 5034 + }, + { + "epoch": 2.1720940263101145, + "grad_norm": 0.18780255317687988, + "learning_rate": 2.6849050656775107e-05, + "loss": 2.103, + "step": 5035 + }, + { + "epoch": 2.1725253396592623, + "grad_norm": 0.20365270972251892, + "learning_rate": 2.6822889937862015e-05, + "loss": 2.2397, + "step": 5036 + }, + { + "epoch": 2.1729566530084106, + "grad_norm": 0.19723571836948395, + "learning_rate": 2.679673919454639e-05, + "loss": 2.0369, + "step": 5037 + }, + { + "epoch": 2.173387966357559, + "grad_norm": 0.20004676282405853, + "learning_rate": 2.677059843224308e-05, + "loss": 2.171, + "step": 5038 + }, + { + "epoch": 2.173819279706707, + "grad_norm": 0.1994907110929489, + "learning_rate": 2.674446765636482e-05, + "loss": 2.2445, + "step": 5039 + }, + { + "epoch": 2.174250593055855, + "grad_norm": 0.1878870576620102, + "learning_rate": 2.6718346872322197e-05, + "loss": 2.123, + "step": 5040 + }, + { + "epoch": 2.174681906405003, + "grad_norm": 0.20285697281360626, + "learning_rate": 2.6692236085523873e-05, + "loss": 2.0787, + "step": 5041 + }, + { + "epoch": 2.1751132197541514, + "grad_norm": 0.2116626501083374, + "learning_rate": 2.666613530137636e-05, + "loss": 2.1716, + "step": 5042 + }, + { + "epoch": 2.1755445331032996, + "grad_norm": 0.19868099689483643, + "learning_rate": 2.6640044525284097e-05, + "loss": 2.1836, + "step": 5043 + }, + { + "epoch": 2.175975846452448, + "grad_norm": 0.18697194755077362, + "learning_rate": 2.661396376264947e-05, + "loss": 2.1, + "step": 5044 + }, + { + "epoch": 2.1764071598015957, + "grad_norm": 0.1909298449754715, + "learning_rate": 2.658789301887279e-05, + "loss": 2.0179, + "step": 5045 + }, + { + "epoch": 2.176838473150744, + "grad_norm": 0.18372386693954468, + "learning_rate": 2.656183229935229e-05, + "loss": 1.861, + "step": 5046 + }, + { + "epoch": 2.1772697864998922, + "grad_norm": 0.20448540151119232, + "learning_rate": 2.653578160948409e-05, + "loss": 2.2202, + "step": 5047 + }, + { + "epoch": 2.1777010998490405, + "grad_norm": 0.19497567415237427, + "learning_rate": 2.6509740954662355e-05, + "loss": 2.1244, + "step": 5048 + }, + { + "epoch": 2.1781324131981883, + "grad_norm": 0.20127350091934204, + "learning_rate": 2.6483710340279013e-05, + "loss": 2.2618, + "step": 5049 + }, + { + "epoch": 2.1785637265473365, + "grad_norm": 0.1878756284713745, + "learning_rate": 2.645768977172401e-05, + "loss": 2.1859, + "step": 5050 + }, + { + "epoch": 2.1785637265473365, + "eval_loss": 2.088935375213623, + "eval_runtime": 207.6666, + "eval_samples_per_second": 0.154, + "eval_steps_per_second": 0.154, + "step": 5050 + }, + { + "epoch": 2.178995039896485, + "grad_norm": 0.19722066819667816, + "learning_rate": 2.6431679254385162e-05, + "loss": 2.2203, + "step": 5051 + }, + { + "epoch": 2.179426353245633, + "grad_norm": 0.19405268132686615, + "learning_rate": 2.6405678793648293e-05, + "loss": 2.1512, + "step": 5052 + }, + { + "epoch": 2.1798576665947813, + "grad_norm": 0.20881058275699615, + "learning_rate": 2.637968839489706e-05, + "loss": 2.262, + "step": 5053 + }, + { + "epoch": 2.180288979943929, + "grad_norm": 0.1833294779062271, + "learning_rate": 2.635370806351301e-05, + "loss": 2.1621, + "step": 5054 + }, + { + "epoch": 2.1807202932930774, + "grad_norm": 0.20712466537952423, + "learning_rate": 2.632773780487571e-05, + "loss": 2.1567, + "step": 5055 + }, + { + "epoch": 2.1811516066422256, + "grad_norm": 0.2082417905330658, + "learning_rate": 2.6301777624362564e-05, + "loss": 2.3652, + "step": 5056 + }, + { + "epoch": 2.181582919991374, + "grad_norm": 0.19520720839500427, + "learning_rate": 2.627582752734893e-05, + "loss": 2.1002, + "step": 5057 + }, + { + "epoch": 2.1820142333405217, + "grad_norm": 0.2027909755706787, + "learning_rate": 2.624988751920803e-05, + "loss": 2.0569, + "step": 5058 + }, + { + "epoch": 2.18244554668967, + "grad_norm": 0.18182145059108734, + "learning_rate": 2.622395760531105e-05, + "loss": 1.9879, + "step": 5059 + }, + { + "epoch": 2.182876860038818, + "grad_norm": 0.25678521394729614, + "learning_rate": 2.6198037791027054e-05, + "loss": 2.3149, + "step": 5060 + }, + { + "epoch": 2.1833081733879665, + "grad_norm": 0.19661037623882294, + "learning_rate": 2.617212808172302e-05, + "loss": 2.1367, + "step": 5061 + }, + { + "epoch": 2.1837394867371147, + "grad_norm": 0.19917482137680054, + "learning_rate": 2.614622848276384e-05, + "loss": 2.2148, + "step": 5062 + }, + { + "epoch": 2.1841708000862625, + "grad_norm": 0.18143093585968018, + "learning_rate": 2.6120338999512307e-05, + "loss": 2.1091, + "step": 5063 + }, + { + "epoch": 2.1846021134354108, + "grad_norm": 0.1831212192773819, + "learning_rate": 2.609445963732912e-05, + "loss": 1.9578, + "step": 5064 + }, + { + "epoch": 2.185033426784559, + "grad_norm": 0.2225896567106247, + "learning_rate": 2.6068590401572868e-05, + "loss": 2.2214, + "step": 5065 + }, + { + "epoch": 2.1854647401337073, + "grad_norm": 0.19103094935417175, + "learning_rate": 2.6042731297600125e-05, + "loss": 2.0882, + "step": 5066 + }, + { + "epoch": 2.185896053482855, + "grad_norm": 0.1908239871263504, + "learning_rate": 2.601688233076523e-05, + "loss": 2.089, + "step": 5067 + }, + { + "epoch": 2.1863273668320033, + "grad_norm": 0.18195314705371857, + "learning_rate": 2.599104350642052e-05, + "loss": 2.1344, + "step": 5068 + }, + { + "epoch": 2.1867586801811516, + "grad_norm": 0.20526288449764252, + "learning_rate": 2.5965214829916175e-05, + "loss": 2.174, + "step": 5069 + }, + { + "epoch": 2.1871899935303, + "grad_norm": 0.17393609881401062, + "learning_rate": 2.5939396306600367e-05, + "loss": 2.0965, + "step": 5070 + }, + { + "epoch": 2.187621306879448, + "grad_norm": 0.19621096551418304, + "learning_rate": 2.591358794181909e-05, + "loss": 2.0679, + "step": 5071 + }, + { + "epoch": 2.188052620228596, + "grad_norm": 0.1995396614074707, + "learning_rate": 2.5887789740916188e-05, + "loss": 2.2099, + "step": 5072 + }, + { + "epoch": 2.188483933577744, + "grad_norm": 0.19627083837985992, + "learning_rate": 2.5862001709233524e-05, + "loss": 2.2104, + "step": 5073 + }, + { + "epoch": 2.1889152469268924, + "grad_norm": 0.1899205595254898, + "learning_rate": 2.5836223852110762e-05, + "loss": 1.9811, + "step": 5074 + }, + { + "epoch": 2.1893465602760407, + "grad_norm": 0.20641696453094482, + "learning_rate": 2.581045617488551e-05, + "loss": 2.1019, + "step": 5075 + }, + { + "epoch": 2.1893465602760407, + "eval_loss": 2.0890145301818848, + "eval_runtime": 203.979, + "eval_samples_per_second": 0.157, + "eval_steps_per_second": 0.157, + "step": 5075 + }, + { + "epoch": 2.1897778736251885, + "grad_norm": 0.19259873032569885, + "learning_rate": 2.5784698682893224e-05, + "loss": 2.1706, + "step": 5076 + }, + { + "epoch": 2.1902091869743368, + "grad_norm": 0.18554507195949554, + "learning_rate": 2.5758951381467293e-05, + "loss": 2.0585, + "step": 5077 + }, + { + "epoch": 2.190640500323485, + "grad_norm": 0.18829064071178436, + "learning_rate": 2.5733214275938965e-05, + "loss": 1.8606, + "step": 5078 + }, + { + "epoch": 2.1910718136726333, + "grad_norm": 0.20728611946105957, + "learning_rate": 2.5707487371637368e-05, + "loss": 2.187, + "step": 5079 + }, + { + "epoch": 2.1915031270217815, + "grad_norm": 0.1915161907672882, + "learning_rate": 2.568177067388961e-05, + "loss": 2.2266, + "step": 5080 + }, + { + "epoch": 2.1919344403709293, + "grad_norm": 0.20191590487957, + "learning_rate": 2.565606418802056e-05, + "loss": 2.0212, + "step": 5081 + }, + { + "epoch": 2.1923657537200776, + "grad_norm": 0.18292774260044098, + "learning_rate": 2.5630367919353024e-05, + "loss": 2.0412, + "step": 5082 + }, + { + "epoch": 2.192797067069226, + "grad_norm": 0.16870975494384766, + "learning_rate": 2.5604681873207682e-05, + "loss": 2.0399, + "step": 5083 + }, + { + "epoch": 2.193228380418374, + "grad_norm": 0.21954478323459625, + "learning_rate": 2.5579006054903168e-05, + "loss": 1.985, + "step": 5084 + }, + { + "epoch": 2.193659693767522, + "grad_norm": 0.19486844539642334, + "learning_rate": 2.5553340469755922e-05, + "loss": 2.1811, + "step": 5085 + }, + { + "epoch": 2.19409100711667, + "grad_norm": 0.2029792219400406, + "learning_rate": 2.5527685123080232e-05, + "loss": 2.189, + "step": 5086 + }, + { + "epoch": 2.1945223204658184, + "grad_norm": 0.19896060228347778, + "learning_rate": 2.550204002018838e-05, + "loss": 2.2319, + "step": 5087 + }, + { + "epoch": 2.1949536338149667, + "grad_norm": 0.1982891857624054, + "learning_rate": 2.5476405166390443e-05, + "loss": 2.1252, + "step": 5088 + }, + { + "epoch": 2.195384947164115, + "grad_norm": 0.20024412870407104, + "learning_rate": 2.54507805669944e-05, + "loss": 2.3929, + "step": 5089 + }, + { + "epoch": 2.1958162605132627, + "grad_norm": 0.1865210235118866, + "learning_rate": 2.542516622730609e-05, + "loss": 2.2408, + "step": 5090 + }, + { + "epoch": 2.196247573862411, + "grad_norm": 0.19232657551765442, + "learning_rate": 2.539956215262926e-05, + "loss": 2.0633, + "step": 5091 + }, + { + "epoch": 2.1966788872115592, + "grad_norm": 0.17881444096565247, + "learning_rate": 2.537396834826551e-05, + "loss": 1.7351, + "step": 5092 + }, + { + "epoch": 2.1971102005607075, + "grad_norm": 0.17570246756076813, + "learning_rate": 2.5348384819514296e-05, + "loss": 2.2295, + "step": 5093 + }, + { + "epoch": 2.1975415139098553, + "grad_norm": 0.19344256818294525, + "learning_rate": 2.5322811571673027e-05, + "loss": 2.1898, + "step": 5094 + }, + { + "epoch": 2.1979728272590036, + "grad_norm": 0.19992877542972565, + "learning_rate": 2.5297248610036863e-05, + "loss": 2.0582, + "step": 5095 + }, + { + "epoch": 2.198404140608152, + "grad_norm": 0.1935880184173584, + "learning_rate": 2.5271695939898902e-05, + "loss": 2.1934, + "step": 5096 + }, + { + "epoch": 2.1988354539573, + "grad_norm": 0.1977873146533966, + "learning_rate": 2.5246153566550107e-05, + "loss": 2.2696, + "step": 5097 + }, + { + "epoch": 2.1992667673064483, + "grad_norm": 0.19314149022102356, + "learning_rate": 2.5220621495279332e-05, + "loss": 2.1523, + "step": 5098 + }, + { + "epoch": 2.199698080655596, + "grad_norm": 0.19658662378787994, + "learning_rate": 2.519509973137328e-05, + "loss": 1.8741, + "step": 5099 + }, + { + "epoch": 2.2001293940047444, + "grad_norm": 0.23600396513938904, + "learning_rate": 2.5169588280116436e-05, + "loss": 2.2851, + "step": 5100 + }, + { + "epoch": 2.2001293940047444, + "eval_loss": 2.0888924598693848, + "eval_runtime": 204.5932, + "eval_samples_per_second": 0.156, + "eval_steps_per_second": 0.156, + "step": 5100 + }, + { + "epoch": 2.2005607073538926, + "grad_norm": 0.21375681459903717, + "learning_rate": 2.5144087146791304e-05, + "loss": 2.239, + "step": 5101 + }, + { + "epoch": 2.200992020703041, + "grad_norm": 0.184476837515831, + "learning_rate": 2.5118596336678137e-05, + "loss": 2.1957, + "step": 5102 + }, + { + "epoch": 2.2014233340521887, + "grad_norm": 0.20138037204742432, + "learning_rate": 2.5093115855055094e-05, + "loss": 2.2173, + "step": 5103 + }, + { + "epoch": 2.201854647401337, + "grad_norm": 0.20391976833343506, + "learning_rate": 2.5067645707198182e-05, + "loss": 2.025, + "step": 5104 + }, + { + "epoch": 2.202285960750485, + "grad_norm": 0.18930117785930634, + "learning_rate": 2.5042185898381282e-05, + "loss": 2.1973, + "step": 5105 + }, + { + "epoch": 2.2027172740996335, + "grad_norm": 0.19269056618213654, + "learning_rate": 2.501673643387612e-05, + "loss": 2.0268, + "step": 5106 + }, + { + "epoch": 2.2031485874487817, + "grad_norm": 0.19102050364017487, + "learning_rate": 2.499129731895229e-05, + "loss": 2.0892, + "step": 5107 + }, + { + "epoch": 2.2035799007979295, + "grad_norm": 0.18067455291748047, + "learning_rate": 2.4965868558877235e-05, + "loss": 2.0446, + "step": 5108 + }, + { + "epoch": 2.204011214147078, + "grad_norm": 0.1875220686197281, + "learning_rate": 2.494045015891626e-05, + "loss": 1.9844, + "step": 5109 + }, + { + "epoch": 2.204442527496226, + "grad_norm": 0.18868571519851685, + "learning_rate": 2.4915042124332517e-05, + "loss": 2.0551, + "step": 5110 + }, + { + "epoch": 2.2048738408453743, + "grad_norm": 0.21553297340869904, + "learning_rate": 2.4889644460386995e-05, + "loss": 2.1265, + "step": 5111 + }, + { + "epoch": 2.205305154194522, + "grad_norm": 2.6596360206604004, + "learning_rate": 2.4864257172338642e-05, + "loss": 2.1501, + "step": 5112 + }, + { + "epoch": 2.2057364675436704, + "grad_norm": 0.21748773753643036, + "learning_rate": 2.4838880265444092e-05, + "loss": 1.5991, + "step": 5113 + }, + { + "epoch": 2.2061677808928186, + "grad_norm": 0.1903078258037567, + "learning_rate": 2.4813513744957933e-05, + "loss": 2.1071, + "step": 5114 + }, + { + "epoch": 2.206599094241967, + "grad_norm": 0.19003809988498688, + "learning_rate": 2.4788157616132555e-05, + "loss": 2.0368, + "step": 5115 + }, + { + "epoch": 2.207030407591115, + "grad_norm": 0.22406871616840363, + "learning_rate": 2.476281188421828e-05, + "loss": 1.8828, + "step": 5116 + }, + { + "epoch": 2.207461720940263, + "grad_norm": 0.18773460388183594, + "learning_rate": 2.473747655446321e-05, + "loss": 2.2092, + "step": 5117 + }, + { + "epoch": 2.207893034289411, + "grad_norm": 0.1880549192428589, + "learning_rate": 2.4712151632113238e-05, + "loss": 2.1944, + "step": 5118 + }, + { + "epoch": 2.2083243476385594, + "grad_norm": 0.1941085010766983, + "learning_rate": 2.4686837122412222e-05, + "loss": 2.1148, + "step": 5119 + }, + { + "epoch": 2.2087556609877077, + "grad_norm": 0.18133896589279175, + "learning_rate": 2.466153303060181e-05, + "loss": 1.8703, + "step": 5120 + }, + { + "epoch": 2.2091869743368555, + "grad_norm": 0.19666768610477448, + "learning_rate": 2.4636239361921464e-05, + "loss": 2.1024, + "step": 5121 + }, + { + "epoch": 2.2096182876860038, + "grad_norm": 0.20931586623191833, + "learning_rate": 2.461095612160853e-05, + "loss": 2.0844, + "step": 5122 + }, + { + "epoch": 2.210049601035152, + "grad_norm": 0.20741429924964905, + "learning_rate": 2.4585683314898177e-05, + "loss": 2.087, + "step": 5123 + }, + { + "epoch": 2.2104809143843003, + "grad_norm": 0.18422767519950867, + "learning_rate": 2.4560420947023422e-05, + "loss": 2.2925, + "step": 5124 + }, + { + "epoch": 2.2109122277334485, + "grad_norm": 0.19024157524108887, + "learning_rate": 2.4535169023215076e-05, + "loss": 1.8757, + "step": 5125 + }, + { + "epoch": 2.2109122277334485, + "eval_loss": 2.088559865951538, + "eval_runtime": 204.4158, + "eval_samples_per_second": 0.157, + "eval_steps_per_second": 0.157, + "step": 5125 + }, + { + "epoch": 2.2113435410825963, + "grad_norm": 0.1930663138628006, + "learning_rate": 2.450992754870191e-05, + "loss": 2.0434, + "step": 5126 + }, + { + "epoch": 2.2117748544317446, + "grad_norm": 0.17984342575073242, + "learning_rate": 2.4484696528710368e-05, + "loss": 2.0912, + "step": 5127 + }, + { + "epoch": 2.212206167780893, + "grad_norm": 0.27934229373931885, + "learning_rate": 2.4459475968464838e-05, + "loss": 2.2206, + "step": 5128 + }, + { + "epoch": 2.212637481130041, + "grad_norm": 0.18810495734214783, + "learning_rate": 2.443426587318748e-05, + "loss": 2.1079, + "step": 5129 + }, + { + "epoch": 2.213068794479189, + "grad_norm": 0.19722598791122437, + "learning_rate": 2.4409066248098383e-05, + "loss": 2.1985, + "step": 5130 + }, + { + "epoch": 2.213500107828337, + "grad_norm": 0.1870979517698288, + "learning_rate": 2.4383877098415395e-05, + "loss": 2.1231, + "step": 5131 + }, + { + "epoch": 2.2139314211774854, + "grad_norm": 0.20809242129325867, + "learning_rate": 2.4358698429354125e-05, + "loss": 1.837, + "step": 5132 + }, + { + "epoch": 2.2143627345266337, + "grad_norm": 0.18958339095115662, + "learning_rate": 2.4333530246128183e-05, + "loss": 2.2733, + "step": 5133 + }, + { + "epoch": 2.214794047875782, + "grad_norm": 0.1951441764831543, + "learning_rate": 2.430837255394888e-05, + "loss": 2.177, + "step": 5134 + }, + { + "epoch": 2.2152253612249297, + "grad_norm": 0.18995796144008636, + "learning_rate": 2.4283225358025397e-05, + "loss": 2.0873, + "step": 5135 + }, + { + "epoch": 2.215656674574078, + "grad_norm": 0.20475205779075623, + "learning_rate": 2.4258088663564724e-05, + "loss": 2.1105, + "step": 5136 + }, + { + "epoch": 2.2160879879232263, + "grad_norm": 0.19895505905151367, + "learning_rate": 2.4232962475771704e-05, + "loss": 2.1126, + "step": 5137 + }, + { + "epoch": 2.2165193012723745, + "grad_norm": 0.19223228096961975, + "learning_rate": 2.4207846799848985e-05, + "loss": 1.8898, + "step": 5138 + }, + { + "epoch": 2.2169506146215223, + "grad_norm": 0.19018810987472534, + "learning_rate": 2.418274164099701e-05, + "loss": 2.0493, + "step": 5139 + }, + { + "epoch": 2.2173819279706706, + "grad_norm": 0.19304291903972626, + "learning_rate": 2.415764700441416e-05, + "loss": 2.143, + "step": 5140 + }, + { + "epoch": 2.217813241319819, + "grad_norm": 0.2021966278553009, + "learning_rate": 2.4132562895296474e-05, + "loss": 2.301, + "step": 5141 + }, + { + "epoch": 2.218244554668967, + "grad_norm": 0.1942233294248581, + "learning_rate": 2.410748931883792e-05, + "loss": 2.1455, + "step": 5142 + }, + { + "epoch": 2.2186758680181153, + "grad_norm": 0.20182685554027557, + "learning_rate": 2.408242628023022e-05, + "loss": 2.3461, + "step": 5143 + }, + { + "epoch": 2.219107181367263, + "grad_norm": 0.1861187219619751, + "learning_rate": 2.4057373784663015e-05, + "loss": 2.2172, + "step": 5144 + }, + { + "epoch": 2.2195384947164114, + "grad_norm": 0.19665570557117462, + "learning_rate": 2.403233183732369e-05, + "loss": 2.1557, + "step": 5145 + }, + { + "epoch": 2.2199698080655597, + "grad_norm": 0.20461268723011017, + "learning_rate": 2.4007300443397384e-05, + "loss": 2.0571, + "step": 5146 + }, + { + "epoch": 2.220401121414708, + "grad_norm": 0.1917303204536438, + "learning_rate": 2.3982279608067194e-05, + "loss": 2.1758, + "step": 5147 + }, + { + "epoch": 2.2208324347638557, + "grad_norm": 0.18870019912719727, + "learning_rate": 2.395726933651393e-05, + "loss": 2.1837, + "step": 5148 + }, + { + "epoch": 2.221263748113004, + "grad_norm": 0.20907212793827057, + "learning_rate": 2.3932269633916235e-05, + "loss": 2.2427, + "step": 5149 + }, + { + "epoch": 2.2216950614621522, + "grad_norm": 0.1932532638311386, + "learning_rate": 2.3907280505450593e-05, + "loss": 2.2975, + "step": 5150 + }, + { + "epoch": 2.2216950614621522, + "eval_loss": 2.088294267654419, + "eval_runtime": 204.373, + "eval_samples_per_second": 0.157, + "eval_steps_per_second": 0.157, + "step": 5150 + }, + { + "epoch": 2.2221263748113005, + "grad_norm": 0.1814073622226715, + "learning_rate": 2.388230195629125e-05, + "loss": 1.9633, + "step": 5151 + }, + { + "epoch": 2.2225576881604487, + "grad_norm": 0.2006046324968338, + "learning_rate": 2.3857333991610307e-05, + "loss": 2.2333, + "step": 5152 + }, + { + "epoch": 2.2229890015095966, + "grad_norm": 0.20529131591320038, + "learning_rate": 2.383237661657765e-05, + "loss": 2.3329, + "step": 5153 + }, + { + "epoch": 2.223420314858745, + "grad_norm": 0.20727939903736115, + "learning_rate": 2.3807429836360974e-05, + "loss": 2.2073, + "step": 5154 + }, + { + "epoch": 2.223851628207893, + "grad_norm": 0.19187138974666595, + "learning_rate": 2.3782493656125784e-05, + "loss": 2.1682, + "step": 5155 + }, + { + "epoch": 2.2242829415570413, + "grad_norm": 0.1747562140226364, + "learning_rate": 2.375756808103538e-05, + "loss": 2.1047, + "step": 5156 + }, + { + "epoch": 2.224714254906189, + "grad_norm": 0.2382543832063675, + "learning_rate": 2.373265311625087e-05, + "loss": 1.7174, + "step": 5157 + }, + { + "epoch": 2.2251455682553374, + "grad_norm": 0.19224894046783447, + "learning_rate": 2.370774876693122e-05, + "loss": 2.1815, + "step": 5158 + }, + { + "epoch": 2.2255768816044856, + "grad_norm": 0.22981807589530945, + "learning_rate": 2.3682855038233092e-05, + "loss": 2.1429, + "step": 5159 + }, + { + "epoch": 2.226008194953634, + "grad_norm": 0.19596794247627258, + "learning_rate": 2.365797193531102e-05, + "loss": 2.0008, + "step": 5160 + }, + { + "epoch": 2.226439508302782, + "grad_norm": 0.2190484255552292, + "learning_rate": 2.3633099463317305e-05, + "loss": 2.1318, + "step": 5161 + }, + { + "epoch": 2.22687082165193, + "grad_norm": 0.19309651851654053, + "learning_rate": 2.3608237627402105e-05, + "loss": 2.1676, + "step": 5162 + }, + { + "epoch": 2.227302135001078, + "grad_norm": 0.20434337854385376, + "learning_rate": 2.3583386432713337e-05, + "loss": 2.2481, + "step": 5163 + }, + { + "epoch": 2.2277334483502265, + "grad_norm": 0.19345074892044067, + "learning_rate": 2.355854588439665e-05, + "loss": 2.238, + "step": 5164 + }, + { + "epoch": 2.2281647616993747, + "grad_norm": 0.18643070757389069, + "learning_rate": 2.353371598759561e-05, + "loss": 2.1621, + "step": 5165 + }, + { + "epoch": 2.2285960750485225, + "grad_norm": 0.182948037981987, + "learning_rate": 2.3508896747451505e-05, + "loss": 2.2342, + "step": 5166 + }, + { + "epoch": 2.229027388397671, + "grad_norm": 0.2127479761838913, + "learning_rate": 2.3484088169103428e-05, + "loss": 2.2865, + "step": 5167 + }, + { + "epoch": 2.229458701746819, + "grad_norm": 0.1867491900920868, + "learning_rate": 2.3459290257688256e-05, + "loss": 2.1871, + "step": 5168 + }, + { + "epoch": 2.2298900150959673, + "grad_norm": 0.20324353873729706, + "learning_rate": 2.343450301834068e-05, + "loss": 2.2684, + "step": 5169 + }, + { + "epoch": 2.2303213284451155, + "grad_norm": 0.21033866703510284, + "learning_rate": 2.340972645619317e-05, + "loss": 2.2214, + "step": 5170 + }, + { + "epoch": 2.2307526417942634, + "grad_norm": 0.21309709548950195, + "learning_rate": 2.3384960576375947e-05, + "loss": 2.328, + "step": 5171 + }, + { + "epoch": 2.2311839551434116, + "grad_norm": 0.1905982345342636, + "learning_rate": 2.3360205384017143e-05, + "loss": 2.0782, + "step": 5172 + }, + { + "epoch": 2.23161526849256, + "grad_norm": 0.19004592299461365, + "learning_rate": 2.3335460884242506e-05, + "loss": 2.2175, + "step": 5173 + }, + { + "epoch": 2.232046581841708, + "grad_norm": 0.2126663327217102, + "learning_rate": 2.3310727082175695e-05, + "loss": 2.0913, + "step": 5174 + }, + { + "epoch": 2.232477895190856, + "grad_norm": 0.17693957686424255, + "learning_rate": 2.3286003982938074e-05, + "loss": 2.0777, + "step": 5175 + }, + { + "epoch": 2.232477895190856, + "eval_loss": 2.088388442993164, + "eval_runtime": 207.049, + "eval_samples_per_second": 0.155, + "eval_steps_per_second": 0.155, + "step": 5175 + }, + { + "epoch": 2.232909208540004, + "grad_norm": 0.18512453138828278, + "learning_rate": 2.326129159164889e-05, + "loss": 2.118, + "step": 5176 + }, + { + "epoch": 2.2333405218891524, + "grad_norm": 0.1941794455051422, + "learning_rate": 2.3236589913425118e-05, + "loss": 2.0438, + "step": 5177 + }, + { + "epoch": 2.2337718352383007, + "grad_norm": 0.20930613577365875, + "learning_rate": 2.3211898953381422e-05, + "loss": 2.0802, + "step": 5178 + }, + { + "epoch": 2.234203148587449, + "grad_norm": 0.18975000083446503, + "learning_rate": 2.3187218716630422e-05, + "loss": 2.1545, + "step": 5179 + }, + { + "epoch": 2.2346344619365968, + "grad_norm": 0.19006623327732086, + "learning_rate": 2.31625492082824e-05, + "loss": 2.131, + "step": 5180 + }, + { + "epoch": 2.235065775285745, + "grad_norm": 0.2047644406557083, + "learning_rate": 2.3137890433445448e-05, + "loss": 2.2678, + "step": 5181 + }, + { + "epoch": 2.2354970886348933, + "grad_norm": 0.49673357605934143, + "learning_rate": 2.3113242397225437e-05, + "loss": 2.162, + "step": 5182 + }, + { + "epoch": 2.2359284019840415, + "grad_norm": 0.18441151082515717, + "learning_rate": 2.3088605104726017e-05, + "loss": 2.0678, + "step": 5183 + }, + { + "epoch": 2.2363597153331893, + "grad_norm": 0.19082053005695343, + "learning_rate": 2.3063978561048594e-05, + "loss": 1.9763, + "step": 5184 + }, + { + "epoch": 2.2367910286823376, + "grad_norm": 0.190688356757164, + "learning_rate": 2.3039362771292374e-05, + "loss": 2.1109, + "step": 5185 + }, + { + "epoch": 2.237222342031486, + "grad_norm": 0.21213744580745697, + "learning_rate": 2.301475774055433e-05, + "loss": 2.3109, + "step": 5186 + }, + { + "epoch": 2.237653655380634, + "grad_norm": 0.19175684452056885, + "learning_rate": 2.29901634739292e-05, + "loss": 2.1973, + "step": 5187 + }, + { + "epoch": 2.2380849687297824, + "grad_norm": 0.20844250917434692, + "learning_rate": 2.2965579976509484e-05, + "loss": 2.0611, + "step": 5188 + }, + { + "epoch": 2.23851628207893, + "grad_norm": 0.19638632237911224, + "learning_rate": 2.2941007253385453e-05, + "loss": 2.1502, + "step": 5189 + }, + { + "epoch": 2.2389475954280784, + "grad_norm": 0.2016148567199707, + "learning_rate": 2.291644530964522e-05, + "loss": 2.3361, + "step": 5190 + }, + { + "epoch": 2.2393789087772267, + "grad_norm": 0.20059362053871155, + "learning_rate": 2.289189415037454e-05, + "loss": 2.2438, + "step": 5191 + }, + { + "epoch": 2.239810222126375, + "grad_norm": 0.2173454612493515, + "learning_rate": 2.2867353780657e-05, + "loss": 2.2661, + "step": 5192 + }, + { + "epoch": 2.2402415354755227, + "grad_norm": 0.19429714977741241, + "learning_rate": 2.2842824205574002e-05, + "loss": 2.1804, + "step": 5193 + }, + { + "epoch": 2.240672848824671, + "grad_norm": 0.19609315693378448, + "learning_rate": 2.2818305430204624e-05, + "loss": 2.1481, + "step": 5194 + }, + { + "epoch": 2.2411041621738192, + "grad_norm": 0.2272697538137436, + "learning_rate": 2.2793797459625783e-05, + "loss": 2.2735, + "step": 5195 + }, + { + "epoch": 2.2415354755229675, + "grad_norm": 0.18201479315757751, + "learning_rate": 2.2769300298912044e-05, + "loss": 2.1451, + "step": 5196 + }, + { + "epoch": 2.2419667888721158, + "grad_norm": 0.19466306269168854, + "learning_rate": 2.2744813953135894e-05, + "loss": 2.1083, + "step": 5197 + }, + { + "epoch": 2.2423981022212636, + "grad_norm": 0.19931024312973022, + "learning_rate": 2.2720338427367465e-05, + "loss": 2.1513, + "step": 5198 + }, + { + "epoch": 2.242829415570412, + "grad_norm": 0.2232944369316101, + "learning_rate": 2.269587372667468e-05, + "loss": 2.1877, + "step": 5199 + }, + { + "epoch": 2.24326072891956, + "grad_norm": 0.1860596090555191, + "learning_rate": 2.2671419856123233e-05, + "loss": 2.2027, + "step": 5200 + }, + { + "epoch": 2.24326072891956, + "eval_loss": 2.0881733894348145, + "eval_runtime": 203.8998, + "eval_samples_per_second": 0.157, + "eval_steps_per_second": 0.157, + "step": 5200 + }, + { + "epoch": 2.2436920422687083, + "grad_norm": 0.2147083878517151, + "learning_rate": 2.2646976820776565e-05, + "loss": 2.3301, + "step": 5201 + }, + { + "epoch": 2.244123355617856, + "grad_norm": 0.20780740678310394, + "learning_rate": 2.2622544625695858e-05, + "loss": 2.2091, + "step": 5202 + }, + { + "epoch": 2.2445546689670044, + "grad_norm": 0.18065021932125092, + "learning_rate": 2.2598123275940067e-05, + "loss": 2.0219, + "step": 5203 + }, + { + "epoch": 2.2449859823161527, + "grad_norm": 0.19713814556598663, + "learning_rate": 2.257371277656595e-05, + "loss": 1.9993, + "step": 5204 + }, + { + "epoch": 2.245417295665301, + "grad_norm": 0.21324028074741364, + "learning_rate": 2.25493131326279e-05, + "loss": 2.0939, + "step": 5205 + }, + { + "epoch": 2.245848609014449, + "grad_norm": 0.20643126964569092, + "learning_rate": 2.252492434917816e-05, + "loss": 2.1377, + "step": 5206 + }, + { + "epoch": 2.246279922363597, + "grad_norm": 0.2044958770275116, + "learning_rate": 2.250054643126666e-05, + "loss": 2.2487, + "step": 5207 + }, + { + "epoch": 2.2467112357127452, + "grad_norm": 0.19290834665298462, + "learning_rate": 2.247617938394118e-05, + "loss": 2.0619, + "step": 5208 + }, + { + "epoch": 2.2471425490618935, + "grad_norm": 0.1928108036518097, + "learning_rate": 2.2451823212247165e-05, + "loss": 2.1128, + "step": 5209 + }, + { + "epoch": 2.2475738624110417, + "grad_norm": 0.20691651105880737, + "learning_rate": 2.2427477921227774e-05, + "loss": 2.2613, + "step": 5210 + }, + { + "epoch": 2.2480051757601895, + "grad_norm": 0.20217694342136383, + "learning_rate": 2.2403143515924017e-05, + "loss": 2.1959, + "step": 5211 + }, + { + "epoch": 2.248436489109338, + "grad_norm": 0.19633910059928894, + "learning_rate": 2.2378820001374587e-05, + "loss": 2.0647, + "step": 5212 + }, + { + "epoch": 2.248867802458486, + "grad_norm": 0.1810738444328308, + "learning_rate": 2.2354507382615933e-05, + "loss": 2.0933, + "step": 5213 + }, + { + "epoch": 2.2492991158076343, + "grad_norm": 0.21027354896068573, + "learning_rate": 2.2330205664682246e-05, + "loss": 2.051, + "step": 5214 + }, + { + "epoch": 2.2497304291567826, + "grad_norm": 0.1850845068693161, + "learning_rate": 2.230591485260546e-05, + "loss": 2.1125, + "step": 5215 + }, + { + "epoch": 2.2501617425059304, + "grad_norm": 0.22473980486392975, + "learning_rate": 2.228163495141526e-05, + "loss": 2.1118, + "step": 5216 + }, + { + "epoch": 2.2505930558550786, + "grad_norm": 0.22738492488861084, + "learning_rate": 2.2257365966139043e-05, + "loss": 1.7314, + "step": 5217 + }, + { + "epoch": 2.251024369204227, + "grad_norm": 0.19673076272010803, + "learning_rate": 2.2233107901802028e-05, + "loss": 2.1071, + "step": 5218 + }, + { + "epoch": 2.251455682553375, + "grad_norm": 0.20075039565563202, + "learning_rate": 2.220886076342705e-05, + "loss": 1.973, + "step": 5219 + }, + { + "epoch": 2.251886995902523, + "grad_norm": 0.17662428319454193, + "learning_rate": 2.218462455603475e-05, + "loss": 2.1947, + "step": 5220 + }, + { + "epoch": 2.252318309251671, + "grad_norm": 0.19919851422309875, + "learning_rate": 2.2160399284643514e-05, + "loss": 2.1702, + "step": 5221 + }, + { + "epoch": 2.2527496226008195, + "grad_norm": 0.17526623606681824, + "learning_rate": 2.2136184954269456e-05, + "loss": 2.0352, + "step": 5222 + }, + { + "epoch": 2.2531809359499677, + "grad_norm": 0.20351293683052063, + "learning_rate": 2.211198156992645e-05, + "loss": 2.384, + "step": 5223 + }, + { + "epoch": 2.253612249299116, + "grad_norm": 0.19407595694065094, + "learning_rate": 2.2087789136625973e-05, + "loss": 2.0531, + "step": 5224 + }, + { + "epoch": 2.2540435626482638, + "grad_norm": 0.19197069108486176, + "learning_rate": 2.206360765937743e-05, + "loss": 2.1488, + "step": 5225 + }, + { + "epoch": 2.2540435626482638, + "eval_loss": 2.0879340171813965, + "eval_runtime": 203.9602, + "eval_samples_per_second": 0.157, + "eval_steps_per_second": 0.157, + "step": 5225 + }, + { + "epoch": 2.254474875997412, + "grad_norm": 0.19059327244758606, + "learning_rate": 2.2039437143187812e-05, + "loss": 2.1152, + "step": 5226 + }, + { + "epoch": 2.2549061893465603, + "grad_norm": 0.19949564337730408, + "learning_rate": 2.2015277593061908e-05, + "loss": 2.1029, + "step": 5227 + }, + { + "epoch": 2.2553375026957085, + "grad_norm": 0.190524160861969, + "learning_rate": 2.1991129014002203e-05, + "loss": 2.0127, + "step": 5228 + }, + { + "epoch": 2.2557688160448564, + "grad_norm": 0.18100754916667938, + "learning_rate": 2.1966991411008938e-05, + "loss": 1.8489, + "step": 5229 + }, + { + "epoch": 2.2562001293940046, + "grad_norm": 0.21011635661125183, + "learning_rate": 2.1942864789080055e-05, + "loss": 2.23, + "step": 5230 + }, + { + "epoch": 2.256631442743153, + "grad_norm": 0.18916666507720947, + "learning_rate": 2.1918749153211243e-05, + "loss": 2.0753, + "step": 5231 + }, + { + "epoch": 2.257062756092301, + "grad_norm": 0.18899627029895782, + "learning_rate": 2.1894644508395906e-05, + "loss": 2.1395, + "step": 5232 + }, + { + "epoch": 2.2574940694414494, + "grad_norm": 0.20524995028972626, + "learning_rate": 2.1870550859625176e-05, + "loss": 2.226, + "step": 5233 + }, + { + "epoch": 2.257925382790597, + "grad_norm": 0.20154394209384918, + "learning_rate": 2.1846468211887898e-05, + "loss": 2.0271, + "step": 5234 + }, + { + "epoch": 2.2583566961397454, + "grad_norm": 0.20304295420646667, + "learning_rate": 2.1822396570170642e-05, + "loss": 2.0985, + "step": 5235 + }, + { + "epoch": 2.2587880094888937, + "grad_norm": 0.18911685049533844, + "learning_rate": 2.179833593945775e-05, + "loss": 2.1121, + "step": 5236 + }, + { + "epoch": 2.259219322838042, + "grad_norm": 0.20598070323467255, + "learning_rate": 2.1774286324731193e-05, + "loss": 1.992, + "step": 5237 + }, + { + "epoch": 2.2596506361871898, + "grad_norm": 0.19132232666015625, + "learning_rate": 2.175024773097069e-05, + "loss": 2.0416, + "step": 5238 + }, + { + "epoch": 2.260081949536338, + "grad_norm": 0.18360275030136108, + "learning_rate": 2.172622016315374e-05, + "loss": 2.1088, + "step": 5239 + }, + { + "epoch": 2.2605132628854863, + "grad_norm": 0.23842813074588776, + "learning_rate": 2.170220362625551e-05, + "loss": 2.1172, + "step": 5240 + }, + { + "epoch": 2.2609445762346345, + "grad_norm": 0.22701770067214966, + "learning_rate": 2.1678198125248895e-05, + "loss": 2.3083, + "step": 5241 + }, + { + "epoch": 2.2613758895837828, + "grad_norm": 0.18772007524967194, + "learning_rate": 2.165420366510442e-05, + "loss": 2.1169, + "step": 5242 + }, + { + "epoch": 2.2618072029329306, + "grad_norm": 0.9131419062614441, + "learning_rate": 2.1630220250790487e-05, + "loss": 2.2628, + "step": 5243 + }, + { + "epoch": 2.262238516282079, + "grad_norm": 0.213560551404953, + "learning_rate": 2.160624788727309e-05, + "loss": 2.1191, + "step": 5244 + }, + { + "epoch": 2.262669829631227, + "grad_norm": 0.1981581449508667, + "learning_rate": 2.1582286579515975e-05, + "loss": 2.2597, + "step": 5245 + }, + { + "epoch": 2.2631011429803753, + "grad_norm": 0.19422860443592072, + "learning_rate": 2.1558336332480593e-05, + "loss": 2.2362, + "step": 5246 + }, + { + "epoch": 2.263532456329523, + "grad_norm": 0.3238939046859741, + "learning_rate": 2.1534397151126094e-05, + "loss": 2.2472, + "step": 5247 + }, + { + "epoch": 2.2639637696786714, + "grad_norm": 0.19412267208099365, + "learning_rate": 2.1510469040409355e-05, + "loss": 2.1139, + "step": 5248 + }, + { + "epoch": 2.2643950830278197, + "grad_norm": 0.1949174553155899, + "learning_rate": 2.1486552005284934e-05, + "loss": 2.297, + "step": 5249 + }, + { + "epoch": 2.264826396376968, + "grad_norm": 0.20700618624687195, + "learning_rate": 2.146264605070518e-05, + "loss": 2.2371, + "step": 5250 + }, + { + "epoch": 2.264826396376968, + "eval_loss": 2.087846279144287, + "eval_runtime": 204.2358, + "eval_samples_per_second": 0.157, + "eval_steps_per_second": 0.157, + "step": 5250 + }, + { + "epoch": 2.265257709726116, + "grad_norm": 0.19957131147384644, + "learning_rate": 2.1438751181620013e-05, + "loss": 2.1992, + "step": 5251 + }, + { + "epoch": 2.265689023075264, + "grad_norm": 0.20005294680595398, + "learning_rate": 2.1414867402977118e-05, + "loss": 2.3806, + "step": 5252 + }, + { + "epoch": 2.2661203364244122, + "grad_norm": 0.19896568357944489, + "learning_rate": 2.1390994719721957e-05, + "loss": 2.1262, + "step": 5253 + }, + { + "epoch": 2.2665516497735605, + "grad_norm": 0.1914672702550888, + "learning_rate": 2.1367133136797594e-05, + "loss": 1.9649, + "step": 5254 + }, + { + "epoch": 2.2669829631227087, + "grad_norm": 0.20051509141921997, + "learning_rate": 2.1343282659144855e-05, + "loss": 2.0603, + "step": 5255 + }, + { + "epoch": 2.2674142764718566, + "grad_norm": 0.2132277637720108, + "learning_rate": 2.1319443291702178e-05, + "loss": 2.3511, + "step": 5256 + }, + { + "epoch": 2.267845589821005, + "grad_norm": 0.1911388635635376, + "learning_rate": 2.129561503940583e-05, + "loss": 2.0434, + "step": 5257 + }, + { + "epoch": 2.268276903170153, + "grad_norm": 0.19434180855751038, + "learning_rate": 2.12717979071897e-05, + "loss": 2.23, + "step": 5258 + }, + { + "epoch": 2.2687082165193013, + "grad_norm": 0.19305378198623657, + "learning_rate": 2.124799189998536e-05, + "loss": 2.3032, + "step": 5259 + }, + { + "epoch": 2.2691395298684496, + "grad_norm": 0.23718416690826416, + "learning_rate": 2.1224197022722124e-05, + "loss": 2.1086, + "step": 5260 + }, + { + "epoch": 2.269570843217598, + "grad_norm": 0.18516016006469727, + "learning_rate": 2.120041328032698e-05, + "loss": 2.1842, + "step": 5261 + }, + { + "epoch": 2.2700021565667456, + "grad_norm": 0.18666745722293854, + "learning_rate": 2.1176640677724603e-05, + "loss": 2.1297, + "step": 5262 + }, + { + "epoch": 2.270433469915894, + "grad_norm": 0.27829509973526, + "learning_rate": 2.1152879219837357e-05, + "loss": 2.0394, + "step": 5263 + }, + { + "epoch": 2.270864783265042, + "grad_norm": 0.22595706582069397, + "learning_rate": 2.1129128911585376e-05, + "loss": 2.1602, + "step": 5264 + }, + { + "epoch": 2.27129609661419, + "grad_norm": 0.20288696885108948, + "learning_rate": 2.1105389757886344e-05, + "loss": 2.1171, + "step": 5265 + }, + { + "epoch": 2.271727409963338, + "grad_norm": 0.2027527093887329, + "learning_rate": 2.1081661763655756e-05, + "loss": 2.1885, + "step": 5266 + }, + { + "epoch": 2.2721587233124865, + "grad_norm": 0.1916697472333908, + "learning_rate": 2.1057944933806717e-05, + "loss": 2.0902, + "step": 5267 + }, + { + "epoch": 2.2725900366616347, + "grad_norm": 0.2065373808145523, + "learning_rate": 2.1034239273250094e-05, + "loss": 2.231, + "step": 5268 + }, + { + "epoch": 2.273021350010783, + "grad_norm": 0.436259388923645, + "learning_rate": 2.101054478689442e-05, + "loss": 2.1565, + "step": 5269 + }, + { + "epoch": 2.2734526633599312, + "grad_norm": 0.21986541152000427, + "learning_rate": 2.098686147964582e-05, + "loss": 2.0916, + "step": 5270 + }, + { + "epoch": 2.273883976709079, + "grad_norm": 0.2018526792526245, + "learning_rate": 2.096318935640825e-05, + "loss": 2.1801, + "step": 5271 + }, + { + "epoch": 2.2743152900582273, + "grad_norm": 0.19900456070899963, + "learning_rate": 2.0939528422083258e-05, + "loss": 2.2055, + "step": 5272 + }, + { + "epoch": 2.2747466034073756, + "grad_norm": 2.7183778285980225, + "learning_rate": 2.0915878681570107e-05, + "loss": 2.0523, + "step": 5273 + }, + { + "epoch": 2.2751779167565234, + "grad_norm": 0.18772058188915253, + "learning_rate": 2.0892240139765724e-05, + "loss": 2.1065, + "step": 5274 + }, + { + "epoch": 2.2756092301056716, + "grad_norm": 0.2923305630683899, + "learning_rate": 2.0868612801564735e-05, + "loss": 2.213, + "step": 5275 + }, + { + "epoch": 2.2756092301056716, + "eval_loss": 2.088007926940918, + "eval_runtime": 203.9558, + "eval_samples_per_second": 0.157, + "eval_steps_per_second": 0.157, + "step": 5275 + }, + { + "epoch": 2.27604054345482, + "grad_norm": 0.18926513195037842, + "learning_rate": 2.0844996671859436e-05, + "loss": 2.1041, + "step": 5276 + }, + { + "epoch": 2.276471856803968, + "grad_norm": 0.1900864690542221, + "learning_rate": 2.08213917555398e-05, + "loss": 2.1807, + "step": 5277 + }, + { + "epoch": 2.2769031701531164, + "grad_norm": 0.21080556511878967, + "learning_rate": 2.07977980574935e-05, + "loss": 2.1454, + "step": 5278 + }, + { + "epoch": 2.2773344835022646, + "grad_norm": 0.20365285873413086, + "learning_rate": 2.0774215582605852e-05, + "loss": 2.0932, + "step": 5279 + }, + { + "epoch": 2.2777657968514124, + "grad_norm": 0.2221464365720749, + "learning_rate": 2.0750644335759867e-05, + "loss": 2.1837, + "step": 5280 + }, + { + "epoch": 2.2781971102005607, + "grad_norm": 0.17998287081718445, + "learning_rate": 2.0727084321836206e-05, + "loss": 2.2303, + "step": 5281 + }, + { + "epoch": 2.278628423549709, + "grad_norm": 0.18822330236434937, + "learning_rate": 2.070353554571331e-05, + "loss": 2.0179, + "step": 5282 + }, + { + "epoch": 2.279059736898857, + "grad_norm": 0.21984271705150604, + "learning_rate": 2.067999801226713e-05, + "loss": 2.2341, + "step": 5283 + }, + { + "epoch": 2.279491050248005, + "grad_norm": 0.20923283696174622, + "learning_rate": 2.0656471726371378e-05, + "loss": 2.2797, + "step": 5284 + }, + { + "epoch": 2.2799223635971533, + "grad_norm": 0.19424287974834442, + "learning_rate": 2.0632956692897457e-05, + "loss": 2.2956, + "step": 5285 + }, + { + "epoch": 2.2803536769463015, + "grad_norm": 0.19724376499652863, + "learning_rate": 2.0609452916714402e-05, + "loss": 2.3226, + "step": 5286 + }, + { + "epoch": 2.28078499029545, + "grad_norm": 0.2070743590593338, + "learning_rate": 2.0585960402688956e-05, + "loss": 2.0051, + "step": 5287 + }, + { + "epoch": 2.281216303644598, + "grad_norm": 0.21037553250789642, + "learning_rate": 2.0562479155685422e-05, + "loss": 1.9465, + "step": 5288 + }, + { + "epoch": 2.281647616993746, + "grad_norm": 0.19721102714538574, + "learning_rate": 2.053900918056592e-05, + "loss": 2.4951, + "step": 5289 + }, + { + "epoch": 2.282078930342894, + "grad_norm": 0.20401695370674133, + "learning_rate": 2.0515550482190145e-05, + "loss": 2.3082, + "step": 5290 + }, + { + "epoch": 2.2825102436920424, + "grad_norm": 0.197083979845047, + "learning_rate": 2.0492103065415465e-05, + "loss": 2.41, + "step": 5291 + }, + { + "epoch": 2.2829415570411906, + "grad_norm": 0.1975667029619217, + "learning_rate": 2.0468666935096943e-05, + "loss": 2.0927, + "step": 5292 + }, + { + "epoch": 2.2833728703903384, + "grad_norm": 0.18257242441177368, + "learning_rate": 2.0445242096087272e-05, + "loss": 1.9528, + "step": 5293 + }, + { + "epoch": 2.2838041837394867, + "grad_norm": 0.20901605486869812, + "learning_rate": 2.0421828553236833e-05, + "loss": 1.9898, + "step": 5294 + }, + { + "epoch": 2.284235497088635, + "grad_norm": 0.19688163697719574, + "learning_rate": 2.0398426311393613e-05, + "loss": 2.0778, + "step": 5295 + }, + { + "epoch": 2.284666810437783, + "grad_norm": 0.21484370529651642, + "learning_rate": 2.0375035375403393e-05, + "loss": 2.1694, + "step": 5296 + }, + { + "epoch": 2.2850981237869314, + "grad_norm": 0.2258901596069336, + "learning_rate": 2.035165575010944e-05, + "loss": 2.1767, + "step": 5297 + }, + { + "epoch": 2.2855294371360793, + "grad_norm": 0.20959137380123138, + "learning_rate": 2.0328287440352772e-05, + "loss": 2.2924, + "step": 5298 + }, + { + "epoch": 2.2859607504852275, + "grad_norm": 0.1976427137851715, + "learning_rate": 2.030493045097209e-05, + "loss": 2.2265, + "step": 5299 + }, + { + "epoch": 2.2863920638343758, + "grad_norm": 0.1976931393146515, + "learning_rate": 2.0281584786803686e-05, + "loss": 2.1814, + "step": 5300 + }, + { + "epoch": 2.2863920638343758, + "eval_loss": 2.087761640548706, + "eval_runtime": 203.8094, + "eval_samples_per_second": 0.157, + "eval_steps_per_second": 0.157, + "step": 5300 + }, + { + "epoch": 2.286823377183524, + "grad_norm": 0.19544154405593872, + "learning_rate": 2.0258250452681572e-05, + "loss": 2.2168, + "step": 5301 + }, + { + "epoch": 2.287254690532672, + "grad_norm": 0.2057940512895584, + "learning_rate": 2.02349274534373e-05, + "loss": 2.2899, + "step": 5302 + }, + { + "epoch": 2.28768600388182, + "grad_norm": 0.1918468326330185, + "learning_rate": 2.0211615793900223e-05, + "loss": 2.1694, + "step": 5303 + }, + { + "epoch": 2.2881173172309683, + "grad_norm": 0.19360606372356415, + "learning_rate": 2.018831547889725e-05, + "loss": 2.2603, + "step": 5304 + }, + { + "epoch": 2.2885486305801166, + "grad_norm": 0.19668936729431152, + "learning_rate": 2.016502651325297e-05, + "loss": 2.1752, + "step": 5305 + }, + { + "epoch": 2.288979943929265, + "grad_norm": 0.19086293876171112, + "learning_rate": 2.0141748901789608e-05, + "loss": 1.9608, + "step": 5306 + }, + { + "epoch": 2.2894112572784127, + "grad_norm": 0.19300664961338043, + "learning_rate": 2.0118482649327063e-05, + "loss": 2.2325, + "step": 5307 + }, + { + "epoch": 2.289842570627561, + "grad_norm": 0.17769969999790192, + "learning_rate": 2.0095227760682862e-05, + "loss": 2.0467, + "step": 5308 + }, + { + "epoch": 2.290273883976709, + "grad_norm": 0.2028777003288269, + "learning_rate": 2.0071984240672174e-05, + "loss": 2.0266, + "step": 5309 + }, + { + "epoch": 2.2907051973258574, + "grad_norm": 0.1918572634458542, + "learning_rate": 2.0048752094107833e-05, + "loss": 2.0798, + "step": 5310 + }, + { + "epoch": 2.2911365106750052, + "grad_norm": 0.20298618078231812, + "learning_rate": 2.0025531325800315e-05, + "loss": 2.2868, + "step": 5311 + }, + { + "epoch": 2.2915678240241535, + "grad_norm": 0.20322071015834808, + "learning_rate": 2.0002321940557718e-05, + "loss": 2.1486, + "step": 5312 + }, + { + "epoch": 2.2919991373733017, + "grad_norm": 0.2008322775363922, + "learning_rate": 1.997912394318579e-05, + "loss": 2.2418, + "step": 5313 + }, + { + "epoch": 2.29243045072245, + "grad_norm": 0.21370355784893036, + "learning_rate": 1.9955937338487994e-05, + "loss": 2.282, + "step": 5314 + }, + { + "epoch": 2.2928617640715983, + "grad_norm": 0.18719002604484558, + "learning_rate": 1.9932762131265295e-05, + "loss": 2.1381, + "step": 5315 + }, + { + "epoch": 2.293293077420746, + "grad_norm": 0.19345025718212128, + "learning_rate": 1.990959832631638e-05, + "loss": 2.295, + "step": 5316 + }, + { + "epoch": 2.2937243907698943, + "grad_norm": 0.21962137520313263, + "learning_rate": 1.988644592843762e-05, + "loss": 2.0284, + "step": 5317 + }, + { + "epoch": 2.2941557041190426, + "grad_norm": 0.18697109818458557, + "learning_rate": 1.9863304942422932e-05, + "loss": 2.0799, + "step": 5318 + }, + { + "epoch": 2.294587017468191, + "grad_norm": 0.18803496658802032, + "learning_rate": 1.9840175373063947e-05, + "loss": 2.1393, + "step": 5319 + }, + { + "epoch": 2.2950183308173386, + "grad_norm": 0.1989171802997589, + "learning_rate": 1.981705722514983e-05, + "loss": 2.0179, + "step": 5320 + }, + { + "epoch": 2.295449644166487, + "grad_norm": 0.19127577543258667, + "learning_rate": 1.9793950503467504e-05, + "loss": 2.3421, + "step": 5321 + }, + { + "epoch": 2.295880957515635, + "grad_norm": 0.19992578029632568, + "learning_rate": 1.9770855212801444e-05, + "loss": 2.2596, + "step": 5322 + }, + { + "epoch": 2.2963122708647834, + "grad_norm": 0.807466447353363, + "learning_rate": 1.9747771357933794e-05, + "loss": 2.0612, + "step": 5323 + }, + { + "epoch": 2.2967435842139317, + "grad_norm": 0.18776492774486542, + "learning_rate": 1.972469894364431e-05, + "loss": 2.1945, + "step": 5324 + }, + { + "epoch": 2.2971748975630795, + "grad_norm": 0.19602049887180328, + "learning_rate": 1.9701637974710393e-05, + "loss": 2.2251, + "step": 5325 + }, + { + "epoch": 2.2971748975630795, + "eval_loss": 2.0878446102142334, + "eval_runtime": 203.056, + "eval_samples_per_second": 0.158, + "eval_steps_per_second": 0.158, + "step": 5325 + }, + { + "epoch": 2.2976062109122277, + "grad_norm": 0.19274592399597168, + "learning_rate": 1.967858845590706e-05, + "loss": 2.0118, + "step": 5326 + }, + { + "epoch": 2.298037524261376, + "grad_norm": 0.1943180114030838, + "learning_rate": 1.965555039200695e-05, + "loss": 2.1096, + "step": 5327 + }, + { + "epoch": 2.2984688376105242, + "grad_norm": 0.18350040912628174, + "learning_rate": 1.9632523787780423e-05, + "loss": 2.0139, + "step": 5328 + }, + { + "epoch": 2.298900150959672, + "grad_norm": 0.1943226009607315, + "learning_rate": 1.9609508647995303e-05, + "loss": 2.1024, + "step": 5329 + }, + { + "epoch": 2.2993314643088203, + "grad_norm": 0.2782226800918579, + "learning_rate": 1.9586504977417136e-05, + "loss": 2.1859, + "step": 5330 + }, + { + "epoch": 2.2997627776579685, + "grad_norm": 0.23133975267410278, + "learning_rate": 1.9563512780809126e-05, + "loss": 2.2521, + "step": 5331 + }, + { + "epoch": 2.300194091007117, + "grad_norm": 0.19578418135643005, + "learning_rate": 1.9540532062932042e-05, + "loss": 2.154, + "step": 5332 + }, + { + "epoch": 2.300625404356265, + "grad_norm": 0.1878523975610733, + "learning_rate": 1.9517562828544316e-05, + "loss": 2.2197, + "step": 5333 + }, + { + "epoch": 2.301056717705413, + "grad_norm": 0.21165266633033752, + "learning_rate": 1.9494605082401894e-05, + "loss": 2.1544, + "step": 5334 + }, + { + "epoch": 2.301488031054561, + "grad_norm": 0.19851690530776978, + "learning_rate": 1.9471658829258517e-05, + "loss": 1.9271, + "step": 5335 + }, + { + "epoch": 2.3019193444037094, + "grad_norm": 0.21829640865325928, + "learning_rate": 1.9448724073865425e-05, + "loss": 2.1683, + "step": 5336 + }, + { + "epoch": 2.3023506577528576, + "grad_norm": 0.19689638912677765, + "learning_rate": 1.9425800820971508e-05, + "loss": 2.2477, + "step": 5337 + }, + { + "epoch": 2.3027819711020054, + "grad_norm": 0.19567584991455078, + "learning_rate": 1.9402889075323293e-05, + "loss": 1.9999, + "step": 5338 + }, + { + "epoch": 2.3032132844511537, + "grad_norm": 0.18084855377674103, + "learning_rate": 1.937998884166488e-05, + "loss": 1.9266, + "step": 5339 + }, + { + "epoch": 2.303644597800302, + "grad_norm": 0.20621658861637115, + "learning_rate": 1.9357100124738037e-05, + "loss": 2.0463, + "step": 5340 + }, + { + "epoch": 2.30407591114945, + "grad_norm": 0.237623929977417, + "learning_rate": 1.9334222929282088e-05, + "loss": 1.9552, + "step": 5341 + }, + { + "epoch": 2.3045072244985985, + "grad_norm": 0.19974792003631592, + "learning_rate": 1.9311357260034075e-05, + "loss": 2.1163, + "step": 5342 + }, + { + "epoch": 2.3049385378477463, + "grad_norm": 0.20718558132648468, + "learning_rate": 1.9288503121728523e-05, + "loss": 2.1902, + "step": 5343 + }, + { + "epoch": 2.3053698511968945, + "grad_norm": 0.21341772377490997, + "learning_rate": 1.9265660519097624e-05, + "loss": 2.2041, + "step": 5344 + }, + { + "epoch": 2.305801164546043, + "grad_norm": 0.20624461770057678, + "learning_rate": 1.9242829456871246e-05, + "loss": 2.0728, + "step": 5345 + }, + { + "epoch": 2.306232477895191, + "grad_norm": 0.21290695667266846, + "learning_rate": 1.9220009939776774e-05, + "loss": 2.233, + "step": 5346 + }, + { + "epoch": 2.306663791244339, + "grad_norm": 0.2021273970603943, + "learning_rate": 1.9197201972539265e-05, + "loss": 2.2376, + "step": 5347 + }, + { + "epoch": 2.307095104593487, + "grad_norm": 0.19745802879333496, + "learning_rate": 1.9174405559881288e-05, + "loss": 2.2321, + "step": 5348 + }, + { + "epoch": 2.3075264179426354, + "grad_norm": 0.2031654715538025, + "learning_rate": 1.9151620706523167e-05, + "loss": 2.0205, + "step": 5349 + }, + { + "epoch": 2.3079577312917836, + "grad_norm": 0.20771507918834686, + "learning_rate": 1.9128847417182718e-05, + "loss": 2.0798, + "step": 5350 + }, + { + "epoch": 2.3079577312917836, + "eval_loss": 2.0877935886383057, + "eval_runtime": 203.6267, + "eval_samples_per_second": 0.157, + "eval_steps_per_second": 0.157, + "step": 5350 + }, + { + "epoch": 2.308389044640932, + "grad_norm": 0.189523383975029, + "learning_rate": 1.910608569657542e-05, + "loss": 2.2223, + "step": 5351 + }, + { + "epoch": 2.3088203579900797, + "grad_norm": 0.21872511506080627, + "learning_rate": 1.9083335549414318e-05, + "loss": 2.1221, + "step": 5352 + }, + { + "epoch": 2.309251671339228, + "grad_norm": 0.20272767543792725, + "learning_rate": 1.9060596980410086e-05, + "loss": 2.2688, + "step": 5353 + }, + { + "epoch": 2.309682984688376, + "grad_norm": 0.20737117528915405, + "learning_rate": 1.903786999427099e-05, + "loss": 2.1833, + "step": 5354 + }, + { + "epoch": 2.3101142980375244, + "grad_norm": 0.2046089470386505, + "learning_rate": 1.90151545957029e-05, + "loss": 2.2038, + "step": 5355 + }, + { + "epoch": 2.3105456113866722, + "grad_norm": 0.21469274163246155, + "learning_rate": 1.8992450789409294e-05, + "loss": 1.8162, + "step": 5356 + }, + { + "epoch": 2.3109769247358205, + "grad_norm": 0.19832271337509155, + "learning_rate": 1.8969758580091243e-05, + "loss": 2.0864, + "step": 5357 + }, + { + "epoch": 2.3114082380849688, + "grad_norm": 0.19137664139270782, + "learning_rate": 1.8947077972447414e-05, + "loss": 1.9514, + "step": 5358 + }, + { + "epoch": 2.311839551434117, + "grad_norm": 0.1882704496383667, + "learning_rate": 1.892440897117405e-05, + "loss": 2.0432, + "step": 5359 + }, + { + "epoch": 2.3122708647832653, + "grad_norm": 0.18820488452911377, + "learning_rate": 1.8901751580965084e-05, + "loss": 2.1626, + "step": 5360 + }, + { + "epoch": 2.312702178132413, + "grad_norm": 0.21664069592952728, + "learning_rate": 1.887910580651191e-05, + "loss": 2.1695, + "step": 5361 + }, + { + "epoch": 2.3131334914815613, + "grad_norm": 0.19514897465705872, + "learning_rate": 1.8856471652503597e-05, + "loss": 2.0783, + "step": 5362 + }, + { + "epoch": 2.3135648048307096, + "grad_norm": 0.22643904387950897, + "learning_rate": 1.8833849123626813e-05, + "loss": 2.2025, + "step": 5363 + }, + { + "epoch": 2.313996118179858, + "grad_norm": 0.17537686228752136, + "learning_rate": 1.8811238224565796e-05, + "loss": 2.1062, + "step": 5364 + }, + { + "epoch": 2.3144274315290057, + "grad_norm": 0.19371572136878967, + "learning_rate": 1.8788638960002398e-05, + "loss": 2.0133, + "step": 5365 + }, + { + "epoch": 2.314858744878154, + "grad_norm": 0.18847690522670746, + "learning_rate": 1.876605133461597e-05, + "loss": 1.9666, + "step": 5366 + }, + { + "epoch": 2.315290058227302, + "grad_norm": 0.1891636997461319, + "learning_rate": 1.8743475353083596e-05, + "loss": 2.1242, + "step": 5367 + }, + { + "epoch": 2.3157213715764504, + "grad_norm": 0.18955299258232117, + "learning_rate": 1.8720911020079877e-05, + "loss": 2.1174, + "step": 5368 + }, + { + "epoch": 2.3161526849255987, + "grad_norm": 0.18408434092998505, + "learning_rate": 1.8698358340276976e-05, + "loss": 1.9551, + "step": 5369 + }, + { + "epoch": 2.3165839982747465, + "grad_norm": 0.20017340779304504, + "learning_rate": 1.8675817318344687e-05, + "loss": 2.141, + "step": 5370 + }, + { + "epoch": 2.3170153116238947, + "grad_norm": 0.18992261588573456, + "learning_rate": 1.865328795895037e-05, + "loss": 2.0916, + "step": 5371 + }, + { + "epoch": 2.317446624973043, + "grad_norm": 0.2128012776374817, + "learning_rate": 1.863077026675899e-05, + "loss": 2.2291, + "step": 5372 + }, + { + "epoch": 2.3178779383221912, + "grad_norm": 0.20145027339458466, + "learning_rate": 1.8608264246433036e-05, + "loss": 2.3127, + "step": 5373 + }, + { + "epoch": 2.318309251671339, + "grad_norm": 0.18959259986877441, + "learning_rate": 1.8585769902632714e-05, + "loss": 2.2089, + "step": 5374 + }, + { + "epoch": 2.3187405650204873, + "grad_norm": 0.19251097738742828, + "learning_rate": 1.8563287240015635e-05, + "loss": 2.143, + "step": 5375 + }, + { + "epoch": 2.3187405650204873, + "eval_loss": 2.08791184425354, + "eval_runtime": 203.5372, + "eval_samples_per_second": 0.157, + "eval_steps_per_second": 0.157, + "step": 5375 + }, + { + "epoch": 2.3191718783696356, + "grad_norm": 0.1923358142375946, + "learning_rate": 1.85408162632371e-05, + "loss": 1.9816, + "step": 5376 + }, + { + "epoch": 2.319603191718784, + "grad_norm": 0.17821118235588074, + "learning_rate": 1.8518356976950012e-05, + "loss": 1.9172, + "step": 5377 + }, + { + "epoch": 2.320034505067932, + "grad_norm": 0.22167186439037323, + "learning_rate": 1.8495909385804776e-05, + "loss": 2.3065, + "step": 5378 + }, + { + "epoch": 2.32046581841708, + "grad_norm": 0.18814527988433838, + "learning_rate": 1.847347349444944e-05, + "loss": 2.1164, + "step": 5379 + }, + { + "epoch": 2.320897131766228, + "grad_norm": 0.1985539346933365, + "learning_rate": 1.845104930752954e-05, + "loss": 2.2795, + "step": 5380 + }, + { + "epoch": 2.3213284451153764, + "grad_norm": 0.20663782954216003, + "learning_rate": 1.842863682968831e-05, + "loss": 2.1005, + "step": 5381 + }, + { + "epoch": 2.3217597584645246, + "grad_norm": 0.2148127406835556, + "learning_rate": 1.8406236065566488e-05, + "loss": 2.0653, + "step": 5382 + }, + { + "epoch": 2.3221910718136725, + "grad_norm": 0.19326218962669373, + "learning_rate": 1.838384701980238e-05, + "loss": 2.132, + "step": 5383 + }, + { + "epoch": 2.3226223851628207, + "grad_norm": 0.19003017246723175, + "learning_rate": 1.836146969703189e-05, + "loss": 1.9125, + "step": 5384 + }, + { + "epoch": 2.323053698511969, + "grad_norm": 0.17955321073532104, + "learning_rate": 1.8339104101888488e-05, + "loss": 2.1892, + "step": 5385 + }, + { + "epoch": 2.3234850118611172, + "grad_norm": 0.18654711544513702, + "learning_rate": 1.8316750239003218e-05, + "loss": 2.2338, + "step": 5386 + }, + { + "epoch": 2.3239163252102655, + "grad_norm": 0.2069181203842163, + "learning_rate": 1.829440811300466e-05, + "loss": 2.2528, + "step": 5387 + }, + { + "epoch": 2.3243476385594133, + "grad_norm": 0.22007881104946136, + "learning_rate": 1.8272077728519077e-05, + "loss": 2.3733, + "step": 5388 + }, + { + "epoch": 2.3247789519085615, + "grad_norm": 0.2045450210571289, + "learning_rate": 1.8249759090170134e-05, + "loss": 2.2287, + "step": 5389 + }, + { + "epoch": 2.32521026525771, + "grad_norm": 0.19702892005443573, + "learning_rate": 1.8227452202579168e-05, + "loss": 2.1169, + "step": 5390 + }, + { + "epoch": 2.325641578606858, + "grad_norm": 0.17602592706680298, + "learning_rate": 1.82051570703651e-05, + "loss": 2.1293, + "step": 5391 + }, + { + "epoch": 2.326072891956006, + "grad_norm": 0.20260587334632874, + "learning_rate": 1.818287369814435e-05, + "loss": 2.2672, + "step": 5392 + }, + { + "epoch": 2.326504205305154, + "grad_norm": 0.20817983150482178, + "learning_rate": 1.8160602090530972e-05, + "loss": 2.1941, + "step": 5393 + }, + { + "epoch": 2.3269355186543024, + "grad_norm": 0.18919779360294342, + "learning_rate": 1.8138342252136467e-05, + "loss": 2.1362, + "step": 5394 + }, + { + "epoch": 2.3273668320034506, + "grad_norm": 0.19633108377456665, + "learning_rate": 1.811609418757005e-05, + "loss": 2.2055, + "step": 5395 + }, + { + "epoch": 2.327798145352599, + "grad_norm": 0.215801402926445, + "learning_rate": 1.8093857901438404e-05, + "loss": 2.1827, + "step": 5396 + }, + { + "epoch": 2.3282294587017467, + "grad_norm": 0.19851261377334595, + "learning_rate": 1.8071633398345804e-05, + "loss": 1.9047, + "step": 5397 + }, + { + "epoch": 2.328660772050895, + "grad_norm": 0.20020222663879395, + "learning_rate": 1.8049420682894054e-05, + "loss": 2.3035, + "step": 5398 + }, + { + "epoch": 2.329092085400043, + "grad_norm": 0.21065831184387207, + "learning_rate": 1.802721975968256e-05, + "loss": 2.1154, + "step": 5399 + }, + { + "epoch": 2.3295233987491915, + "grad_norm": 0.20882493257522583, + "learning_rate": 1.8005030633308252e-05, + "loss": 2.1965, + "step": 5400 + }, + { + "epoch": 2.3295233987491915, + "eval_loss": 2.0880227088928223, + "eval_runtime": 203.6302, + "eval_samples_per_second": 0.157, + "eval_steps_per_second": 0.157, + "step": 5400 + }, + { + "epoch": 2.3299547120983393, + "grad_norm": 0.1875244826078415, + "learning_rate": 1.798285330836564e-05, + "loss": 2.3193, + "step": 5401 + }, + { + "epoch": 2.3303860254474875, + "grad_norm": 0.19934353232383728, + "learning_rate": 1.7960687789446782e-05, + "loss": 2.0483, + "step": 5402 + }, + { + "epoch": 2.3308173387966358, + "grad_norm": 0.20099350810050964, + "learning_rate": 1.7938534081141288e-05, + "loss": 1.9604, + "step": 5403 + }, + { + "epoch": 2.331248652145784, + "grad_norm": 0.195001482963562, + "learning_rate": 1.791639218803633e-05, + "loss": 2.0237, + "step": 5404 + }, + { + "epoch": 2.3316799654949323, + "grad_norm": 0.19568021595478058, + "learning_rate": 1.789426211471659e-05, + "loss": 2.1974, + "step": 5405 + }, + { + "epoch": 2.33211127884408, + "grad_norm": 0.1914471983909607, + "learning_rate": 1.7872143865764436e-05, + "loss": 2.2391, + "step": 5406 + }, + { + "epoch": 2.3325425921932283, + "grad_norm": 0.19289901852607727, + "learning_rate": 1.7850037445759608e-05, + "loss": 2.1496, + "step": 5407 + }, + { + "epoch": 2.3329739055423766, + "grad_norm": 0.18522243201732635, + "learning_rate": 1.7827942859279482e-05, + "loss": 2.1157, + "step": 5408 + }, + { + "epoch": 2.333405218891525, + "grad_norm": 0.18957099318504333, + "learning_rate": 1.780586011089904e-05, + "loss": 1.9777, + "step": 5409 + }, + { + "epoch": 2.3338365322406727, + "grad_norm": 0.19008250534534454, + "learning_rate": 1.7783789205190717e-05, + "loss": 2.056, + "step": 5410 + }, + { + "epoch": 2.334267845589821, + "grad_norm": 0.19285881519317627, + "learning_rate": 1.776173014672457e-05, + "loss": 2.3684, + "step": 5411 + }, + { + "epoch": 2.334699158938969, + "grad_norm": 0.20712214708328247, + "learning_rate": 1.77396829400681e-05, + "loss": 2.3285, + "step": 5412 + }, + { + "epoch": 2.3351304722881174, + "grad_norm": 0.20356346666812897, + "learning_rate": 1.7717647589786486e-05, + "loss": 1.9902, + "step": 5413 + }, + { + "epoch": 2.3355617856372657, + "grad_norm": 0.19335150718688965, + "learning_rate": 1.7695624100442372e-05, + "loss": 2.13, + "step": 5414 + }, + { + "epoch": 2.3359930989864135, + "grad_norm": 0.2026914805173874, + "learning_rate": 1.7673612476595962e-05, + "loss": 2.1992, + "step": 5415 + }, + { + "epoch": 2.3364244123355618, + "grad_norm": 0.20388589799404144, + "learning_rate": 1.7651612722804986e-05, + "loss": 2.1595, + "step": 5416 + }, + { + "epoch": 2.33685572568471, + "grad_norm": 0.1827160269021988, + "learning_rate": 1.7629624843624758e-05, + "loss": 2.1954, + "step": 5417 + }, + { + "epoch": 2.3372870390338583, + "grad_norm": 0.22113530337810516, + "learning_rate": 1.7607648843608097e-05, + "loss": 2.1259, + "step": 5418 + }, + { + "epoch": 2.337718352383006, + "grad_norm": 0.2118062674999237, + "learning_rate": 1.7585684727305348e-05, + "loss": 2.0906, + "step": 5419 + }, + { + "epoch": 2.3381496657321543, + "grad_norm": 0.18365009129047394, + "learning_rate": 1.7563732499264492e-05, + "loss": 2.0491, + "step": 5420 + }, + { + "epoch": 2.3385809790813026, + "grad_norm": 0.2063271403312683, + "learning_rate": 1.754179216403092e-05, + "loss": 2.1538, + "step": 5421 + }, + { + "epoch": 2.339012292430451, + "grad_norm": 0.20388810336589813, + "learning_rate": 1.7519863726147607e-05, + "loss": 2.0918, + "step": 5422 + }, + { + "epoch": 2.339443605779599, + "grad_norm": 0.21293078362941742, + "learning_rate": 1.7497947190155118e-05, + "loss": 2.2654, + "step": 5423 + }, + { + "epoch": 2.339874919128747, + "grad_norm": 0.19700607657432556, + "learning_rate": 1.7476042560591505e-05, + "loss": 2.1123, + "step": 5424 + }, + { + "epoch": 2.340306232477895, + "grad_norm": 0.19425487518310547, + "learning_rate": 1.7454149841992366e-05, + "loss": 2.2007, + "step": 5425 + }, + { + "epoch": 2.340306232477895, + "eval_loss": 2.087836265563965, + "eval_runtime": 200.5705, + "eval_samples_per_second": 0.16, + "eval_steps_per_second": 0.16, + "step": 5425 + }, + { + "epoch": 2.3407375458270434, + "grad_norm": 0.2275095134973526, + "learning_rate": 1.7432269038890773e-05, + "loss": 2.0679, + "step": 5426 + }, + { + "epoch": 2.3411688591761917, + "grad_norm": 0.23057441413402557, + "learning_rate": 1.7410400155817445e-05, + "loss": 2.1185, + "step": 5427 + }, + { + "epoch": 2.3416001725253395, + "grad_norm": 0.17289194464683533, + "learning_rate": 1.738854319730055e-05, + "loss": 1.9679, + "step": 5428 + }, + { + "epoch": 2.3420314858744877, + "grad_norm": 0.18307892978191376, + "learning_rate": 1.736669816786582e-05, + "loss": 1.9135, + "step": 5429 + }, + { + "epoch": 2.342462799223636, + "grad_norm": 0.19872839748859406, + "learning_rate": 1.73448650720365e-05, + "loss": 2.0873, + "step": 5430 + }, + { + "epoch": 2.3428941125727842, + "grad_norm": 0.19778376817703247, + "learning_rate": 1.732304391433337e-05, + "loss": 2.2195, + "step": 5431 + }, + { + "epoch": 2.3433254259219325, + "grad_norm": 0.2171076387166977, + "learning_rate": 1.7301234699274747e-05, + "loss": 2.2742, + "step": 5432 + }, + { + "epoch": 2.3437567392710803, + "grad_norm": 0.18240311741828918, + "learning_rate": 1.727943743137646e-05, + "loss": 2.073, + "step": 5433 + }, + { + "epoch": 2.3441880526202286, + "grad_norm": 0.19008196890354156, + "learning_rate": 1.725765211515188e-05, + "loss": 2.0548, + "step": 5434 + }, + { + "epoch": 2.344619365969377, + "grad_norm": 18.04897117614746, + "learning_rate": 1.7235878755111883e-05, + "loss": 2.2369, + "step": 5435 + }, + { + "epoch": 2.345050679318525, + "grad_norm": 0.18793705105781555, + "learning_rate": 1.7214117355764868e-05, + "loss": 1.9845, + "step": 5436 + }, + { + "epoch": 2.345481992667673, + "grad_norm": 0.1938720941543579, + "learning_rate": 1.7192367921616805e-05, + "loss": 2.145, + "step": 5437 + }, + { + "epoch": 2.345913306016821, + "grad_norm": 0.21334202587604523, + "learning_rate": 1.7170630457171166e-05, + "loss": 2.3754, + "step": 5438 + }, + { + "epoch": 2.3463446193659694, + "grad_norm": 0.1999683380126953, + "learning_rate": 1.714890496692888e-05, + "loss": 2.2846, + "step": 5439 + }, + { + "epoch": 2.3467759327151176, + "grad_norm": 0.2468402087688446, + "learning_rate": 1.712719145538845e-05, + "loss": 2.1985, + "step": 5440 + }, + { + "epoch": 2.347207246064266, + "grad_norm": 0.19367870688438416, + "learning_rate": 1.710548992704593e-05, + "loss": 2.2734, + "step": 5441 + }, + { + "epoch": 2.3476385594134137, + "grad_norm": 0.1968536078929901, + "learning_rate": 1.7083800386394845e-05, + "loss": 1.9915, + "step": 5442 + }, + { + "epoch": 2.348069872762562, + "grad_norm": 0.20344178378582, + "learning_rate": 1.7062122837926263e-05, + "loss": 2.057, + "step": 5443 + }, + { + "epoch": 2.34850118611171, + "grad_norm": 0.19281743466854095, + "learning_rate": 1.7040457286128732e-05, + "loss": 2.0647, + "step": 5444 + }, + { + "epoch": 2.3489324994608585, + "grad_norm": 0.1950780153274536, + "learning_rate": 1.7018803735488356e-05, + "loss": 2.2951, + "step": 5445 + }, + { + "epoch": 2.3493638128100063, + "grad_norm": 0.2115524411201477, + "learning_rate": 1.6997162190488745e-05, + "loss": 2.0425, + "step": 5446 + }, + { + "epoch": 2.3497951261591545, + "grad_norm": 0.18657691776752472, + "learning_rate": 1.6975532655611014e-05, + "loss": 2.0386, + "step": 5447 + }, + { + "epoch": 2.350226439508303, + "grad_norm": 0.19606566429138184, + "learning_rate": 1.695391513533378e-05, + "loss": 2.0809, + "step": 5448 + }, + { + "epoch": 2.350657752857451, + "grad_norm": 0.18347160518169403, + "learning_rate": 1.6932309634133203e-05, + "loss": 2.3827, + "step": 5449 + }, + { + "epoch": 2.3510890662065993, + "grad_norm": 0.19450697302818298, + "learning_rate": 1.6910716156482934e-05, + "loss": 2.3068, + "step": 5450 + }, + { + "epoch": 2.3510890662065993, + "eval_loss": 2.088000774383545, + "eval_runtime": 201.0248, + "eval_samples_per_second": 0.159, + "eval_steps_per_second": 0.159, + "step": 5450 + }, + { + "epoch": 2.351520379555747, + "grad_norm": 0.19642473757266998, + "learning_rate": 1.6889134706854118e-05, + "loss": 2.1511, + "step": 5451 + }, + { + "epoch": 2.3519516929048954, + "grad_norm": 0.19885718822479248, + "learning_rate": 1.686756528971549e-05, + "loss": 2.2092, + "step": 5452 + }, + { + "epoch": 2.3523830062540436, + "grad_norm": 0.20507009327411652, + "learning_rate": 1.6846007909533178e-05, + "loss": 2.1743, + "step": 5453 + }, + { + "epoch": 2.352814319603192, + "grad_norm": 0.20062431693077087, + "learning_rate": 1.6824462570770865e-05, + "loss": 2.1195, + "step": 5454 + }, + { + "epoch": 2.3532456329523397, + "grad_norm": 0.2013835459947586, + "learning_rate": 1.68029292778898e-05, + "loss": 2.0451, + "step": 5455 + }, + { + "epoch": 2.353676946301488, + "grad_norm": 0.18703527748584747, + "learning_rate": 1.6781408035348657e-05, + "loss": 2.0735, + "step": 5456 + }, + { + "epoch": 2.354108259650636, + "grad_norm": 0.19562098383903503, + "learning_rate": 1.6759898847603668e-05, + "loss": 2.13, + "step": 5457 + }, + { + "epoch": 2.3545395729997844, + "grad_norm": 0.2325151264667511, + "learning_rate": 1.6738401719108477e-05, + "loss": 2.1173, + "step": 5458 + }, + { + "epoch": 2.3549708863489327, + "grad_norm": 0.19891884922981262, + "learning_rate": 1.6716916654314374e-05, + "loss": 2.1029, + "step": 5459 + }, + { + "epoch": 2.3554021996980805, + "grad_norm": 0.18463841080665588, + "learning_rate": 1.669544365767006e-05, + "loss": 2.0109, + "step": 5460 + }, + { + "epoch": 2.3558335130472288, + "grad_norm": 0.20649510622024536, + "learning_rate": 1.6673982733621735e-05, + "loss": 2.1988, + "step": 5461 + }, + { + "epoch": 2.356264826396377, + "grad_norm": 0.185276597738266, + "learning_rate": 1.6652533886613127e-05, + "loss": 2.1077, + "step": 5462 + }, + { + "epoch": 2.3566961397455253, + "grad_norm": 0.19789348542690277, + "learning_rate": 1.6631097121085463e-05, + "loss": 2.0926, + "step": 5463 + }, + { + "epoch": 2.357127453094673, + "grad_norm": 0.19963330030441284, + "learning_rate": 1.6609672441477446e-05, + "loss": 1.9511, + "step": 5464 + }, + { + "epoch": 2.3575587664438213, + "grad_norm": 0.2180710732936859, + "learning_rate": 1.658825985222527e-05, + "loss": 2.1496, + "step": 5465 + }, + { + "epoch": 2.3579900797929696, + "grad_norm": 0.19674089550971985, + "learning_rate": 1.6566859357762722e-05, + "loss": 2.0358, + "step": 5466 + }, + { + "epoch": 2.358421393142118, + "grad_norm": 0.19395115971565247, + "learning_rate": 1.6545470962520933e-05, + "loss": 2.1949, + "step": 5467 + }, + { + "epoch": 2.358852706491266, + "grad_norm": 0.19646000862121582, + "learning_rate": 1.6524094670928608e-05, + "loss": 2.1848, + "step": 5468 + }, + { + "epoch": 2.359284019840414, + "grad_norm": 0.17927643656730652, + "learning_rate": 1.6502730487411986e-05, + "loss": 2.1109, + "step": 5469 + }, + { + "epoch": 2.359715333189562, + "grad_norm": 0.19817306101322174, + "learning_rate": 1.6481378416394733e-05, + "loss": 2.2828, + "step": 5470 + }, + { + "epoch": 2.3601466465387104, + "grad_norm": 0.20632949471473694, + "learning_rate": 1.646003846229805e-05, + "loss": 2.2486, + "step": 5471 + }, + { + "epoch": 2.3605779598878587, + "grad_norm": 0.1893763691186905, + "learning_rate": 1.6438710629540534e-05, + "loss": 2.1597, + "step": 5472 + }, + { + "epoch": 2.3610092732370065, + "grad_norm": 0.20030063390731812, + "learning_rate": 1.6417394922538423e-05, + "loss": 1.9663, + "step": 5473 + }, + { + "epoch": 2.3614405865861547, + "grad_norm": 0.18441982567310333, + "learning_rate": 1.6396091345705346e-05, + "loss": 2.0616, + "step": 5474 + }, + { + "epoch": 2.361871899935303, + "grad_norm": 0.2071865200996399, + "learning_rate": 1.6374799903452438e-05, + "loss": 2.1528, + "step": 5475 + }, + { + "epoch": 2.361871899935303, + "eval_loss": 2.0878982543945312, + "eval_runtime": 204.4375, + "eval_samples_per_second": 0.157, + "eval_steps_per_second": 0.157, + "step": 5475 + }, + { + "epoch": 2.3623032132844513, + "grad_norm": 0.21171632409095764, + "learning_rate": 1.6353520600188315e-05, + "loss": 2.2473, + "step": 5476 + }, + { + "epoch": 2.3627345266335995, + "grad_norm": 0.21455693244934082, + "learning_rate": 1.633225344031911e-05, + "loss": 2.3628, + "step": 5477 + }, + { + "epoch": 2.3631658399827473, + "grad_norm": 0.195206418633461, + "learning_rate": 1.6310998428248404e-05, + "loss": 2.0825, + "step": 5478 + }, + { + "epoch": 2.3635971533318956, + "grad_norm": 0.18683667480945587, + "learning_rate": 1.6289755568377286e-05, + "loss": 1.8842, + "step": 5479 + }, + { + "epoch": 2.364028466681044, + "grad_norm": 0.2660259008407593, + "learning_rate": 1.626852486510432e-05, + "loss": 2.1353, + "step": 5480 + }, + { + "epoch": 2.364459780030192, + "grad_norm": 0.20735685527324677, + "learning_rate": 1.624730632282556e-05, + "loss": 2.182, + "step": 5481 + }, + { + "epoch": 2.36489109337934, + "grad_norm": 0.20878545939922333, + "learning_rate": 1.622609994593451e-05, + "loss": 2.1041, + "step": 5482 + }, + { + "epoch": 2.365322406728488, + "grad_norm": 0.19899587333202362, + "learning_rate": 1.6204905738822233e-05, + "loss": 2.0607, + "step": 5483 + }, + { + "epoch": 2.3657537200776364, + "grad_norm": 0.19157008826732635, + "learning_rate": 1.618372370587721e-05, + "loss": 2.1185, + "step": 5484 + }, + { + "epoch": 2.3661850334267847, + "grad_norm": 0.19703416526317596, + "learning_rate": 1.616255385148538e-05, + "loss": 2.1078, + "step": 5485 + }, + { + "epoch": 2.366616346775933, + "grad_norm": 0.1869412213563919, + "learning_rate": 1.6141396180030192e-05, + "loss": 2.1147, + "step": 5486 + }, + { + "epoch": 2.3670476601250807, + "grad_norm": 0.2072449028491974, + "learning_rate": 1.612025069589261e-05, + "loss": 2.1778, + "step": 5487 + }, + { + "epoch": 2.367478973474229, + "grad_norm": 0.2161872535943985, + "learning_rate": 1.609911740345102e-05, + "loss": 2.189, + "step": 5488 + }, + { + "epoch": 2.3679102868233772, + "grad_norm": 0.19158300757408142, + "learning_rate": 1.6077996307081302e-05, + "loss": 2.1283, + "step": 5489 + }, + { + "epoch": 2.3683416001725255, + "grad_norm": 0.19557763636112213, + "learning_rate": 1.605688741115681e-05, + "loss": 2.0445, + "step": 5490 + }, + { + "epoch": 2.3687729135216733, + "grad_norm": 0.19908276200294495, + "learning_rate": 1.603579072004837e-05, + "loss": 2.1983, + "step": 5491 + }, + { + "epoch": 2.3692042268708216, + "grad_norm": 0.21621151268482208, + "learning_rate": 1.6014706238124276e-05, + "loss": 2.148, + "step": 5492 + }, + { + "epoch": 2.36963554021997, + "grad_norm": 0.18872621655464172, + "learning_rate": 1.599363396975031e-05, + "loss": 2.0592, + "step": 5493 + }, + { + "epoch": 2.370066853569118, + "grad_norm": 0.20846761763095856, + "learning_rate": 1.597257391928972e-05, + "loss": 2.1044, + "step": 5494 + }, + { + "epoch": 2.3704981669182663, + "grad_norm": 0.18615426123142242, + "learning_rate": 1.595152609110321e-05, + "loss": 1.8943, + "step": 5495 + }, + { + "epoch": 2.370929480267414, + "grad_norm": 0.18413949012756348, + "learning_rate": 1.5930490489548962e-05, + "loss": 2.2685, + "step": 5496 + }, + { + "epoch": 2.3713607936165624, + "grad_norm": 0.21032410860061646, + "learning_rate": 1.5909467118982617e-05, + "loss": 2.0688, + "step": 5497 + }, + { + "epoch": 2.3717921069657106, + "grad_norm": 0.19261309504508972, + "learning_rate": 1.5888455983757345e-05, + "loss": 2.1473, + "step": 5498 + }, + { + "epoch": 2.372223420314859, + "grad_norm": 0.19529931247234344, + "learning_rate": 1.5867457088223675e-05, + "loss": 2.1682, + "step": 5499 + }, + { + "epoch": 2.3726547336640067, + "grad_norm": 0.20554889738559723, + "learning_rate": 1.5846470436729655e-05, + "loss": 2.2578, + "step": 5500 + }, + { + "epoch": 2.3726547336640067, + "eval_loss": 2.087747573852539, + "eval_runtime": 204.2922, + "eval_samples_per_second": 0.157, + "eval_steps_per_second": 0.157, + "step": 5500 + }, + { + "epoch": 2.373086047013155, + "grad_norm": 0.21087957918643951, + "learning_rate": 1.582549603362085e-05, + "loss": 2.0516, + "step": 5501 + }, + { + "epoch": 2.373517360362303, + "grad_norm": 0.21421092748641968, + "learning_rate": 1.580453388324021e-05, + "loss": 2.0853, + "step": 5502 + }, + { + "epoch": 2.3739486737114515, + "grad_norm": 0.20026853680610657, + "learning_rate": 1.5783583989928194e-05, + "loss": 2.1586, + "step": 5503 + }, + { + "epoch": 2.3743799870605997, + "grad_norm": 0.2020842432975769, + "learning_rate": 1.5762646358022653e-05, + "loss": 2.2867, + "step": 5504 + }, + { + "epoch": 2.3748113004097475, + "grad_norm": 0.20774178206920624, + "learning_rate": 1.574172099185902e-05, + "loss": 2.273, + "step": 5505 + }, + { + "epoch": 2.375242613758896, + "grad_norm": 0.20067383348941803, + "learning_rate": 1.572080789577009e-05, + "loss": 2.1054, + "step": 5506 + }, + { + "epoch": 2.375673927108044, + "grad_norm": 0.17484228312969208, + "learning_rate": 1.5699907074086155e-05, + "loss": 2.2669, + "step": 5507 + }, + { + "epoch": 2.3761052404571923, + "grad_norm": 0.1788690686225891, + "learning_rate": 1.567901853113495e-05, + "loss": 1.8956, + "step": 5508 + }, + { + "epoch": 2.37653655380634, + "grad_norm": 0.23119118809700012, + "learning_rate": 1.565814227124168e-05, + "loss": 2.1177, + "step": 5509 + }, + { + "epoch": 2.3769678671554884, + "grad_norm": 0.1931457668542862, + "learning_rate": 1.5637278298729008e-05, + "loss": 2.0814, + "step": 5510 + }, + { + "epoch": 2.3773991805046366, + "grad_norm": 0.20546475052833557, + "learning_rate": 1.5616426617917026e-05, + "loss": 2.2013, + "step": 5511 + }, + { + "epoch": 2.377830493853785, + "grad_norm": 0.19993987679481506, + "learning_rate": 1.5595587233123357e-05, + "loss": 2.2371, + "step": 5512 + }, + { + "epoch": 2.378261807202933, + "grad_norm": 0.16741925477981567, + "learning_rate": 1.5574760148662965e-05, + "loss": 2.0317, + "step": 5513 + }, + { + "epoch": 2.378693120552081, + "grad_norm": 0.19004949927330017, + "learning_rate": 1.5553945368848334e-05, + "loss": 2.3056, + "step": 5514 + }, + { + "epoch": 2.379124433901229, + "grad_norm": 0.20242708921432495, + "learning_rate": 1.5533142897989413e-05, + "loss": 2.2824, + "step": 5515 + }, + { + "epoch": 2.3795557472503774, + "grad_norm": 0.20056506991386414, + "learning_rate": 1.551235274039359e-05, + "loss": 2.1467, + "step": 5516 + }, + { + "epoch": 2.3799870605995257, + "grad_norm": 0.24377302825450897, + "learning_rate": 1.5491574900365687e-05, + "loss": 2.2118, + "step": 5517 + }, + { + "epoch": 2.3804183739486735, + "grad_norm": 0.18310534954071045, + "learning_rate": 1.547080938220794e-05, + "loss": 1.7325, + "step": 5518 + }, + { + "epoch": 2.3808496872978218, + "grad_norm": 0.17511674761772156, + "learning_rate": 1.5450056190220123e-05, + "loss": 1.9583, + "step": 5519 + }, + { + "epoch": 2.38128100064697, + "grad_norm": 0.19512279331684113, + "learning_rate": 1.5429315328699408e-05, + "loss": 2.0862, + "step": 5520 + }, + { + "epoch": 2.3817123139961183, + "grad_norm": 0.20123524963855743, + "learning_rate": 1.54085868019404e-05, + "loss": 1.8113, + "step": 5521 + }, + { + "epoch": 2.3821436273452665, + "grad_norm": 0.1939847469329834, + "learning_rate": 1.5387870614235178e-05, + "loss": 2.1032, + "step": 5522 + }, + { + "epoch": 2.3825749406944143, + "grad_norm": 0.20574967563152313, + "learning_rate": 1.5367166769873255e-05, + "loss": 2.0414, + "step": 5523 + }, + { + "epoch": 2.3830062540435626, + "grad_norm": 0.20114852488040924, + "learning_rate": 1.534647527314159e-05, + "loss": 2.2412, + "step": 5524 + }, + { + "epoch": 2.383437567392711, + "grad_norm": 0.1988348662853241, + "learning_rate": 1.5325796128324574e-05, + "loss": 2.0585, + "step": 5525 + }, + { + "epoch": 2.383437567392711, + "eval_loss": 2.0880467891693115, + "eval_runtime": 203.5213, + "eval_samples_per_second": 0.157, + "eval_steps_per_second": 0.157, + "step": 5525 + }, + { + "epoch": 2.383868880741859, + "grad_norm": 0.20394539833068848, + "learning_rate": 1.5305129339704057e-05, + "loss": 2.1355, + "step": 5526 + }, + { + "epoch": 2.384300194091007, + "grad_norm": 0.20129942893981934, + "learning_rate": 1.528447491155932e-05, + "loss": 2.1748, + "step": 5527 + }, + { + "epoch": 2.384731507440155, + "grad_norm": 0.2347383201122284, + "learning_rate": 1.526383284816707e-05, + "loss": 1.9031, + "step": 5528 + }, + { + "epoch": 2.3851628207893034, + "grad_norm": 0.1985783576965332, + "learning_rate": 1.5243203153801512e-05, + "loss": 1.986, + "step": 5529 + }, + { + "epoch": 2.3855941341384517, + "grad_norm": 0.20156444609165192, + "learning_rate": 1.5222585832734258e-05, + "loss": 2.1152, + "step": 5530 + }, + { + "epoch": 2.3860254474876, + "grad_norm": 0.2207607924938202, + "learning_rate": 1.5201980889234292e-05, + "loss": 2.1955, + "step": 5531 + }, + { + "epoch": 2.3864567608367477, + "grad_norm": 0.1918589174747467, + "learning_rate": 1.5181388327568109e-05, + "loss": 2.2195, + "step": 5532 + }, + { + "epoch": 2.386888074185896, + "grad_norm": 0.20215705037117004, + "learning_rate": 1.5160808151999657e-05, + "loss": 2.0159, + "step": 5533 + }, + { + "epoch": 2.3873193875350442, + "grad_norm": 0.19671401381492615, + "learning_rate": 1.5140240366790268e-05, + "loss": 2.1159, + "step": 5534 + }, + { + "epoch": 2.3877507008841925, + "grad_norm": 0.18804560601711273, + "learning_rate": 1.511968497619873e-05, + "loss": 2.1538, + "step": 5535 + }, + { + "epoch": 2.3881820142333403, + "grad_norm": 0.19151148200035095, + "learning_rate": 1.509914198448125e-05, + "loss": 2.1489, + "step": 5536 + }, + { + "epoch": 2.3886133275824886, + "grad_norm": 0.19355329871177673, + "learning_rate": 1.5078611395891488e-05, + "loss": 2.0815, + "step": 5537 + }, + { + "epoch": 2.389044640931637, + "grad_norm": 0.19582180678844452, + "learning_rate": 1.5058093214680533e-05, + "loss": 1.8665, + "step": 5538 + }, + { + "epoch": 2.389475954280785, + "grad_norm": 0.20649458467960358, + "learning_rate": 1.503758744509688e-05, + "loss": 2.1585, + "step": 5539 + }, + { + "epoch": 2.3899072676299333, + "grad_norm": 0.21218176186084747, + "learning_rate": 1.5017094091386497e-05, + "loss": 2.1526, + "step": 5540 + }, + { + "epoch": 2.390338580979081, + "grad_norm": 0.19842740893363953, + "learning_rate": 1.4996613157792733e-05, + "loss": 2.0773, + "step": 5541 + }, + { + "epoch": 2.3907698943282294, + "grad_norm": 0.1913827359676361, + "learning_rate": 1.4976144648556405e-05, + "loss": 2.2597, + "step": 5542 + }, + { + "epoch": 2.3912012076773776, + "grad_norm": 0.19794151186943054, + "learning_rate": 1.4955688567915718e-05, + "loss": 2.1475, + "step": 5543 + }, + { + "epoch": 2.391632521026526, + "grad_norm": 0.2209303081035614, + "learning_rate": 1.493524492010639e-05, + "loss": 2.0839, + "step": 5544 + }, + { + "epoch": 2.3920638343756737, + "grad_norm": 0.21582293510437012, + "learning_rate": 1.4914813709361432e-05, + "loss": 2.1666, + "step": 5545 + }, + { + "epoch": 2.392495147724822, + "grad_norm": 0.191584050655365, + "learning_rate": 1.4894394939911358e-05, + "loss": 2.0405, + "step": 5546 + }, + { + "epoch": 2.3929264610739702, + "grad_norm": 0.19593264162540436, + "learning_rate": 1.4873988615984139e-05, + "loss": 2.1676, + "step": 5547 + }, + { + "epoch": 2.3933577744231185, + "grad_norm": 0.19111260771751404, + "learning_rate": 1.4853594741805102e-05, + "loss": 2.2233, + "step": 5548 + }, + { + "epoch": 2.3937890877722667, + "grad_norm": 0.2119603306055069, + "learning_rate": 1.4833213321597046e-05, + "loss": 2.1493, + "step": 5549 + }, + { + "epoch": 2.3942204011214145, + "grad_norm": 0.2001449316740036, + "learning_rate": 1.4812844359580101e-05, + "loss": 2.0279, + "step": 5550 + }, + { + "epoch": 2.3942204011214145, + "eval_loss": 2.087592601776123, + "eval_runtime": 203.9108, + "eval_samples_per_second": 0.157, + "eval_steps_per_second": 0.157, + "step": 5550 + }, + { + "epoch": 2.394651714470563, + "grad_norm": 0.2072679102420807, + "learning_rate": 1.4792487859971944e-05, + "loss": 2.2792, + "step": 5551 + }, + { + "epoch": 2.395083027819711, + "grad_norm": 0.18107175827026367, + "learning_rate": 1.4772143826987595e-05, + "loss": 1.9677, + "step": 5552 + }, + { + "epoch": 2.3955143411688593, + "grad_norm": 0.19043400883674622, + "learning_rate": 1.475181226483951e-05, + "loss": 2.1645, + "step": 5553 + }, + { + "epoch": 2.395945654518007, + "grad_norm": 0.18325147032737732, + "learning_rate": 1.4731493177737564e-05, + "loss": 2.2081, + "step": 5554 + }, + { + "epoch": 2.3963769678671554, + "grad_norm": 0.26620277762413025, + "learning_rate": 1.471118656988903e-05, + "loss": 2.0327, + "step": 5555 + }, + { + "epoch": 2.3968082812163036, + "grad_norm": 0.19313527643680573, + "learning_rate": 1.469089244549863e-05, + "loss": 2.1078, + "step": 5556 + }, + { + "epoch": 2.397239594565452, + "grad_norm": 0.20311668515205383, + "learning_rate": 1.467061080876848e-05, + "loss": 2.1917, + "step": 5557 + }, + { + "epoch": 2.3976709079146, + "grad_norm": 0.21538524329662323, + "learning_rate": 1.4650341663898107e-05, + "loss": 2.2499, + "step": 5558 + }, + { + "epoch": 2.398102221263748, + "grad_norm": 0.1952304244041443, + "learning_rate": 1.4630085015084467e-05, + "loss": 2.0227, + "step": 5559 + }, + { + "epoch": 2.398533534612896, + "grad_norm": 0.21069559454917908, + "learning_rate": 1.4609840866521901e-05, + "loss": 2.2244, + "step": 5560 + }, + { + "epoch": 2.3989648479620445, + "grad_norm": 0.2014789879322052, + "learning_rate": 1.4589609222402219e-05, + "loss": 2.1381, + "step": 5561 + }, + { + "epoch": 2.3993961613111927, + "grad_norm": 0.21519064903259277, + "learning_rate": 1.4569390086914605e-05, + "loss": 2.3563, + "step": 5562 + }, + { + "epoch": 2.3998274746603405, + "grad_norm": 0.1834069937467575, + "learning_rate": 1.4549183464245609e-05, + "loss": 1.9828, + "step": 5563 + }, + { + "epoch": 2.4002587880094888, + "grad_norm": 0.2046196162700653, + "learning_rate": 1.4528989358579237e-05, + "loss": 2.2115, + "step": 5564 + }, + { + "epoch": 2.400690101358637, + "grad_norm": 0.1972675323486328, + "learning_rate": 1.4508807774096939e-05, + "loss": 2.0984, + "step": 5565 + }, + { + "epoch": 2.4011214147077853, + "grad_norm": 0.19684778153896332, + "learning_rate": 1.4488638714977513e-05, + "loss": 2.2921, + "step": 5566 + }, + { + "epoch": 2.4015527280569335, + "grad_norm": 0.18879768252372742, + "learning_rate": 1.4468482185397184e-05, + "loss": 2.1441, + "step": 5567 + }, + { + "epoch": 2.4019840414060813, + "grad_norm": 0.22304823994636536, + "learning_rate": 1.4448338189529574e-05, + "loss": 2.0291, + "step": 5568 + }, + { + "epoch": 2.4024153547552296, + "grad_norm": 0.21063990890979767, + "learning_rate": 1.4428206731545727e-05, + "loss": 2.1963, + "step": 5569 + }, + { + "epoch": 2.402846668104378, + "grad_norm": 0.22029930353164673, + "learning_rate": 1.4408087815614077e-05, + "loss": 1.9112, + "step": 5570 + }, + { + "epoch": 2.403277981453526, + "grad_norm": 0.2134680449962616, + "learning_rate": 1.4387981445900468e-05, + "loss": 2.0876, + "step": 5571 + }, + { + "epoch": 2.403709294802674, + "grad_norm": 0.20396561920642853, + "learning_rate": 1.4367887626568141e-05, + "loss": 2.2803, + "step": 5572 + }, + { + "epoch": 2.404140608151822, + "grad_norm": 0.2179805487394333, + "learning_rate": 1.434780636177774e-05, + "loss": 2.1741, + "step": 5573 + }, + { + "epoch": 2.4045719215009704, + "grad_norm": 0.18951210379600525, + "learning_rate": 1.43277376556873e-05, + "loss": 2.1052, + "step": 5574 + }, + { + "epoch": 2.4050032348501187, + "grad_norm": 0.2312512844800949, + "learning_rate": 1.430768151245229e-05, + "loss": 2.1362, + "step": 5575 + }, + { + "epoch": 2.4050032348501187, + "eval_loss": 2.087423324584961, + "eval_runtime": 197.2855, + "eval_samples_per_second": 0.162, + "eval_steps_per_second": 0.162, + "step": 5575 + }, + { + "epoch": 2.405434548199267, + "grad_norm": 0.1995340883731842, + "learning_rate": 1.4287637936225565e-05, + "loss": 2.1059, + "step": 5576 + }, + { + "epoch": 2.4058658615484148, + "grad_norm": 0.19406473636627197, + "learning_rate": 1.426760693115733e-05, + "loss": 2.1551, + "step": 5577 + }, + { + "epoch": 2.406297174897563, + "grad_norm": 0.22575275599956512, + "learning_rate": 1.4247588501395215e-05, + "loss": 2.0501, + "step": 5578 + }, + { + "epoch": 2.4067284882467113, + "grad_norm": 0.21095532178878784, + "learning_rate": 1.4227582651084299e-05, + "loss": 2.275, + "step": 5579 + }, + { + "epoch": 2.4071598015958595, + "grad_norm": 0.19748379290103912, + "learning_rate": 1.4207589384366991e-05, + "loss": 2.2486, + "step": 5580 + }, + { + "epoch": 2.4075911149450073, + "grad_norm": 0.237355038523674, + "learning_rate": 1.4187608705383114e-05, + "loss": 2.0254, + "step": 5581 + }, + { + "epoch": 2.4080224282941556, + "grad_norm": 0.19868335127830505, + "learning_rate": 1.4167640618269891e-05, + "loss": 2.1226, + "step": 5582 + }, + { + "epoch": 2.408453741643304, + "grad_norm": 0.1926330029964447, + "learning_rate": 1.4147685127161923e-05, + "loss": 2.0953, + "step": 5583 + }, + { + "epoch": 2.408885054992452, + "grad_norm": 0.20621483027935028, + "learning_rate": 1.4127742236191223e-05, + "loss": 2.2536, + "step": 5584 + }, + { + "epoch": 2.4093163683416003, + "grad_norm": 0.21812713146209717, + "learning_rate": 1.4107811949487185e-05, + "loss": 2.2392, + "step": 5585 + }, + { + "epoch": 2.409747681690748, + "grad_norm": 0.20326219499111176, + "learning_rate": 1.4087894271176586e-05, + "loss": 2.1634, + "step": 5586 + }, + { + "epoch": 2.4101789950398964, + "grad_norm": 0.19478273391723633, + "learning_rate": 1.4067989205383592e-05, + "loss": 2.1454, + "step": 5587 + }, + { + "epoch": 2.4106103083890447, + "grad_norm": 0.19656997919082642, + "learning_rate": 1.4048096756229766e-05, + "loss": 2.1551, + "step": 5588 + }, + { + "epoch": 2.411041621738193, + "grad_norm": 0.1959657073020935, + "learning_rate": 1.4028216927834084e-05, + "loss": 2.1672, + "step": 5589 + }, + { + "epoch": 2.4114729350873407, + "grad_norm": 0.1876540184020996, + "learning_rate": 1.4008349724312876e-05, + "loss": 1.9919, + "step": 5590 + }, + { + "epoch": 2.411904248436489, + "grad_norm": 0.1929643601179123, + "learning_rate": 1.3988495149779838e-05, + "loss": 2.2164, + "step": 5591 + }, + { + "epoch": 2.4123355617856372, + "grad_norm": 0.19551007449626923, + "learning_rate": 1.3968653208346067e-05, + "loss": 2.1125, + "step": 5592 + }, + { + "epoch": 2.4127668751347855, + "grad_norm": 0.2112320214509964, + "learning_rate": 1.3948823904120102e-05, + "loss": 2.1652, + "step": 5593 + }, + { + "epoch": 2.4131981884839337, + "grad_norm": 0.22940433025360107, + "learning_rate": 1.3929007241207783e-05, + "loss": 2.4004, + "step": 5594 + }, + { + "epoch": 2.4136295018330816, + "grad_norm": 0.2000400274991989, + "learning_rate": 1.3909203223712401e-05, + "loss": 2.2189, + "step": 5595 + }, + { + "epoch": 2.41406081518223, + "grad_norm": 0.18464751541614532, + "learning_rate": 1.3889411855734528e-05, + "loss": 2.1399, + "step": 5596 + }, + { + "epoch": 2.414492128531378, + "grad_norm": 0.20219942927360535, + "learning_rate": 1.3869633141372244e-05, + "loss": 2.0755, + "step": 5597 + }, + { + "epoch": 2.4149234418805263, + "grad_norm": 0.1929488629102707, + "learning_rate": 1.384986708472093e-05, + "loss": 2.342, + "step": 5598 + }, + { + "epoch": 2.415354755229674, + "grad_norm": 0.21046137809753418, + "learning_rate": 1.3830113689873357e-05, + "loss": 2.1805, + "step": 5599 + }, + { + "epoch": 2.4157860685788224, + "grad_norm": 0.18207894265651703, + "learning_rate": 1.38103729609197e-05, + "loss": 2.1979, + "step": 5600 + }, + { + "epoch": 2.4157860685788224, + "eval_loss": 2.0876383781433105, + "eval_runtime": 194.9901, + "eval_samples_per_second": 0.164, + "eval_steps_per_second": 0.164, + "step": 5600 + }, + { + "epoch": 2.4162173819279706, + "grad_norm": 0.18475405871868134, + "learning_rate": 1.3790644901947464e-05, + "loss": 2.2139, + "step": 5601 + }, + { + "epoch": 2.416648695277119, + "grad_norm": 0.1960173100233078, + "learning_rate": 1.3770929517041584e-05, + "loss": 2.1189, + "step": 5602 + }, + { + "epoch": 2.417080008626267, + "grad_norm": 0.1998952180147171, + "learning_rate": 1.3751226810284336e-05, + "loss": 2.0716, + "step": 5603 + }, + { + "epoch": 2.417511321975415, + "grad_norm": 0.1847987174987793, + "learning_rate": 1.3731536785755388e-05, + "loss": 2.0077, + "step": 5604 + }, + { + "epoch": 2.417942635324563, + "grad_norm": 0.19390426576137543, + "learning_rate": 1.3711859447531761e-05, + "loss": 2.2725, + "step": 5605 + }, + { + "epoch": 2.4183739486737115, + "grad_norm": 0.18967781960964203, + "learning_rate": 1.369219479968785e-05, + "loss": 2.0619, + "step": 5606 + }, + { + "epoch": 2.4188052620228597, + "grad_norm": 0.19128675758838654, + "learning_rate": 1.3672542846295486e-05, + "loss": 2.194, + "step": 5607 + }, + { + "epoch": 2.4192365753720075, + "grad_norm": 0.18843933939933777, + "learning_rate": 1.36529035914238e-05, + "loss": 1.9821, + "step": 5608 + }, + { + "epoch": 2.419667888721156, + "grad_norm": 0.19080249965190887, + "learning_rate": 1.3633277039139296e-05, + "loss": 2.0477, + "step": 5609 + }, + { + "epoch": 2.420099202070304, + "grad_norm": 0.2069847136735916, + "learning_rate": 1.3613663193505848e-05, + "loss": 2.2744, + "step": 5610 + }, + { + "epoch": 2.4205305154194523, + "grad_norm": 0.19362397491931915, + "learning_rate": 1.3594062058584773e-05, + "loss": 2.1301, + "step": 5611 + }, + { + "epoch": 2.4209618287686006, + "grad_norm": 0.19271761178970337, + "learning_rate": 1.3574473638434668e-05, + "loss": 2.2115, + "step": 5612 + }, + { + "epoch": 2.4213931421177484, + "grad_norm": 0.19851896166801453, + "learning_rate": 1.3554897937111531e-05, + "loss": 2.1409, + "step": 5613 + }, + { + "epoch": 2.4218244554668966, + "grad_norm": 0.19920331239700317, + "learning_rate": 1.3535334958668734e-05, + "loss": 2.1274, + "step": 5614 + }, + { + "epoch": 2.422255768816045, + "grad_norm": 0.19172550737857819, + "learning_rate": 1.3515784707157004e-05, + "loss": 2.1971, + "step": 5615 + }, + { + "epoch": 2.422687082165193, + "grad_norm": 0.1874234974384308, + "learning_rate": 1.3496247186624421e-05, + "loss": 2.076, + "step": 5616 + }, + { + "epoch": 2.423118395514341, + "grad_norm": 0.1872202306985855, + "learning_rate": 1.3476722401116455e-05, + "loss": 2.0465, + "step": 5617 + }, + { + "epoch": 2.423549708863489, + "grad_norm": 0.20311930775642395, + "learning_rate": 1.3457210354675924e-05, + "loss": 2.2531, + "step": 5618 + }, + { + "epoch": 2.4239810222126374, + "grad_norm": 0.18927481770515442, + "learning_rate": 1.343771105134301e-05, + "loss": 2.1877, + "step": 5619 + }, + { + "epoch": 2.4244123355617857, + "grad_norm": 0.20810562372207642, + "learning_rate": 1.341822449515524e-05, + "loss": 2.1992, + "step": 5620 + }, + { + "epoch": 2.424843648910934, + "grad_norm": 0.23007936775684357, + "learning_rate": 1.3398750690147544e-05, + "loss": 1.6996, + "step": 5621 + }, + { + "epoch": 2.4252749622600818, + "grad_norm": 0.20400209724903107, + "learning_rate": 1.3379289640352203e-05, + "loss": 1.9667, + "step": 5622 + }, + { + "epoch": 2.42570627560923, + "grad_norm": 0.194600448012352, + "learning_rate": 1.33598413497988e-05, + "loss": 2.0099, + "step": 5623 + }, + { + "epoch": 2.4261375889583783, + "grad_norm": 0.20893391966819763, + "learning_rate": 1.3340405822514305e-05, + "loss": 2.2339, + "step": 5624 + }, + { + "epoch": 2.4265689023075265, + "grad_norm": 0.1931038498878479, + "learning_rate": 1.3320983062523109e-05, + "loss": 2.1979, + "step": 5625 + }, + { + "epoch": 2.4265689023075265, + "eval_loss": 2.0875062942504883, + "eval_runtime": 194.7639, + "eval_samples_per_second": 0.164, + "eval_steps_per_second": 0.164, + "step": 5625 + }, + { + "epoch": 2.4270002156566743, + "grad_norm": 0.216264545917511, + "learning_rate": 1.3301573073846874e-05, + "loss": 2.0019, + "step": 5626 + }, + { + "epoch": 2.4274315290058226, + "grad_norm": 0.21702730655670166, + "learning_rate": 1.3282175860504657e-05, + "loss": 1.9417, + "step": 5627 + }, + { + "epoch": 2.427862842354971, + "grad_norm": 0.19851179420948029, + "learning_rate": 1.3262791426512868e-05, + "loss": 2.2475, + "step": 5628 + }, + { + "epoch": 2.428294155704119, + "grad_norm": 0.19763562083244324, + "learning_rate": 1.3243419775885252e-05, + "loss": 2.1318, + "step": 5629 + }, + { + "epoch": 2.4287254690532674, + "grad_norm": 0.20032916963100433, + "learning_rate": 1.3224060912632938e-05, + "loss": 2.3122, + "step": 5630 + }, + { + "epoch": 2.429156782402415, + "grad_norm": 0.20838148891925812, + "learning_rate": 1.3204714840764377e-05, + "loss": 2.0032, + "step": 5631 + }, + { + "epoch": 2.4295880957515634, + "grad_norm": 0.1888265162706375, + "learning_rate": 1.3185381564285396e-05, + "loss": 2.052, + "step": 5632 + }, + { + "epoch": 2.4300194091007117, + "grad_norm": 0.18382272124290466, + "learning_rate": 1.3166061087199153e-05, + "loss": 2.1631, + "step": 5633 + }, + { + "epoch": 2.43045072244986, + "grad_norm": 0.19664019346237183, + "learning_rate": 1.3146753413506147e-05, + "loss": 2.1096, + "step": 5634 + }, + { + "epoch": 2.4308820357990077, + "grad_norm": 0.21699444949626923, + "learning_rate": 1.3127458547204281e-05, + "loss": 2.0513, + "step": 5635 + }, + { + "epoch": 2.431313349148156, + "grad_norm": 0.22892074286937714, + "learning_rate": 1.3108176492288766e-05, + "loss": 1.7689, + "step": 5636 + }, + { + "epoch": 2.4317446624973043, + "grad_norm": 0.19827568531036377, + "learning_rate": 1.3088907252752126e-05, + "loss": 2.2414, + "step": 5637 + }, + { + "epoch": 2.4321759758464525, + "grad_norm": 0.19449590146541595, + "learning_rate": 1.3069650832584269e-05, + "loss": 2.0536, + "step": 5638 + }, + { + "epoch": 2.4326072891956008, + "grad_norm": 0.2000982165336609, + "learning_rate": 1.3050407235772489e-05, + "loss": 2.1, + "step": 5639 + }, + { + "epoch": 2.4330386025447486, + "grad_norm": 0.26171600818634033, + "learning_rate": 1.3031176466301357e-05, + "loss": 2.1844, + "step": 5640 + }, + { + "epoch": 2.433469915893897, + "grad_norm": 0.19738814234733582, + "learning_rate": 1.3011958528152839e-05, + "loss": 2.2074, + "step": 5641 + }, + { + "epoch": 2.433901229243045, + "grad_norm": 0.19321668148040771, + "learning_rate": 1.2992753425306158e-05, + "loss": 2.2208, + "step": 5642 + }, + { + "epoch": 2.4343325425921933, + "grad_norm": 0.2076299637556076, + "learning_rate": 1.2973561161738e-05, + "loss": 2.1897, + "step": 5643 + }, + { + "epoch": 2.434763855941341, + "grad_norm": 0.20294933021068573, + "learning_rate": 1.2954381741422312e-05, + "loss": 2.2548, + "step": 5644 + }, + { + "epoch": 2.4351951692904894, + "grad_norm": 0.1980472356081009, + "learning_rate": 1.2935215168330409e-05, + "loss": 2.2052, + "step": 5645 + }, + { + "epoch": 2.4356264826396377, + "grad_norm": 0.20628444850444794, + "learning_rate": 1.2916061446430925e-05, + "loss": 2.2023, + "step": 5646 + }, + { + "epoch": 2.436057795988786, + "grad_norm": 0.1949494630098343, + "learning_rate": 1.2896920579689862e-05, + "loss": 2.0623, + "step": 5647 + }, + { + "epoch": 2.436489109337934, + "grad_norm": 0.2031056135892868, + "learning_rate": 1.2877792572070542e-05, + "loss": 2.1009, + "step": 5648 + }, + { + "epoch": 2.436920422687082, + "grad_norm": 0.20041726529598236, + "learning_rate": 1.285867742753362e-05, + "loss": 2.0863, + "step": 5649 + }, + { + "epoch": 2.4373517360362302, + "grad_norm": 0.19893936812877655, + "learning_rate": 1.2839575150037116e-05, + "loss": 2.0612, + "step": 5650 + }, + { + "epoch": 2.4373517360362302, + "eval_loss": 2.0876355171203613, + "eval_runtime": 195.172, + "eval_samples_per_second": 0.164, + "eval_steps_per_second": 0.164, + "step": 5650 + }, + { + "epoch": 2.4377830493853785, + "grad_norm": 0.20607835054397583, + "learning_rate": 1.2820485743536339e-05, + "loss": 2.1069, + "step": 5651 + }, + { + "epoch": 2.4382143627345267, + "grad_norm": 0.19930028915405273, + "learning_rate": 1.280140921198396e-05, + "loss": 2.1847, + "step": 5652 + }, + { + "epoch": 2.4386456760836746, + "grad_norm": 0.17793206870555878, + "learning_rate": 1.2782345559330012e-05, + "loss": 2.171, + "step": 5653 + }, + { + "epoch": 2.439076989432823, + "grad_norm": 0.17830494046211243, + "learning_rate": 1.2763294789521838e-05, + "loss": 2.0727, + "step": 5654 + }, + { + "epoch": 2.439508302781971, + "grad_norm": 0.20324087142944336, + "learning_rate": 1.2744256906504066e-05, + "loss": 2.311, + "step": 5655 + }, + { + "epoch": 2.4399396161311193, + "grad_norm": 0.2175140231847763, + "learning_rate": 1.27252319142187e-05, + "loss": 2.1746, + "step": 5656 + }, + { + "epoch": 2.4403709294802676, + "grad_norm": 0.2338789403438568, + "learning_rate": 1.2706219816605116e-05, + "loss": 2.2366, + "step": 5657 + }, + { + "epoch": 2.4408022428294154, + "grad_norm": 0.19201385974884033, + "learning_rate": 1.2687220617599944e-05, + "loss": 1.9177, + "step": 5658 + }, + { + "epoch": 2.4412335561785636, + "grad_norm": 0.22386674582958221, + "learning_rate": 1.2668234321137192e-05, + "loss": 2.2145, + "step": 5659 + }, + { + "epoch": 2.441664869527712, + "grad_norm": 0.2086491733789444, + "learning_rate": 1.2649260931148169e-05, + "loss": 2.1575, + "step": 5660 + }, + { + "epoch": 2.44209618287686, + "grad_norm": 0.18864378333091736, + "learning_rate": 1.2630300451561521e-05, + "loss": 2.0536, + "step": 5661 + }, + { + "epoch": 2.442527496226008, + "grad_norm": 0.21808907389640808, + "learning_rate": 1.2611352886303236e-05, + "loss": 2.2387, + "step": 5662 + }, + { + "epoch": 2.442958809575156, + "grad_norm": 0.2047359198331833, + "learning_rate": 1.2592418239296604e-05, + "loss": 2.1705, + "step": 5663 + }, + { + "epoch": 2.4433901229243045, + "grad_norm": 0.18208691477775574, + "learning_rate": 1.2573496514462254e-05, + "loss": 1.9052, + "step": 5664 + }, + { + "epoch": 2.4438214362734527, + "grad_norm": 0.1993609517812729, + "learning_rate": 1.2554587715718125e-05, + "loss": 2.3345, + "step": 5665 + }, + { + "epoch": 2.444252749622601, + "grad_norm": 0.20080235600471497, + "learning_rate": 1.2535691846979497e-05, + "loss": 2.0826, + "step": 5666 + }, + { + "epoch": 2.444684062971749, + "grad_norm": 0.20008035004138947, + "learning_rate": 1.2516808912158982e-05, + "loss": 2.1286, + "step": 5667 + }, + { + "epoch": 2.445115376320897, + "grad_norm": 0.2622080445289612, + "learning_rate": 1.2497938915166509e-05, + "loss": 2.1557, + "step": 5668 + }, + { + "epoch": 2.4455466896700453, + "grad_norm": 0.1986854374408722, + "learning_rate": 1.2479081859909268e-05, + "loss": 2.2945, + "step": 5669 + }, + { + "epoch": 2.4459780030191935, + "grad_norm": 0.32015666365623474, + "learning_rate": 1.246023775029183e-05, + "loss": 2.0321, + "step": 5670 + }, + { + "epoch": 2.4464093163683414, + "grad_norm": 0.20417790114879608, + "learning_rate": 1.2441406590216112e-05, + "loss": 2.1611, + "step": 5671 + }, + { + "epoch": 2.4468406297174896, + "grad_norm": 0.1966090053319931, + "learning_rate": 1.2422588383581283e-05, + "loss": 1.9855, + "step": 5672 + }, + { + "epoch": 2.447271943066638, + "grad_norm": 0.20060087740421295, + "learning_rate": 1.2403783134283874e-05, + "loss": 2.2288, + "step": 5673 + }, + { + "epoch": 2.447703256415786, + "grad_norm": 0.1950266808271408, + "learning_rate": 1.2384990846217693e-05, + "loss": 2.2091, + "step": 5674 + }, + { + "epoch": 2.4481345697649344, + "grad_norm": 0.18725168704986572, + "learning_rate": 1.2366211523273914e-05, + "loss": 2.0898, + "step": 5675 + }, + { + "epoch": 2.4481345697649344, + "eval_loss": 2.08742094039917, + "eval_runtime": 194.5775, + "eval_samples_per_second": 0.164, + "eval_steps_per_second": 0.164, + "step": 5675 + }, + { + "epoch": 2.448565883114082, + "grad_norm": 0.19744376838207245, + "learning_rate": 1.2347445169340991e-05, + "loss": 2.3429, + "step": 5676 + }, + { + "epoch": 2.4489971964632304, + "grad_norm": 0.19756796956062317, + "learning_rate": 1.23286917883047e-05, + "loss": 2.0573, + "step": 5677 + }, + { + "epoch": 2.4494285098123787, + "grad_norm": 0.1958743929862976, + "learning_rate": 1.2309951384048137e-05, + "loss": 2.1837, + "step": 5678 + }, + { + "epoch": 2.449859823161527, + "grad_norm": 0.191923588514328, + "learning_rate": 1.2291223960451702e-05, + "loss": 2.2266, + "step": 5679 + }, + { + "epoch": 2.4502911365106748, + "grad_norm": 0.20301900804042816, + "learning_rate": 1.2272509521393104e-05, + "loss": 1.9793, + "step": 5680 + }, + { + "epoch": 2.450722449859823, + "grad_norm": 0.20013652741909027, + "learning_rate": 1.2253808070747417e-05, + "loss": 2.1545, + "step": 5681 + }, + { + "epoch": 2.4511537632089713, + "grad_norm": 0.20575332641601562, + "learning_rate": 1.2235119612386938e-05, + "loss": 2.2977, + "step": 5682 + }, + { + "epoch": 2.4515850765581195, + "grad_norm": 0.18771253526210785, + "learning_rate": 1.2216444150181326e-05, + "loss": 1.9519, + "step": 5683 + }, + { + "epoch": 2.452016389907268, + "grad_norm": 0.20944516360759735, + "learning_rate": 1.2197781687997524e-05, + "loss": 2.1642, + "step": 5684 + }, + { + "epoch": 2.452447703256416, + "grad_norm": 0.18974487483501434, + "learning_rate": 1.217913222969983e-05, + "loss": 2.1065, + "step": 5685 + }, + { + "epoch": 2.452879016605564, + "grad_norm": 0.19545859098434448, + "learning_rate": 1.2160495779149834e-05, + "loss": 2.1656, + "step": 5686 + }, + { + "epoch": 2.453310329954712, + "grad_norm": 0.19517193734645844, + "learning_rate": 1.2141872340206363e-05, + "loss": 2.0571, + "step": 5687 + }, + { + "epoch": 2.4537416433038604, + "grad_norm": 0.19417819380760193, + "learning_rate": 1.2123261916725606e-05, + "loss": 2.0085, + "step": 5688 + }, + { + "epoch": 2.454172956653008, + "grad_norm": 0.18139328062534332, + "learning_rate": 1.2104664512561101e-05, + "loss": 1.9799, + "step": 5689 + }, + { + "epoch": 2.4546042700021564, + "grad_norm": 0.19426609575748444, + "learning_rate": 1.2086080131563625e-05, + "loss": 2.116, + "step": 5690 + }, + { + "epoch": 2.4550355833513047, + "grad_norm": 0.182246133685112, + "learning_rate": 1.2067508777581265e-05, + "loss": 2.2792, + "step": 5691 + }, + { + "epoch": 2.455466896700453, + "grad_norm": 0.19076038897037506, + "learning_rate": 1.2048950454459435e-05, + "loss": 2.0988, + "step": 5692 + }, + { + "epoch": 2.455898210049601, + "grad_norm": 0.1936006098985672, + "learning_rate": 1.2030405166040832e-05, + "loss": 1.9843, + "step": 5693 + }, + { + "epoch": 2.4563295233987494, + "grad_norm": 0.19579117000102997, + "learning_rate": 1.2011872916165465e-05, + "loss": 2.1017, + "step": 5694 + }, + { + "epoch": 2.4567608367478972, + "grad_norm": 0.20319832861423492, + "learning_rate": 1.1993353708670641e-05, + "loss": 2.1175, + "step": 5695 + }, + { + "epoch": 2.4571921500970455, + "grad_norm": 0.20473158359527588, + "learning_rate": 1.1974847547390959e-05, + "loss": 2.0309, + "step": 5696 + }, + { + "epoch": 2.4576234634461938, + "grad_norm": 0.20871326327323914, + "learning_rate": 1.1956354436158319e-05, + "loss": 2.1344, + "step": 5697 + }, + { + "epoch": 2.4580547767953416, + "grad_norm": 0.18608273565769196, + "learning_rate": 1.193787437880191e-05, + "loss": 2.1024, + "step": 5698 + }, + { + "epoch": 2.45848609014449, + "grad_norm": 0.19599929451942444, + "learning_rate": 1.1919407379148268e-05, + "loss": 2.111, + "step": 5699 + }, + { + "epoch": 2.458917403493638, + "grad_norm": 0.1737334430217743, + "learning_rate": 1.190095344102118e-05, + "loss": 2.0196, + "step": 5700 + }, + { + "epoch": 2.458917403493638, + "eval_loss": 2.087368965148926, + "eval_runtime": 194.2931, + "eval_samples_per_second": 0.165, + "eval_steps_per_second": 0.165, + "step": 5700 + }, + { + "epoch": 2.4593487168427863, + "grad_norm": 0.16876773536205292, + "learning_rate": 1.1882512568241703e-05, + "loss": 1.9346, + "step": 5701 + }, + { + "epoch": 2.4597800301919346, + "grad_norm": 0.19252726435661316, + "learning_rate": 1.1864084764628226e-05, + "loss": 2.1173, + "step": 5702 + }, + { + "epoch": 2.460211343541083, + "grad_norm": 0.20373648405075073, + "learning_rate": 1.1845670033996454e-05, + "loss": 2.195, + "step": 5703 + }, + { + "epoch": 2.4606426568902307, + "grad_norm": 0.2033834308385849, + "learning_rate": 1.1827268380159354e-05, + "loss": 2.0791, + "step": 5704 + }, + { + "epoch": 2.461073970239379, + "grad_norm": 0.1925143599510193, + "learning_rate": 1.1808879806927171e-05, + "loss": 1.907, + "step": 5705 + }, + { + "epoch": 2.461505283588527, + "grad_norm": 0.18729040026664734, + "learning_rate": 1.1790504318107475e-05, + "loss": 2.0964, + "step": 5706 + }, + { + "epoch": 2.4619365969376754, + "grad_norm": 0.20501752197742462, + "learning_rate": 1.1772141917505112e-05, + "loss": 2.1624, + "step": 5707 + }, + { + "epoch": 2.4623679102868232, + "grad_norm": 0.27427372336387634, + "learning_rate": 1.1753792608922202e-05, + "loss": 2.1858, + "step": 5708 + }, + { + "epoch": 2.4627992236359715, + "grad_norm": 0.1857091635465622, + "learning_rate": 1.1735456396158188e-05, + "loss": 2.1267, + "step": 5709 + }, + { + "epoch": 2.4632305369851197, + "grad_norm": 0.1786196380853653, + "learning_rate": 1.1717133283009783e-05, + "loss": 1.9905, + "step": 5710 + }, + { + "epoch": 2.463661850334268, + "grad_norm": 2.388453722000122, + "learning_rate": 1.1698823273270969e-05, + "loss": 2.2965, + "step": 5711 + }, + { + "epoch": 2.4640931636834162, + "grad_norm": 0.26211005449295044, + "learning_rate": 1.1680526370733037e-05, + "loss": 2.1418, + "step": 5712 + }, + { + "epoch": 2.464524477032564, + "grad_norm": 0.21328003704547882, + "learning_rate": 1.1662242579184583e-05, + "loss": 2.1827, + "step": 5713 + }, + { + "epoch": 2.4649557903817123, + "grad_norm": 0.19279971718788147, + "learning_rate": 1.1643971902411478e-05, + "loss": 2.079, + "step": 5714 + }, + { + "epoch": 2.4653871037308606, + "grad_norm": 0.19928252696990967, + "learning_rate": 1.1625714344196813e-05, + "loss": 1.957, + "step": 5715 + }, + { + "epoch": 2.465818417080009, + "grad_norm": 0.1971615105867386, + "learning_rate": 1.160746990832103e-05, + "loss": 2.0119, + "step": 5716 + }, + { + "epoch": 2.4662497304291566, + "grad_norm": 0.21007204055786133, + "learning_rate": 1.1589238598561873e-05, + "loss": 1.7142, + "step": 5717 + }, + { + "epoch": 2.466681043778305, + "grad_norm": 0.1840398609638214, + "learning_rate": 1.1571020418694308e-05, + "loss": 1.8995, + "step": 5718 + }, + { + "epoch": 2.467112357127453, + "grad_norm": 0.19897425174713135, + "learning_rate": 1.1552815372490606e-05, + "loss": 2.1401, + "step": 5719 + }, + { + "epoch": 2.4675436704766014, + "grad_norm": 0.18006531894207, + "learning_rate": 1.1534623463720338e-05, + "loss": 1.9013, + "step": 5720 + }, + { + "epoch": 2.4679749838257496, + "grad_norm": 0.21226273477077484, + "learning_rate": 1.1516444696150324e-05, + "loss": 1.9415, + "step": 5721 + }, + { + "epoch": 2.4684062971748975, + "grad_norm": 0.21781636774539948, + "learning_rate": 1.1498279073544683e-05, + "loss": 2.1623, + "step": 5722 + }, + { + "epoch": 2.4688376105240457, + "grad_norm": 0.19267801940441132, + "learning_rate": 1.1480126599664801e-05, + "loss": 2.1478, + "step": 5723 + }, + { + "epoch": 2.469268923873194, + "grad_norm": 0.1948312222957611, + "learning_rate": 1.1461987278269347e-05, + "loss": 2.1864, + "step": 5724 + }, + { + "epoch": 2.469700237222342, + "grad_norm": 0.20160384476184845, + "learning_rate": 1.1443861113114264e-05, + "loss": 2.0665, + "step": 5725 + }, + { + "epoch": 2.469700237222342, + "eval_loss": 2.0871713161468506, + "eval_runtime": 194.8679, + "eval_samples_per_second": 0.164, + "eval_steps_per_second": 0.164, + "step": 5725 + }, + { + "epoch": 2.47013155057149, + "grad_norm": 0.20347172021865845, + "learning_rate": 1.142574810795276e-05, + "loss": 2.4612, + "step": 5726 + }, + { + "epoch": 2.4705628639206383, + "grad_norm": 0.2064550369977951, + "learning_rate": 1.1407648266535382e-05, + "loss": 2.165, + "step": 5727 + }, + { + "epoch": 2.4709941772697865, + "grad_norm": 0.20214228332042694, + "learning_rate": 1.1389561592609852e-05, + "loss": 1.9555, + "step": 5728 + }, + { + "epoch": 2.471425490618935, + "grad_norm": 0.2005394995212555, + "learning_rate": 1.1371488089921224e-05, + "loss": 2.1927, + "step": 5729 + }, + { + "epoch": 2.471856803968083, + "grad_norm": 0.20218129456043243, + "learning_rate": 1.1353427762211794e-05, + "loss": 2.0424, + "step": 5730 + }, + { + "epoch": 2.472288117317231, + "grad_norm": 0.19005270302295685, + "learning_rate": 1.1335380613221198e-05, + "loss": 2.1325, + "step": 5731 + }, + { + "epoch": 2.472719430666379, + "grad_norm": 0.19264422357082367, + "learning_rate": 1.1317346646686279e-05, + "loss": 2.1046, + "step": 5732 + }, + { + "epoch": 2.4731507440155274, + "grad_norm": 0.2074297070503235, + "learning_rate": 1.129932586634115e-05, + "loss": 2.0958, + "step": 5733 + }, + { + "epoch": 2.4735820573646756, + "grad_norm": 0.18325988948345184, + "learning_rate": 1.1281318275917192e-05, + "loss": 1.9128, + "step": 5734 + }, + { + "epoch": 2.4740133707138234, + "grad_norm": 0.21392813324928284, + "learning_rate": 1.1263323879143123e-05, + "loss": 1.9925, + "step": 5735 + }, + { + "epoch": 2.4744446840629717, + "grad_norm": 0.18119731545448303, + "learning_rate": 1.1245342679744866e-05, + "loss": 2.2641, + "step": 5736 + }, + { + "epoch": 2.47487599741212, + "grad_norm": 0.2063588947057724, + "learning_rate": 1.1227374681445603e-05, + "loss": 2.1409, + "step": 5737 + }, + { + "epoch": 2.475307310761268, + "grad_norm": 0.20988424122333527, + "learning_rate": 1.1209419887965828e-05, + "loss": 2.1957, + "step": 5738 + }, + { + "epoch": 2.4757386241104165, + "grad_norm": 0.1982715129852295, + "learning_rate": 1.119147830302326e-05, + "loss": 2.2277, + "step": 5739 + }, + { + "epoch": 2.4761699374595643, + "grad_norm": 0.19500423967838287, + "learning_rate": 1.117354993033291e-05, + "loss": 2.0229, + "step": 5740 + }, + { + "epoch": 2.4766012508087125, + "grad_norm": 0.21447238326072693, + "learning_rate": 1.1155634773607039e-05, + "loss": 2.1768, + "step": 5741 + }, + { + "epoch": 2.4770325641578608, + "grad_norm": 0.19308245182037354, + "learning_rate": 1.1137732836555183e-05, + "loss": 2.0162, + "step": 5742 + }, + { + "epoch": 2.477463877507009, + "grad_norm": 0.21386805176734924, + "learning_rate": 1.1119844122884125e-05, + "loss": 1.9618, + "step": 5743 + }, + { + "epoch": 2.477895190856157, + "grad_norm": 0.17448431253433228, + "learning_rate": 1.1101968636297912e-05, + "loss": 1.9715, + "step": 5744 + }, + { + "epoch": 2.478326504205305, + "grad_norm": 0.21260018646717072, + "learning_rate": 1.1084106380497882e-05, + "loss": 2.2935, + "step": 5745 + }, + { + "epoch": 2.4787578175544533, + "grad_norm": 0.2187669724225998, + "learning_rate": 1.1066257359182609e-05, + "loss": 1.9428, + "step": 5746 + }, + { + "epoch": 2.4791891309036016, + "grad_norm": 0.19976875185966492, + "learning_rate": 1.1048421576047914e-05, + "loss": 1.994, + "step": 5747 + }, + { + "epoch": 2.47962044425275, + "grad_norm": 0.21492761373519897, + "learning_rate": 1.1030599034786867e-05, + "loss": 2.0978, + "step": 5748 + }, + { + "epoch": 2.4800517576018977, + "grad_norm": 0.23321105539798737, + "learning_rate": 1.1012789739089864e-05, + "loss": 2.105, + "step": 5749 + }, + { + "epoch": 2.480483070951046, + "grad_norm": 0.2062074840068817, + "learning_rate": 1.0994993692644508e-05, + "loss": 2.1688, + "step": 5750 + }, + { + "epoch": 2.480483070951046, + "eval_loss": 2.0872256755828857, + "eval_runtime": 194.9368, + "eval_samples_per_second": 0.164, + "eval_steps_per_second": 0.164, + "step": 5750 + }, + { + "epoch": 2.480914384300194, + "grad_norm": 0.2005544900894165, + "learning_rate": 1.097721089913565e-05, + "loss": 2.1628, + "step": 5751 + }, + { + "epoch": 2.4813456976493424, + "grad_norm": 0.21592171490192413, + "learning_rate": 1.0959441362245411e-05, + "loss": 2.1939, + "step": 5752 + }, + { + "epoch": 2.4817770109984902, + "grad_norm": 0.19873322546482086, + "learning_rate": 1.0941685085653182e-05, + "loss": 2.1372, + "step": 5753 + }, + { + "epoch": 2.4822083243476385, + "grad_norm": 0.20747192203998566, + "learning_rate": 1.0923942073035579e-05, + "loss": 2.5073, + "step": 5754 + }, + { + "epoch": 2.4826396376967867, + "grad_norm": 0.19865496456623077, + "learning_rate": 1.09062123280665e-05, + "loss": 2.0926, + "step": 5755 + }, + { + "epoch": 2.483070951045935, + "grad_norm": 0.1956738829612732, + "learning_rate": 1.0888495854417071e-05, + "loss": 2.0335, + "step": 5756 + }, + { + "epoch": 2.4835022643950833, + "grad_norm": 0.2046099156141281, + "learning_rate": 1.0870792655755683e-05, + "loss": 2.4057, + "step": 5757 + }, + { + "epoch": 2.483933577744231, + "grad_norm": 0.1916872262954712, + "learning_rate": 1.0853102735747973e-05, + "loss": 2.1253, + "step": 5758 + }, + { + "epoch": 2.4843648910933793, + "grad_norm": 0.2286677360534668, + "learning_rate": 1.0835426098056851e-05, + "loss": 2.2194, + "step": 5759 + }, + { + "epoch": 2.4847962044425276, + "grad_norm": 0.23386746644973755, + "learning_rate": 1.0817762746342473e-05, + "loss": 2.3312, + "step": 5760 + }, + { + "epoch": 2.485227517791676, + "grad_norm": 0.1910087913274765, + "learning_rate": 1.080011268426218e-05, + "loss": 2.1249, + "step": 5761 + }, + { + "epoch": 2.4856588311408236, + "grad_norm": 0.18502314388751984, + "learning_rate": 1.0782475915470615e-05, + "loss": 2.1537, + "step": 5762 + }, + { + "epoch": 2.486090144489972, + "grad_norm": 0.19793559610843658, + "learning_rate": 1.0764852443619697e-05, + "loss": 2.0208, + "step": 5763 + }, + { + "epoch": 2.48652145783912, + "grad_norm": 0.21486476063728333, + "learning_rate": 1.0747242272358553e-05, + "loss": 2.2666, + "step": 5764 + }, + { + "epoch": 2.4869527711882684, + "grad_norm": 0.211890310049057, + "learning_rate": 1.0729645405333545e-05, + "loss": 1.9514, + "step": 5765 + }, + { + "epoch": 2.4873840845374167, + "grad_norm": 0.19332849979400635, + "learning_rate": 1.071206184618831e-05, + "loss": 2.2484, + "step": 5766 + }, + { + "epoch": 2.4878153978865645, + "grad_norm": 0.20185452699661255, + "learning_rate": 1.0694491598563712e-05, + "loss": 2.0449, + "step": 5767 + }, + { + "epoch": 2.4882467112357127, + "grad_norm": 0.1992376446723938, + "learning_rate": 1.0676934666097856e-05, + "loss": 2.0498, + "step": 5768 + }, + { + "epoch": 2.488678024584861, + "grad_norm": 0.21259760856628418, + "learning_rate": 1.0659391052426109e-05, + "loss": 2.1275, + "step": 5769 + }, + { + "epoch": 2.4891093379340092, + "grad_norm": 0.20208117365837097, + "learning_rate": 1.0641860761181057e-05, + "loss": 2.0495, + "step": 5770 + }, + { + "epoch": 2.489540651283157, + "grad_norm": 0.20526215434074402, + "learning_rate": 1.0624343795992546e-05, + "loss": 2.1591, + "step": 5771 + }, + { + "epoch": 2.4899719646323053, + "grad_norm": 0.20042036473751068, + "learning_rate": 1.0606840160487633e-05, + "loss": 2.0136, + "step": 5772 + }, + { + "epoch": 2.4904032779814536, + "grad_norm": 0.20826835930347443, + "learning_rate": 1.0589349858290689e-05, + "loss": 2.3665, + "step": 5773 + }, + { + "epoch": 2.490834591330602, + "grad_norm": 0.20930518209934235, + "learning_rate": 1.0571872893023228e-05, + "loss": 2.1048, + "step": 5774 + }, + { + "epoch": 2.49126590467975, + "grad_norm": 0.21557918190956116, + "learning_rate": 1.055440926830405e-05, + "loss": 2.1528, + "step": 5775 + }, + { + "epoch": 2.49126590467975, + "eval_loss": 2.087198495864868, + "eval_runtime": 208.443, + "eval_samples_per_second": 0.154, + "eval_steps_per_second": 0.154, + "step": 5775 + }, + { + "epoch": 2.491697218028898, + "grad_norm": 0.19226939976215363, + "learning_rate": 1.0536958987749188e-05, + "loss": 2.0221, + "step": 5776 + }, + { + "epoch": 2.492128531378046, + "grad_norm": 0.1970117837190628, + "learning_rate": 1.0519522054971944e-05, + "loss": 2.0414, + "step": 5777 + }, + { + "epoch": 2.4925598447271944, + "grad_norm": 0.18709838390350342, + "learning_rate": 1.0502098473582809e-05, + "loss": 2.1164, + "step": 5778 + }, + { + "epoch": 2.4929911580763426, + "grad_norm": 0.18943005800247192, + "learning_rate": 1.0484688247189502e-05, + "loss": 2.104, + "step": 5779 + }, + { + "epoch": 2.4934224714254904, + "grad_norm": 0.18927139043807983, + "learning_rate": 1.0467291379397027e-05, + "loss": 2.1364, + "step": 5780 + }, + { + "epoch": 2.4938537847746387, + "grad_norm": 0.19848081469535828, + "learning_rate": 1.0449907873807598e-05, + "loss": 2.0313, + "step": 5781 + }, + { + "epoch": 2.494285098123787, + "grad_norm": 0.17562636733055115, + "learning_rate": 1.043253773402065e-05, + "loss": 1.9745, + "step": 5782 + }, + { + "epoch": 2.494716411472935, + "grad_norm": 0.21291068196296692, + "learning_rate": 1.0415180963632855e-05, + "loss": 2.1383, + "step": 5783 + }, + { + "epoch": 2.4951477248220835, + "grad_norm": 0.2445984035730362, + "learning_rate": 1.039783756623814e-05, + "loss": 2.106, + "step": 5784 + }, + { + "epoch": 2.4955790381712313, + "grad_norm": 0.18510930240154266, + "learning_rate": 1.0380507545427624e-05, + "loss": 2.2387, + "step": 5785 + }, + { + "epoch": 2.4960103515203795, + "grad_norm": 0.19964301586151123, + "learning_rate": 1.0363190904789695e-05, + "loss": 2.045, + "step": 5786 + }, + { + "epoch": 2.496441664869528, + "grad_norm": 0.19906377792358398, + "learning_rate": 1.0345887647909943e-05, + "loss": 2.2741, + "step": 5787 + }, + { + "epoch": 2.496872978218676, + "grad_norm": 0.20326967537403107, + "learning_rate": 1.0328597778371201e-05, + "loss": 2.1149, + "step": 5788 + }, + { + "epoch": 2.497304291567824, + "grad_norm": 0.20548272132873535, + "learning_rate": 1.0311321299753513e-05, + "loss": 1.996, + "step": 5789 + }, + { + "epoch": 2.497735604916972, + "grad_norm": 0.23548679053783417, + "learning_rate": 1.0294058215634172e-05, + "loss": 1.7426, + "step": 5790 + }, + { + "epoch": 2.4981669182661204, + "grad_norm": 0.19880113005638123, + "learning_rate": 1.0276808529587696e-05, + "loss": 2.1825, + "step": 5791 + }, + { + "epoch": 2.4985982316152686, + "grad_norm": 0.18782949447631836, + "learning_rate": 1.0259572245185832e-05, + "loss": 2.3359, + "step": 5792 + }, + { + "epoch": 2.499029544964417, + "grad_norm": 0.20282316207885742, + "learning_rate": 1.0242349365997515e-05, + "loss": 2.2454, + "step": 5793 + }, + { + "epoch": 2.4994608583135647, + "grad_norm": 0.19014997780323029, + "learning_rate": 1.022513989558891e-05, + "loss": 1.987, + "step": 5794 + }, + { + "epoch": 2.499892171662713, + "grad_norm": 0.21561653912067413, + "learning_rate": 1.0207943837523483e-05, + "loss": 2.307, + "step": 5795 + }, + { + "epoch": 2.500323485011861, + "grad_norm": 0.21344530582427979, + "learning_rate": 1.0190761195361835e-05, + "loss": 2.0094, + "step": 5796 + }, + { + "epoch": 2.500754798361009, + "grad_norm": 0.2258821278810501, + "learning_rate": 1.017359197266182e-05, + "loss": 2.1, + "step": 5797 + }, + { + "epoch": 2.5011861117101573, + "grad_norm": 0.18700018525123596, + "learning_rate": 1.0156436172978522e-05, + "loss": 2.1812, + "step": 5798 + }, + { + "epoch": 2.5016174250593055, + "grad_norm": 0.18775956332683563, + "learning_rate": 1.0139293799864229e-05, + "loss": 1.9226, + "step": 5799 + }, + { + "epoch": 2.5020487384084538, + "grad_norm": 0.2064802497625351, + "learning_rate": 1.0122164856868461e-05, + "loss": 2.0509, + "step": 5800 + }, + { + "epoch": 2.5020487384084538, + "eval_loss": 2.087212562561035, + "eval_runtime": 202.1354, + "eval_samples_per_second": 0.158, + "eval_steps_per_second": 0.158, + "step": 5800 + }, + { + "epoch": 2.502480051757602, + "grad_norm": 0.1930222511291504, + "learning_rate": 1.010504934753795e-05, + "loss": 2.1301, + "step": 5801 + }, + { + "epoch": 2.5029113651067503, + "grad_norm": 0.19863606989383698, + "learning_rate": 1.008794727541665e-05, + "loss": 2.1475, + "step": 5802 + }, + { + "epoch": 2.503342678455898, + "grad_norm": 0.20644447207450867, + "learning_rate": 1.0070858644045737e-05, + "loss": 2.1981, + "step": 5803 + }, + { + "epoch": 2.5037739918050463, + "grad_norm": 0.20835015177726746, + "learning_rate": 1.0053783456963574e-05, + "loss": 2.2185, + "step": 5804 + }, + { + "epoch": 2.5042053051541946, + "grad_norm": 0.19597108662128448, + "learning_rate": 1.0036721717705823e-05, + "loss": 2.0874, + "step": 5805 + }, + { + "epoch": 2.5046366185033424, + "grad_norm": 0.1970120519399643, + "learning_rate": 1.001967342980525e-05, + "loss": 2.2048, + "step": 5806 + }, + { + "epoch": 2.5050679318524907, + "grad_norm": 0.22588270902633667, + "learning_rate": 1.0002638596791898e-05, + "loss": 2.1817, + "step": 5807 + }, + { + "epoch": 2.505499245201639, + "grad_norm": 0.21643772721290588, + "learning_rate": 9.985617222193013e-06, + "loss": 2.2974, + "step": 5808 + }, + { + "epoch": 2.505930558550787, + "grad_norm": 0.20602551102638245, + "learning_rate": 9.96860930953309e-06, + "loss": 2.1928, + "step": 5809 + }, + { + "epoch": 2.5063618718999354, + "grad_norm": 0.20020756125450134, + "learning_rate": 9.951614862333789e-06, + "loss": 2.0934, + "step": 5810 + }, + { + "epoch": 2.5067931852490837, + "grad_norm": 0.1873340904712677, + "learning_rate": 9.934633884113954e-06, + "loss": 2.037, + "step": 5811 + }, + { + "epoch": 2.5072244985982315, + "grad_norm": 0.19135379791259766, + "learning_rate": 9.917666378389738e-06, + "loss": 2.1021, + "step": 5812 + }, + { + "epoch": 2.5076558119473797, + "grad_norm": 0.18421195447444916, + "learning_rate": 9.900712348674422e-06, + "loss": 1.9513, + "step": 5813 + }, + { + "epoch": 2.508087125296528, + "grad_norm": 0.20990701019763947, + "learning_rate": 9.883771798478533e-06, + "loss": 2.3694, + "step": 5814 + }, + { + "epoch": 2.508518438645676, + "grad_norm": 0.18025539815425873, + "learning_rate": 9.866844731309787e-06, + "loss": 2.0232, + "step": 5815 + }, + { + "epoch": 2.508949751994824, + "grad_norm": 0.20541740953922272, + "learning_rate": 9.849931150673133e-06, + "loss": 2.1529, + "step": 5816 + }, + { + "epoch": 2.5093810653439723, + "grad_norm": 0.19457145035266876, + "learning_rate": 9.833031060070693e-06, + "loss": 2.011, + "step": 5817 + }, + { + "epoch": 2.5098123786931206, + "grad_norm": 0.20473068952560425, + "learning_rate": 9.81614446300181e-06, + "loss": 2.179, + "step": 5818 + }, + { + "epoch": 2.510243692042269, + "grad_norm": 0.2168092429637909, + "learning_rate": 9.799271362963096e-06, + "loss": 2.2023, + "step": 5819 + }, + { + "epoch": 2.510675005391417, + "grad_norm": 0.19788049161434174, + "learning_rate": 9.782411763448247e-06, + "loss": 2.1279, + "step": 5820 + }, + { + "epoch": 2.511106318740565, + "grad_norm": 0.1992284208536148, + "learning_rate": 9.765565667948247e-06, + "loss": 2.1279, + "step": 5821 + }, + { + "epoch": 2.511537632089713, + "grad_norm": 0.18312112987041473, + "learning_rate": 9.748733079951254e-06, + "loss": 1.8849, + "step": 5822 + }, + { + "epoch": 2.5119689454388614, + "grad_norm": 0.17599177360534668, + "learning_rate": 9.731914002942664e-06, + "loss": 2.0223, + "step": 5823 + }, + { + "epoch": 2.5124002587880097, + "grad_norm": 0.20038534700870514, + "learning_rate": 9.715108440405067e-06, + "loss": 2.1895, + "step": 5824 + }, + { + "epoch": 2.5128315721371575, + "grad_norm": 0.19765359163284302, + "learning_rate": 9.69831639581817e-06, + "loss": 2.1974, + "step": 5825 + }, + { + "epoch": 2.5128315721371575, + "eval_loss": 2.087245225906372, + "eval_runtime": 202.6707, + "eval_samples_per_second": 0.158, + "eval_steps_per_second": 0.158, + "step": 5825 + }, + { + "epoch": 2.5132628854863057, + "grad_norm": 4.632514476776123, + "learning_rate": 9.681537872659009e-06, + "loss": 2.2227, + "step": 5826 + }, + { + "epoch": 2.513694198835454, + "grad_norm": 0.20792263746261597, + "learning_rate": 9.664772874401738e-06, + "loss": 2.1268, + "step": 5827 + }, + { + "epoch": 2.5141255121846022, + "grad_norm": 0.18867172300815582, + "learning_rate": 9.648021404517747e-06, + "loss": 1.9849, + "step": 5828 + }, + { + "epoch": 2.5145568255337505, + "grad_norm": 0.19542069733142853, + "learning_rate": 9.631283466475595e-06, + "loss": 2.1466, + "step": 5829 + }, + { + "epoch": 2.5149881388828983, + "grad_norm": 0.20394869148731232, + "learning_rate": 9.61455906374106e-06, + "loss": 2.0995, + "step": 5830 + }, + { + "epoch": 2.5154194522320465, + "grad_norm": 0.1910642683506012, + "learning_rate": 9.597848199777111e-06, + "loss": 2.2057, + "step": 5831 + }, + { + "epoch": 2.515850765581195, + "grad_norm": 0.19179588556289673, + "learning_rate": 9.581150878043925e-06, + "loss": 2.0844, + "step": 5832 + }, + { + "epoch": 2.516282078930343, + "grad_norm": 0.20327067375183105, + "learning_rate": 9.56446710199886e-06, + "loss": 2.2818, + "step": 5833 + }, + { + "epoch": 2.516713392279491, + "grad_norm": 0.18520274758338928, + "learning_rate": 9.54779687509647e-06, + "loss": 1.811, + "step": 5834 + }, + { + "epoch": 2.517144705628639, + "grad_norm": 0.18724271655082703, + "learning_rate": 9.531140200788508e-06, + "loss": 1.8034, + "step": 5835 + }, + { + "epoch": 2.5175760189777874, + "grad_norm": 0.19156400859355927, + "learning_rate": 9.514497082523903e-06, + "loss": 2.2527, + "step": 5836 + }, + { + "epoch": 2.5180073323269356, + "grad_norm": 0.20256634056568146, + "learning_rate": 9.497867523748837e-06, + "loss": 2.0685, + "step": 5837 + }, + { + "epoch": 2.518438645676084, + "grad_norm": 0.1926616132259369, + "learning_rate": 9.48125152790663e-06, + "loss": 2.0477, + "step": 5838 + }, + { + "epoch": 2.5188699590252317, + "grad_norm": 0.19700340926647186, + "learning_rate": 9.464649098437781e-06, + "loss": 2.3059, + "step": 5839 + }, + { + "epoch": 2.51930127237438, + "grad_norm": 0.209873229265213, + "learning_rate": 9.448060238780008e-06, + "loss": 2.318, + "step": 5840 + }, + { + "epoch": 2.519732585723528, + "grad_norm": 0.1966349333524704, + "learning_rate": 9.431484952368237e-06, + "loss": 2.0184, + "step": 5841 + }, + { + "epoch": 2.5201638990726765, + "grad_norm": 0.23189082741737366, + "learning_rate": 9.414923242634556e-06, + "loss": 2.1466, + "step": 5842 + }, + { + "epoch": 2.5205952124218243, + "grad_norm": 0.18889522552490234, + "learning_rate": 9.398375113008239e-06, + "loss": 1.9915, + "step": 5843 + }, + { + "epoch": 2.5210265257709725, + "grad_norm": 0.2166232466697693, + "learning_rate": 9.38184056691576e-06, + "loss": 2.1035, + "step": 5844 + }, + { + "epoch": 2.521457839120121, + "grad_norm": 0.2198508232831955, + "learning_rate": 9.365319607780792e-06, + "loss": 2.3479, + "step": 5845 + }, + { + "epoch": 2.521889152469269, + "grad_norm": 0.2350691556930542, + "learning_rate": 9.348812239024153e-06, + "loss": 2.1984, + "step": 5846 + }, + { + "epoch": 2.5223204658184173, + "grad_norm": 0.19254909455776215, + "learning_rate": 9.332318464063895e-06, + "loss": 2.0115, + "step": 5847 + }, + { + "epoch": 2.522751779167565, + "grad_norm": 0.18990060687065125, + "learning_rate": 9.315838286315236e-06, + "loss": 2.1486, + "step": 5848 + }, + { + "epoch": 2.5231830925167134, + "grad_norm": 0.19294965267181396, + "learning_rate": 9.299371709190561e-06, + "loss": 2.0948, + "step": 5849 + }, + { + "epoch": 2.5236144058658616, + "grad_norm": 0.23233257234096527, + "learning_rate": 9.282918736099446e-06, + "loss": 2.141, + "step": 5850 + }, + { + "epoch": 2.5236144058658616, + "eval_loss": 2.0872998237609863, + "eval_runtime": 202.8781, + "eval_samples_per_second": 0.158, + "eval_steps_per_second": 0.158, + "step": 5850 + }, + { + "epoch": 2.52404571921501, + "grad_norm": 0.19171443581581116, + "learning_rate": 9.266479370448712e-06, + "loss": 2.1621, + "step": 5851 + }, + { + "epoch": 2.5244770325641577, + "grad_norm": 17.882905960083008, + "learning_rate": 9.250053615642263e-06, + "loss": 1.9856, + "step": 5852 + }, + { + "epoch": 2.524908345913306, + "grad_norm": 0.20517498254776, + "learning_rate": 9.233641475081239e-06, + "loss": 2.2057, + "step": 5853 + }, + { + "epoch": 2.525339659262454, + "grad_norm": 0.2134210765361786, + "learning_rate": 9.217242952163934e-06, + "loss": 2.1409, + "step": 5854 + }, + { + "epoch": 2.5257709726116024, + "grad_norm": 0.19968686997890472, + "learning_rate": 9.200858050285883e-06, + "loss": 2.1377, + "step": 5855 + }, + { + "epoch": 2.5262022859607507, + "grad_norm": 0.1992197185754776, + "learning_rate": 9.184486772839758e-06, + "loss": 2.04, + "step": 5856 + }, + { + "epoch": 2.5266335993098985, + "grad_norm": 0.2095516175031662, + "learning_rate": 9.168129123215363e-06, + "loss": 2.1054, + "step": 5857 + }, + { + "epoch": 2.5270649126590468, + "grad_norm": 0.18523921072483063, + "learning_rate": 9.15178510479977e-06, + "loss": 2.0172, + "step": 5858 + }, + { + "epoch": 2.527496226008195, + "grad_norm": 0.20448660850524902, + "learning_rate": 9.13545472097718e-06, + "loss": 2.0324, + "step": 5859 + }, + { + "epoch": 2.5279275393573433, + "grad_norm": 0.19547265768051147, + "learning_rate": 9.119137975128967e-06, + "loss": 2.2357, + "step": 5860 + }, + { + "epoch": 2.528358852706491, + "grad_norm": 0.20918558537960052, + "learning_rate": 9.102834870633703e-06, + "loss": 2.2514, + "step": 5861 + }, + { + "epoch": 2.5287901660556393, + "grad_norm": 0.2057720124721527, + "learning_rate": 9.086545410867107e-06, + "loss": 2.1962, + "step": 5862 + }, + { + "epoch": 2.5292214794047876, + "grad_norm": 0.19434405863285065, + "learning_rate": 9.070269599202112e-06, + "loss": 2.141, + "step": 5863 + }, + { + "epoch": 2.529652792753936, + "grad_norm": 0.20333506166934967, + "learning_rate": 9.05400743900877e-06, + "loss": 2.2222, + "step": 5864 + }, + { + "epoch": 2.530084106103084, + "grad_norm": 0.19405965507030487, + "learning_rate": 9.037758933654396e-06, + "loss": 2.1248, + "step": 5865 + }, + { + "epoch": 2.530515419452232, + "grad_norm": 0.2062096744775772, + "learning_rate": 9.021524086503374e-06, + "loss": 2.1761, + "step": 5866 + }, + { + "epoch": 2.53094673280138, + "grad_norm": 0.1919679492712021, + "learning_rate": 9.005302900917313e-06, + "loss": 2.0342, + "step": 5867 + }, + { + "epoch": 2.5313780461505284, + "grad_norm": 0.20065893232822418, + "learning_rate": 8.98909538025497e-06, + "loss": 2.2201, + "step": 5868 + }, + { + "epoch": 2.5318093594996767, + "grad_norm": 0.2016979157924652, + "learning_rate": 8.972901527872337e-06, + "loss": 2.0871, + "step": 5869 + }, + { + "epoch": 2.5322406728488245, + "grad_norm": 0.2010689228773117, + "learning_rate": 8.956721347122519e-06, + "loss": 2.2679, + "step": 5870 + }, + { + "epoch": 2.5326719861979727, + "grad_norm": 0.18522782623767853, + "learning_rate": 8.94055484135575e-06, + "loss": 2.1558, + "step": 5871 + }, + { + "epoch": 2.533103299547121, + "grad_norm": 0.21023201942443848, + "learning_rate": 8.92440201391952e-06, + "loss": 2.0731, + "step": 5872 + }, + { + "epoch": 2.5335346128962692, + "grad_norm": 0.21523456275463104, + "learning_rate": 8.908262868158452e-06, + "loss": 1.936, + "step": 5873 + }, + { + "epoch": 2.5339659262454175, + "grad_norm": 0.1987065076828003, + "learning_rate": 8.892137407414329e-06, + "loss": 2.1175, + "step": 5874 + }, + { + "epoch": 2.5343972395945653, + "grad_norm": 0.19248445332050323, + "learning_rate": 8.876025635026096e-06, + "loss": 2.0221, + "step": 5875 + }, + { + "epoch": 2.5343972395945653, + "eval_loss": 2.087226390838623, + "eval_runtime": 200.8487, + "eval_samples_per_second": 0.159, + "eval_steps_per_second": 0.159, + "step": 5875 + }, + { + "epoch": 2.5348285529437136, + "grad_norm": 0.20776793360710144, + "learning_rate": 8.859927554329877e-06, + "loss": 1.9483, + "step": 5876 + }, + { + "epoch": 2.535259866292862, + "grad_norm": 0.19198954105377197, + "learning_rate": 8.843843168658945e-06, + "loss": 2.2548, + "step": 5877 + }, + { + "epoch": 2.53569117964201, + "grad_norm": 0.1943345069885254, + "learning_rate": 8.827772481343764e-06, + "loss": 2.2774, + "step": 5878 + }, + { + "epoch": 2.536122492991158, + "grad_norm": 0.21345116198062897, + "learning_rate": 8.811715495711938e-06, + "loss": 2.2552, + "step": 5879 + }, + { + "epoch": 2.536553806340306, + "grad_norm": 0.1998181790113449, + "learning_rate": 8.795672215088231e-06, + "loss": 2.098, + "step": 5880 + }, + { + "epoch": 2.5369851196894544, + "grad_norm": 0.1961486041545868, + "learning_rate": 8.779642642794604e-06, + "loss": 2.0678, + "step": 5881 + }, + { + "epoch": 2.5374164330386026, + "grad_norm": 0.2047472447156906, + "learning_rate": 8.763626782150118e-06, + "loss": 2.1216, + "step": 5882 + }, + { + "epoch": 2.537847746387751, + "grad_norm": 0.19389478862285614, + "learning_rate": 8.747624636471071e-06, + "loss": 2.2829, + "step": 5883 + }, + { + "epoch": 2.5382790597368987, + "grad_norm": 0.1919451355934143, + "learning_rate": 8.731636209070886e-06, + "loss": 2.0559, + "step": 5884 + }, + { + "epoch": 2.538710373086047, + "grad_norm": 0.1973874270915985, + "learning_rate": 8.715661503260108e-06, + "loss": 2.1969, + "step": 5885 + }, + { + "epoch": 2.5391416864351952, + "grad_norm": 0.19648849964141846, + "learning_rate": 8.699700522346464e-06, + "loss": 2.1255, + "step": 5886 + }, + { + "epoch": 2.5395729997843435, + "grad_norm": 0.22413647174835205, + "learning_rate": 8.683753269634905e-06, + "loss": 2.1444, + "step": 5887 + }, + { + "epoch": 2.5400043131334913, + "grad_norm": 0.212081179022789, + "learning_rate": 8.667819748427437e-06, + "loss": 2.0756, + "step": 5888 + }, + { + "epoch": 2.5404356264826395, + "grad_norm": 0.19365866482257843, + "learning_rate": 8.651899962023297e-06, + "loss": 2.2299, + "step": 5889 + }, + { + "epoch": 2.540866939831788, + "grad_norm": 3.2413103580474854, + "learning_rate": 8.635993913718842e-06, + "loss": 1.843, + "step": 5890 + }, + { + "epoch": 2.541298253180936, + "grad_norm": 0.19114452600479126, + "learning_rate": 8.620101606807583e-06, + "loss": 2.1101, + "step": 5891 + }, + { + "epoch": 2.5417295665300843, + "grad_norm": 0.20034754276275635, + "learning_rate": 8.604223044580216e-06, + "loss": 2.2439, + "step": 5892 + }, + { + "epoch": 2.542160879879232, + "grad_norm": 0.21234995126724243, + "learning_rate": 8.588358230324553e-06, + "loss": 2.153, + "step": 5893 + }, + { + "epoch": 2.5425921932283804, + "grad_norm": 0.18674694001674652, + "learning_rate": 8.572507167325579e-06, + "loss": 2.0226, + "step": 5894 + }, + { + "epoch": 2.5430235065775286, + "grad_norm": 0.20961590111255646, + "learning_rate": 8.556669858865448e-06, + "loss": 2.3047, + "step": 5895 + }, + { + "epoch": 2.543454819926677, + "grad_norm": 0.2046174705028534, + "learning_rate": 8.54084630822341e-06, + "loss": 2.214, + "step": 5896 + }, + { + "epoch": 2.5438861332758247, + "grad_norm": 0.21066071093082428, + "learning_rate": 8.525036518675959e-06, + "loss": 2.3135, + "step": 5897 + }, + { + "epoch": 2.544317446624973, + "grad_norm": 0.22002047300338745, + "learning_rate": 8.50924049349664e-06, + "loss": 2.0497, + "step": 5898 + }, + { + "epoch": 2.544748759974121, + "grad_norm": 0.19885514676570892, + "learning_rate": 8.493458235956212e-06, + "loss": 2.1107, + "step": 5899 + }, + { + "epoch": 2.5451800733232695, + "grad_norm": 0.17786090075969696, + "learning_rate": 8.477689749322537e-06, + "loss": 2.3082, + "step": 5900 + }, + { + "epoch": 2.5451800733232695, + "eval_loss": 2.0872507095336914, + "eval_runtime": 202.5459, + "eval_samples_per_second": 0.158, + "eval_steps_per_second": 0.158, + "step": 5900 + }, + { + "epoch": 2.5456113866724177, + "grad_norm": 0.20288458466529846, + "learning_rate": 8.461935036860695e-06, + "loss": 2.2552, + "step": 5901 + }, + { + "epoch": 2.5460427000215655, + "grad_norm": 0.19387395679950714, + "learning_rate": 8.446194101832865e-06, + "loss": 2.0872, + "step": 5902 + }, + { + "epoch": 2.5464740133707138, + "grad_norm": 0.20319326221942902, + "learning_rate": 8.430466947498338e-06, + "loss": 2.4172, + "step": 5903 + }, + { + "epoch": 2.546905326719862, + "grad_norm": 0.6388072967529297, + "learning_rate": 8.414753577113634e-06, + "loss": 2.148, + "step": 5904 + }, + { + "epoch": 2.5473366400690103, + "grad_norm": 0.20338557660579681, + "learning_rate": 8.399053993932367e-06, + "loss": 2.1059, + "step": 5905 + }, + { + "epoch": 2.547767953418158, + "grad_norm": 0.19849160313606262, + "learning_rate": 8.383368201205312e-06, + "loss": 2.194, + "step": 5906 + }, + { + "epoch": 2.5481992667673063, + "grad_norm": 0.22318509221076965, + "learning_rate": 8.367696202180371e-06, + "loss": 2.0911, + "step": 5907 + }, + { + "epoch": 2.5486305801164546, + "grad_norm": 0.19940035045146942, + "learning_rate": 8.352038000102614e-06, + "loss": 1.9267, + "step": 5908 + }, + { + "epoch": 2.549061893465603, + "grad_norm": 0.211517333984375, + "learning_rate": 8.336393598214233e-06, + "loss": 2.2814, + "step": 5909 + }, + { + "epoch": 2.549493206814751, + "grad_norm": 0.186599001288414, + "learning_rate": 8.320762999754568e-06, + "loss": 1.8915, + "step": 5910 + }, + { + "epoch": 2.549924520163899, + "grad_norm": 0.2223532497882843, + "learning_rate": 8.305146207960148e-06, + "loss": 2.2837, + "step": 5911 + }, + { + "epoch": 2.550355833513047, + "grad_norm": 0.20304465293884277, + "learning_rate": 8.289543226064552e-06, + "loss": 2.0781, + "step": 5912 + }, + { + "epoch": 2.5507871468621954, + "grad_norm": 0.20744968950748444, + "learning_rate": 8.273954057298562e-06, + "loss": 1.9754, + "step": 5913 + }, + { + "epoch": 2.5512184602113437, + "grad_norm": 0.20294539630413055, + "learning_rate": 8.258378704890073e-06, + "loss": 2.0599, + "step": 5914 + }, + { + "epoch": 2.5516497735604915, + "grad_norm": 0.1947573870420456, + "learning_rate": 8.242817172064162e-06, + "loss": 2.1859, + "step": 5915 + }, + { + "epoch": 2.5520810869096398, + "grad_norm": 0.19360491633415222, + "learning_rate": 8.22726946204301e-06, + "loss": 2.2329, + "step": 5916 + }, + { + "epoch": 2.552512400258788, + "grad_norm": 0.21240995824337006, + "learning_rate": 8.211735578045909e-06, + "loss": 2.154, + "step": 5917 + }, + { + "epoch": 2.5529437136079363, + "grad_norm": 0.20330595970153809, + "learning_rate": 8.19621552328935e-06, + "loss": 1.878, + "step": 5918 + }, + { + "epoch": 2.5533750269570845, + "grad_norm": 0.20137812197208405, + "learning_rate": 8.180709300986926e-06, + "loss": 2.1407, + "step": 5919 + }, + { + "epoch": 2.5538063403062323, + "grad_norm": 0.19222082197666168, + "learning_rate": 8.165216914349357e-06, + "loss": 2.2948, + "step": 5920 + }, + { + "epoch": 2.5542376536553806, + "grad_norm": 0.22950433194637299, + "learning_rate": 8.149738366584526e-06, + "loss": 2.049, + "step": 5921 + }, + { + "epoch": 2.554668967004529, + "grad_norm": 0.22426775097846985, + "learning_rate": 8.134273660897422e-06, + "loss": 2.1904, + "step": 5922 + }, + { + "epoch": 2.555100280353677, + "grad_norm": 0.2124083787202835, + "learning_rate": 8.118822800490199e-06, + "loss": 2.2236, + "step": 5923 + }, + { + "epoch": 2.555531593702825, + "grad_norm": 0.21372617781162262, + "learning_rate": 8.103385788562105e-06, + "loss": 2.3067, + "step": 5924 + }, + { + "epoch": 2.555962907051973, + "grad_norm": 0.20108765363693237, + "learning_rate": 8.087962628309552e-06, + "loss": 2.2048, + "step": 5925 + }, + { + "epoch": 2.555962907051973, + "eval_loss": 2.087303638458252, + "eval_runtime": 204.4825, + "eval_samples_per_second": 0.156, + "eval_steps_per_second": 0.156, + "step": 5925 + }, + { + "epoch": 2.5563942204011214, + "grad_norm": 0.20634593069553375, + "learning_rate": 8.072553322926076e-06, + "loss": 2.1873, + "step": 5926 + }, + { + "epoch": 2.5568255337502697, + "grad_norm": 0.17352814972400665, + "learning_rate": 8.05715787560235e-06, + "loss": 2.1897, + "step": 5927 + }, + { + "epoch": 2.557256847099418, + "grad_norm": 0.1895429790019989, + "learning_rate": 8.04177628952613e-06, + "loss": 2.0911, + "step": 5928 + }, + { + "epoch": 2.5576881604485657, + "grad_norm": 0.21185369789600372, + "learning_rate": 8.026408567882415e-06, + "loss": 2.1121, + "step": 5929 + }, + { + "epoch": 2.558119473797714, + "grad_norm": 0.214601069688797, + "learning_rate": 8.011054713853196e-06, + "loss": 2.0906, + "step": 5930 + }, + { + "epoch": 2.5585507871468622, + "grad_norm": 0.2063516229391098, + "learning_rate": 7.995714730617674e-06, + "loss": 2.3176, + "step": 5931 + }, + { + "epoch": 2.5589821004960105, + "grad_norm": 0.20094339549541473, + "learning_rate": 7.980388621352151e-06, + "loss": 2.2627, + "step": 5932 + }, + { + "epoch": 2.5594134138451583, + "grad_norm": 0.190909743309021, + "learning_rate": 7.965076389230096e-06, + "loss": 2.267, + "step": 5933 + }, + { + "epoch": 2.5598447271943066, + "grad_norm": 0.19475963711738586, + "learning_rate": 7.949778037422073e-06, + "loss": 1.9725, + "step": 5934 + }, + { + "epoch": 2.560276040543455, + "grad_norm": 0.2023172229528427, + "learning_rate": 7.93449356909572e-06, + "loss": 2.0805, + "step": 5935 + }, + { + "epoch": 2.560707353892603, + "grad_norm": 0.22337819635868073, + "learning_rate": 7.919222987415911e-06, + "loss": 2.2469, + "step": 5936 + }, + { + "epoch": 2.5611386672417513, + "grad_norm": 0.1987045407295227, + "learning_rate": 7.903966295544562e-06, + "loss": 2.2534, + "step": 5937 + }, + { + "epoch": 2.561569980590899, + "grad_norm": 0.19819551706314087, + "learning_rate": 7.888723496640739e-06, + "loss": 1.9927, + "step": 5938 + }, + { + "epoch": 2.5620012939400474, + "grad_norm": 0.19501759111881256, + "learning_rate": 7.873494593860636e-06, + "loss": 2.1534, + "step": 5939 + }, + { + "epoch": 2.5624326072891956, + "grad_norm": 0.19448330998420715, + "learning_rate": 7.858279590357553e-06, + "loss": 2.2013, + "step": 5940 + }, + { + "epoch": 2.562863920638344, + "grad_norm": 0.19504505395889282, + "learning_rate": 7.843078489281937e-06, + "loss": 2.1991, + "step": 5941 + }, + { + "epoch": 2.5632952339874917, + "grad_norm": 0.21552002429962158, + "learning_rate": 7.827891293781322e-06, + "loss": 2.1397, + "step": 5942 + }, + { + "epoch": 2.56372654733664, + "grad_norm": 0.19737036526203156, + "learning_rate": 7.812718007000424e-06, + "loss": 2.0697, + "step": 5943 + }, + { + "epoch": 2.564157860685788, + "grad_norm": 0.19582600891590118, + "learning_rate": 7.797558632080996e-06, + "loss": 2.1335, + "step": 5944 + }, + { + "epoch": 2.5645891740349365, + "grad_norm": 0.19773516058921814, + "learning_rate": 7.782413172161963e-06, + "loss": 2.0992, + "step": 5945 + }, + { + "epoch": 2.5650204873840847, + "grad_norm": 0.20045964419841766, + "learning_rate": 7.767281630379349e-06, + "loss": 2.2479, + "step": 5946 + }, + { + "epoch": 2.5654518007332325, + "grad_norm": 0.19492270052433014, + "learning_rate": 7.752164009866338e-06, + "loss": 2.1007, + "step": 5947 + }, + { + "epoch": 2.565883114082381, + "grad_norm": 0.19753959774971008, + "learning_rate": 7.737060313753185e-06, + "loss": 2.1354, + "step": 5948 + }, + { + "epoch": 2.566314427431529, + "grad_norm": 0.19580677151679993, + "learning_rate": 7.721970545167247e-06, + "loss": 2.1234, + "step": 5949 + }, + { + "epoch": 2.5667457407806773, + "grad_norm": 0.23202303051948547, + "learning_rate": 7.706894707233057e-06, + "loss": 2.0789, + "step": 5950 + }, + { + "epoch": 2.5667457407806773, + "eval_loss": 2.087388277053833, + "eval_runtime": 203.6764, + "eval_samples_per_second": 0.157, + "eval_steps_per_second": 0.157, + "step": 5950 + }, + { + "epoch": 2.567177054129825, + "grad_norm": 0.1929044872522354, + "learning_rate": 7.691832803072221e-06, + "loss": 2.1219, + "step": 5951 + }, + { + "epoch": 2.5676083674789734, + "grad_norm": 0.18711425364017487, + "learning_rate": 7.676784835803481e-06, + "loss": 2.2567, + "step": 5952 + }, + { + "epoch": 2.5680396808281216, + "grad_norm": 0.19768792390823364, + "learning_rate": 7.661750808542666e-06, + "loss": 2.0054, + "step": 5953 + }, + { + "epoch": 2.56847099417727, + "grad_norm": 0.18793371319770813, + "learning_rate": 7.646730724402749e-06, + "loss": 2.1189, + "step": 5954 + }, + { + "epoch": 2.568902307526418, + "grad_norm": 0.1859693080186844, + "learning_rate": 7.631724586493804e-06, + "loss": 2.0841, + "step": 5955 + }, + { + "epoch": 2.569333620875566, + "grad_norm": 0.19793583452701569, + "learning_rate": 7.616732397922992e-06, + "loss": 2.1784, + "step": 5956 + }, + { + "epoch": 2.569764934224714, + "grad_norm": 0.1949821561574936, + "learning_rate": 7.601754161794657e-06, + "loss": 1.89, + "step": 5957 + }, + { + "epoch": 2.5701962475738624, + "grad_norm": 0.1898735910654068, + "learning_rate": 7.586789881210159e-06, + "loss": 2.122, + "step": 5958 + }, + { + "epoch": 2.5706275609230107, + "grad_norm": 0.20294371247291565, + "learning_rate": 7.571839559268036e-06, + "loss": 2.1028, + "step": 5959 + }, + { + "epoch": 2.5710588742721585, + "grad_norm": 0.1875556856393814, + "learning_rate": 7.556903199063904e-06, + "loss": 2.0433, + "step": 5960 + }, + { + "epoch": 2.5714901876213068, + "grad_norm": 0.21623234450817108, + "learning_rate": 7.541980803690522e-06, + "loss": 2.1431, + "step": 5961 + }, + { + "epoch": 2.571921500970455, + "grad_norm": 0.20514778792858124, + "learning_rate": 7.527072376237736e-06, + "loss": 2.0767, + "step": 5962 + }, + { + "epoch": 2.5723528143196033, + "grad_norm": 0.1848430186510086, + "learning_rate": 7.5121779197924585e-06, + "loss": 2.1939, + "step": 5963 + }, + { + "epoch": 2.5727841276687515, + "grad_norm": 0.18685248494148254, + "learning_rate": 7.49729743743879e-06, + "loss": 2.0598, + "step": 5964 + }, + { + "epoch": 2.5732154410178993, + "grad_norm": 0.19710910320281982, + "learning_rate": 7.482430932257882e-06, + "loss": 1.9915, + "step": 5965 + }, + { + "epoch": 2.5736467543670476, + "grad_norm": 0.19875414669513702, + "learning_rate": 7.467578407328015e-06, + "loss": 1.9632, + "step": 5966 + }, + { + "epoch": 2.574078067716196, + "grad_norm": 0.2088191658258438, + "learning_rate": 7.452739865724553e-06, + "loss": 2.0285, + "step": 5967 + }, + { + "epoch": 2.574509381065344, + "grad_norm": 0.19341960549354553, + "learning_rate": 7.4379153105199955e-06, + "loss": 1.9608, + "step": 5968 + }, + { + "epoch": 2.574940694414492, + "grad_norm": 0.2098260521888733, + "learning_rate": 7.423104744783914e-06, + "loss": 2.1184, + "step": 5969 + }, + { + "epoch": 2.57537200776364, + "grad_norm": 0.20200322568416595, + "learning_rate": 7.408308171583005e-06, + "loss": 2.1119, + "step": 5970 + }, + { + "epoch": 2.5758033211127884, + "grad_norm": 0.2155592143535614, + "learning_rate": 7.3935255939810586e-06, + "loss": 2.1972, + "step": 5971 + }, + { + "epoch": 2.5762346344619367, + "grad_norm": 0.2060846984386444, + "learning_rate": 7.3787570150389695e-06, + "loss": 2.0123, + "step": 5972 + }, + { + "epoch": 2.576665947811085, + "grad_norm": 0.21392014622688293, + "learning_rate": 7.364002437814731e-06, + "loss": 2.2761, + "step": 5973 + }, + { + "epoch": 2.5770972611602327, + "grad_norm": 0.2115173488855362, + "learning_rate": 7.349261865363434e-06, + "loss": 2.2229, + "step": 5974 + }, + { + "epoch": 2.577528574509381, + "grad_norm": 0.19911466538906097, + "learning_rate": 7.334535300737313e-06, + "loss": 2.1165, + "step": 5975 + }, + { + "epoch": 2.577528574509381, + "eval_loss": 2.087284564971924, + "eval_runtime": 204.9153, + "eval_samples_per_second": 0.156, + "eval_steps_per_second": 0.156, + "step": 5975 + }, + { + "epoch": 2.5779598878585293, + "grad_norm": 0.17648504674434662, + "learning_rate": 7.319822746985618e-06, + "loss": 1.9316, + "step": 5976 + }, + { + "epoch": 2.5783912012076775, + "grad_norm": 0.21495161950588226, + "learning_rate": 7.3051242071547554e-06, + "loss": 2.0624, + "step": 5977 + }, + { + "epoch": 2.5788225145568253, + "grad_norm": 0.18364490568637848, + "learning_rate": 7.290439684288216e-06, + "loss": 1.6425, + "step": 5978 + }, + { + "epoch": 2.5792538279059736, + "grad_norm": 0.20838278532028198, + "learning_rate": 7.275769181426616e-06, + "loss": 2.07, + "step": 5979 + }, + { + "epoch": 2.579685141255122, + "grad_norm": 0.19302137196063995, + "learning_rate": 7.261112701607635e-06, + "loss": 1.8957, + "step": 5980 + }, + { + "epoch": 2.58011645460427, + "grad_norm": 0.1876315027475357, + "learning_rate": 7.2464702478660295e-06, + "loss": 1.8388, + "step": 5981 + }, + { + "epoch": 2.5805477679534183, + "grad_norm": 0.2408015877008438, + "learning_rate": 7.2318418232337054e-06, + "loss": 1.9432, + "step": 5982 + }, + { + "epoch": 2.5809790813025666, + "grad_norm": 0.20141193270683289, + "learning_rate": 7.217227430739633e-06, + "loss": 2.0929, + "step": 5983 + }, + { + "epoch": 2.5814103946517144, + "grad_norm": 0.204561248421669, + "learning_rate": 7.202627073409883e-06, + "loss": 2.039, + "step": 5984 + }, + { + "epoch": 2.5818417080008627, + "grad_norm": 0.20142017304897308, + "learning_rate": 7.188040754267613e-06, + "loss": 2.4515, + "step": 5985 + }, + { + "epoch": 2.582273021350011, + "grad_norm": 0.1931556612253189, + "learning_rate": 7.1734684763330885e-06, + "loss": 2.1068, + "step": 5986 + }, + { + "epoch": 2.5827043346991587, + "grad_norm": 0.20126812160015106, + "learning_rate": 7.158910242623656e-06, + "loss": 2.0849, + "step": 5987 + }, + { + "epoch": 2.583135648048307, + "grad_norm": 0.21254923939704895, + "learning_rate": 7.144366056153736e-06, + "loss": 2.0209, + "step": 5988 + }, + { + "epoch": 2.5835669613974552, + "grad_norm": 0.21428455412387848, + "learning_rate": 7.129835919934912e-06, + "loss": 2.0088, + "step": 5989 + }, + { + "epoch": 2.5839982747466035, + "grad_norm": 0.19039230048656464, + "learning_rate": 7.115319836975761e-06, + "loss": 2.1215, + "step": 5990 + }, + { + "epoch": 2.5844295880957517, + "grad_norm": 0.20700082182884216, + "learning_rate": 7.100817810282011e-06, + "loss": 1.9611, + "step": 5991 + }, + { + "epoch": 2.5848609014449, + "grad_norm": 0.2018066942691803, + "learning_rate": 7.086329842856453e-06, + "loss": 2.1965, + "step": 5992 + }, + { + "epoch": 2.585292214794048, + "grad_norm": 0.20085200667381287, + "learning_rate": 7.071855937699003e-06, + "loss": 2.2475, + "step": 5993 + }, + { + "epoch": 2.585723528143196, + "grad_norm": 0.22047682106494904, + "learning_rate": 7.057396097806647e-06, + "loss": 2.1593, + "step": 5994 + }, + { + "epoch": 2.5861548414923443, + "grad_norm": 0.19113773107528687, + "learning_rate": 7.042950326173416e-06, + "loss": 2.0633, + "step": 5995 + }, + { + "epoch": 2.586586154841492, + "grad_norm": 0.19312886893749237, + "learning_rate": 7.028518625790492e-06, + "loss": 2.1159, + "step": 5996 + }, + { + "epoch": 2.5870174681906404, + "grad_norm": 0.20536403357982635, + "learning_rate": 7.014100999646108e-06, + "loss": 2.1573, + "step": 5997 + }, + { + "epoch": 2.5874487815397886, + "grad_norm": 0.1941017359495163, + "learning_rate": 6.999697450725602e-06, + "loss": 2.1259, + "step": 5998 + }, + { + "epoch": 2.587880094888937, + "grad_norm": 0.19312000274658203, + "learning_rate": 6.985307982011373e-06, + "loss": 1.9693, + "step": 5999 + }, + { + "epoch": 2.588311408238085, + "grad_norm": 0.2647634744644165, + "learning_rate": 6.97093259648292e-06, + "loss": 2.0193, + "step": 6000 + }, + { + "epoch": 2.588311408238085, + "eval_loss": 2.0873632431030273, + "eval_runtime": 211.403, + "eval_samples_per_second": 0.151, + "eval_steps_per_second": 0.151, + "step": 6000 + }, + { + "epoch": 2.5887427215872334, + "grad_norm": 0.18285487592220306, + "learning_rate": 6.956571297116828e-06, + "loss": 2.0634, + "step": 6001 + }, + { + "epoch": 2.589174034936381, + "grad_norm": 0.1905910074710846, + "learning_rate": 6.942224086886744e-06, + "loss": 2.114, + "step": 6002 + }, + { + "epoch": 2.5896053482855295, + "grad_norm": 0.19421325623989105, + "learning_rate": 6.927890968763458e-06, + "loss": 2.1351, + "step": 6003 + }, + { + "epoch": 2.5900366616346777, + "grad_norm": 0.18619950115680695, + "learning_rate": 6.913571945714763e-06, + "loss": 1.9864, + "step": 6004 + }, + { + "epoch": 2.5904679749838255, + "grad_norm": 0.21644152700901031, + "learning_rate": 6.89926702070557e-06, + "loss": 2.1049, + "step": 6005 + }, + { + "epoch": 2.590899288332974, + "grad_norm": 0.20959539711475372, + "learning_rate": 6.884976196697869e-06, + "loss": 2.0024, + "step": 6006 + }, + { + "epoch": 2.591330601682122, + "grad_norm": 0.20113292336463928, + "learning_rate": 6.870699476650751e-06, + "loss": 2.1691, + "step": 6007 + }, + { + "epoch": 2.5917619150312703, + "grad_norm": 0.18709880113601685, + "learning_rate": 6.856436863520368e-06, + "loss": 1.9913, + "step": 6008 + }, + { + "epoch": 2.5921932283804185, + "grad_norm": 0.20955413579940796, + "learning_rate": 6.842188360259915e-06, + "loss": 2.1659, + "step": 6009 + }, + { + "epoch": 2.592624541729567, + "grad_norm": 0.188461035490036, + "learning_rate": 6.827953969819733e-06, + "loss": 2.1667, + "step": 6010 + }, + { + "epoch": 2.5930558550787146, + "grad_norm": 0.18821631371974945, + "learning_rate": 6.813733695147197e-06, + "loss": 2.0207, + "step": 6011 + }, + { + "epoch": 2.593487168427863, + "grad_norm": 0.20827440917491913, + "learning_rate": 6.7995275391867775e-06, + "loss": 2.1734, + "step": 6012 + }, + { + "epoch": 2.593918481777011, + "grad_norm": 0.24497582018375397, + "learning_rate": 6.785335504880005e-06, + "loss": 1.9533, + "step": 6013 + }, + { + "epoch": 2.594349795126159, + "grad_norm": 0.19103097915649414, + "learning_rate": 6.771157595165494e-06, + "loss": 2.1005, + "step": 6014 + }, + { + "epoch": 2.594781108475307, + "grad_norm": 0.19511078298091888, + "learning_rate": 6.756993812978947e-06, + "loss": 2.2424, + "step": 6015 + }, + { + "epoch": 2.5952124218244554, + "grad_norm": 0.1991245299577713, + "learning_rate": 6.742844161253128e-06, + "loss": 2.1054, + "step": 6016 + }, + { + "epoch": 2.5956437351736037, + "grad_norm": 0.18926672637462616, + "learning_rate": 6.7287086429178655e-06, + "loss": 2.1717, + "step": 6017 + }, + { + "epoch": 2.596075048522752, + "grad_norm": 0.20850855112075806, + "learning_rate": 6.714587260900087e-06, + "loss": 2.1144, + "step": 6018 + }, + { + "epoch": 2.5965063618719, + "grad_norm": 0.18856045603752136, + "learning_rate": 6.700480018123769e-06, + "loss": 1.949, + "step": 6019 + }, + { + "epoch": 2.596937675221048, + "grad_norm": 0.22177739441394806, + "learning_rate": 6.686386917509959e-06, + "loss": 2.0571, + "step": 6020 + }, + { + "epoch": 2.5973689885701963, + "grad_norm": 0.22452549636363983, + "learning_rate": 6.672307961976831e-06, + "loss": 2.1356, + "step": 6021 + }, + { + "epoch": 2.5978003019193445, + "grad_norm": 0.24075448513031006, + "learning_rate": 6.658243154439544e-06, + "loss": 2.0627, + "step": 6022 + }, + { + "epoch": 2.5982316152684923, + "grad_norm": 0.2034800946712494, + "learning_rate": 6.644192497810394e-06, + "loss": 2.2097, + "step": 6023 + }, + { + "epoch": 2.5986629286176406, + "grad_norm": 0.1853140890598297, + "learning_rate": 6.630155994998687e-06, + "loss": 1.7731, + "step": 6024 + }, + { + "epoch": 2.599094241966789, + "grad_norm": 0.2146259993314743, + "learning_rate": 6.616133648910882e-06, + "loss": 2.0087, + "step": 6025 + }, + { + "epoch": 2.599094241966789, + "eval_loss": 2.0873498916625977, + "eval_runtime": 204.7135, + "eval_samples_per_second": 0.156, + "eval_steps_per_second": 0.156, + "step": 6025 + }, + { + "epoch": 2.599525555315937, + "grad_norm": 0.2032996118068695, + "learning_rate": 6.602125462450447e-06, + "loss": 2.3627, + "step": 6026 + }, + { + "epoch": 2.5999568686650854, + "grad_norm": 0.19560416042804718, + "learning_rate": 6.588131438517904e-06, + "loss": 2.2585, + "step": 6027 + }, + { + "epoch": 2.6003881820142336, + "grad_norm": 0.21011295914649963, + "learning_rate": 6.574151580010894e-06, + "loss": 1.9346, + "step": 6028 + }, + { + "epoch": 2.6008194953633814, + "grad_norm": 0.1834384649991989, + "learning_rate": 6.560185889824085e-06, + "loss": 2.059, + "step": 6029 + }, + { + "epoch": 2.6012508087125297, + "grad_norm": 0.2010362297296524, + "learning_rate": 6.546234370849246e-06, + "loss": 2.3003, + "step": 6030 + }, + { + "epoch": 2.601682122061678, + "grad_norm": 0.1939825564622879, + "learning_rate": 6.532297025975167e-06, + "loss": 1.987, + "step": 6031 + }, + { + "epoch": 2.6021134354108257, + "grad_norm": 0.19909532368183136, + "learning_rate": 6.518373858087747e-06, + "loss": 2.2438, + "step": 6032 + }, + { + "epoch": 2.602544748759974, + "grad_norm": 0.20319907367229462, + "learning_rate": 6.504464870069914e-06, + "loss": 2.1522, + "step": 6033 + }, + { + "epoch": 2.6029760621091222, + "grad_norm": 0.20442801713943481, + "learning_rate": 6.490570064801673e-06, + "loss": 2.0656, + "step": 6034 + }, + { + "epoch": 2.6034073754582705, + "grad_norm": 0.19795440137386322, + "learning_rate": 6.476689445160138e-06, + "loss": 2.0074, + "step": 6035 + }, + { + "epoch": 2.6038386888074188, + "grad_norm": 0.19856740534305573, + "learning_rate": 6.4628230140193945e-06, + "loss": 2.1488, + "step": 6036 + }, + { + "epoch": 2.604270002156567, + "grad_norm": 0.2064681053161621, + "learning_rate": 6.448970774250661e-06, + "loss": 2.1495, + "step": 6037 + }, + { + "epoch": 2.604701315505715, + "grad_norm": 0.22349761426448822, + "learning_rate": 6.43513272872217e-06, + "loss": 2.004, + "step": 6038 + }, + { + "epoch": 2.605132628854863, + "grad_norm": 0.21363207697868347, + "learning_rate": 6.421308880299278e-06, + "loss": 2.256, + "step": 6039 + }, + { + "epoch": 2.6055639422040113, + "grad_norm": 0.19845212996006012, + "learning_rate": 6.407499231844354e-06, + "loss": 2.0137, + "step": 6040 + }, + { + "epoch": 2.605995255553159, + "grad_norm": 0.1993948072195053, + "learning_rate": 6.393703786216811e-06, + "loss": 2.1983, + "step": 6041 + }, + { + "epoch": 2.6064265689023074, + "grad_norm": 0.19329705834388733, + "learning_rate": 6.379922546273172e-06, + "loss": 2.2284, + "step": 6042 + }, + { + "epoch": 2.6068578822514556, + "grad_norm": 0.2110152691602707, + "learning_rate": 6.3661555148669885e-06, + "loss": 2.1247, + "step": 6043 + }, + { + "epoch": 2.607289195600604, + "grad_norm": 0.20873045921325684, + "learning_rate": 6.352402694848871e-06, + "loss": 2.236, + "step": 6044 + }, + { + "epoch": 2.607720508949752, + "grad_norm": 0.20056168735027313, + "learning_rate": 6.338664089066492e-06, + "loss": 2.3545, + "step": 6045 + }, + { + "epoch": 2.6081518222989004, + "grad_norm": 0.19856448471546173, + "learning_rate": 6.324939700364584e-06, + "loss": 2.1218, + "step": 6046 + }, + { + "epoch": 2.6085831356480482, + "grad_norm": 0.1984233558177948, + "learning_rate": 6.311229531584924e-06, + "loss": 2.1644, + "step": 6047 + }, + { + "epoch": 2.6090144489971965, + "grad_norm": 0.2085246741771698, + "learning_rate": 6.297533585566358e-06, + "loss": 1.9952, + "step": 6048 + }, + { + "epoch": 2.6094457623463447, + "grad_norm": 0.19457945227622986, + "learning_rate": 6.2838518651447845e-06, + "loss": 2.0665, + "step": 6049 + }, + { + "epoch": 2.6098770756954925, + "grad_norm": 0.20151954889297485, + "learning_rate": 6.270184373153153e-06, + "loss": 2.1981, + "step": 6050 + }, + { + "epoch": 2.6098770756954925, + "eval_loss": 2.0872039794921875, + "eval_runtime": 207.3196, + "eval_samples_per_second": 0.154, + "eval_steps_per_second": 0.154, + "step": 6050 + }, + { + "epoch": 2.610308389044641, + "grad_norm": 0.2253008335828781, + "learning_rate": 6.256531112421459e-06, + "loss": 2.1381, + "step": 6051 + }, + { + "epoch": 2.610739702393789, + "grad_norm": 0.2223806083202362, + "learning_rate": 6.242892085776749e-06, + "loss": 2.1014, + "step": 6052 + }, + { + "epoch": 2.6111710157429373, + "grad_norm": 0.1943701058626175, + "learning_rate": 6.229267296043172e-06, + "loss": 1.8895, + "step": 6053 + }, + { + "epoch": 2.6116023290920856, + "grad_norm": 0.20236046612262726, + "learning_rate": 6.215656746041855e-06, + "loss": 2.3317, + "step": 6054 + }, + { + "epoch": 2.612033642441234, + "grad_norm": 0.2128019779920578, + "learning_rate": 6.2020604385910115e-06, + "loss": 2.4217, + "step": 6055 + }, + { + "epoch": 2.6124649557903816, + "grad_norm": 0.1914263665676117, + "learning_rate": 6.188478376505929e-06, + "loss": 2.1676, + "step": 6056 + }, + { + "epoch": 2.61289626913953, + "grad_norm": 0.2062346190214157, + "learning_rate": 6.174910562598909e-06, + "loss": 2.1603, + "step": 6057 + }, + { + "epoch": 2.613327582488678, + "grad_norm": 0.19567060470581055, + "learning_rate": 6.161356999679321e-06, + "loss": 1.9346, + "step": 6058 + }, + { + "epoch": 2.613758895837826, + "grad_norm": 0.19446727633476257, + "learning_rate": 6.147817690553552e-06, + "loss": 2.1298, + "step": 6059 + }, + { + "epoch": 2.614190209186974, + "grad_norm": 0.2038094699382782, + "learning_rate": 6.1342926380250955e-06, + "loss": 2.2005, + "step": 6060 + }, + { + "epoch": 2.6146215225361225, + "grad_norm": 0.21893729269504547, + "learning_rate": 6.12078184489446e-06, + "loss": 1.9707, + "step": 6061 + }, + { + "epoch": 2.6150528358852707, + "grad_norm": 0.19223694503307343, + "learning_rate": 6.107285313959181e-06, + "loss": 2.085, + "step": 6062 + }, + { + "epoch": 2.615484149234419, + "grad_norm": 0.20330168306827545, + "learning_rate": 6.093803048013885e-06, + "loss": 2.2486, + "step": 6063 + }, + { + "epoch": 2.615915462583567, + "grad_norm": 0.1966131627559662, + "learning_rate": 6.08033504985021e-06, + "loss": 2.2113, + "step": 6064 + }, + { + "epoch": 2.616346775932715, + "grad_norm": 0.20934173464775085, + "learning_rate": 6.066881322256853e-06, + "loss": 2.0675, + "step": 6065 + }, + { + "epoch": 2.6167780892818633, + "grad_norm": 0.19852526485919952, + "learning_rate": 6.053441868019551e-06, + "loss": 2.1583, + "step": 6066 + }, + { + "epoch": 2.6172094026310115, + "grad_norm": 0.2178497314453125, + "learning_rate": 6.040016689921121e-06, + "loss": 2.054, + "step": 6067 + }, + { + "epoch": 2.6176407159801593, + "grad_norm": 0.2043694406747818, + "learning_rate": 6.026605790741354e-06, + "loss": 2.245, + "step": 6068 + }, + { + "epoch": 2.6180720293293076, + "grad_norm": 0.23877695202827454, + "learning_rate": 6.0132091732571385e-06, + "loss": 1.8492, + "step": 6069 + }, + { + "epoch": 2.618503342678456, + "grad_norm": 0.1892527937889099, + "learning_rate": 5.999826840242372e-06, + "loss": 2.038, + "step": 6070 + }, + { + "epoch": 2.618934656027604, + "grad_norm": 0.2118130773305893, + "learning_rate": 5.986458794468055e-06, + "loss": 2.2015, + "step": 6071 + }, + { + "epoch": 2.6193659693767524, + "grad_norm": 0.20288275182247162, + "learning_rate": 5.973105038702172e-06, + "loss": 2.3318, + "step": 6072 + }, + { + "epoch": 2.6197972827259006, + "grad_norm": 0.21088698506355286, + "learning_rate": 5.959765575709729e-06, + "loss": 2.1951, + "step": 6073 + }, + { + "epoch": 2.6202285960750484, + "grad_norm": 0.201625794172287, + "learning_rate": 5.946440408252851e-06, + "loss": 2.1579, + "step": 6074 + }, + { + "epoch": 2.6206599094241967, + "grad_norm": 0.19988271594047546, + "learning_rate": 5.9331295390906364e-06, + "loss": 1.9267, + "step": 6075 + }, + { + "epoch": 2.6206599094241967, + "eval_loss": 2.0870893001556396, + "eval_runtime": 206.5147, + "eval_samples_per_second": 0.155, + "eval_steps_per_second": 0.155, + "step": 6075 + }, + { + "epoch": 2.621091222773345, + "grad_norm": 0.2041642665863037, + "learning_rate": 5.919832970979265e-06, + "loss": 2.2081, + "step": 6076 + }, + { + "epoch": 2.6215225361224928, + "grad_norm": 0.18532341718673706, + "learning_rate": 5.90655070667192e-06, + "loss": 2.3406, + "step": 6077 + }, + { + "epoch": 2.621953849471641, + "grad_norm": 0.1991955190896988, + "learning_rate": 5.89328274891884e-06, + "loss": 2.1091, + "step": 6078 + }, + { + "epoch": 2.6223851628207893, + "grad_norm": 0.21103163063526154, + "learning_rate": 5.8800291004673036e-06, + "loss": 2.0537, + "step": 6079 + }, + { + "epoch": 2.6228164761699375, + "grad_norm": 0.18268458545207977, + "learning_rate": 5.866789764061617e-06, + "loss": 1.9784, + "step": 6080 + }, + { + "epoch": 2.6232477895190858, + "grad_norm": 0.4187362492084503, + "learning_rate": 5.853564742443167e-06, + "loss": 2.0681, + "step": 6081 + }, + { + "epoch": 2.623679102868234, + "grad_norm": 0.21114923059940338, + "learning_rate": 5.8403540383502925e-06, + "loss": 2.072, + "step": 6082 + }, + { + "epoch": 2.624110416217382, + "grad_norm": 0.1981240063905716, + "learning_rate": 5.827157654518427e-06, + "loss": 2.131, + "step": 6083 + }, + { + "epoch": 2.62454172956653, + "grad_norm": 0.21558067202568054, + "learning_rate": 5.813975593680029e-06, + "loss": 2.1768, + "step": 6084 + }, + { + "epoch": 2.6249730429156783, + "grad_norm": 0.1967075616121292, + "learning_rate": 5.8008078585645965e-06, + "loss": 2.2239, + "step": 6085 + }, + { + "epoch": 2.625404356264826, + "grad_norm": 0.2138763815164566, + "learning_rate": 5.787654451898658e-06, + "loss": 2.0824, + "step": 6086 + }, + { + "epoch": 2.6258356696139744, + "grad_norm": 0.20435772836208344, + "learning_rate": 5.774515376405733e-06, + "loss": 2.2836, + "step": 6087 + }, + { + "epoch": 2.6262669829631227, + "grad_norm": 0.1953764408826828, + "learning_rate": 5.761390634806448e-06, + "loss": 2.1566, + "step": 6088 + }, + { + "epoch": 2.626698296312271, + "grad_norm": 0.22136853635311127, + "learning_rate": 5.748280229818419e-06, + "loss": 2.1489, + "step": 6089 + }, + { + "epoch": 2.627129609661419, + "grad_norm": 0.22649671137332916, + "learning_rate": 5.735184164156284e-06, + "loss": 2.0366, + "step": 6090 + }, + { + "epoch": 2.6275609230105674, + "grad_norm": 0.21080105006694794, + "learning_rate": 5.7221024405317395e-06, + "loss": 2.2014, + "step": 6091 + }, + { + "epoch": 2.6279922363597152, + "grad_norm": 0.180626779794693, + "learning_rate": 5.709035061653494e-06, + "loss": 2.1699, + "step": 6092 + }, + { + "epoch": 2.6284235497088635, + "grad_norm": 0.191581591963768, + "learning_rate": 5.6959820302272916e-06, + "loss": 2.0196, + "step": 6093 + }, + { + "epoch": 2.6288548630580117, + "grad_norm": 0.20116713643074036, + "learning_rate": 5.682943348955913e-06, + "loss": 2.0793, + "step": 6094 + }, + { + "epoch": 2.6292861764071596, + "grad_norm": 0.19673985242843628, + "learning_rate": 5.669919020539146e-06, + "loss": 1.968, + "step": 6095 + }, + { + "epoch": 2.629717489756308, + "grad_norm": 0.18929420411586761, + "learning_rate": 5.656909047673827e-06, + "loss": 2.1299, + "step": 6096 + }, + { + "epoch": 2.630148803105456, + "grad_norm": 0.20557215809822083, + "learning_rate": 5.64391343305381e-06, + "loss": 2.0828, + "step": 6097 + }, + { + "epoch": 2.6305801164546043, + "grad_norm": 0.19969946146011353, + "learning_rate": 5.6309321793699735e-06, + "loss": 2.2417, + "step": 6098 + }, + { + "epoch": 2.6310114298037526, + "grad_norm": 0.21544784307479858, + "learning_rate": 5.617965289310261e-06, + "loss": 2.1295, + "step": 6099 + }, + { + "epoch": 2.631442743152901, + "grad_norm": 0.20157448947429657, + "learning_rate": 5.605012765559575e-06, + "loss": 2.1707, + "step": 6100 + }, + { + "epoch": 2.631442743152901, + "eval_loss": 2.0871706008911133, + "eval_runtime": 204.4249, + "eval_samples_per_second": 0.157, + "eval_steps_per_second": 0.157, + "step": 6100 + }, + { + "epoch": 2.6318740565020486, + "grad_norm": 0.19672569632530212, + "learning_rate": 5.592074610799868e-06, + "loss": 2.2779, + "step": 6101 + }, + { + "epoch": 2.632305369851197, + "grad_norm": 0.21693246066570282, + "learning_rate": 5.579150827710158e-06, + "loss": 2.2466, + "step": 6102 + }, + { + "epoch": 2.632736683200345, + "grad_norm": 0.28595659136772156, + "learning_rate": 5.566241418966452e-06, + "loss": 2.1764, + "step": 6103 + }, + { + "epoch": 2.633167996549493, + "grad_norm": 0.19356441497802734, + "learning_rate": 5.55334638724178e-06, + "loss": 2.4061, + "step": 6104 + }, + { + "epoch": 2.633599309898641, + "grad_norm": 0.19937746226787567, + "learning_rate": 5.540465735206165e-06, + "loss": 2.0587, + "step": 6105 + }, + { + "epoch": 2.6340306232477895, + "grad_norm": 0.17437097430229187, + "learning_rate": 5.527599465526739e-06, + "loss": 1.831, + "step": 6106 + }, + { + "epoch": 2.6344619365969377, + "grad_norm": 0.20675235986709595, + "learning_rate": 5.51474758086757e-06, + "loss": 2.1508, + "step": 6107 + }, + { + "epoch": 2.634893249946086, + "grad_norm": 0.19855298101902008, + "learning_rate": 5.501910083889799e-06, + "loss": 2.1318, + "step": 6108 + }, + { + "epoch": 2.6353245632952342, + "grad_norm": 0.20525705814361572, + "learning_rate": 5.489086977251564e-06, + "loss": 2.1458, + "step": 6109 + }, + { + "epoch": 2.635755876644382, + "grad_norm": 0.20039616525173187, + "learning_rate": 5.476278263608025e-06, + "loss": 2.3108, + "step": 6110 + }, + { + "epoch": 2.6361871899935303, + "grad_norm": 0.19344262778759003, + "learning_rate": 5.463483945611377e-06, + "loss": 2.1225, + "step": 6111 + }, + { + "epoch": 2.6366185033426786, + "grad_norm": 0.19685900211334229, + "learning_rate": 5.450704025910793e-06, + "loss": 2.1575, + "step": 6112 + }, + { + "epoch": 2.6370498166918264, + "grad_norm": 0.20429310202598572, + "learning_rate": 5.437938507152556e-06, + "loss": 2.0551, + "step": 6113 + }, + { + "epoch": 2.6374811300409746, + "grad_norm": 0.195399209856987, + "learning_rate": 5.425187391979852e-06, + "loss": 2.1962, + "step": 6114 + }, + { + "epoch": 2.637912443390123, + "grad_norm": 0.21501316130161285, + "learning_rate": 5.4124506830329345e-06, + "loss": 2.1833, + "step": 6115 + }, + { + "epoch": 2.638343756739271, + "grad_norm": 0.27056029438972473, + "learning_rate": 5.39972838294912e-06, + "loss": 2.1472, + "step": 6116 + }, + { + "epoch": 2.6387750700884194, + "grad_norm": 0.2011452466249466, + "learning_rate": 5.3870204943626845e-06, + "loss": 2.167, + "step": 6117 + }, + { + "epoch": 2.6392063834375676, + "grad_norm": 0.22378626465797424, + "learning_rate": 5.374327019904931e-06, + "loss": 2.1392, + "step": 6118 + }, + { + "epoch": 2.6396376967867154, + "grad_norm": 0.23554293811321259, + "learning_rate": 5.361647962204166e-06, + "loss": 2.0103, + "step": 6119 + }, + { + "epoch": 2.6400690101358637, + "grad_norm": 0.205733984708786, + "learning_rate": 5.348983323885756e-06, + "loss": 2.1259, + "step": 6120 + }, + { + "epoch": 2.640500323485012, + "grad_norm": 0.2434549480676651, + "learning_rate": 5.336333107572028e-06, + "loss": 2.1261, + "step": 6121 + }, + { + "epoch": 2.6409316368341598, + "grad_norm": 0.2025012969970703, + "learning_rate": 5.3236973158823695e-06, + "loss": 2.1764, + "step": 6122 + }, + { + "epoch": 2.641362950183308, + "grad_norm": 0.195558562874794, + "learning_rate": 5.311075951433147e-06, + "loss": 1.8611, + "step": 6123 + }, + { + "epoch": 2.6417942635324563, + "grad_norm": 0.19787761569023132, + "learning_rate": 5.298469016837753e-06, + "loss": 2.2368, + "step": 6124 + }, + { + "epoch": 2.6422255768816045, + "grad_norm": 0.19688139855861664, + "learning_rate": 5.285876514706591e-06, + "loss": 2.3121, + "step": 6125 + }, + { + "epoch": 2.6422255768816045, + "eval_loss": 2.087151527404785, + "eval_runtime": 204.0015, + "eval_samples_per_second": 0.157, + "eval_steps_per_second": 0.157, + "step": 6125 + }, + { + "epoch": 2.642656890230753, + "grad_norm": 0.2052869200706482, + "learning_rate": 5.273298447647067e-06, + "loss": 2.0919, + "step": 6126 + }, + { + "epoch": 2.643088203579901, + "grad_norm": 0.21629759669303894, + "learning_rate": 5.26073481826364e-06, + "loss": 2.054, + "step": 6127 + }, + { + "epoch": 2.643519516929049, + "grad_norm": 0.17514556646347046, + "learning_rate": 5.248185629157711e-06, + "loss": 1.8445, + "step": 6128 + }, + { + "epoch": 2.643950830278197, + "grad_norm": 0.21411702036857605, + "learning_rate": 5.235650882927744e-06, + "loss": 1.6976, + "step": 6129 + }, + { + "epoch": 2.6443821436273454, + "grad_norm": 0.194255530834198, + "learning_rate": 5.22313058216918e-06, + "loss": 2.1619, + "step": 6130 + }, + { + "epoch": 2.644813456976493, + "grad_norm": 0.1894819736480713, + "learning_rate": 5.210624729474503e-06, + "loss": 2.2686, + "step": 6131 + }, + { + "epoch": 2.6452447703256414, + "grad_norm": 0.21412064135074615, + "learning_rate": 5.198133327433199e-06, + "loss": 2.2112, + "step": 6132 + }, + { + "epoch": 2.6456760836747897, + "grad_norm": 0.20222002267837524, + "learning_rate": 5.1856563786317e-06, + "loss": 2.128, + "step": 6133 + }, + { + "epoch": 2.646107397023938, + "grad_norm": 0.18645766377449036, + "learning_rate": 5.17319388565354e-06, + "loss": 2.1181, + "step": 6134 + }, + { + "epoch": 2.646538710373086, + "grad_norm": 0.19865809381008148, + "learning_rate": 5.160745851079196e-06, + "loss": 2.1027, + "step": 6135 + }, + { + "epoch": 2.6469700237222344, + "grad_norm": 0.8640784025192261, + "learning_rate": 5.148312277486172e-06, + "loss": 1.9384, + "step": 6136 + }, + { + "epoch": 2.6474013370713823, + "grad_norm": 0.22291713953018188, + "learning_rate": 5.135893167448976e-06, + "loss": 2.3333, + "step": 6137 + }, + { + "epoch": 2.6478326504205305, + "grad_norm": 0.22736655175685883, + "learning_rate": 5.1234885235391264e-06, + "loss": 2.0239, + "step": 6138 + }, + { + "epoch": 2.6482639637696788, + "grad_norm": 0.20177805423736572, + "learning_rate": 5.111098348325126e-06, + "loss": 2.3397, + "step": 6139 + }, + { + "epoch": 2.6486952771188266, + "grad_norm": 0.2055068165063858, + "learning_rate": 5.098722644372516e-06, + "loss": 2.1454, + "step": 6140 + }, + { + "epoch": 2.649126590467975, + "grad_norm": 0.20407767593860626, + "learning_rate": 5.086361414243803e-06, + "loss": 2.2067, + "step": 6141 + }, + { + "epoch": 2.649557903817123, + "grad_norm": 0.20581407845020294, + "learning_rate": 5.074014660498524e-06, + "loss": 2.107, + "step": 6142 + }, + { + "epoch": 2.6499892171662713, + "grad_norm": 0.2167045921087265, + "learning_rate": 5.0616823856932e-06, + "loss": 2.2216, + "step": 6143 + }, + { + "epoch": 2.6504205305154196, + "grad_norm": 0.21492063999176025, + "learning_rate": 5.049364592381364e-06, + "loss": 2.2001, + "step": 6144 + }, + { + "epoch": 2.650851843864568, + "grad_norm": 0.19565856456756592, + "learning_rate": 5.037061283113583e-06, + "loss": 2.0096, + "step": 6145 + }, + { + "epoch": 2.6512831572137157, + "grad_norm": 0.19109144806861877, + "learning_rate": 5.024772460437345e-06, + "loss": 2.0656, + "step": 6146 + }, + { + "epoch": 2.651714470562864, + "grad_norm": 0.2673475742340088, + "learning_rate": 5.012498126897199e-06, + "loss": 2.132, + "step": 6147 + }, + { + "epoch": 2.652145783912012, + "grad_norm": 0.19297978281974792, + "learning_rate": 5.000238285034702e-06, + "loss": 2.0913, + "step": 6148 + }, + { + "epoch": 2.65257709726116, + "grad_norm": 0.21174772083759308, + "learning_rate": 4.987992937388366e-06, + "loss": 2.1064, + "step": 6149 + }, + { + "epoch": 2.6530084106103082, + "grad_norm": 0.19300834834575653, + "learning_rate": 4.9757620864937454e-06, + "loss": 2.1357, + "step": 6150 + }, + { + "epoch": 2.6530084106103082, + "eval_loss": 2.087246894836426, + "eval_runtime": 204.0893, + "eval_samples_per_second": 0.157, + "eval_steps_per_second": 0.157, + "step": 6150 + }, + { + "epoch": 2.6534397239594565, + "grad_norm": 0.18600746989250183, + "learning_rate": 4.963545734883331e-06, + "loss": 2.1089, + "step": 6151 + }, + { + "epoch": 2.6538710373086047, + "grad_norm": 0.1895558387041092, + "learning_rate": 4.951343885086692e-06, + "loss": 2.1419, + "step": 6152 + }, + { + "epoch": 2.654302350657753, + "grad_norm": 0.21031838655471802, + "learning_rate": 4.93915653963034e-06, + "loss": 2.2392, + "step": 6153 + }, + { + "epoch": 2.6547336640069013, + "grad_norm": 0.2003740668296814, + "learning_rate": 4.926983701037807e-06, + "loss": 2.0949, + "step": 6154 + }, + { + "epoch": 2.655164977356049, + "grad_norm": 0.19085167348384857, + "learning_rate": 4.914825371829592e-06, + "loss": 2.3022, + "step": 6155 + }, + { + "epoch": 2.6555962907051973, + "grad_norm": 0.1863519549369812, + "learning_rate": 4.902681554523224e-06, + "loss": 2.0893, + "step": 6156 + }, + { + "epoch": 2.6560276040543456, + "grad_norm": 0.21060968935489655, + "learning_rate": 4.890552251633217e-06, + "loss": 2.0933, + "step": 6157 + }, + { + "epoch": 2.6564589174034934, + "grad_norm": 0.18363919854164124, + "learning_rate": 4.878437465671054e-06, + "loss": 2.1239, + "step": 6158 + }, + { + "epoch": 2.6568902307526416, + "grad_norm": 0.17458274960517883, + "learning_rate": 4.866337199145276e-06, + "loss": 1.9382, + "step": 6159 + }, + { + "epoch": 2.65732154410179, + "grad_norm": 0.1946101188659668, + "learning_rate": 4.8542514545613395e-06, + "loss": 2.0561, + "step": 6160 + }, + { + "epoch": 2.657752857450938, + "grad_norm": 0.2080899327993393, + "learning_rate": 4.842180234421725e-06, + "loss": 2.1089, + "step": 6161 + }, + { + "epoch": 2.6581841708000864, + "grad_norm": 0.18995599448680878, + "learning_rate": 4.830123541225941e-06, + "loss": 2.0242, + "step": 6162 + }, + { + "epoch": 2.6586154841492347, + "grad_norm": 0.20470154285430908, + "learning_rate": 4.818081377470442e-06, + "loss": 2.0086, + "step": 6163 + }, + { + "epoch": 2.6590467974983825, + "grad_norm": 0.20341189205646515, + "learning_rate": 4.8060537456487e-06, + "loss": 2.1157, + "step": 6164 + }, + { + "epoch": 2.6594781108475307, + "grad_norm": 0.18980471789836884, + "learning_rate": 4.794040648251129e-06, + "loss": 2.0815, + "step": 6165 + }, + { + "epoch": 2.659909424196679, + "grad_norm": 0.2044554501771927, + "learning_rate": 4.782042087765226e-06, + "loss": 2.1729, + "step": 6166 + }, + { + "epoch": 2.660340737545827, + "grad_norm": 0.19274021685123444, + "learning_rate": 4.7700580666753915e-06, + "loss": 2.1044, + "step": 6167 + }, + { + "epoch": 2.660772050894975, + "grad_norm": 0.19175758957862854, + "learning_rate": 4.758088587463052e-06, + "loss": 2.0502, + "step": 6168 + }, + { + "epoch": 2.6612033642441233, + "grad_norm": 0.20993196964263916, + "learning_rate": 4.746133652606632e-06, + "loss": 2.2449, + "step": 6169 + }, + { + "epoch": 2.6616346775932715, + "grad_norm": 0.19075722992420197, + "learning_rate": 4.734193264581518e-06, + "loss": 2.0345, + "step": 6170 + }, + { + "epoch": 2.66206599094242, + "grad_norm": 0.20163051784038544, + "learning_rate": 4.722267425860099e-06, + "loss": 2.1201, + "step": 6171 + }, + { + "epoch": 2.662497304291568, + "grad_norm": 0.20374572277069092, + "learning_rate": 4.710356138911759e-06, + "loss": 2.2216, + "step": 6172 + }, + { + "epoch": 2.662928617640716, + "grad_norm": 0.20015719532966614, + "learning_rate": 4.698459406202856e-06, + "loss": 2.1439, + "step": 6173 + }, + { + "epoch": 2.663359930989864, + "grad_norm": 0.19572551548480988, + "learning_rate": 4.686577230196731e-06, + "loss": 2.2754, + "step": 6174 + }, + { + "epoch": 2.6637912443390124, + "grad_norm": 0.19161763787269592, + "learning_rate": 4.674709613353731e-06, + "loss": 1.8648, + "step": 6175 + }, + { + "epoch": 2.6637912443390124, + "eval_loss": 2.0872437953948975, + "eval_runtime": 204.8946, + "eval_samples_per_second": 0.156, + "eval_steps_per_second": 0.156, + "step": 6175 + }, + { + "epoch": 2.66422255768816, + "grad_norm": 0.21940241754055023, + "learning_rate": 4.662856558131156e-06, + "loss": 2.047, + "step": 6176 + }, + { + "epoch": 2.6646538710373084, + "grad_norm": 0.20560306310653687, + "learning_rate": 4.651018066983353e-06, + "loss": 2.0876, + "step": 6177 + }, + { + "epoch": 2.6650851843864567, + "grad_norm": 0.2093990594148636, + "learning_rate": 4.63919414236156e-06, + "loss": 2.385, + "step": 6178 + }, + { + "epoch": 2.665516497735605, + "grad_norm": 0.2323451191186905, + "learning_rate": 4.6273847867140696e-06, + "loss": 2.0265, + "step": 6179 + }, + { + "epoch": 2.665947811084753, + "grad_norm": 0.18504031002521515, + "learning_rate": 4.61559000248615e-06, + "loss": 2.0652, + "step": 6180 + }, + { + "epoch": 2.6663791244339015, + "grad_norm": 0.1730000376701355, + "learning_rate": 4.60380979212003e-06, + "loss": 1.9666, + "step": 6181 + }, + { + "epoch": 2.6668104377830493, + "grad_norm": 0.1883518546819687, + "learning_rate": 4.592044158054953e-06, + "loss": 2.0666, + "step": 6182 + }, + { + "epoch": 2.6672417511321975, + "grad_norm": 0.21099349856376648, + "learning_rate": 4.5802931027270674e-06, + "loss": 2.1527, + "step": 6183 + }, + { + "epoch": 2.667673064481346, + "grad_norm": 0.21935242414474487, + "learning_rate": 4.568556628569603e-06, + "loss": 2.1865, + "step": 6184 + }, + { + "epoch": 2.6681043778304936, + "grad_norm": 0.17997373640537262, + "learning_rate": 4.556834738012724e-06, + "loss": 2.0517, + "step": 6185 + }, + { + "epoch": 2.668535691179642, + "grad_norm": 0.19894824922084808, + "learning_rate": 4.545127433483556e-06, + "loss": 2.0752, + "step": 6186 + }, + { + "epoch": 2.66896700452879, + "grad_norm": 0.24281533062458038, + "learning_rate": 4.533434717406234e-06, + "loss": 1.9051, + "step": 6187 + }, + { + "epoch": 2.6693983178779384, + "grad_norm": 0.19154304265975952, + "learning_rate": 4.521756592201872e-06, + "loss": 2.1348, + "step": 6188 + }, + { + "epoch": 2.6698296312270866, + "grad_norm": 0.20902879536151886, + "learning_rate": 4.510093060288536e-06, + "loss": 2.1539, + "step": 6189 + }, + { + "epoch": 2.670260944576235, + "grad_norm": 0.19530873000621796, + "learning_rate": 4.4984441240812935e-06, + "loss": 2.0056, + "step": 6190 + }, + { + "epoch": 2.6706922579253827, + "grad_norm": 0.19316688179969788, + "learning_rate": 4.486809785992207e-06, + "loss": 2.1989, + "step": 6191 + }, + { + "epoch": 2.671123571274531, + "grad_norm": 0.19766834378242493, + "learning_rate": 4.4751900484302574e-06, + "loss": 2.2485, + "step": 6192 + }, + { + "epoch": 2.671554884623679, + "grad_norm": 0.23230670392513275, + "learning_rate": 4.463584913801446e-06, + "loss": 1.7903, + "step": 6193 + }, + { + "epoch": 2.671986197972827, + "grad_norm": 0.1969883143901825, + "learning_rate": 4.451994384508758e-06, + "loss": 2.2623, + "step": 6194 + }, + { + "epoch": 2.6724175113219752, + "grad_norm": 0.1922171413898468, + "learning_rate": 4.440418462952141e-06, + "loss": 2.2303, + "step": 6195 + }, + { + "epoch": 2.6728488246711235, + "grad_norm": 0.20559678971767426, + "learning_rate": 4.42885715152852e-06, + "loss": 2.2323, + "step": 6196 + }, + { + "epoch": 2.6732801380202718, + "grad_norm": 0.18253296613693237, + "learning_rate": 4.4173104526317445e-06, + "loss": 2.0705, + "step": 6197 + }, + { + "epoch": 2.67371145136942, + "grad_norm": 0.20601263642311096, + "learning_rate": 4.405778368652738e-06, + "loss": 1.9288, + "step": 6198 + }, + { + "epoch": 2.6741427647185683, + "grad_norm": 0.2276991456747055, + "learning_rate": 4.394260901979329e-06, + "loss": 2.2312, + "step": 6199 + }, + { + "epoch": 2.674574078067716, + "grad_norm": 0.18565404415130615, + "learning_rate": 4.38275805499633e-06, + "loss": 1.9813, + "step": 6200 + }, + { + "epoch": 2.674574078067716, + "eval_loss": 2.0872318744659424, + "eval_runtime": 201.4551, + "eval_samples_per_second": 0.159, + "eval_steps_per_second": 0.159, + "step": 6200 + }, + { + "epoch": 2.6750053914168643, + "grad_norm": 0.18837133049964905, + "learning_rate": 4.371269830085525e-06, + "loss": 1.9287, + "step": 6201 + }, + { + "epoch": 2.6754367047660126, + "grad_norm": 0.20320770144462585, + "learning_rate": 4.359796229625695e-06, + "loss": 1.988, + "step": 6202 + }, + { + "epoch": 2.6758680181151604, + "grad_norm": 0.19712461531162262, + "learning_rate": 4.348337255992556e-06, + "loss": 2.0474, + "step": 6203 + }, + { + "epoch": 2.6762993314643087, + "grad_norm": 0.21028605103492737, + "learning_rate": 4.3368929115588115e-06, + "loss": 2.1066, + "step": 6204 + }, + { + "epoch": 2.676730644813457, + "grad_norm": 0.18788939714431763, + "learning_rate": 4.325463198694173e-06, + "loss": 2.1886, + "step": 6205 + }, + { + "epoch": 2.677161958162605, + "grad_norm": 0.20289573073387146, + "learning_rate": 4.314048119765248e-06, + "loss": 2.0151, + "step": 6206 + }, + { + "epoch": 2.6775932715117534, + "grad_norm": 0.2069668173789978, + "learning_rate": 4.302647677135662e-06, + "loss": 2.3451, + "step": 6207 + }, + { + "epoch": 2.6780245848609017, + "grad_norm": 0.19542808830738068, + "learning_rate": 4.2912618731660184e-06, + "loss": 2.0026, + "step": 6208 + }, + { + "epoch": 2.6784558982100495, + "grad_norm": 0.20399220287799835, + "learning_rate": 4.279890710213857e-06, + "loss": 2.2483, + "step": 6209 + }, + { + "epoch": 2.6788872115591977, + "grad_norm": 0.20325656235218048, + "learning_rate": 4.268534190633727e-06, + "loss": 2.1639, + "step": 6210 + }, + { + "epoch": 2.679318524908346, + "grad_norm": 0.18592952191829681, + "learning_rate": 4.257192316777064e-06, + "loss": 1.9605, + "step": 6211 + }, + { + "epoch": 2.679749838257494, + "grad_norm": 0.19579529762268066, + "learning_rate": 4.245865090992382e-06, + "loss": 2.1996, + "step": 6212 + }, + { + "epoch": 2.680181151606642, + "grad_norm": 0.2062699943780899, + "learning_rate": 4.2345525156250865e-06, + "loss": 2.1259, + "step": 6213 + }, + { + "epoch": 2.6806124649557903, + "grad_norm": 0.19597366452217102, + "learning_rate": 4.223254593017564e-06, + "loss": 2.066, + "step": 6214 + }, + { + "epoch": 2.6810437783049386, + "grad_norm": 0.22171823680400848, + "learning_rate": 4.211971325509183e-06, + "loss": 1.9581, + "step": 6215 + }, + { + "epoch": 2.681475091654087, + "grad_norm": 0.18330083787441254, + "learning_rate": 4.200702715436269e-06, + "loss": 1.9926, + "step": 6216 + }, + { + "epoch": 2.681906405003235, + "grad_norm": 0.20268268883228302, + "learning_rate": 4.189448765132103e-06, + "loss": 2.1025, + "step": 6217 + }, + { + "epoch": 2.682337718352383, + "grad_norm": 0.18285948038101196, + "learning_rate": 4.178209476926939e-06, + "loss": 1.8616, + "step": 6218 + }, + { + "epoch": 2.682769031701531, + "grad_norm": 0.2060435712337494, + "learning_rate": 4.166984853147998e-06, + "loss": 2.2504, + "step": 6219 + }, + { + "epoch": 2.6832003450506794, + "grad_norm": 0.19653423130512238, + "learning_rate": 4.155774896119463e-06, + "loss": 2.0712, + "step": 6220 + }, + { + "epoch": 2.683631658399827, + "grad_norm": 0.19370831549167633, + "learning_rate": 4.144579608162468e-06, + "loss": 2.1085, + "step": 6221 + }, + { + "epoch": 2.6840629717489755, + "grad_norm": 0.18302783370018005, + "learning_rate": 4.133398991595119e-06, + "loss": 1.9831, + "step": 6222 + }, + { + "epoch": 2.6844942850981237, + "grad_norm": 0.19806934893131256, + "learning_rate": 4.122233048732512e-06, + "loss": 2.2637, + "step": 6223 + }, + { + "epoch": 2.684925598447272, + "grad_norm": 0.19383008778095245, + "learning_rate": 4.111081781886657e-06, + "loss": 2.1842, + "step": 6224 + }, + { + "epoch": 2.68535691179642, + "grad_norm": 0.20208725333213806, + "learning_rate": 4.099945193366524e-06, + "loss": 2.0627, + "step": 6225 + }, + { + "epoch": 2.68535691179642, + "eval_loss": 2.087400436401367, + "eval_runtime": 197.412, + "eval_samples_per_second": 0.162, + "eval_steps_per_second": 0.162, + "step": 6225 + }, + { + "epoch": 2.6857882251455685, + "grad_norm": 0.20198409259319305, + "learning_rate": 4.08882328547811e-06, + "loss": 2.0314, + "step": 6226 + }, + { + "epoch": 2.6862195384947163, + "grad_norm": 0.19249609112739563, + "learning_rate": 4.077716060524297e-06, + "loss": 2.004, + "step": 6227 + }, + { + "epoch": 2.6866508518438645, + "grad_norm": 0.20324400067329407, + "learning_rate": 4.066623520804987e-06, + "loss": 2.1872, + "step": 6228 + }, + { + "epoch": 2.687082165193013, + "grad_norm": 0.20244289934635162, + "learning_rate": 4.055545668616961e-06, + "loss": 2.1976, + "step": 6229 + }, + { + "epoch": 2.6875134785421606, + "grad_norm": 0.20366866886615753, + "learning_rate": 4.044482506254057e-06, + "loss": 2.1723, + "step": 6230 + }, + { + "epoch": 2.687944791891309, + "grad_norm": 0.21891331672668457, + "learning_rate": 4.033434036007002e-06, + "loss": 2.2852, + "step": 6231 + }, + { + "epoch": 2.688376105240457, + "grad_norm": 0.19543354213237762, + "learning_rate": 4.022400260163508e-06, + "loss": 2.0835, + "step": 6232 + }, + { + "epoch": 2.6888074185896054, + "grad_norm": 0.19975729286670685, + "learning_rate": 4.011381181008247e-06, + "loss": 2.1318, + "step": 6233 + }, + { + "epoch": 2.6892387319387536, + "grad_norm": 0.20842617750167847, + "learning_rate": 4.000376800822818e-06, + "loss": 2.1383, + "step": 6234 + }, + { + "epoch": 2.689670045287902, + "grad_norm": 0.19123625755310059, + "learning_rate": 3.989387121885817e-06, + "loss": 2.0994, + "step": 6235 + }, + { + "epoch": 2.6901013586370497, + "grad_norm": 0.213456928730011, + "learning_rate": 3.978412146472757e-06, + "loss": 2.2063, + "step": 6236 + }, + { + "epoch": 2.690532671986198, + "grad_norm": 0.19223642349243164, + "learning_rate": 3.967451876856167e-06, + "loss": 1.9427, + "step": 6237 + }, + { + "epoch": 2.690963985335346, + "grad_norm": 0.285829097032547, + "learning_rate": 3.956506315305452e-06, + "loss": 2.2721, + "step": 6238 + }, + { + "epoch": 2.691395298684494, + "grad_norm": 0.20903241634368896, + "learning_rate": 3.945575464087014e-06, + "loss": 2.014, + "step": 6239 + }, + { + "epoch": 2.6918266120336423, + "grad_norm": 0.19532136619091034, + "learning_rate": 3.934659325464226e-06, + "loss": 2.1684, + "step": 6240 + }, + { + "epoch": 2.6922579253827905, + "grad_norm": 0.20843470096588135, + "learning_rate": 3.92375790169738e-06, + "loss": 2.1268, + "step": 6241 + }, + { + "epoch": 2.6926892387319388, + "grad_norm": 0.18874512612819672, + "learning_rate": 3.912871195043746e-06, + "loss": 2.2548, + "step": 6242 + }, + { + "epoch": 2.693120552081087, + "grad_norm": 0.19821836054325104, + "learning_rate": 3.901999207757503e-06, + "loss": 2.1225, + "step": 6243 + }, + { + "epoch": 2.6935518654302353, + "grad_norm": 0.19665993750095367, + "learning_rate": 3.891141942089851e-06, + "loss": 2.1058, + "step": 6244 + }, + { + "epoch": 2.693983178779383, + "grad_norm": 0.1842608004808426, + "learning_rate": 3.8802994002888995e-06, + "loss": 1.9469, + "step": 6245 + }, + { + "epoch": 2.6944144921285313, + "grad_norm": 0.2139851152896881, + "learning_rate": 3.869471584599704e-06, + "loss": 2.3312, + "step": 6246 + }, + { + "epoch": 2.6948458054776796, + "grad_norm": 0.19002576172351837, + "learning_rate": 3.858658497264286e-06, + "loss": 2.0497, + "step": 6247 + }, + { + "epoch": 2.695277118826828, + "grad_norm": 0.2080351859331131, + "learning_rate": 3.847860140521622e-06, + "loss": 2.1032, + "step": 6248 + }, + { + "epoch": 2.6957084321759757, + "grad_norm": 0.2042865753173828, + "learning_rate": 3.837076516607615e-06, + "loss": 2.0746, + "step": 6249 + }, + { + "epoch": 2.696139745525124, + "grad_norm": 0.19151441752910614, + "learning_rate": 3.826307627755129e-06, + "loss": 1.9515, + "step": 6250 + }, + { + "epoch": 2.696139745525124, + "eval_loss": 2.087350368499756, + "eval_runtime": 196.982, + "eval_samples_per_second": 0.162, + "eval_steps_per_second": 0.162, + "step": 6250 + }, + { + "epoch": 2.696571058874272, + "grad_norm": 0.18477578461170197, + "learning_rate": 3.8155534761940135e-06, + "loss": 2.129, + "step": 6251 + }, + { + "epoch": 2.6970023722234204, + "grad_norm": 0.1978139728307724, + "learning_rate": 3.8048140641510045e-06, + "loss": 2.3592, + "step": 6252 + }, + { + "epoch": 2.6974336855725687, + "grad_norm": 0.21185347437858582, + "learning_rate": 3.794089393849806e-06, + "loss": 2.2006, + "step": 6253 + }, + { + "epoch": 2.6978649989217165, + "grad_norm": 0.18322397768497467, + "learning_rate": 3.783379467511108e-06, + "loss": 2.0021, + "step": 6254 + }, + { + "epoch": 2.6982963122708647, + "grad_norm": 0.19997923076152802, + "learning_rate": 3.7726842873524955e-06, + "loss": 2.0924, + "step": 6255 + }, + { + "epoch": 2.698727625620013, + "grad_norm": 0.20193538069725037, + "learning_rate": 3.7620038555885467e-06, + "loss": 2.2167, + "step": 6256 + }, + { + "epoch": 2.6991589389691613, + "grad_norm": 0.20680344104766846, + "learning_rate": 3.751338174430718e-06, + "loss": 2.0264, + "step": 6257 + }, + { + "epoch": 2.699590252318309, + "grad_norm": 0.196910560131073, + "learning_rate": 3.7406872460875008e-06, + "loss": 2.0407, + "step": 6258 + }, + { + "epoch": 2.7000215656674573, + "grad_norm": 0.2135513722896576, + "learning_rate": 3.7300510727642647e-06, + "loss": 2.181, + "step": 6259 + }, + { + "epoch": 2.7004528790166056, + "grad_norm": 0.2150072604417801, + "learning_rate": 3.7194296566633474e-06, + "loss": 2.1288, + "step": 6260 + }, + { + "epoch": 2.700884192365754, + "grad_norm": 0.18341900408267975, + "learning_rate": 3.7088229999840317e-06, + "loss": 2.1744, + "step": 6261 + }, + { + "epoch": 2.701315505714902, + "grad_norm": 0.19466567039489746, + "learning_rate": 3.6982311049225444e-06, + "loss": 2.2336, + "step": 6262 + }, + { + "epoch": 2.70174681906405, + "grad_norm": 0.19956213235855103, + "learning_rate": 3.6876539736720475e-06, + "loss": 2.2707, + "step": 6263 + }, + { + "epoch": 2.702178132413198, + "grad_norm": 0.20101287961006165, + "learning_rate": 3.6770916084226554e-06, + "loss": 2.1547, + "step": 6264 + }, + { + "epoch": 2.7026094457623464, + "grad_norm": 0.22500237822532654, + "learning_rate": 3.666544011361411e-06, + "loss": 2.0867, + "step": 6265 + }, + { + "epoch": 2.7030407591114947, + "grad_norm": 0.20740890502929688, + "learning_rate": 3.656011184672325e-06, + "loss": 2.1929, + "step": 6266 + }, + { + "epoch": 2.7034720724606425, + "grad_norm": 0.1789781153202057, + "learning_rate": 3.6454931305363285e-06, + "loss": 1.876, + "step": 6267 + }, + { + "epoch": 2.7039033858097907, + "grad_norm": 0.2012297362089157, + "learning_rate": 3.6349898511312953e-06, + "loss": 2.1847, + "step": 6268 + }, + { + "epoch": 2.704334699158939, + "grad_norm": 0.19550712406635284, + "learning_rate": 3.624501348632061e-06, + "loss": 2.1368, + "step": 6269 + }, + { + "epoch": 2.7047660125080872, + "grad_norm": 0.1741098314523697, + "learning_rate": 3.6140276252103557e-06, + "loss": 1.8827, + "step": 6270 + }, + { + "epoch": 2.7051973258572355, + "grad_norm": 0.20835484564304352, + "learning_rate": 3.6035686830348936e-06, + "loss": 2.0917, + "step": 6271 + }, + { + "epoch": 2.7056286392063833, + "grad_norm": 0.19523583352565765, + "learning_rate": 3.593124524271318e-06, + "loss": 2.2537, + "step": 6272 + }, + { + "epoch": 2.7060599525555316, + "grad_norm": 0.19004569947719574, + "learning_rate": 3.582695151082207e-06, + "loss": 2.0502, + "step": 6273 + }, + { + "epoch": 2.70649126590468, + "grad_norm": 0.20238938927650452, + "learning_rate": 3.5722805656270837e-06, + "loss": 2.1448, + "step": 6274 + }, + { + "epoch": 2.706922579253828, + "grad_norm": 0.18806307017803192, + "learning_rate": 3.561880770062364e-06, + "loss": 2.2018, + "step": 6275 + }, + { + "epoch": 2.706922579253828, + "eval_loss": 2.0872480869293213, + "eval_runtime": 197.1462, + "eval_samples_per_second": 0.162, + "eval_steps_per_second": 0.162, + "step": 6275 + }, + { + "epoch": 2.707353892602976, + "grad_norm": 0.20623880624771118, + "learning_rate": 3.5514957665414763e-06, + "loss": 2.2654, + "step": 6276 + }, + { + "epoch": 2.707785205952124, + "grad_norm": 0.20118732750415802, + "learning_rate": 3.541125557214741e-06, + "loss": 2.0149, + "step": 6277 + }, + { + "epoch": 2.7082165193012724, + "grad_norm": 0.22785833477973938, + "learning_rate": 3.5307701442294166e-06, + "loss": 2.0617, + "step": 6278 + }, + { + "epoch": 2.7086478326504206, + "grad_norm": 0.20555000007152557, + "learning_rate": 3.520429529729704e-06, + "loss": 2.3689, + "step": 6279 + }, + { + "epoch": 2.709079145999569, + "grad_norm": 0.2299307882785797, + "learning_rate": 3.510103715856749e-06, + "loss": 2.2022, + "step": 6280 + }, + { + "epoch": 2.7095104593487167, + "grad_norm": 0.1937788426876068, + "learning_rate": 3.499792704748625e-06, + "loss": 2.2859, + "step": 6281 + }, + { + "epoch": 2.709941772697865, + "grad_norm": 0.21659395098686218, + "learning_rate": 3.4894964985403147e-06, + "loss": 2.3204, + "step": 6282 + }, + { + "epoch": 2.710373086047013, + "grad_norm": 0.1889360249042511, + "learning_rate": 3.479215099363805e-06, + "loss": 2.3064, + "step": 6283 + }, + { + "epoch": 2.7108043993961615, + "grad_norm": 0.18120795488357544, + "learning_rate": 3.4689485093479353e-06, + "loss": 1.8809, + "step": 6284 + }, + { + "epoch": 2.7112357127453093, + "grad_norm": 0.19754794239997864, + "learning_rate": 3.458696730618521e-06, + "loss": 2.2219, + "step": 6285 + }, + { + "epoch": 2.7116670260944575, + "grad_norm": 0.20219269394874573, + "learning_rate": 3.4484597652983226e-06, + "loss": 2.2278, + "step": 6286 + }, + { + "epoch": 2.712098339443606, + "grad_norm": 0.2099461704492569, + "learning_rate": 3.4382376155069947e-06, + "loss": 2.0175, + "step": 6287 + }, + { + "epoch": 2.712529652792754, + "grad_norm": 0.19380366802215576, + "learning_rate": 3.428030283361169e-06, + "loss": 2.0483, + "step": 6288 + }, + { + "epoch": 2.7129609661419023, + "grad_norm": 0.21564538776874542, + "learning_rate": 3.417837770974355e-06, + "loss": 2.0766, + "step": 6289 + }, + { + "epoch": 2.71339227949105, + "grad_norm": 0.17680005729198456, + "learning_rate": 3.4076600804570476e-06, + "loss": 1.7898, + "step": 6290 + }, + { + "epoch": 2.7138235928401984, + "grad_norm": 0.21097064018249512, + "learning_rate": 3.3974972139166447e-06, + "loss": 2.2372, + "step": 6291 + }, + { + "epoch": 2.7142549061893466, + "grad_norm": 0.18284453451633453, + "learning_rate": 3.3873491734574708e-06, + "loss": 1.9787, + "step": 6292 + }, + { + "epoch": 2.714686219538495, + "grad_norm": 0.18313708901405334, + "learning_rate": 3.3772159611807876e-06, + "loss": 2.2077, + "step": 6293 + }, + { + "epoch": 2.7151175328876427, + "grad_norm": 0.20901811122894287, + "learning_rate": 3.3670975791847994e-06, + "loss": 2.1341, + "step": 6294 + }, + { + "epoch": 2.715548846236791, + "grad_norm": 0.18402762711048126, + "learning_rate": 3.356994029564622e-06, + "loss": 2.1648, + "step": 6295 + }, + { + "epoch": 2.715980159585939, + "grad_norm": 0.20775403082370758, + "learning_rate": 3.346905314412307e-06, + "loss": 2.2146, + "step": 6296 + }, + { + "epoch": 2.7164114729350874, + "grad_norm": 0.22491486370563507, + "learning_rate": 3.3368314358168334e-06, + "loss": 2.0796, + "step": 6297 + }, + { + "epoch": 2.7168427862842357, + "grad_norm": 0.2053094506263733, + "learning_rate": 3.3267723958641e-06, + "loss": 2.0496, + "step": 6298 + }, + { + "epoch": 2.7172740996333835, + "grad_norm": 0.1953420639038086, + "learning_rate": 3.3167281966369393e-06, + "loss": 2.1895, + "step": 6299 + }, + { + "epoch": 2.7177054129825318, + "grad_norm": 0.18459086120128632, + "learning_rate": 3.30669884021513e-06, + "loss": 2.0651, + "step": 6300 + }, + { + "epoch": 2.7177054129825318, + "eval_loss": 2.087197780609131, + "eval_runtime": 196.2873, + "eval_samples_per_second": 0.163, + "eval_steps_per_second": 0.163, + "step": 6300 + }, + { + "epoch": 2.71813672633168, + "grad_norm": 0.17420656979084015, + "learning_rate": 3.2966843286753614e-06, + "loss": 1.886, + "step": 6301 + }, + { + "epoch": 2.7185680396808283, + "grad_norm": 0.19562606513500214, + "learning_rate": 3.2866846640912327e-06, + "loss": 2.012, + "step": 6302 + }, + { + "epoch": 2.718999353029976, + "grad_norm": 0.19525456428527832, + "learning_rate": 3.2766998485332705e-06, + "loss": 2.222, + "step": 6303 + }, + { + "epoch": 2.7194306663791243, + "grad_norm": 0.19238971173763275, + "learning_rate": 3.2667298840689637e-06, + "loss": 2.0992, + "step": 6304 + }, + { + "epoch": 2.7198619797282726, + "grad_norm": 0.19270627200603485, + "learning_rate": 3.2567747727627018e-06, + "loss": 2.1833, + "step": 6305 + }, + { + "epoch": 2.720293293077421, + "grad_norm": 0.20034782588481903, + "learning_rate": 3.246834516675795e-06, + "loss": 2.2458, + "step": 6306 + }, + { + "epoch": 2.720724606426569, + "grad_norm": 0.3612179160118103, + "learning_rate": 3.236909117866479e-06, + "loss": 2.0146, + "step": 6307 + }, + { + "epoch": 2.721155919775717, + "grad_norm": 0.18545496463775635, + "learning_rate": 3.2269985783899192e-06, + "loss": 2.2781, + "step": 6308 + }, + { + "epoch": 2.721587233124865, + "grad_norm": 0.21780475974082947, + "learning_rate": 3.2171029002981992e-06, + "loss": 2.1374, + "step": 6309 + }, + { + "epoch": 2.7220185464740134, + "grad_norm": 0.19212014973163605, + "learning_rate": 3.2072220856403293e-06, + "loss": 2.3301, + "step": 6310 + }, + { + "epoch": 2.7224498598231617, + "grad_norm": 0.1920885145664215, + "learning_rate": 3.1973561364622396e-06, + "loss": 2.1969, + "step": 6311 + }, + { + "epoch": 2.7228811731723095, + "grad_norm": 0.6626337766647339, + "learning_rate": 3.1875050548067797e-06, + "loss": 2.0958, + "step": 6312 + }, + { + "epoch": 2.7233124865214577, + "grad_norm": 0.19787397980690002, + "learning_rate": 3.1776688427137264e-06, + "loss": 2.2973, + "step": 6313 + }, + { + "epoch": 2.723743799870606, + "grad_norm": 0.20773246884346008, + "learning_rate": 3.167847502219767e-06, + "loss": 2.174, + "step": 6314 + }, + { + "epoch": 2.7241751132197543, + "grad_norm": 0.20719939470291138, + "learning_rate": 3.158041035358541e-06, + "loss": 2.5476, + "step": 6315 + }, + { + "epoch": 2.7246064265689025, + "grad_norm": 0.21184757351875305, + "learning_rate": 3.148249444160558e-06, + "loss": 2.136, + "step": 6316 + }, + { + "epoch": 2.7250377399180503, + "grad_norm": 0.20608209073543549, + "learning_rate": 3.1384727306532797e-06, + "loss": 2.1159, + "step": 6317 + }, + { + "epoch": 2.7254690532671986, + "grad_norm": 0.21290935575962067, + "learning_rate": 3.128710896861103e-06, + "loss": 2.0429, + "step": 6318 + }, + { + "epoch": 2.725900366616347, + "grad_norm": 0.1945108324289322, + "learning_rate": 3.1189639448052943e-06, + "loss": 2.0928, + "step": 6319 + }, + { + "epoch": 2.726331679965495, + "grad_norm": 0.2086068093776703, + "learning_rate": 3.109231876504098e-06, + "loss": 2.3127, + "step": 6320 + }, + { + "epoch": 2.726762993314643, + "grad_norm": 0.19717541337013245, + "learning_rate": 3.0995146939726017e-06, + "loss": 2.205, + "step": 6321 + }, + { + "epoch": 2.727194306663791, + "grad_norm": 0.19919461011886597, + "learning_rate": 3.089812399222888e-06, + "loss": 2.184, + "step": 6322 + }, + { + "epoch": 2.7276256200129394, + "grad_norm": 0.19284598529338837, + "learning_rate": 3.080124994263916e-06, + "loss": 2.2423, + "step": 6323 + }, + { + "epoch": 2.7280569333620877, + "grad_norm": 0.2111853063106537, + "learning_rate": 3.070452481101565e-06, + "loss": 1.9166, + "step": 6324 + }, + { + "epoch": 2.728488246711236, + "grad_norm": 0.19298049807548523, + "learning_rate": 3.060794861738633e-06, + "loss": 2.0194, + "step": 6325 + }, + { + "epoch": 2.728488246711236, + "eval_loss": 2.087181806564331, + "eval_runtime": 196.6185, + "eval_samples_per_second": 0.163, + "eval_steps_per_second": 0.163, + "step": 6325 + }, + { + "epoch": 2.7289195600603837, + "grad_norm": 0.1980437934398651, + "learning_rate": 3.0511521381748364e-06, + "loss": 2.0237, + "step": 6326 + }, + { + "epoch": 2.729350873409532, + "grad_norm": 0.18850843608379364, + "learning_rate": 3.0415243124068207e-06, + "loss": 2.2705, + "step": 6327 + }, + { + "epoch": 2.7297821867586802, + "grad_norm": 0.2052440196275711, + "learning_rate": 3.0319113864280987e-06, + "loss": 2.2331, + "step": 6328 + }, + { + "epoch": 2.7302135001078285, + "grad_norm": 0.21586772799491882, + "learning_rate": 3.0223133622291786e-06, + "loss": 2.1638, + "step": 6329 + }, + { + "epoch": 2.7306448134569763, + "grad_norm": 0.22283998131752014, + "learning_rate": 3.0127302417974035e-06, + "loss": 2.0749, + "step": 6330 + }, + { + "epoch": 2.7310761268061245, + "grad_norm": 0.20513780415058136, + "learning_rate": 3.003162027117062e-06, + "loss": 2.3214, + "step": 6331 + }, + { + "epoch": 2.731507440155273, + "grad_norm": 0.21246421337127686, + "learning_rate": 2.9936087201693688e-06, + "loss": 2.2214, + "step": 6332 + }, + { + "epoch": 2.731938753504421, + "grad_norm": 0.1875782608985901, + "learning_rate": 2.98407032293245e-06, + "loss": 2.1809, + "step": 6333 + }, + { + "epoch": 2.7323700668535693, + "grad_norm": 0.20177769660949707, + "learning_rate": 2.9745468373813254e-06, + "loss": 1.87, + "step": 6334 + }, + { + "epoch": 2.732801380202717, + "grad_norm": 0.19652503728866577, + "learning_rate": 2.965038265487918e-06, + "loss": 2.1856, + "step": 6335 + }, + { + "epoch": 2.7332326935518654, + "grad_norm": 0.20007416605949402, + "learning_rate": 2.955544609221111e-06, + "loss": 2.1514, + "step": 6336 + }, + { + "epoch": 2.7336640069010136, + "grad_norm": 0.21538324654102325, + "learning_rate": 2.946065870546649e-06, + "loss": 2.1655, + "step": 6337 + }, + { + "epoch": 2.734095320250162, + "grad_norm": 0.27241501212120056, + "learning_rate": 2.936602051427228e-06, + "loss": 2.0861, + "step": 6338 + }, + { + "epoch": 2.7345266335993097, + "grad_norm": 0.18923966586589813, + "learning_rate": 2.9271531538224147e-06, + "loss": 2.1178, + "step": 6339 + }, + { + "epoch": 2.734957946948458, + "grad_norm": 0.19602225720882416, + "learning_rate": 2.9177191796887266e-06, + "loss": 2.1503, + "step": 6340 + }, + { + "epoch": 2.735389260297606, + "grad_norm": 0.2139609009027481, + "learning_rate": 2.9083001309795517e-06, + "loss": 2.3083, + "step": 6341 + }, + { + "epoch": 2.7358205736467545, + "grad_norm": 0.18180742859840393, + "learning_rate": 2.898896009645221e-06, + "loss": 1.9963, + "step": 6342 + }, + { + "epoch": 2.7362518869959027, + "grad_norm": 0.20043374598026276, + "learning_rate": 2.8895068176329527e-06, + "loss": 2.2321, + "step": 6343 + }, + { + "epoch": 2.7366832003450505, + "grad_norm": 0.2054060846567154, + "learning_rate": 2.8801325568868826e-06, + "loss": 2.1354, + "step": 6344 + }, + { + "epoch": 2.737114513694199, + "grad_norm": 0.20694616436958313, + "learning_rate": 2.8707732293480584e-06, + "loss": 1.9688, + "step": 6345 + }, + { + "epoch": 2.737545827043347, + "grad_norm": 0.2166181057691574, + "learning_rate": 2.8614288369544298e-06, + "loss": 2.1904, + "step": 6346 + }, + { + "epoch": 2.7379771403924953, + "grad_norm": 0.1976022571325302, + "learning_rate": 2.8520993816408655e-06, + "loss": 2.1796, + "step": 6347 + }, + { + "epoch": 2.738408453741643, + "grad_norm": 0.2170005589723587, + "learning_rate": 2.842784865339104e-06, + "loss": 2.1227, + "step": 6348 + }, + { + "epoch": 2.7388397670907914, + "grad_norm": 0.23662927746772766, + "learning_rate": 2.8334852899778355e-06, + "loss": 2.0782, + "step": 6349 + }, + { + "epoch": 2.7392710804399396, + "grad_norm": 0.17678862810134888, + "learning_rate": 2.8242006574826366e-06, + "loss": 2.0842, + "step": 6350 + }, + { + "epoch": 2.7392710804399396, + "eval_loss": 2.087163209915161, + "eval_runtime": 196.5639, + "eval_samples_per_second": 0.163, + "eval_steps_per_second": 0.163, + "step": 6350 + }, + { + "epoch": 2.739702393789088, + "grad_norm": 0.20020273327827454, + "learning_rate": 2.8149309697760024e-06, + "loss": 2.1296, + "step": 6351 + }, + { + "epoch": 2.740133707138236, + "grad_norm": 0.20694181323051453, + "learning_rate": 2.8056762287773067e-06, + "loss": 2.0802, + "step": 6352 + }, + { + "epoch": 2.740565020487384, + "grad_norm": 0.19143207371234894, + "learning_rate": 2.7964364364028574e-06, + "loss": 2.0352, + "step": 6353 + }, + { + "epoch": 2.740996333836532, + "grad_norm": 0.20685730874538422, + "learning_rate": 2.7872115945658414e-06, + "loss": 2.1375, + "step": 6354 + }, + { + "epoch": 2.7414276471856804, + "grad_norm": 0.2112044095993042, + "learning_rate": 2.7780017051763803e-06, + "loss": 2.1864, + "step": 6355 + }, + { + "epoch": 2.7418589605348287, + "grad_norm": 0.18936973810195923, + "learning_rate": 2.768806770141466e-06, + "loss": 2.1065, + "step": 6356 + }, + { + "epoch": 2.7422902738839765, + "grad_norm": 0.19992630183696747, + "learning_rate": 2.7596267913650163e-06, + "loss": 2.1422, + "step": 6357 + }, + { + "epoch": 2.7427215872331248, + "grad_norm": 0.19272871315479279, + "learning_rate": 2.7504617707478443e-06, + "loss": 2.2792, + "step": 6358 + }, + { + "epoch": 2.743152900582273, + "grad_norm": 0.19747158885002136, + "learning_rate": 2.741311710187674e-06, + "loss": 2.2129, + "step": 6359 + }, + { + "epoch": 2.7435842139314213, + "grad_norm": 0.1911972463130951, + "learning_rate": 2.732176611579115e-06, + "loss": 2.3908, + "step": 6360 + }, + { + "epoch": 2.7440155272805695, + "grad_norm": 0.20867711305618286, + "learning_rate": 2.723056476813712e-06, + "loss": 1.9727, + "step": 6361 + }, + { + "epoch": 2.7444468406297173, + "grad_norm": 0.22647875547409058, + "learning_rate": 2.713951307779863e-06, + "loss": 1.9784, + "step": 6362 + }, + { + "epoch": 2.7448781539788656, + "grad_norm": 0.18755938112735748, + "learning_rate": 2.704861106362902e-06, + "loss": 2.0905, + "step": 6363 + }, + { + "epoch": 2.745309467328014, + "grad_norm": 0.18958084285259247, + "learning_rate": 2.6957858744450556e-06, + "loss": 2.0371, + "step": 6364 + }, + { + "epoch": 2.745740780677162, + "grad_norm": 0.19676895439624786, + "learning_rate": 2.6867256139054466e-06, + "loss": 2.1324, + "step": 6365 + }, + { + "epoch": 2.74617209402631, + "grad_norm": 0.2035304605960846, + "learning_rate": 2.6776803266201156e-06, + "loss": 2.1633, + "step": 6366 + }, + { + "epoch": 2.746603407375458, + "grad_norm": 0.18587583303451538, + "learning_rate": 2.668650014461965e-06, + "loss": 2.2838, + "step": 6367 + }, + { + "epoch": 2.7470347207246064, + "grad_norm": 0.20424732565879822, + "learning_rate": 2.659634679300832e-06, + "loss": 2.1103, + "step": 6368 + }, + { + "epoch": 2.7474660340737547, + "grad_norm": 0.19742344319820404, + "learning_rate": 2.6506343230034396e-06, + "loss": 2.2929, + "step": 6369 + }, + { + "epoch": 2.747897347422903, + "grad_norm": 0.19142282009124756, + "learning_rate": 2.641648947433414e-06, + "loss": 1.8528, + "step": 6370 + }, + { + "epoch": 2.7483286607720507, + "grad_norm": 0.1873573362827301, + "learning_rate": 2.6326785544512675e-06, + "loss": 2.0168, + "step": 6371 + }, + { + "epoch": 2.748759974121199, + "grad_norm": 1.0676277875900269, + "learning_rate": 2.6237231459144305e-06, + "loss": 2.025, + "step": 6372 + }, + { + "epoch": 2.7491912874703472, + "grad_norm": 0.19708436727523804, + "learning_rate": 2.614782723677203e-06, + "loss": 2.3798, + "step": 6373 + }, + { + "epoch": 2.7496226008194955, + "grad_norm": 0.1987622082233429, + "learning_rate": 2.605857289590796e-06, + "loss": 2.1454, + "step": 6374 + }, + { + "epoch": 2.7500539141686433, + "grad_norm": 0.22205474972724915, + "learning_rate": 2.5969468455033392e-06, + "loss": 2.1309, + "step": 6375 + }, + { + "epoch": 2.7500539141686433, + "eval_loss": 2.0871455669403076, + "eval_runtime": 200.0309, + "eval_samples_per_second": 0.16, + "eval_steps_per_second": 0.16, + "step": 6375 + }, + { + "epoch": 2.7504852275177916, + "grad_norm": 0.20050524175167084, + "learning_rate": 2.588051393259824e-06, + "loss": 2.1983, + "step": 6376 + }, + { + "epoch": 2.75091654086694, + "grad_norm": 0.2021673619747162, + "learning_rate": 2.579170934702135e-06, + "loss": 2.3204, + "step": 6377 + }, + { + "epoch": 2.751347854216088, + "grad_norm": 0.1954621821641922, + "learning_rate": 2.570305471669101e-06, + "loss": 2.2105, + "step": 6378 + }, + { + "epoch": 2.7517791675652363, + "grad_norm": 0.19034738838672638, + "learning_rate": 2.561455005996388e-06, + "loss": 2.1093, + "step": 6379 + }, + { + "epoch": 2.752210480914384, + "grad_norm": 0.19725388288497925, + "learning_rate": 2.552619539516604e-06, + "loss": 2.0607, + "step": 6380 + }, + { + "epoch": 2.7526417942635324, + "grad_norm": 0.19629184901714325, + "learning_rate": 2.5437990740592024e-06, + "loss": 2.0065, + "step": 6381 + }, + { + "epoch": 2.7530731076126806, + "grad_norm": 0.18590472638607025, + "learning_rate": 2.5349936114505725e-06, + "loss": 1.932, + "step": 6382 + }, + { + "epoch": 2.753504420961829, + "grad_norm": 0.20266211032867432, + "learning_rate": 2.526203153513981e-06, + "loss": 2.1198, + "step": 6383 + }, + { + "epoch": 2.7539357343109767, + "grad_norm": 0.2148665189743042, + "learning_rate": 2.517427702069588e-06, + "loss": 2.2842, + "step": 6384 + }, + { + "epoch": 2.754367047660125, + "grad_norm": 0.25880327820777893, + "learning_rate": 2.508667258934449e-06, + "loss": 2.1491, + "step": 6385 + }, + { + "epoch": 2.7547983610092732, + "grad_norm": 0.19644123315811157, + "learning_rate": 2.4999218259224962e-06, + "loss": 2.189, + "step": 6386 + }, + { + "epoch": 2.7552296743584215, + "grad_norm": 0.2401173710823059, + "learning_rate": 2.491191404844581e-06, + "loss": 1.8995, + "step": 6387 + }, + { + "epoch": 2.7556609877075697, + "grad_norm": 0.19858931005001068, + "learning_rate": 2.482475997508432e-06, + "loss": 2.1006, + "step": 6388 + }, + { + "epoch": 2.7560923010567175, + "grad_norm": 0.1920706033706665, + "learning_rate": 2.473775605718664e-06, + "loss": 2.0843, + "step": 6389 + }, + { + "epoch": 2.756523614405866, + "grad_norm": 0.21617066860198975, + "learning_rate": 2.465090231276787e-06, + "loss": 2.2643, + "step": 6390 + }, + { + "epoch": 2.756954927755014, + "grad_norm": 0.192400261759758, + "learning_rate": 2.4564198759812026e-06, + "loss": 2.0117, + "step": 6391 + }, + { + "epoch": 2.7573862411041623, + "grad_norm": 0.19906947016716003, + "learning_rate": 2.447764541627209e-06, + "loss": 1.9897, + "step": 6392 + }, + { + "epoch": 2.75781755445331, + "grad_norm": 0.21517327427864075, + "learning_rate": 2.4391242300069974e-06, + "loss": 2.1997, + "step": 6393 + }, + { + "epoch": 2.7582488678024584, + "grad_norm": 0.20024853944778442, + "learning_rate": 2.43049894290962e-06, + "loss": 2.059, + "step": 6394 + }, + { + "epoch": 2.7586801811516066, + "grad_norm": 0.20617882907390594, + "learning_rate": 2.4218886821210316e-06, + "loss": 2.0869, + "step": 6395 + }, + { + "epoch": 2.759111494500755, + "grad_norm": 0.1851520985364914, + "learning_rate": 2.4132934494240972e-06, + "loss": 1.9781, + "step": 6396 + }, + { + "epoch": 2.759542807849903, + "grad_norm": 0.20461012423038483, + "learning_rate": 2.4047132465985525e-06, + "loss": 2.1605, + "step": 6397 + }, + { + "epoch": 2.759974121199051, + "grad_norm": 0.19638298451900482, + "learning_rate": 2.396148075421017e-06, + "loss": 1.9916, + "step": 6398 + }, + { + "epoch": 2.760405434548199, + "grad_norm": 0.21877293288707733, + "learning_rate": 2.3875979376650063e-06, + "loss": 2.2603, + "step": 6399 + }, + { + "epoch": 2.7608367478973475, + "grad_norm": 0.19478262960910797, + "learning_rate": 2.37906283510092e-06, + "loss": 2.1077, + "step": 6400 + }, + { + "epoch": 2.7608367478973475, + "eval_loss": 2.0871729850769043, + "eval_runtime": 194.9904, + "eval_samples_per_second": 0.164, + "eval_steps_per_second": 0.164, + "step": 6400 + }, + { + "epoch": 2.7612680612464957, + "grad_norm": 0.21150808036327362, + "learning_rate": 2.370542769496045e-06, + "loss": 2.1381, + "step": 6401 + }, + { + "epoch": 2.7616993745956435, + "grad_norm": 0.20274010300636292, + "learning_rate": 2.3620377426145448e-06, + "loss": 2.2028, + "step": 6402 + }, + { + "epoch": 2.7621306879447918, + "grad_norm": 0.17827123403549194, + "learning_rate": 2.353547756217486e-06, + "loss": 2.1247, + "step": 6403 + }, + { + "epoch": 2.76256200129394, + "grad_norm": 0.20930832624435425, + "learning_rate": 2.3450728120628205e-06, + "loss": 1.9401, + "step": 6404 + }, + { + "epoch": 2.7629933146430883, + "grad_norm": 0.2053506076335907, + "learning_rate": 2.336612911905361e-06, + "loss": 2.0497, + "step": 6405 + }, + { + "epoch": 2.7634246279922365, + "grad_norm": 0.2011050581932068, + "learning_rate": 2.328168057496832e-06, + "loss": 2.1845, + "step": 6406 + }, + { + "epoch": 2.763855941341385, + "grad_norm": 0.19295421242713928, + "learning_rate": 2.3197382505858423e-06, + "loss": 2.0928, + "step": 6407 + }, + { + "epoch": 2.7642872546905326, + "grad_norm": 0.19697582721710205, + "learning_rate": 2.311323492917855e-06, + "loss": 2.2089, + "step": 6408 + }, + { + "epoch": 2.764718568039681, + "grad_norm": 0.20078019797801971, + "learning_rate": 2.302923786235242e-06, + "loss": 2.2633, + "step": 6409 + }, + { + "epoch": 2.765149881388829, + "grad_norm": 0.2065783143043518, + "learning_rate": 2.2945391322772786e-06, + "loss": 2.2333, + "step": 6410 + }, + { + "epoch": 2.765581194737977, + "grad_norm": 0.20100045204162598, + "learning_rate": 2.286169532780069e-06, + "loss": 1.8474, + "step": 6411 + }, + { + "epoch": 2.766012508087125, + "grad_norm": 0.1967354416847229, + "learning_rate": 2.277814989476659e-06, + "loss": 2.1904, + "step": 6412 + }, + { + "epoch": 2.7664438214362734, + "grad_norm": 0.20408812165260315, + "learning_rate": 2.269475504096907e-06, + "loss": 1.9991, + "step": 6413 + }, + { + "epoch": 2.7668751347854217, + "grad_norm": 0.1991935819387436, + "learning_rate": 2.26115107836764e-06, + "loss": 2.0678, + "step": 6414 + }, + { + "epoch": 2.76730644813457, + "grad_norm": 0.20208342373371124, + "learning_rate": 2.2528417140124876e-06, + "loss": 2.2605, + "step": 6415 + }, + { + "epoch": 2.767737761483718, + "grad_norm": 0.19435718655586243, + "learning_rate": 2.2445474127520152e-06, + "loss": 2.1604, + "step": 6416 + }, + { + "epoch": 2.768169074832866, + "grad_norm": 0.2047269344329834, + "learning_rate": 2.2362681763036404e-06, + "loss": 2.1334, + "step": 6417 + }, + { + "epoch": 2.7686003881820143, + "grad_norm": 0.19622421264648438, + "learning_rate": 2.228004006381667e-06, + "loss": 2.2027, + "step": 6418 + }, + { + "epoch": 2.7690317015311625, + "grad_norm": 0.1936480700969696, + "learning_rate": 2.2197549046972934e-06, + "loss": 2.1398, + "step": 6419 + }, + { + "epoch": 2.7694630148803103, + "grad_norm": 0.21298177540302277, + "learning_rate": 2.2115208729585692e-06, + "loss": 1.9736, + "step": 6420 + }, + { + "epoch": 2.7698943282294586, + "grad_norm": 0.17468075454235077, + "learning_rate": 2.203301912870456e-06, + "loss": 1.9918, + "step": 6421 + }, + { + "epoch": 2.770325641578607, + "grad_norm": 0.19439345598220825, + "learning_rate": 2.1950980261347674e-06, + "loss": 2.0712, + "step": 6422 + }, + { + "epoch": 2.770756954927755, + "grad_norm": 0.19806329905986786, + "learning_rate": 2.186909214450211e-06, + "loss": 2.0701, + "step": 6423 + }, + { + "epoch": 2.7711882682769033, + "grad_norm": 0.1960648149251938, + "learning_rate": 2.1787354795123724e-06, + "loss": 2.0607, + "step": 6424 + }, + { + "epoch": 2.7716195816260516, + "grad_norm": 0.20692449808120728, + "learning_rate": 2.170576823013731e-06, + "loss": 2.1282, + "step": 6425 + }, + { + "epoch": 2.7716195816260516, + "eval_loss": 2.0871548652648926, + "eval_runtime": 194.7721, + "eval_samples_per_second": 0.164, + "eval_steps_per_second": 0.164, + "step": 6425 + }, + { + "epoch": 2.7720508949751994, + "grad_norm": 0.20055831968784332, + "learning_rate": 2.1624332466435938e-06, + "loss": 2.1578, + "step": 6426 + }, + { + "epoch": 2.7724822083243477, + "grad_norm": 0.1872549206018448, + "learning_rate": 2.154304752088179e-06, + "loss": 2.2042, + "step": 6427 + }, + { + "epoch": 2.772913521673496, + "grad_norm": 0.19771480560302734, + "learning_rate": 2.1461913410306064e-06, + "loss": 2.1327, + "step": 6428 + }, + { + "epoch": 2.7733448350226437, + "grad_norm": 0.23074021935462952, + "learning_rate": 2.1380930151508325e-06, + "loss": 2.1386, + "step": 6429 + }, + { + "epoch": 2.773776148371792, + "grad_norm": 0.20994645357131958, + "learning_rate": 2.130009776125699e-06, + "loss": 2.1653, + "step": 6430 + }, + { + "epoch": 2.7742074617209402, + "grad_norm": 0.19817151129245758, + "learning_rate": 2.1219416256289424e-06, + "loss": 2.185, + "step": 6431 + }, + { + "epoch": 2.7746387750700885, + "grad_norm": 0.20503103733062744, + "learning_rate": 2.1138885653311426e-06, + "loss": 2.1917, + "step": 6432 + }, + { + "epoch": 2.7750700884192367, + "grad_norm": 0.18863680958747864, + "learning_rate": 2.105850596899791e-06, + "loss": 2.1454, + "step": 6433 + }, + { + "epoch": 2.775501401768385, + "grad_norm": 0.1986525058746338, + "learning_rate": 2.0978277219992318e-06, + "loss": 2.2787, + "step": 6434 + }, + { + "epoch": 2.775932715117533, + "grad_norm": 0.20332397520542145, + "learning_rate": 2.089819942290685e-06, + "loss": 1.9675, + "step": 6435 + }, + { + "epoch": 2.776364028466681, + "grad_norm": 0.18802419304847717, + "learning_rate": 2.0818272594322507e-06, + "loss": 2.1166, + "step": 6436 + }, + { + "epoch": 2.7767953418158293, + "grad_norm": 0.2151930332183838, + "learning_rate": 2.073849675078895e-06, + "loss": 2.1658, + "step": 6437 + }, + { + "epoch": 2.777226655164977, + "grad_norm": 0.18824675679206848, + "learning_rate": 2.0658871908824816e-06, + "loss": 2.1774, + "step": 6438 + }, + { + "epoch": 2.7776579685141254, + "grad_norm": 0.20375822484493256, + "learning_rate": 2.0579398084917236e-06, + "loss": 2.1393, + "step": 6439 + }, + { + "epoch": 2.7780892818632736, + "grad_norm": 0.21004673838615417, + "learning_rate": 2.0500075295521973e-06, + "loss": 1.9528, + "step": 6440 + }, + { + "epoch": 2.778520595212422, + "grad_norm": 0.24513062834739685, + "learning_rate": 2.04209035570638e-06, + "loss": 1.7822, + "step": 6441 + }, + { + "epoch": 2.77895190856157, + "grad_norm": 0.19536660611629486, + "learning_rate": 2.0341882885936107e-06, + "loss": 2.0755, + "step": 6442 + }, + { + "epoch": 2.7793832219107184, + "grad_norm": 0.25875434279441833, + "learning_rate": 2.0263013298500967e-06, + "loss": 2.2622, + "step": 6443 + }, + { + "epoch": 2.779814535259866, + "grad_norm": 0.24299484491348267, + "learning_rate": 2.018429481108924e-06, + "loss": 2.2031, + "step": 6444 + }, + { + "epoch": 2.7802458486090145, + "grad_norm": 0.1964147537946701, + "learning_rate": 2.0105727440000386e-06, + "loss": 2.1252, + "step": 6445 + }, + { + "epoch": 2.7806771619581627, + "grad_norm": 0.19906258583068848, + "learning_rate": 2.0027311201502638e-06, + "loss": 2.1964, + "step": 6446 + }, + { + "epoch": 2.7811084753073105, + "grad_norm": 0.19478124380111694, + "learning_rate": 1.994904611183293e-06, + "loss": 1.9829, + "step": 6447 + }, + { + "epoch": 2.781539788656459, + "grad_norm": 0.1987592726945877, + "learning_rate": 1.9870932187196957e-06, + "loss": 2.1168, + "step": 6448 + }, + { + "epoch": 2.781971102005607, + "grad_norm": 0.19542710483074188, + "learning_rate": 1.979296944376904e-06, + "loss": 2.3019, + "step": 6449 + }, + { + "epoch": 2.7824024153547553, + "grad_norm": 0.2147682160139084, + "learning_rate": 1.971515789769218e-06, + "loss": 2.2137, + "step": 6450 + }, + { + "epoch": 2.7824024153547553, + "eval_loss": 2.0871737003326416, + "eval_runtime": 194.617, + "eval_samples_per_second": 0.164, + "eval_steps_per_second": 0.164, + "step": 6450 + }, + { + "epoch": 2.7828337287039036, + "grad_norm": 0.19122406840324402, + "learning_rate": 1.963749756507807e-06, + "loss": 2.143, + "step": 6451 + }, + { + "epoch": 2.783265042053052, + "grad_norm": 0.20361442863941193, + "learning_rate": 1.9559988462007348e-06, + "loss": 2.109, + "step": 6452 + }, + { + "epoch": 2.7836963554021996, + "grad_norm": 0.19597600400447845, + "learning_rate": 1.9482630604528927e-06, + "loss": 2.2253, + "step": 6453 + }, + { + "epoch": 2.784127668751348, + "grad_norm": 0.19568900763988495, + "learning_rate": 1.9405424008660657e-06, + "loss": 2.2234, + "step": 6454 + }, + { + "epoch": 2.784558982100496, + "grad_norm": 0.20625774562358856, + "learning_rate": 1.9328368690388925e-06, + "loss": 2.0696, + "step": 6455 + }, + { + "epoch": 2.784990295449644, + "grad_norm": 0.19813793897628784, + "learning_rate": 1.925146466566896e-06, + "loss": 2.2189, + "step": 6456 + }, + { + "epoch": 2.785421608798792, + "grad_norm": 0.21412412822246552, + "learning_rate": 1.917471195042461e-06, + "loss": 2.1537, + "step": 6457 + }, + { + "epoch": 2.7858529221479404, + "grad_norm": 0.20112594962120056, + "learning_rate": 1.909811056054833e-06, + "loss": 2.3006, + "step": 6458 + }, + { + "epoch": 2.7862842354970887, + "grad_norm": 0.1943495273590088, + "learning_rate": 1.9021660511901182e-06, + "loss": 1.949, + "step": 6459 + }, + { + "epoch": 2.786715548846237, + "grad_norm": 0.21642576158046722, + "learning_rate": 1.8945361820313087e-06, + "loss": 2.3344, + "step": 6460 + }, + { + "epoch": 2.787146862195385, + "grad_norm": 0.20003938674926758, + "learning_rate": 1.8869214501582575e-06, + "loss": 2.3522, + "step": 6461 + }, + { + "epoch": 2.787578175544533, + "grad_norm": 0.19744734466075897, + "learning_rate": 1.8793218571476697e-06, + "loss": 2.2744, + "step": 6462 + }, + { + "epoch": 2.7880094888936813, + "grad_norm": 0.18666155636310577, + "learning_rate": 1.8717374045731204e-06, + "loss": 1.9739, + "step": 6463 + }, + { + "epoch": 2.7884408022428295, + "grad_norm": 0.19375626742839813, + "learning_rate": 1.8641680940050612e-06, + "loss": 2.0815, + "step": 6464 + }, + { + "epoch": 2.7888721155919773, + "grad_norm": 0.19252249598503113, + "learning_rate": 1.856613927010797e-06, + "loss": 1.968, + "step": 6465 + }, + { + "epoch": 2.7893034289411256, + "grad_norm": 0.21271540224552155, + "learning_rate": 1.8490749051545017e-06, + "loss": 2.2135, + "step": 6466 + }, + { + "epoch": 2.789734742290274, + "grad_norm": 0.23579350113868713, + "learning_rate": 1.841551029997218e-06, + "loss": 1.9886, + "step": 6467 + }, + { + "epoch": 2.790166055639422, + "grad_norm": 0.19472530484199524, + "learning_rate": 1.8340423030968332e-06, + "loss": 2.2144, + "step": 6468 + }, + { + "epoch": 2.7905973689885704, + "grad_norm": 0.21968962252140045, + "learning_rate": 1.8265487260081202e-06, + "loss": 2.2461, + "step": 6469 + }, + { + "epoch": 2.7910286823377186, + "grad_norm": 0.19856375455856323, + "learning_rate": 1.8190703002827046e-06, + "loss": 2.1164, + "step": 6470 + }, + { + "epoch": 2.7914599956868664, + "grad_norm": 0.19614583253860474, + "learning_rate": 1.811607027469089e-06, + "loss": 2.0143, + "step": 6471 + }, + { + "epoch": 2.7918913090360147, + "grad_norm": 0.4438173174858093, + "learning_rate": 1.8041589091126047e-06, + "loss": 2.1001, + "step": 6472 + }, + { + "epoch": 2.792322622385163, + "grad_norm": 0.21552805602550507, + "learning_rate": 1.796725946755459e-06, + "loss": 2.2342, + "step": 6473 + }, + { + "epoch": 2.7927539357343107, + "grad_norm": 0.18399421870708466, + "learning_rate": 1.7893081419367627e-06, + "loss": 2.0788, + "step": 6474 + }, + { + "epoch": 2.793185249083459, + "grad_norm": 0.18250444531440735, + "learning_rate": 1.781905496192429e-06, + "loss": 1.8374, + "step": 6475 + }, + { + "epoch": 2.793185249083459, + "eval_loss": 2.0871214866638184, + "eval_runtime": 194.9282, + "eval_samples_per_second": 0.164, + "eval_steps_per_second": 0.164, + "step": 6475 + }, + { + "epoch": 2.7936165624326073, + "grad_norm": 0.1958213448524475, + "learning_rate": 1.7745180110552565e-06, + "loss": 2.3356, + "step": 6476 + }, + { + "epoch": 2.7940478757817555, + "grad_norm": 0.2061747908592224, + "learning_rate": 1.7671456880549217e-06, + "loss": 2.1613, + "step": 6477 + }, + { + "epoch": 2.7944791891309038, + "grad_norm": 0.20832446217536926, + "learning_rate": 1.7597885287179203e-06, + "loss": 2.0902, + "step": 6478 + }, + { + "epoch": 2.794910502480052, + "grad_norm": 0.192495658993721, + "learning_rate": 1.7524465345676503e-06, + "loss": 1.8938, + "step": 6479 + }, + { + "epoch": 2.7953418158292, + "grad_norm": 0.21228113770484924, + "learning_rate": 1.7451197071243538e-06, + "loss": 1.9777, + "step": 6480 + }, + { + "epoch": 2.795773129178348, + "grad_norm": 0.19520573318004608, + "learning_rate": 1.7378080479051176e-06, + "loss": 2.0726, + "step": 6481 + }, + { + "epoch": 2.7962044425274963, + "grad_norm": 0.19283892214298248, + "learning_rate": 1.7305115584239055e-06, + "loss": 2.2036, + "step": 6482 + }, + { + "epoch": 2.796635755876644, + "grad_norm": 0.1813032180070877, + "learning_rate": 1.7232302401915337e-06, + "loss": 2.4088, + "step": 6483 + }, + { + "epoch": 2.7970670692257924, + "grad_norm": 0.2084456980228424, + "learning_rate": 1.7159640947156884e-06, + "loss": 2.1702, + "step": 6484 + }, + { + "epoch": 2.7974983825749407, + "grad_norm": 0.21089473366737366, + "learning_rate": 1.7087131235009072e-06, + "loss": 2.5504, + "step": 6485 + }, + { + "epoch": 2.797929695924089, + "grad_norm": 0.19794204831123352, + "learning_rate": 1.701477328048556e-06, + "loss": 2.179, + "step": 6486 + }, + { + "epoch": 2.798361009273237, + "grad_norm": 0.22298972308635712, + "learning_rate": 1.6942567098569032e-06, + "loss": 2.0364, + "step": 6487 + }, + { + "epoch": 2.7987923226223854, + "grad_norm": 0.21681968867778778, + "learning_rate": 1.6870512704210691e-06, + "loss": 2.2683, + "step": 6488 + }, + { + "epoch": 2.7992236359715332, + "grad_norm": 0.2068440020084381, + "learning_rate": 1.6798610112330024e-06, + "loss": 2.2263, + "step": 6489 + }, + { + "epoch": 2.7996549493206815, + "grad_norm": 0.20238614082336426, + "learning_rate": 1.6726859337815285e-06, + "loss": 2.0219, + "step": 6490 + }, + { + "epoch": 2.8000862626698297, + "grad_norm": 0.20311656594276428, + "learning_rate": 1.6655260395523257e-06, + "loss": 2.1749, + "step": 6491 + }, + { + "epoch": 2.8005175760189776, + "grad_norm": 0.2581019699573517, + "learning_rate": 1.6583813300279247e-06, + "loss": 2.0499, + "step": 6492 + }, + { + "epoch": 2.800948889368126, + "grad_norm": 0.20405562222003937, + "learning_rate": 1.6512518066877256e-06, + "loss": 2.1406, + "step": 6493 + }, + { + "epoch": 2.801380202717274, + "grad_norm": 0.20483388006687164, + "learning_rate": 1.6441374710079642e-06, + "loss": 2.1057, + "step": 6494 + }, + { + "epoch": 2.8018115160664223, + "grad_norm": 0.19205987453460693, + "learning_rate": 1.6370383244617452e-06, + "loss": 2.0372, + "step": 6495 + }, + { + "epoch": 2.8022428294155706, + "grad_norm": 0.20968760550022125, + "learning_rate": 1.6299543685190263e-06, + "loss": 2.0498, + "step": 6496 + }, + { + "epoch": 2.802674142764719, + "grad_norm": 0.22050540149211884, + "learning_rate": 1.6228856046466093e-06, + "loss": 2.1664, + "step": 6497 + }, + { + "epoch": 2.8031054561138666, + "grad_norm": 0.1926727592945099, + "learning_rate": 1.6158320343081732e-06, + "loss": 2.3444, + "step": 6498 + }, + { + "epoch": 2.803536769463015, + "grad_norm": 0.2012040913105011, + "learning_rate": 1.6087936589642414e-06, + "loss": 2.2633, + "step": 6499 + }, + { + "epoch": 2.803968082812163, + "grad_norm": 0.21898652613162994, + "learning_rate": 1.6017704800721648e-06, + "loss": 2.1332, + "step": 6500 + }, + { + "epoch": 2.803968082812163, + "eval_loss": 2.0871434211730957, + "eval_runtime": 194.753, + "eval_samples_per_second": 0.164, + "eval_steps_per_second": 0.164, + "step": 6500 + }, + { + "epoch": 2.804399396161311, + "grad_norm": 0.1935500055551529, + "learning_rate": 1.5947624990861718e-06, + "loss": 2.0575, + "step": 6501 + }, + { + "epoch": 2.804830709510459, + "grad_norm": 0.1991499662399292, + "learning_rate": 1.5877697174573601e-06, + "loss": 2.0898, + "step": 6502 + }, + { + "epoch": 2.8052620228596075, + "grad_norm": 0.18370495736598969, + "learning_rate": 1.5807921366336463e-06, + "loss": 2.1516, + "step": 6503 + }, + { + "epoch": 2.8056933362087557, + "grad_norm": 0.2187313735485077, + "learning_rate": 1.573829758059833e-06, + "loss": 1.7816, + "step": 6504 + }, + { + "epoch": 2.806124649557904, + "grad_norm": 0.21289658546447754, + "learning_rate": 1.5668825831775172e-06, + "loss": 2.2105, + "step": 6505 + }, + { + "epoch": 2.8065559629070522, + "grad_norm": 0.20149406790733337, + "learning_rate": 1.5599506134252315e-06, + "loss": 2.2682, + "step": 6506 + }, + { + "epoch": 2.8069872762562, + "grad_norm": 0.2140059471130371, + "learning_rate": 1.5530338502382945e-06, + "loss": 2.0684, + "step": 6507 + }, + { + "epoch": 2.8074185896053483, + "grad_norm": 0.21615104377269745, + "learning_rate": 1.5461322950489018e-06, + "loss": 2.1763, + "step": 6508 + }, + { + "epoch": 2.8078499029544965, + "grad_norm": 0.20524483919143677, + "learning_rate": 1.5392459492860942e-06, + "loss": 2.1927, + "step": 6509 + }, + { + "epoch": 2.8082812163036444, + "grad_norm": 0.1816680133342743, + "learning_rate": 1.5323748143757725e-06, + "loss": 2.072, + "step": 6510 + }, + { + "epoch": 2.8087125296527926, + "grad_norm": 0.1891278773546219, + "learning_rate": 1.5255188917406652e-06, + "loss": 2.2138, + "step": 6511 + }, + { + "epoch": 2.809143843001941, + "grad_norm": 0.20423439145088196, + "learning_rate": 1.5186781828003786e-06, + "loss": 2.0443, + "step": 6512 + }, + { + "epoch": 2.809575156351089, + "grad_norm": 0.20651452243328094, + "learning_rate": 1.5118526889713462e-06, + "loss": 2.1039, + "step": 6513 + }, + { + "epoch": 2.8100064697002374, + "grad_norm": 0.8881313800811768, + "learning_rate": 1.5050424116668792e-06, + "loss": 2.1473, + "step": 6514 + }, + { + "epoch": 2.8104377830493856, + "grad_norm": 0.19139255583286285, + "learning_rate": 1.4982473522970912e-06, + "loss": 2.0588, + "step": 6515 + }, + { + "epoch": 2.8108690963985334, + "grad_norm": 0.1904522329568863, + "learning_rate": 1.4914675122689983e-06, + "loss": 2.1976, + "step": 6516 + }, + { + "epoch": 2.8113004097476817, + "grad_norm": 0.22774598002433777, + "learning_rate": 1.4847028929864447e-06, + "loss": 1.7157, + "step": 6517 + }, + { + "epoch": 2.81173172309683, + "grad_norm": 0.201594278216362, + "learning_rate": 1.477953495850101e-06, + "loss": 2.1143, + "step": 6518 + }, + { + "epoch": 2.8121630364459778, + "grad_norm": 0.20529593527317047, + "learning_rate": 1.4712193222574997e-06, + "loss": 2.1312, + "step": 6519 + }, + { + "epoch": 2.812594349795126, + "grad_norm": 0.19255559146404266, + "learning_rate": 1.464500373603042e-06, + "loss": 2.1235, + "step": 6520 + }, + { + "epoch": 2.8130256631442743, + "grad_norm": 0.2061937004327774, + "learning_rate": 1.4577966512779482e-06, + "loss": 2.1598, + "step": 6521 + }, + { + "epoch": 2.8134569764934225, + "grad_norm": 0.2003118246793747, + "learning_rate": 1.4511081566703081e-06, + "loss": 2.2089, + "step": 6522 + }, + { + "epoch": 2.8138882898425708, + "grad_norm": 0.20519734919071198, + "learning_rate": 1.4444348911650388e-06, + "loss": 2.2433, + "step": 6523 + }, + { + "epoch": 2.814319603191719, + "grad_norm": 0.19626106321811676, + "learning_rate": 1.4377768561439102e-06, + "loss": 2.2808, + "step": 6524 + }, + { + "epoch": 2.814750916540867, + "grad_norm": 0.20269042253494263, + "learning_rate": 1.4311340529855525e-06, + "loss": 2.0386, + "step": 6525 + }, + { + "epoch": 2.814750916540867, + "eval_loss": 2.0870800018310547, + "eval_runtime": 194.8169, + "eval_samples_per_second": 0.164, + "eval_steps_per_second": 0.164, + "step": 6525 + }, + { + "epoch": 2.815182229890015, + "grad_norm": 0.2238195538520813, + "learning_rate": 1.424506483065424e-06, + "loss": 2.2912, + "step": 6526 + }, + { + "epoch": 2.8156135432391634, + "grad_norm": 0.19504454731941223, + "learning_rate": 1.4178941477558353e-06, + "loss": 2.0852, + "step": 6527 + }, + { + "epoch": 2.816044856588311, + "grad_norm": 0.20155642926692963, + "learning_rate": 1.4112970484259411e-06, + "loss": 1.9959, + "step": 6528 + }, + { + "epoch": 2.8164761699374594, + "grad_norm": 0.1982235610485077, + "learning_rate": 1.4047151864417405e-06, + "loss": 2.0988, + "step": 6529 + }, + { + "epoch": 2.8169074832866077, + "grad_norm": 0.20831064879894257, + "learning_rate": 1.3981485631660933e-06, + "loss": 2.1964, + "step": 6530 + }, + { + "epoch": 2.817338796635756, + "grad_norm": 0.2066282033920288, + "learning_rate": 1.3915971799586867e-06, + "loss": 2.1122, + "step": 6531 + }, + { + "epoch": 2.817770109984904, + "grad_norm": 0.19970938563346863, + "learning_rate": 1.3850610381760524e-06, + "loss": 2.1798, + "step": 6532 + }, + { + "epoch": 2.8182014233340524, + "grad_norm": 0.20368598401546478, + "learning_rate": 1.3785401391715579e-06, + "loss": 2.0832, + "step": 6533 + }, + { + "epoch": 2.8186327366832002, + "grad_norm": 0.2219233214855194, + "learning_rate": 1.3720344842954395e-06, + "loss": 2.2272, + "step": 6534 + }, + { + "epoch": 2.8190640500323485, + "grad_norm": 0.189238041639328, + "learning_rate": 1.36554407489477e-06, + "loss": 2.1196, + "step": 6535 + }, + { + "epoch": 2.8194953633814968, + "grad_norm": 0.19806627929210663, + "learning_rate": 1.3590689123134491e-06, + "loss": 2.158, + "step": 6536 + }, + { + "epoch": 2.8199266767306446, + "grad_norm": 0.20515137910842896, + "learning_rate": 1.3526089978922377e-06, + "loss": 2.2419, + "step": 6537 + }, + { + "epoch": 2.820357990079793, + "grad_norm": 0.22096523642539978, + "learning_rate": 1.3461643329687244e-06, + "loss": 2.18, + "step": 6538 + }, + { + "epoch": 2.820789303428941, + "grad_norm": 0.18739445507526398, + "learning_rate": 1.3397349188773498e-06, + "loss": 1.9742, + "step": 6539 + }, + { + "epoch": 2.8212206167780893, + "grad_norm": 0.19757986068725586, + "learning_rate": 1.3333207569493991e-06, + "loss": 2.2201, + "step": 6540 + }, + { + "epoch": 2.8216519301272376, + "grad_norm": 0.2100953906774521, + "learning_rate": 1.3269218485129852e-06, + "loss": 2.1429, + "step": 6541 + }, + { + "epoch": 2.822083243476386, + "grad_norm": 0.19657377898693085, + "learning_rate": 1.320538194893081e-06, + "loss": 2.0707, + "step": 6542 + }, + { + "epoch": 2.8225145568255336, + "grad_norm": 0.19098903238773346, + "learning_rate": 1.3141697974114796e-06, + "loss": 1.9887, + "step": 6543 + }, + { + "epoch": 2.822945870174682, + "grad_norm": 0.210743710398674, + "learning_rate": 1.307816657386851e-06, + "loss": 2.1582, + "step": 6544 + }, + { + "epoch": 2.82337718352383, + "grad_norm": 0.18364079296588898, + "learning_rate": 1.3014787761346596e-06, + "loss": 1.8924, + "step": 6545 + }, + { + "epoch": 2.823808496872978, + "grad_norm": 0.2031092643737793, + "learning_rate": 1.2951561549672473e-06, + "loss": 2.2685, + "step": 6546 + }, + { + "epoch": 2.8242398102221262, + "grad_norm": 0.21850912272930145, + "learning_rate": 1.288848795193767e-06, + "loss": 2.1427, + "step": 6547 + }, + { + "epoch": 2.8246711235712745, + "grad_norm": 0.19463056325912476, + "learning_rate": 1.2825566981202324e-06, + "loss": 2.1355, + "step": 6548 + }, + { + "epoch": 2.8251024369204227, + "grad_norm": 0.21621888875961304, + "learning_rate": 1.2762798650495094e-06, + "loss": 2.0943, + "step": 6549 + }, + { + "epoch": 2.825533750269571, + "grad_norm": 0.17011678218841553, + "learning_rate": 1.2700182972812585e-06, + "loss": 1.9588, + "step": 6550 + }, + { + "epoch": 2.825533750269571, + "eval_loss": 2.0870361328125, + "eval_runtime": 195.0598, + "eval_samples_per_second": 0.164, + "eval_steps_per_second": 0.164, + "step": 6550 + }, + { + "epoch": 2.8259650636187192, + "grad_norm": 0.1891787052154541, + "learning_rate": 1.2637719961120175e-06, + "loss": 2.1756, + "step": 6551 + }, + { + "epoch": 2.826396376967867, + "grad_norm": 0.17923136055469513, + "learning_rate": 1.2575409628351518e-06, + "loss": 2.2323, + "step": 6552 + }, + { + "epoch": 2.8268276903170153, + "grad_norm": 0.20607276260852814, + "learning_rate": 1.251325198740863e-06, + "loss": 2.1205, + "step": 6553 + }, + { + "epoch": 2.8272590036661636, + "grad_norm": 0.19592034816741943, + "learning_rate": 1.2451247051161878e-06, + "loss": 2.0756, + "step": 6554 + }, + { + "epoch": 2.8276903170153114, + "grad_norm": 0.20986421406269073, + "learning_rate": 1.2389394832450245e-06, + "loss": 2.0254, + "step": 6555 + }, + { + "epoch": 2.8281216303644596, + "grad_norm": 0.18774859607219696, + "learning_rate": 1.2327695344080735e-06, + "loss": 2.0611, + "step": 6556 + }, + { + "epoch": 2.828552943713608, + "grad_norm": 0.19214344024658203, + "learning_rate": 1.226614859882888e-06, + "loss": 2.1418, + "step": 6557 + }, + { + "epoch": 2.828984257062756, + "grad_norm": 0.18636095523834229, + "learning_rate": 1.2204754609438733e-06, + "loss": 2.0401, + "step": 6558 + }, + { + "epoch": 2.8294155704119044, + "grad_norm": 0.20176027715206146, + "learning_rate": 1.2143513388622544e-06, + "loss": 2.0793, + "step": 6559 + }, + { + "epoch": 2.8298468837610526, + "grad_norm": 0.18498626351356506, + "learning_rate": 1.2082424949061003e-06, + "loss": 2.0077, + "step": 6560 + }, + { + "epoch": 2.8302781971102005, + "grad_norm": 0.1844421774148941, + "learning_rate": 1.2021489303402992e-06, + "loss": 1.9651, + "step": 6561 + }, + { + "epoch": 2.8307095104593487, + "grad_norm": 0.20158296823501587, + "learning_rate": 1.1960706464266085e-06, + "loss": 1.9676, + "step": 6562 + }, + { + "epoch": 2.831140823808497, + "grad_norm": 0.20933398604393005, + "learning_rate": 1.1900076444236046e-06, + "loss": 2.1597, + "step": 6563 + }, + { + "epoch": 2.8315721371576448, + "grad_norm": 0.38234084844589233, + "learning_rate": 1.1839599255866832e-06, + "loss": 2.1744, + "step": 6564 + }, + { + "epoch": 2.832003450506793, + "grad_norm": 0.19370846450328827, + "learning_rate": 1.1779274911680931e-06, + "loss": 1.9428, + "step": 6565 + }, + { + "epoch": 2.8324347638559413, + "grad_norm": 0.1982627660036087, + "learning_rate": 1.1719103424169263e-06, + "loss": 2.1265, + "step": 6566 + }, + { + "epoch": 2.8328660772050895, + "grad_norm": 0.23792126774787903, + "learning_rate": 1.1659084805790946e-06, + "loss": 1.9542, + "step": 6567 + }, + { + "epoch": 2.833297390554238, + "grad_norm": 0.21161462366580963, + "learning_rate": 1.1599219068973458e-06, + "loss": 2.149, + "step": 6568 + }, + { + "epoch": 2.833728703903386, + "grad_norm": 0.2071421891450882, + "learning_rate": 1.1539506226112634e-06, + "loss": 2.1924, + "step": 6569 + }, + { + "epoch": 2.834160017252534, + "grad_norm": 0.2017904669046402, + "learning_rate": 1.1479946289572828e-06, + "loss": 2.1854, + "step": 6570 + }, + { + "epoch": 2.834591330601682, + "grad_norm": 0.18799950182437897, + "learning_rate": 1.1420539271686346e-06, + "loss": 2.188, + "step": 6571 + }, + { + "epoch": 2.8350226439508304, + "grad_norm": 0.1960608810186386, + "learning_rate": 1.1361285184754266e-06, + "loss": 2.1766, + "step": 6572 + }, + { + "epoch": 2.835453957299978, + "grad_norm": 0.7240349054336548, + "learning_rate": 1.1302184041045603e-06, + "loss": 2.1145, + "step": 6573 + }, + { + "epoch": 2.8358852706491264, + "grad_norm": 0.20830345153808594, + "learning_rate": 1.124323585279807e-06, + "loss": 2.101, + "step": 6574 + }, + { + "epoch": 2.8363165839982747, + "grad_norm": 0.1981867402791977, + "learning_rate": 1.1184440632217405e-06, + "loss": 2.1011, + "step": 6575 + }, + { + "epoch": 2.8363165839982747, + "eval_loss": 2.087026834487915, + "eval_runtime": 199.1904, + "eval_samples_per_second": 0.161, + "eval_steps_per_second": 0.161, + "step": 6575 + }, + { + "epoch": 2.836747897347423, + "grad_norm": 0.2012898027896881, + "learning_rate": 1.1125798391477869e-06, + "loss": 2.1873, + "step": 6576 + }, + { + "epoch": 2.837179210696571, + "grad_norm": 0.2115420699119568, + "learning_rate": 1.106730914272208e-06, + "loss": 2.0934, + "step": 6577 + }, + { + "epoch": 2.8376105240457195, + "grad_norm": 0.19753608107566833, + "learning_rate": 1.1008972898060687e-06, + "loss": 2.066, + "step": 6578 + }, + { + "epoch": 2.8380418373948673, + "grad_norm": 0.20166726410388947, + "learning_rate": 1.0950789669572858e-06, + "loss": 2.2392, + "step": 6579 + }, + { + "epoch": 2.8384731507440155, + "grad_norm": 0.855771005153656, + "learning_rate": 1.0892759469306206e-06, + "loss": 2.285, + "step": 6580 + }, + { + "epoch": 2.8389044640931638, + "grad_norm": 0.20052401721477509, + "learning_rate": 1.0834882309276455e-06, + "loss": 2.0705, + "step": 6581 + }, + { + "epoch": 2.8393357774423116, + "grad_norm": 0.19966664910316467, + "learning_rate": 1.07771582014676e-06, + "loss": 2.0723, + "step": 6582 + }, + { + "epoch": 2.83976709079146, + "grad_norm": 0.18459691107273102, + "learning_rate": 1.0719587157832248e-06, + "loss": 2.116, + "step": 6583 + }, + { + "epoch": 2.840198404140608, + "grad_norm": 0.19035299122333527, + "learning_rate": 1.0662169190290949e-06, + "loss": 2.109, + "step": 6584 + }, + { + "epoch": 2.8406297174897563, + "grad_norm": 0.21567097306251526, + "learning_rate": 1.0604904310732853e-06, + "loss": 2.2523, + "step": 6585 + }, + { + "epoch": 2.8410610308389046, + "grad_norm": 0.2520385682582855, + "learning_rate": 1.0547792531015148e-06, + "loss": 2.1898, + "step": 6586 + }, + { + "epoch": 2.841492344188053, + "grad_norm": 0.19074073433876038, + "learning_rate": 1.0490833862963539e-06, + "loss": 2.1059, + "step": 6587 + }, + { + "epoch": 2.8419236575372007, + "grad_norm": 0.20969784259796143, + "learning_rate": 1.0434028318371924e-06, + "loss": 2.1702, + "step": 6588 + }, + { + "epoch": 2.842354970886349, + "grad_norm": 0.22954483330249786, + "learning_rate": 1.0377375909002393e-06, + "loss": 2.1248, + "step": 6589 + }, + { + "epoch": 2.842786284235497, + "grad_norm": 0.18923214077949524, + "learning_rate": 1.0320876646585736e-06, + "loss": 2.0614, + "step": 6590 + }, + { + "epoch": 2.843217597584645, + "grad_norm": 0.20723962783813477, + "learning_rate": 1.026453054282042e-06, + "loss": 2.1693, + "step": 6591 + }, + { + "epoch": 2.8436489109337932, + "grad_norm": 0.19316589832305908, + "learning_rate": 1.0208337609373784e-06, + "loss": 2.1487, + "step": 6592 + }, + { + "epoch": 2.8440802242829415, + "grad_norm": 0.19485367834568024, + "learning_rate": 1.0152297857880936e-06, + "loss": 2.3646, + "step": 6593 + }, + { + "epoch": 2.8445115376320897, + "grad_norm": 0.1999531090259552, + "learning_rate": 1.0096411299945677e-06, + "loss": 2.114, + "step": 6594 + }, + { + "epoch": 2.844942850981238, + "grad_norm": 0.2081172615289688, + "learning_rate": 1.0040677947140002e-06, + "loss": 2.3284, + "step": 6595 + }, + { + "epoch": 2.8453741643303863, + "grad_norm": 0.21178525686264038, + "learning_rate": 9.985097811004012e-07, + "loss": 2.2318, + "step": 6596 + }, + { + "epoch": 2.845805477679534, + "grad_norm": 0.2165527045726776, + "learning_rate": 9.929670903046084e-07, + "loss": 2.1899, + "step": 6597 + }, + { + "epoch": 2.8462367910286823, + "grad_norm": 0.20413519442081451, + "learning_rate": 9.87439723474312e-07, + "loss": 2.2067, + "step": 6598 + }, + { + "epoch": 2.8466681043778306, + "grad_norm": 0.1823328584432602, + "learning_rate": 9.81927681754005e-07, + "loss": 2.1177, + "step": 6599 + }, + { + "epoch": 2.8470994177269784, + "grad_norm": 0.2507426142692566, + "learning_rate": 9.764309662850244e-07, + "loss": 2.2117, + "step": 6600 + }, + { + "epoch": 2.8470994177269784, + "eval_loss": 2.087045907974243, + "eval_runtime": 195.3532, + "eval_samples_per_second": 0.164, + "eval_steps_per_second": 0.164, + "step": 6600 + }, + { + "epoch": 2.8475307310761266, + "grad_norm": 0.19653716683387756, + "learning_rate": 9.709495782055266e-07, + "loss": 2.452, + "step": 6601 + }, + { + "epoch": 2.847962044425275, + "grad_norm": 1.8744388818740845, + "learning_rate": 9.654835186504784e-07, + "loss": 2.0288, + "step": 6602 + }, + { + "epoch": 2.848393357774423, + "grad_norm": 0.2228662669658661, + "learning_rate": 9.600327887516912e-07, + "loss": 2.2299, + "step": 6603 + }, + { + "epoch": 2.8488246711235714, + "grad_norm": 0.2115723341703415, + "learning_rate": 9.545973896378123e-07, + "loss": 2.2138, + "step": 6604 + }, + { + "epoch": 2.8492559844727197, + "grad_norm": 0.20439818501472473, + "learning_rate": 9.491773224342831e-07, + "loss": 1.9981, + "step": 6605 + }, + { + "epoch": 2.8496872978218675, + "grad_norm": 0.20456363260746002, + "learning_rate": 9.437725882633973e-07, + "loss": 2.1978, + "step": 6606 + }, + { + "epoch": 2.8501186111710157, + "grad_norm": 0.19252854585647583, + "learning_rate": 9.383831882442599e-07, + "loss": 2.1711, + "step": 6607 + }, + { + "epoch": 2.850549924520164, + "grad_norm": 0.21124014258384705, + "learning_rate": 9.330091234928117e-07, + "loss": 2.0787, + "step": 6608 + }, + { + "epoch": 2.850981237869312, + "grad_norm": 0.19842417538166046, + "learning_rate": 9.276503951218123e-07, + "loss": 2.0478, + "step": 6609 + }, + { + "epoch": 2.85141255121846, + "grad_norm": 0.20528315007686615, + "learning_rate": 9.223070042408326e-07, + "loss": 2.1122, + "step": 6610 + }, + { + "epoch": 2.8518438645676083, + "grad_norm": 0.2027859389781952, + "learning_rate": 9.169789519562875e-07, + "loss": 2.1932, + "step": 6611 + }, + { + "epoch": 2.8522751779167566, + "grad_norm": 0.20300999283790588, + "learning_rate": 9.116662393714114e-07, + "loss": 2.0943, + "step": 6612 + }, + { + "epoch": 2.852706491265905, + "grad_norm": 0.18875077366828918, + "learning_rate": 9.063688675862574e-07, + "loss": 2.117, + "step": 6613 + }, + { + "epoch": 2.853137804615053, + "grad_norm": 0.19429738819599152, + "learning_rate": 9.010868376976982e-07, + "loss": 2.1098, + "step": 6614 + }, + { + "epoch": 2.853569117964201, + "grad_norm": 0.1997184306383133, + "learning_rate": 8.958201507994506e-07, + "loss": 2.1689, + "step": 6615 + }, + { + "epoch": 2.854000431313349, + "grad_norm": 0.693905234336853, + "learning_rate": 8.905688079820256e-07, + "loss": 2.1667, + "step": 6616 + }, + { + "epoch": 2.8544317446624974, + "grad_norm": 0.20525632798671722, + "learning_rate": 8.8533281033277e-07, + "loss": 2.0395, + "step": 6617 + }, + { + "epoch": 2.854863058011645, + "grad_norm": 0.2016616314649582, + "learning_rate": 8.801121589358667e-07, + "loss": 2.1236, + "step": 6618 + }, + { + "epoch": 2.8552943713607934, + "grad_norm": 0.19586916267871857, + "learning_rate": 8.74906854872301e-07, + "loss": 2.0706, + "step": 6619 + }, + { + "epoch": 2.8557256847099417, + "grad_norm": 0.20806916058063507, + "learning_rate": 8.697168992198939e-07, + "loss": 2.2386, + "step": 6620 + }, + { + "epoch": 2.85615699805909, + "grad_norm": 0.19900496304035187, + "learning_rate": 8.645422930532692e-07, + "loss": 2.2531, + "step": 6621 + }, + { + "epoch": 2.856588311408238, + "grad_norm": 0.1811569631099701, + "learning_rate": 8.593830374438948e-07, + "loss": 2.1097, + "step": 6622 + }, + { + "epoch": 2.8570196247573865, + "grad_norm": 0.2104291319847107, + "learning_rate": 8.542391334600579e-07, + "loss": 1.9623, + "step": 6623 + }, + { + "epoch": 2.8574509381065343, + "grad_norm": 0.21303077042102814, + "learning_rate": 8.491105821668565e-07, + "loss": 2.2264, + "step": 6624 + }, + { + "epoch": 2.8578822514556825, + "grad_norm": 0.2043152004480362, + "learning_rate": 8.43997384626191e-07, + "loss": 2.1682, + "step": 6625 + }, + { + "epoch": 2.8578822514556825, + "eval_loss": 2.0864996910095215, + "eval_runtime": 195.4386, + "eval_samples_per_second": 0.164, + "eval_steps_per_second": 0.164, + "step": 6625 + }, + { + "epoch": 2.858313564804831, + "grad_norm": 0.19965513050556183, + "learning_rate": 8.388995418968397e-07, + "loss": 2.224, + "step": 6626 + }, + { + "epoch": 2.8587448781539786, + "grad_norm": 0.20936112105846405, + "learning_rate": 8.338170550343415e-07, + "loss": 2.256, + "step": 6627 + }, + { + "epoch": 2.859176191503127, + "grad_norm": 0.20981188118457794, + "learning_rate": 8.287499250910961e-07, + "loss": 2.2108, + "step": 6628 + }, + { + "epoch": 2.859607504852275, + "grad_norm": 0.19933246076107025, + "learning_rate": 8.236981531162979e-07, + "loss": 2.1298, + "step": 6629 + }, + { + "epoch": 2.8600388182014234, + "grad_norm": 0.18843577802181244, + "learning_rate": 8.186617401559764e-07, + "loss": 2.1118, + "step": 6630 + }, + { + "epoch": 2.8604701315505716, + "grad_norm": 0.19854816794395447, + "learning_rate": 8.13640687252981e-07, + "loss": 2.2842, + "step": 6631 + }, + { + "epoch": 2.86090144489972, + "grad_norm": 0.1979244500398636, + "learning_rate": 8.086349954469634e-07, + "loss": 2.0514, + "step": 6632 + }, + { + "epoch": 2.8613327582488677, + "grad_norm": 0.2001362293958664, + "learning_rate": 8.036446657744111e-07, + "loss": 1.6759, + "step": 6633 + }, + { + "epoch": 2.861764071598016, + "grad_norm": 0.19848394393920898, + "learning_rate": 7.986696992686308e-07, + "loss": 2.0379, + "step": 6634 + }, + { + "epoch": 2.862195384947164, + "grad_norm": 0.20530439913272858, + "learning_rate": 7.937100969597404e-07, + "loss": 2.1757, + "step": 6635 + }, + { + "epoch": 2.862626698296312, + "grad_norm": 0.20105339586734772, + "learning_rate": 7.887658598746849e-07, + "loss": 2.2595, + "step": 6636 + }, + { + "epoch": 2.8630580116454603, + "grad_norm": 0.19826927781105042, + "learning_rate": 7.83836989037212e-07, + "loss": 2.1847, + "step": 6637 + }, + { + "epoch": 2.8634893249946085, + "grad_norm": 0.20068801939487457, + "learning_rate": 7.789234854679055e-07, + "loss": 2.1122, + "step": 6638 + }, + { + "epoch": 2.8639206383437568, + "grad_norm": 0.18873481452465057, + "learning_rate": 7.740253501841598e-07, + "loss": 2.0476, + "step": 6639 + }, + { + "epoch": 2.864351951692905, + "grad_norm": 0.17728032171726227, + "learning_rate": 7.691425842001886e-07, + "loss": 1.7599, + "step": 6640 + }, + { + "epoch": 2.8647832650420533, + "grad_norm": 0.1999674141407013, + "learning_rate": 7.642751885270249e-07, + "loss": 2.1667, + "step": 6641 + }, + { + "epoch": 2.865214578391201, + "grad_norm": 0.18546587228775024, + "learning_rate": 7.59423164172504e-07, + "loss": 2.1342, + "step": 6642 + }, + { + "epoch": 2.8656458917403493, + "grad_norm": 0.19256991147994995, + "learning_rate": 7.545865121413059e-07, + "loss": 2.0104, + "step": 6643 + }, + { + "epoch": 2.8660772050894976, + "grad_norm": 0.22530266642570496, + "learning_rate": 7.497652334348958e-07, + "loss": 2.3658, + "step": 6644 + }, + { + "epoch": 2.8665085184386454, + "grad_norm": 0.680363118648529, + "learning_rate": 7.449593290515837e-07, + "loss": 2.2008, + "step": 6645 + }, + { + "epoch": 2.8669398317877937, + "grad_norm": 0.20634908974170685, + "learning_rate": 7.401687999864903e-07, + "loss": 2.0761, + "step": 6646 + }, + { + "epoch": 2.867371145136942, + "grad_norm": 0.2039497047662735, + "learning_rate": 7.353936472315302e-07, + "loss": 2.2705, + "step": 6647 + }, + { + "epoch": 2.86780245848609, + "grad_norm": 0.1839836686849594, + "learning_rate": 7.306338717754629e-07, + "loss": 2.1424, + "step": 6648 + }, + { + "epoch": 2.8682337718352384, + "grad_norm": 0.19734084606170654, + "learning_rate": 7.258894746038413e-07, + "loss": 2.0179, + "step": 6649 + }, + { + "epoch": 2.8686650851843867, + "grad_norm": 0.20046739280223846, + "learning_rate": 7.211604566990631e-07, + "loss": 2.2146, + "step": 6650 + }, + { + "epoch": 2.8686650851843867, + "eval_loss": 2.086951732635498, + "eval_runtime": 200.8992, + "eval_samples_per_second": 0.159, + "eval_steps_per_second": 0.159, + "step": 6650 + }, + { + "epoch": 2.8690963985335345, + "grad_norm": 0.17358741164207458, + "learning_rate": 7.164468190403034e-07, + "loss": 2.0335, + "step": 6651 + }, + { + "epoch": 2.8695277118826827, + "grad_norm": 0.20116187632083893, + "learning_rate": 7.117485626035813e-07, + "loss": 2.0944, + "step": 6652 + }, + { + "epoch": 2.869959025231831, + "grad_norm": 0.20895633101463318, + "learning_rate": 7.070656883617188e-07, + "loss": 2.2085, + "step": 6653 + }, + { + "epoch": 2.870390338580979, + "grad_norm": 0.21144898235797882, + "learning_rate": 7.023981972843651e-07, + "loss": 1.9656, + "step": 6654 + }, + { + "epoch": 2.870821651930127, + "grad_norm": 0.19829332828521729, + "learning_rate": 6.977460903379806e-07, + "loss": 2.195, + "step": 6655 + }, + { + "epoch": 2.8712529652792753, + "grad_norm": 0.2046317160129547, + "learning_rate": 6.931093684858114e-07, + "loss": 2.1055, + "step": 6656 + }, + { + "epoch": 2.8716842786284236, + "grad_norm": 0.20307636260986328, + "learning_rate": 6.884880326879477e-07, + "loss": 2.214, + "step": 6657 + }, + { + "epoch": 2.872115591977572, + "grad_norm": 0.20070429146289825, + "learning_rate": 6.838820839012993e-07, + "loss": 2.2231, + "step": 6658 + }, + { + "epoch": 2.87254690532672, + "grad_norm": 0.21028077602386475, + "learning_rate": 6.792915230795782e-07, + "loss": 2.1243, + "step": 6659 + }, + { + "epoch": 2.872978218675868, + "grad_norm": 0.215521901845932, + "learning_rate": 6.747163511732989e-07, + "loss": 2.2529, + "step": 6660 + }, + { + "epoch": 2.873409532025016, + "grad_norm": 0.20972667634487152, + "learning_rate": 6.701565691298122e-07, + "loss": 2.159, + "step": 6661 + }, + { + "epoch": 2.8738408453741644, + "grad_norm": 0.20830823481082916, + "learning_rate": 6.656121778932627e-07, + "loss": 2.221, + "step": 6662 + }, + { + "epoch": 2.874272158723312, + "grad_norm": 0.2061716765165329, + "learning_rate": 6.610831784046145e-07, + "loss": 2.2175, + "step": 6663 + }, + { + "epoch": 2.8747034720724605, + "grad_norm": 0.19651354849338531, + "learning_rate": 6.565695716016589e-07, + "loss": 2.1074, + "step": 6664 + }, + { + "epoch": 2.8751347854216087, + "grad_norm": 0.2101098895072937, + "learning_rate": 6.520713584189736e-07, + "loss": 2.121, + "step": 6665 + }, + { + "epoch": 2.875566098770757, + "grad_norm": 0.22959531843662262, + "learning_rate": 6.475885397879715e-07, + "loss": 2.1796, + "step": 6666 + }, + { + "epoch": 2.8759974121199052, + "grad_norm": 0.24616841971874237, + "learning_rate": 6.431211166368605e-07, + "loss": 2.0884, + "step": 6667 + }, + { + "epoch": 2.8764287254690535, + "grad_norm": 0.19734734296798706, + "learning_rate": 6.386690898906837e-07, + "loss": 1.9628, + "step": 6668 + }, + { + "epoch": 2.8768600388182013, + "grad_norm": 0.19654397666454315, + "learning_rate": 6.342324604712706e-07, + "loss": 2.1327, + "step": 6669 + }, + { + "epoch": 2.8772913521673495, + "grad_norm": 0.19043001532554626, + "learning_rate": 6.298112292972779e-07, + "loss": 2.0345, + "step": 6670 + }, + { + "epoch": 2.877722665516498, + "grad_norm": 0.2039814293384552, + "learning_rate": 6.254053972841566e-07, + "loss": 2.2965, + "step": 6671 + }, + { + "epoch": 2.878153978865646, + "grad_norm": 0.19430115818977356, + "learning_rate": 6.210149653442104e-07, + "loss": 2.0156, + "step": 6672 + }, + { + "epoch": 2.878585292214794, + "grad_norm": 0.19798149168491364, + "learning_rate": 6.166399343865036e-07, + "loss": 2.1365, + "step": 6673 + }, + { + "epoch": 2.879016605563942, + "grad_norm": 0.1976615935564041, + "learning_rate": 6.122803053169451e-07, + "loss": 2.1069, + "step": 6674 + }, + { + "epoch": 2.8794479189130904, + "grad_norm": 0.19123438000679016, + "learning_rate": 6.079360790382375e-07, + "loss": 2.1346, + "step": 6675 + }, + { + "epoch": 2.8794479189130904, + "eval_loss": 2.087125778198242, + "eval_runtime": 195.1716, + "eval_samples_per_second": 0.164, + "eval_steps_per_second": 0.164, + "step": 6675 + }, + { + "epoch": 2.8798792322622386, + "grad_norm": 0.2105484902858734, + "learning_rate": 6.036072564498951e-07, + "loss": 2.0971, + "step": 6676 + }, + { + "epoch": 2.880310545611387, + "grad_norm": 0.2146306186914444, + "learning_rate": 5.992938384482671e-07, + "loss": 2.1788, + "step": 6677 + }, + { + "epoch": 2.8807418589605347, + "grad_norm": 0.18713943660259247, + "learning_rate": 5.949958259264808e-07, + "loss": 2.1241, + "step": 6678 + }, + { + "epoch": 2.881173172309683, + "grad_norm": 0.20983433723449707, + "learning_rate": 5.907132197744829e-07, + "loss": 2.1917, + "step": 6679 + }, + { + "epoch": 2.881604485658831, + "grad_norm": 0.1955743134021759, + "learning_rate": 5.864460208790389e-07, + "loss": 2.1598, + "step": 6680 + }, + { + "epoch": 2.8820357990079795, + "grad_norm": 0.23252135515213013, + "learning_rate": 5.82194230123717e-07, + "loss": 1.7775, + "step": 6681 + }, + { + "epoch": 2.8824671123571273, + "grad_norm": 0.21306568384170532, + "learning_rate": 5.779578483889131e-07, + "loss": 2.0399, + "step": 6682 + }, + { + "epoch": 2.8828984257062755, + "grad_norm": 0.35586538910865784, + "learning_rate": 5.737368765517919e-07, + "loss": 2.0184, + "step": 6683 + }, + { + "epoch": 2.883329739055424, + "grad_norm": 0.22086109220981598, + "learning_rate": 5.695313154863629e-07, + "loss": 2.2016, + "step": 6684 + }, + { + "epoch": 2.883761052404572, + "grad_norm": 0.20301884412765503, + "learning_rate": 5.653411660634294e-07, + "loss": 2.1293, + "step": 6685 + }, + { + "epoch": 2.8841923657537203, + "grad_norm": 0.20924760401248932, + "learning_rate": 5.61166429150614e-07, + "loss": 2.3138, + "step": 6686 + }, + { + "epoch": 2.884623679102868, + "grad_norm": 0.20393526554107666, + "learning_rate": 5.570071056123421e-07, + "loss": 2.1015, + "step": 6687 + }, + { + "epoch": 2.8850549924520164, + "grad_norm": 0.2079361379146576, + "learning_rate": 5.528631963098412e-07, + "loss": 2.1904, + "step": 6688 + }, + { + "epoch": 2.8854863058011646, + "grad_norm": 0.21120835840702057, + "learning_rate": 5.487347021011502e-07, + "loss": 2.2447, + "step": 6689 + }, + { + "epoch": 2.885917619150313, + "grad_norm": 0.19092771410942078, + "learning_rate": 5.44621623841135e-07, + "loss": 2.0738, + "step": 6690 + }, + { + "epoch": 2.8863489324994607, + "grad_norm": 0.2014131247997284, + "learning_rate": 5.405239623814394e-07, + "loss": 2.217, + "step": 6691 + }, + { + "epoch": 2.886780245848609, + "grad_norm": 0.2046353667974472, + "learning_rate": 5.364417185705267e-07, + "loss": 2.1468, + "step": 6692 + }, + { + "epoch": 2.887211559197757, + "grad_norm": 0.19931462407112122, + "learning_rate": 5.323748932536787e-07, + "loss": 2.1508, + "step": 6693 + }, + { + "epoch": 2.8876428725469054, + "grad_norm": 0.1995890736579895, + "learning_rate": 5.283234872729802e-07, + "loss": 1.9074, + "step": 6694 + }, + { + "epoch": 2.8880741858960537, + "grad_norm": 0.1915419101715088, + "learning_rate": 5.24287501467302e-07, + "loss": 2.1993, + "step": 6695 + }, + { + "epoch": 2.8885054992452015, + "grad_norm": 0.1968814730644226, + "learning_rate": 5.202669366723588e-07, + "loss": 2.1575, + "step": 6696 + }, + { + "epoch": 2.8889368125943498, + "grad_norm": 0.19840653240680695, + "learning_rate": 5.162617937206348e-07, + "loss": 2.0741, + "step": 6697 + }, + { + "epoch": 2.889368125943498, + "grad_norm": 0.19516044855117798, + "learning_rate": 5.122720734414498e-07, + "loss": 2.3114, + "step": 6698 + }, + { + "epoch": 2.8897994392926463, + "grad_norm": 0.19493445754051208, + "learning_rate": 5.082977766609098e-07, + "loss": 2.0649, + "step": 6699 + }, + { + "epoch": 2.890230752641794, + "grad_norm": 0.20990261435508728, + "learning_rate": 5.04338904201948e-07, + "loss": 2.2653, + "step": 6700 + }, + { + "epoch": 2.890230752641794, + "eval_loss": 2.087057113647461, + "eval_runtime": 195.522, + "eval_samples_per_second": 0.164, + "eval_steps_per_second": 0.164, + "step": 6700 + }, + { + "epoch": 2.8906620659909423, + "grad_norm": 0.19304528832435608, + "learning_rate": 5.003954568842922e-07, + "loss": 2.1359, + "step": 6701 + }, + { + "epoch": 2.8910933793400906, + "grad_norm": 0.1956472247838974, + "learning_rate": 4.964674355244724e-07, + "loss": 1.9652, + "step": 6702 + }, + { + "epoch": 2.891524692689239, + "grad_norm": 0.20953501760959625, + "learning_rate": 4.925548409358133e-07, + "loss": 2.1561, + "step": 6703 + }, + { + "epoch": 2.891956006038387, + "grad_norm": 0.19960734248161316, + "learning_rate": 4.886576739284831e-07, + "loss": 2.2626, + "step": 6704 + }, + { + "epoch": 2.892387319387535, + "grad_norm": 0.19962123036384583, + "learning_rate": 4.847759353094199e-07, + "loss": 2.2388, + "step": 6705 + }, + { + "epoch": 2.892818632736683, + "grad_norm": 0.2108890563249588, + "learning_rate": 4.809096258823892e-07, + "loss": 2.1739, + "step": 6706 + }, + { + "epoch": 2.8932499460858314, + "grad_norm": 0.20290786027908325, + "learning_rate": 4.770587464479503e-07, + "loss": 1.9976, + "step": 6707 + }, + { + "epoch": 2.8936812594349797, + "grad_norm": 0.1943017840385437, + "learning_rate": 4.732232978034572e-07, + "loss": 2.044, + "step": 6708 + }, + { + "epoch": 2.8941125727841275, + "grad_norm": 0.1958266943693161, + "learning_rate": 4.694032807430997e-07, + "loss": 1.9093, + "step": 6709 + }, + { + "epoch": 2.8945438861332757, + "grad_norm": 0.22454294562339783, + "learning_rate": 4.6559869605783675e-07, + "loss": 2.2015, + "step": 6710 + }, + { + "epoch": 2.894975199482424, + "grad_norm": 0.20868833363056183, + "learning_rate": 4.61809544535463e-07, + "loss": 2.0411, + "step": 6711 + }, + { + "epoch": 2.8954065128315722, + "grad_norm": 2.48274302482605, + "learning_rate": 4.580358269605594e-07, + "loss": 2.0262, + "step": 6712 + }, + { + "epoch": 2.8958378261807205, + "grad_norm": 0.19596897065639496, + "learning_rate": 4.542775441145174e-07, + "loss": 2.0699, + "step": 6713 + }, + { + "epoch": 2.8962691395298683, + "grad_norm": 0.8381218314170837, + "learning_rate": 4.50534696775523e-07, + "loss": 2.2512, + "step": 6714 + }, + { + "epoch": 2.8967004528790166, + "grad_norm": 0.22735945880413055, + "learning_rate": 4.4680728571858116e-07, + "loss": 2.2737, + "step": 6715 + }, + { + "epoch": 2.897131766228165, + "grad_norm": 0.20122380554676056, + "learning_rate": 4.430953117154912e-07, + "loss": 2.1732, + "step": 6716 + }, + { + "epoch": 2.897563079577313, + "grad_norm": 0.17533114552497864, + "learning_rate": 4.393987755348549e-07, + "loss": 2.1612, + "step": 6717 + }, + { + "epoch": 2.897994392926461, + "grad_norm": 0.18631507456302643, + "learning_rate": 4.357176779420851e-07, + "loss": 2.2448, + "step": 6718 + }, + { + "epoch": 2.898425706275609, + "grad_norm": 0.19202157855033875, + "learning_rate": 4.320520196993971e-07, + "loss": 2.0164, + "step": 6719 + }, + { + "epoch": 2.8988570196247574, + "grad_norm": 0.20462560653686523, + "learning_rate": 4.28401801565792e-07, + "loss": 2.0823, + "step": 6720 + }, + { + "epoch": 2.8992883329739056, + "grad_norm": 0.20073947310447693, + "learning_rate": 4.247670242970985e-07, + "loss": 2.049, + "step": 6721 + }, + { + "epoch": 2.899719646323054, + "grad_norm": 0.2194797843694687, + "learning_rate": 4.211476886459314e-07, + "loss": 2.2606, + "step": 6722 + }, + { + "epoch": 2.9001509596722017, + "grad_norm": 0.19137798249721527, + "learning_rate": 4.175437953617161e-07, + "loss": 2.2349, + "step": 6723 + }, + { + "epoch": 2.90058227302135, + "grad_norm": 0.19921918213367462, + "learning_rate": 4.1395534519068076e-07, + "loss": 2.4475, + "step": 6724 + }, + { + "epoch": 2.901013586370498, + "grad_norm": 0.21443259716033936, + "learning_rate": 4.103823388758476e-07, + "loss": 2.3751, + "step": 6725 + }, + { + "epoch": 2.901013586370498, + "eval_loss": 2.0870168209075928, + "eval_runtime": 195.6161, + "eval_samples_per_second": 0.164, + "eval_steps_per_second": 0.164, + "step": 6725 + }, + { + "epoch": 2.9014448997196465, + "grad_norm": 0.19477905333042145, + "learning_rate": 4.0682477715704986e-07, + "loss": 1.9528, + "step": 6726 + }, + { + "epoch": 2.9018762130687943, + "grad_norm": 0.19327762722969055, + "learning_rate": 4.03282660770915e-07, + "loss": 2.1517, + "step": 6727 + }, + { + "epoch": 2.9023075264179425, + "grad_norm": 0.6009685397148132, + "learning_rate": 3.9975599045088955e-07, + "loss": 2.0693, + "step": 6728 + }, + { + "epoch": 2.902738839767091, + "grad_norm": 0.1924435794353485, + "learning_rate": 3.962447669271979e-07, + "loss": 2.2527, + "step": 6729 + }, + { + "epoch": 2.903170153116239, + "grad_norm": 0.20312826335430145, + "learning_rate": 3.9274899092687497e-07, + "loss": 2.267, + "step": 6730 + }, + { + "epoch": 2.9036014664653873, + "grad_norm": 0.1943555325269699, + "learning_rate": 3.892686631737668e-07, + "loss": 2.1596, + "step": 6731 + }, + { + "epoch": 2.904032779814535, + "grad_norm": 0.1989336460828781, + "learning_rate": 3.8580378438851367e-07, + "loss": 2.2043, + "step": 6732 + }, + { + "epoch": 2.9044640931636834, + "grad_norm": 0.19924335181713104, + "learning_rate": 3.823543552885583e-07, + "loss": 2.0053, + "step": 6733 + }, + { + "epoch": 2.9048954065128316, + "grad_norm": 0.20633479952812195, + "learning_rate": 3.7892037658812933e-07, + "loss": 2.2411, + "step": 6734 + }, + { + "epoch": 2.90532671986198, + "grad_norm": 0.1956966370344162, + "learning_rate": 3.7550184899828304e-07, + "loss": 2.1115, + "step": 6735 + }, + { + "epoch": 2.9057580332111277, + "grad_norm": 0.19222424924373627, + "learning_rate": 3.720987732268532e-07, + "loss": 2.1966, + "step": 6736 + }, + { + "epoch": 2.906189346560276, + "grad_norm": 0.20548641681671143, + "learning_rate": 3.6871114997850115e-07, + "loss": 2.3462, + "step": 6737 + }, + { + "epoch": 2.906620659909424, + "grad_norm": 0.21557635068893433, + "learning_rate": 3.6533897995464913e-07, + "loss": 1.9703, + "step": 6738 + }, + { + "epoch": 2.9070519732585725, + "grad_norm": 0.217384472489357, + "learning_rate": 3.619822638535552e-07, + "loss": 2.2944, + "step": 6739 + }, + { + "epoch": 2.9074832866077207, + "grad_norm": 0.20118100941181183, + "learning_rate": 3.586410023702635e-07, + "loss": 1.9737, + "step": 6740 + }, + { + "epoch": 2.9079145999568685, + "grad_norm": 0.19280241429805756, + "learning_rate": 3.553151961966122e-07, + "loss": 2.1149, + "step": 6741 + }, + { + "epoch": 2.9083459133060168, + "grad_norm": 0.18102608621120453, + "learning_rate": 3.5200484602125045e-07, + "loss": 2.0382, + "step": 6742 + }, + { + "epoch": 2.908777226655165, + "grad_norm": 0.19837021827697754, + "learning_rate": 3.4870995252962994e-07, + "loss": 2.2212, + "step": 6743 + }, + { + "epoch": 2.9092085400043133, + "grad_norm": 0.20592765510082245, + "learning_rate": 3.4543051640398e-07, + "loss": 2.28, + "step": 6744 + }, + { + "epoch": 2.909639853353461, + "grad_norm": 0.21061819791793823, + "learning_rate": 3.4216653832334904e-07, + "loss": 2.0615, + "step": 6745 + }, + { + "epoch": 2.9100711667026093, + "grad_norm": 0.19395215809345245, + "learning_rate": 3.3891801896357984e-07, + "loss": 2.0634, + "step": 6746 + }, + { + "epoch": 2.9105024800517576, + "grad_norm": 0.20471517741680145, + "learning_rate": 3.3568495899731765e-07, + "loss": 2.2366, + "step": 6747 + }, + { + "epoch": 2.910933793400906, + "grad_norm": 0.9463796615600586, + "learning_rate": 3.3246735909400193e-07, + "loss": 2.1938, + "step": 6748 + }, + { + "epoch": 2.911365106750054, + "grad_norm": 0.19731606543064117, + "learning_rate": 3.2926521991986646e-07, + "loss": 2.0529, + "step": 6749 + }, + { + "epoch": 2.911796420099202, + "grad_norm": 0.21093003451824188, + "learning_rate": 3.260785421379475e-07, + "loss": 2.1669, + "step": 6750 + }, + { + "epoch": 2.911796420099202, + "eval_loss": 2.087052345275879, + "eval_runtime": 195.3611, + "eval_samples_per_second": 0.164, + "eval_steps_per_second": 0.164, + "step": 6750 + }, + { + "epoch": 2.91222773344835, + "grad_norm": 0.2138030230998993, + "learning_rate": 3.2290732640809223e-07, + "loss": 2.3074, + "step": 6751 + }, + { + "epoch": 2.9126590467974984, + "grad_norm": 0.20158720016479492, + "learning_rate": 3.197515733869338e-07, + "loss": 2.1314, + "step": 6752 + }, + { + "epoch": 2.9130903601466467, + "grad_norm": 0.20197947323322296, + "learning_rate": 3.166112837278995e-07, + "loss": 2.0844, + "step": 6753 + }, + { + "epoch": 2.9135216734957945, + "grad_norm": 0.20278272032737732, + "learning_rate": 3.134864580812191e-07, + "loss": 2.1534, + "step": 6754 + }, + { + "epoch": 2.9139529868449427, + "grad_norm": 0.21093043684959412, + "learning_rate": 3.103770970939251e-07, + "loss": 2.2496, + "step": 6755 + }, + { + "epoch": 2.914384300194091, + "grad_norm": 0.1912979632616043, + "learning_rate": 3.072832014098442e-07, + "loss": 2.195, + "step": 6756 + }, + { + "epoch": 2.9148156135432393, + "grad_norm": 0.2099740356206894, + "learning_rate": 3.0420477166960557e-07, + "loss": 2.1556, + "step": 6757 + }, + { + "epoch": 2.9152469268923875, + "grad_norm": 0.20231591165065765, + "learning_rate": 3.0114180851063265e-07, + "loss": 2.2292, + "step": 6758 + }, + { + "epoch": 2.9156782402415353, + "grad_norm": 0.19204868376255035, + "learning_rate": 2.980943125671348e-07, + "loss": 2.0469, + "step": 6759 + }, + { + "epoch": 2.9161095535906836, + "grad_norm": 0.212842658162117, + "learning_rate": 2.950622844701406e-07, + "loss": 2.2559, + "step": 6760 + }, + { + "epoch": 2.916540866939832, + "grad_norm": 0.20621280372142792, + "learning_rate": 2.920457248474561e-07, + "loss": 2.2678, + "step": 6761 + }, + { + "epoch": 2.91697218028898, + "grad_norm": 0.21888796985149384, + "learning_rate": 2.890446343236902e-07, + "loss": 2.1982, + "step": 6762 + }, + { + "epoch": 2.917403493638128, + "grad_norm": 0.19971078634262085, + "learning_rate": 2.860590135202623e-07, + "loss": 2.1474, + "step": 6763 + }, + { + "epoch": 2.917834806987276, + "grad_norm": 0.21392220258712769, + "learning_rate": 2.8308886305537794e-07, + "loss": 2.2514, + "step": 6764 + }, + { + "epoch": 2.9182661203364244, + "grad_norm": 0.1981843262910843, + "learning_rate": 2.801341835440368e-07, + "loss": 2.1369, + "step": 6765 + }, + { + "epoch": 2.9186974336855727, + "grad_norm": 0.21266666054725647, + "learning_rate": 2.7719497559803295e-07, + "loss": 2.072, + "step": 6766 + }, + { + "epoch": 2.919128747034721, + "grad_norm": 0.20510557293891907, + "learning_rate": 2.7427123982596277e-07, + "loss": 2.1128, + "step": 6767 + }, + { + "epoch": 2.9195600603838687, + "grad_norm": 0.18966813385486603, + "learning_rate": 2.7136297683321706e-07, + "loss": 2.3035, + "step": 6768 + }, + { + "epoch": 2.919991373733017, + "grad_norm": 0.20073847472667694, + "learning_rate": 2.684701872219891e-07, + "loss": 2.1059, + "step": 6769 + }, + { + "epoch": 2.9204226870821652, + "grad_norm": 0.20553511381149292, + "learning_rate": 2.655928715912581e-07, + "loss": 2.1751, + "step": 6770 + }, + { + "epoch": 2.9208540004313135, + "grad_norm": 0.20025672018527985, + "learning_rate": 2.6273103053680576e-07, + "loss": 2.1073, + "step": 6771 + }, + { + "epoch": 2.9212853137804613, + "grad_norm": 0.18325988948345184, + "learning_rate": 2.5988466465119983e-07, + "loss": 2.0485, + "step": 6772 + }, + { + "epoch": 2.9217166271296096, + "grad_norm": 0.19736038148403168, + "learning_rate": 2.570537745238188e-07, + "loss": 2.127, + "step": 6773 + }, + { + "epoch": 2.922147940478758, + "grad_norm": 0.213426873087883, + "learning_rate": 2.5423836074083546e-07, + "loss": 2.4094, + "step": 6774 + }, + { + "epoch": 2.922579253827906, + "grad_norm": 0.2321801632642746, + "learning_rate": 2.5143842388520854e-07, + "loss": 2.1095, + "step": 6775 + }, + { + "epoch": 2.922579253827906, + "eval_loss": 2.0869569778442383, + "eval_runtime": 195.3991, + "eval_samples_per_second": 0.164, + "eval_steps_per_second": 0.164, + "step": 6775 + }, + { + "epoch": 2.9230105671770543, + "grad_norm": 0.20658846199512482, + "learning_rate": 2.4865396453668254e-07, + "loss": 2.0625, + "step": 6776 + }, + { + "epoch": 2.923441880526202, + "grad_norm": 0.20817962288856506, + "learning_rate": 2.458849832718213e-07, + "loss": 2.1999, + "step": 6777 + }, + { + "epoch": 2.9238731938753504, + "grad_norm": 0.20461788773536682, + "learning_rate": 2.431314806639828e-07, + "loss": 2.0633, + "step": 6778 + }, + { + "epoch": 2.9243045072244986, + "grad_norm": 0.19786451756954193, + "learning_rate": 2.403934572832861e-07, + "loss": 2.0404, + "step": 6779 + }, + { + "epoch": 2.924735820573647, + "grad_norm": 0.213621124625206, + "learning_rate": 2.3767091369668588e-07, + "loss": 2.2852, + "step": 6780 + }, + { + "epoch": 2.9251671339227947, + "grad_norm": 0.1997172236442566, + "learning_rate": 2.3496385046790634e-07, + "loss": 2.0288, + "step": 6781 + }, + { + "epoch": 2.925598447271943, + "grad_norm": 0.1925685852766037, + "learning_rate": 2.3227226815748235e-07, + "loss": 2.1788, + "step": 6782 + }, + { + "epoch": 2.926029760621091, + "grad_norm": 0.2174510508775711, + "learning_rate": 2.2959616732273478e-07, + "loss": 2.1952, + "step": 6783 + }, + { + "epoch": 2.9264610739702395, + "grad_norm": 0.2124984711408615, + "learning_rate": 2.269355485177704e-07, + "loss": 2.1769, + "step": 6784 + }, + { + "epoch": 2.9268923873193877, + "grad_norm": 0.17805612087249756, + "learning_rate": 2.2429041229350686e-07, + "loss": 2.1134, + "step": 6785 + }, + { + "epoch": 2.9273237006685355, + "grad_norm": 0.19361688196659088, + "learning_rate": 2.216607591976477e-07, + "loss": 2.1236, + "step": 6786 + }, + { + "epoch": 2.927755014017684, + "grad_norm": 0.17942456901073456, + "learning_rate": 2.190465897746907e-07, + "loss": 1.9414, + "step": 6787 + }, + { + "epoch": 2.928186327366832, + "grad_norm": 0.21018028259277344, + "learning_rate": 2.1644790456592786e-07, + "loss": 1.942, + "step": 6788 + }, + { + "epoch": 2.9286176407159803, + "grad_norm": 0.21197490394115448, + "learning_rate": 2.1386470410943713e-07, + "loss": 2.0872, + "step": 6789 + }, + { + "epoch": 2.929048954065128, + "grad_norm": 0.1928757131099701, + "learning_rate": 2.112969889401156e-07, + "loss": 2.1669, + "step": 6790 + }, + { + "epoch": 2.9294802674142764, + "grad_norm": 0.1942148655653, + "learning_rate": 2.0874475958962967e-07, + "loss": 2.1107, + "step": 6791 + }, + { + "epoch": 2.9299115807634246, + "grad_norm": 0.18924149870872498, + "learning_rate": 2.0620801658643992e-07, + "loss": 1.922, + "step": 6792 + }, + { + "epoch": 2.930342894112573, + "grad_norm": 0.22006838023662567, + "learning_rate": 2.0368676045580957e-07, + "loss": 2.1127, + "step": 6793 + }, + { + "epoch": 2.930774207461721, + "grad_norm": 0.1920243501663208, + "learning_rate": 2.0118099171979595e-07, + "loss": 2.1766, + "step": 6794 + }, + { + "epoch": 2.931205520810869, + "grad_norm": 0.19455263018608093, + "learning_rate": 1.9869071089725075e-07, + "loss": 2.0103, + "step": 6795 + }, + { + "epoch": 2.931636834160017, + "grad_norm": 0.18723560869693756, + "learning_rate": 1.9621591850379482e-07, + "loss": 1.9857, + "step": 6796 + }, + { + "epoch": 2.9320681475091654, + "grad_norm": 0.20894886553287506, + "learning_rate": 1.9375661505188488e-07, + "loss": 1.9011, + "step": 6797 + }, + { + "epoch": 2.9324994608583137, + "grad_norm": 0.20964136719703674, + "learning_rate": 1.9131280105073032e-07, + "loss": 2.1019, + "step": 6798 + }, + { + "epoch": 2.9329307742074615, + "grad_norm": 0.18412867188453674, + "learning_rate": 1.8888447700635134e-07, + "loss": 1.8822, + "step": 6799 + }, + { + "epoch": 2.9333620875566098, + "grad_norm": 0.20818638801574707, + "learning_rate": 1.864716434215624e-07, + "loss": 2.2422, + "step": 6800 + }, + { + "epoch": 2.9333620875566098, + "eval_loss": 2.0870261192321777, + "eval_runtime": 195.3813, + "eval_samples_per_second": 0.164, + "eval_steps_per_second": 0.164, + "step": 6800 + }, + { + "epoch": 2.933793400905758, + "grad_norm": 0.18927675485610962, + "learning_rate": 1.8407430079597217e-07, + "loss": 2.1812, + "step": 6801 + }, + { + "epoch": 2.9342247142549063, + "grad_norm": 0.27150091528892517, + "learning_rate": 1.8169244962596695e-07, + "loss": 2.0754, + "step": 6802 + }, + { + "epoch": 2.9346560276040545, + "grad_norm": 0.7071654200553894, + "learning_rate": 1.7932609040474388e-07, + "loss": 2.107, + "step": 6803 + }, + { + "epoch": 2.9350873409532023, + "grad_norm": 0.20735763013362885, + "learning_rate": 1.7697522362226934e-07, + "loss": 2.2393, + "step": 6804 + }, + { + "epoch": 2.9355186543023506, + "grad_norm": 0.22723041474819183, + "learning_rate": 1.7463984976532896e-07, + "loss": 2.2172, + "step": 6805 + }, + { + "epoch": 2.935949967651499, + "grad_norm": 0.20030942559242249, + "learning_rate": 1.7231996931749425e-07, + "loss": 2.2804, + "step": 6806 + }, + { + "epoch": 2.936381281000647, + "grad_norm": 0.9719837307929993, + "learning_rate": 1.7001558275909766e-07, + "loss": 2.1877, + "step": 6807 + }, + { + "epoch": 2.936812594349795, + "grad_norm": 0.20374545454978943, + "learning_rate": 1.677266905673158e-07, + "loss": 2.0767, + "step": 6808 + }, + { + "epoch": 2.937243907698943, + "grad_norm": 0.2079235166311264, + "learning_rate": 1.6545329321606127e-07, + "loss": 2.2619, + "step": 6809 + }, + { + "epoch": 2.9376752210480914, + "grad_norm": 0.21248282492160797, + "learning_rate": 1.631953911760825e-07, + "loss": 2.1212, + "step": 6810 + }, + { + "epoch": 2.9381065343972397, + "grad_norm": 0.19418877363204956, + "learning_rate": 1.6095298491490548e-07, + "loss": 2.2536, + "step": 6811 + }, + { + "epoch": 2.938537847746388, + "grad_norm": 0.1790664941072464, + "learning_rate": 1.5872607489683387e-07, + "loss": 2.1337, + "step": 6812 + }, + { + "epoch": 2.9389691610955357, + "grad_norm": 0.19266265630722046, + "learning_rate": 1.5651466158297377e-07, + "loss": 2.1115, + "step": 6813 + }, + { + "epoch": 2.939400474444684, + "grad_norm": 0.19078189134597778, + "learning_rate": 1.543187454312339e-07, + "loss": 2.3414, + "step": 6814 + }, + { + "epoch": 2.9398317877938323, + "grad_norm": 0.1999526023864746, + "learning_rate": 1.5213832689630057e-07, + "loss": 2.0213, + "step": 6815 + }, + { + "epoch": 2.9402631011429805, + "grad_norm": 0.19977650046348572, + "learning_rate": 1.4997340642963762e-07, + "loss": 2.0637, + "step": 6816 + }, + { + "epoch": 2.9406944144921283, + "grad_norm": 0.19836311042308807, + "learning_rate": 1.4782398447953646e-07, + "loss": 2.2932, + "step": 6817 + }, + { + "epoch": 2.9411257278412766, + "grad_norm": 0.19646064937114716, + "learning_rate": 1.4569006149104944e-07, + "loss": 2.0503, + "step": 6818 + }, + { + "epoch": 2.941557041190425, + "grad_norm": 0.1919003129005432, + "learning_rate": 1.4357163790602312e-07, + "loss": 2.2208, + "step": 6819 + }, + { + "epoch": 2.941988354539573, + "grad_norm": 0.1872098445892334, + "learning_rate": 1.4146871416310657e-07, + "loss": 2.069, + "step": 6820 + }, + { + "epoch": 2.9424196678887213, + "grad_norm": 0.21568883955478668, + "learning_rate": 1.3938129069772652e-07, + "loss": 2.1917, + "step": 6821 + }, + { + "epoch": 2.942850981237869, + "grad_norm": 0.2069755494594574, + "learning_rate": 1.3730936794212056e-07, + "loss": 2.0024, + "step": 6822 + }, + { + "epoch": 2.9432822945870174, + "grad_norm": 0.19497476518154144, + "learning_rate": 1.3525294632528716e-07, + "loss": 2.1144, + "step": 6823 + }, + { + "epoch": 2.9437136079361657, + "grad_norm": 0.20551230013370514, + "learning_rate": 1.3321202627304407e-07, + "loss": 2.1161, + "step": 6824 + }, + { + "epoch": 2.944144921285314, + "grad_norm": 0.19996224343776703, + "learning_rate": 1.3118660820797822e-07, + "loss": 2.2457, + "step": 6825 + }, + { + "epoch": 2.944144921285314, + "eval_loss": 2.0869040489196777, + "eval_runtime": 195.4597, + "eval_samples_per_second": 0.164, + "eval_steps_per_second": 0.164, + "step": 6825 + }, + { + "epoch": 2.9445762346344617, + "grad_norm": 3.4457085132598877, + "learning_rate": 1.2917669254947915e-07, + "loss": 2.2843, + "step": 6826 + }, + { + "epoch": 2.94500754798361, + "grad_norm": 0.19179941713809967, + "learning_rate": 1.2718227971371397e-07, + "loss": 2.1372, + "step": 6827 + }, + { + "epoch": 2.9454388613327582, + "grad_norm": 0.20089589059352875, + "learning_rate": 1.2520337011366066e-07, + "loss": 2.069, + "step": 6828 + }, + { + "epoch": 2.9458701746819065, + "grad_norm": 0.19916307926177979, + "learning_rate": 1.2323996415905814e-07, + "loss": 2.3106, + "step": 6829 + }, + { + "epoch": 2.9463014880310547, + "grad_norm": 0.26394152641296387, + "learning_rate": 1.212920622564645e-07, + "loss": 2.0409, + "step": 6830 + }, + { + "epoch": 2.946732801380203, + "grad_norm": 0.1933462917804718, + "learning_rate": 1.193596648092071e-07, + "loss": 2.2213, + "step": 6831 + }, + { + "epoch": 2.947164114729351, + "grad_norm": 0.19561220705509186, + "learning_rate": 1.1744277221741594e-07, + "loss": 2.1113, + "step": 6832 + }, + { + "epoch": 2.947595428078499, + "grad_norm": 0.2019442617893219, + "learning_rate": 1.1554138487799847e-07, + "loss": 2.0393, + "step": 6833 + }, + { + "epoch": 2.9480267414276473, + "grad_norm": 0.20612679421901703, + "learning_rate": 1.1365550318465644e-07, + "loss": 2.1104, + "step": 6834 + }, + { + "epoch": 2.948458054776795, + "grad_norm": 0.2055290788412094, + "learning_rate": 1.1178512752788582e-07, + "loss": 2.1996, + "step": 6835 + }, + { + "epoch": 2.9488893681259434, + "grad_norm": 0.2121615707874298, + "learning_rate": 1.0993025829497682e-07, + "loss": 2.3635, + "step": 6836 + }, + { + "epoch": 2.9493206814750916, + "grad_norm": 0.19621258974075317, + "learning_rate": 1.0809089586998054e-07, + "loss": 2.1606, + "step": 6837 + }, + { + "epoch": 2.94975199482424, + "grad_norm": 0.2023998647928238, + "learning_rate": 1.0626704063377567e-07, + "loss": 2.3725, + "step": 6838 + }, + { + "epoch": 2.950183308173388, + "grad_norm": 0.20256933569908142, + "learning_rate": 1.0445869296400178e-07, + "loss": 2.1603, + "step": 6839 + }, + { + "epoch": 2.9506146215225364, + "grad_norm": 0.1928139477968216, + "learning_rate": 1.0266585323510102e-07, + "loss": 2.2596, + "step": 6840 + }, + { + "epoch": 2.951045934871684, + "grad_norm": 0.18831153213977814, + "learning_rate": 1.008885218182931e-07, + "loss": 2.1917, + "step": 6841 + }, + { + "epoch": 2.9514772482208325, + "grad_norm": 0.21388104557991028, + "learning_rate": 9.912669908160031e-08, + "loss": 2.1683, + "step": 6842 + }, + { + "epoch": 2.9519085615699807, + "grad_norm": 0.18549636006355286, + "learning_rate": 9.738038538983084e-08, + "loss": 2.1971, + "step": 6843 + }, + { + "epoch": 2.9523398749191285, + "grad_norm": 0.20190787315368652, + "learning_rate": 9.564958110457877e-08, + "loss": 2.184, + "step": 6844 + }, + { + "epoch": 2.952771188268277, + "grad_norm": 0.20602178573608398, + "learning_rate": 9.393428658420743e-08, + "loss": 2.1097, + "step": 6845 + }, + { + "epoch": 2.953202501617425, + "grad_norm": 0.21606449782848358, + "learning_rate": 9.223450218391604e-08, + "loss": 2.136, + "step": 6846 + }, + { + "epoch": 2.9536338149665733, + "grad_norm": 0.21144703030586243, + "learning_rate": 9.055022825563974e-08, + "loss": 2.1398, + "step": 6847 + }, + { + "epoch": 2.9540651283157215, + "grad_norm": 0.22148758172988892, + "learning_rate": 8.888146514813287e-08, + "loss": 1.7514, + "step": 6848 + }, + { + "epoch": 2.95449644166487, + "grad_norm": 0.19042165577411652, + "learning_rate": 8.722821320693573e-08, + "loss": 2.0921, + "step": 6849 + }, + { + "epoch": 2.9549277550140176, + "grad_norm": 0.21668869256973267, + "learning_rate": 8.559047277436614e-08, + "loss": 2.178, + "step": 6850 + }, + { + "epoch": 2.9549277550140176, + "eval_loss": 2.086881160736084, + "eval_runtime": 195.3724, + "eval_samples_per_second": 0.164, + "eval_steps_per_second": 0.164, + "step": 6850 + }, + { + "epoch": 2.955359068363166, + "grad_norm": 0.2040349543094635, + "learning_rate": 8.39682441895445e-08, + "loss": 2.2474, + "step": 6851 + }, + { + "epoch": 2.955790381712314, + "grad_norm": 0.19920016825199127, + "learning_rate": 8.236152778836047e-08, + "loss": 1.9796, + "step": 6852 + }, + { + "epoch": 2.956221695061462, + "grad_norm": 0.19970017671585083, + "learning_rate": 8.077032390351457e-08, + "loss": 2.2889, + "step": 6853 + }, + { + "epoch": 2.95665300841061, + "grad_norm": 0.20651085674762726, + "learning_rate": 7.919463286446825e-08, + "loss": 2.1612, + "step": 6854 + }, + { + "epoch": 2.9570843217597584, + "grad_norm": 0.19311478734016418, + "learning_rate": 7.763445499750221e-08, + "loss": 2.2837, + "step": 6855 + }, + { + "epoch": 2.9575156351089067, + "grad_norm": 0.20735017955303192, + "learning_rate": 7.608979062564968e-08, + "loss": 2.3233, + "step": 6856 + }, + { + "epoch": 2.957946948458055, + "grad_norm": 0.19792737066745758, + "learning_rate": 7.45606400687715e-08, + "loss": 2.1628, + "step": 6857 + }, + { + "epoch": 2.958378261807203, + "grad_norm": 0.20489484071731567, + "learning_rate": 7.304700364347271e-08, + "loss": 2.076, + "step": 6858 + }, + { + "epoch": 2.958809575156351, + "grad_norm": 0.19102559983730316, + "learning_rate": 7.154888166317762e-08, + "loss": 1.9899, + "step": 6859 + }, + { + "epoch": 2.9592408885054993, + "grad_norm": 0.1942603439092636, + "learning_rate": 7.006627443809643e-08, + "loss": 2.2985, + "step": 6860 + }, + { + "epoch": 2.9596722018546475, + "grad_norm": 0.1981513351202011, + "learning_rate": 6.859918227521688e-08, + "loss": 2.1052, + "step": 6861 + }, + { + "epoch": 2.9601035152037953, + "grad_norm": 0.1908348798751831, + "learning_rate": 6.714760547830433e-08, + "loss": 2.1002, + "step": 6862 + }, + { + "epoch": 2.9605348285529436, + "grad_norm": 0.20092721283435822, + "learning_rate": 6.571154434793502e-08, + "loss": 2.2455, + "step": 6863 + }, + { + "epoch": 2.960966141902092, + "grad_norm": 0.2005387842655182, + "learning_rate": 6.429099918146274e-08, + "loss": 2.1606, + "step": 6864 + }, + { + "epoch": 2.96139745525124, + "grad_norm": 0.2052379548549652, + "learning_rate": 6.28859702730189e-08, + "loss": 2.1535, + "step": 6865 + }, + { + "epoch": 2.9618287686003884, + "grad_norm": 0.21680006384849548, + "learning_rate": 6.149645791352908e-08, + "loss": 2.1254, + "step": 6866 + }, + { + "epoch": 2.9622600819495366, + "grad_norm": 0.19733987748622894, + "learning_rate": 6.012246239071317e-08, + "loss": 2.2219, + "step": 6867 + }, + { + "epoch": 2.9626913952986844, + "grad_norm": 0.21556080877780914, + "learning_rate": 5.876398398907689e-08, + "loss": 2.3021, + "step": 6868 + }, + { + "epoch": 2.9631227086478327, + "grad_norm": 0.17450863122940063, + "learning_rate": 5.7421022989886935e-08, + "loss": 1.8516, + "step": 6869 + }, + { + "epoch": 2.963554021996981, + "grad_norm": 0.20078998804092407, + "learning_rate": 5.609357967124584e-08, + "loss": 2.298, + "step": 6870 + }, + { + "epoch": 2.9639853353461287, + "grad_norm": 0.20523545145988464, + "learning_rate": 5.478165430800041e-08, + "loss": 2.1605, + "step": 6871 + }, + { + "epoch": 2.964416648695277, + "grad_norm": 0.24319957196712494, + "learning_rate": 5.3485247171791703e-08, + "loss": 2.306, + "step": 6872 + }, + { + "epoch": 2.9648479620444252, + "grad_norm": 13.429825782775879, + "learning_rate": 5.2204358531071636e-08, + "loss": 2.2616, + "step": 6873 + }, + { + "epoch": 2.9652792753935735, + "grad_norm": 0.20633433759212494, + "learning_rate": 5.093898865105306e-08, + "loss": 2.3296, + "step": 6874 + }, + { + "epoch": 2.9657105887427218, + "grad_norm": 0.1916525661945343, + "learning_rate": 4.9689137793743064e-08, + "loss": 2.1803, + "step": 6875 + }, + { + "epoch": 2.9657105887427218, + "eval_loss": 2.0869922637939453, + "eval_runtime": 212.9874, + "eval_samples_per_second": 0.15, + "eval_steps_per_second": 0.15, + "step": 6875 + }, + { + "epoch": 2.96614190209187, + "grad_norm": 0.1868002563714981, + "learning_rate": 4.845480621794295e-08, + "loss": 2.2291, + "step": 6876 + }, + { + "epoch": 2.966573215441018, + "grad_norm": 0.20137692987918854, + "learning_rate": 4.723599417923163e-08, + "loss": 2.1532, + "step": 6877 + }, + { + "epoch": 2.967004528790166, + "grad_norm": 0.19183698296546936, + "learning_rate": 4.6032701929973884e-08, + "loss": 2.1687, + "step": 6878 + }, + { + "epoch": 2.9674358421393143, + "grad_norm": 0.2428313046693802, + "learning_rate": 4.484492971932874e-08, + "loss": 2.2349, + "step": 6879 + }, + { + "epoch": 2.967867155488462, + "grad_norm": 0.2129974216222763, + "learning_rate": 4.367267779324113e-08, + "loss": 2.2224, + "step": 6880 + }, + { + "epoch": 2.9682984688376104, + "grad_norm": 0.20092099905014038, + "learning_rate": 4.251594639442524e-08, + "loss": 2.0661, + "step": 6881 + }, + { + "epoch": 2.9687297821867586, + "grad_norm": 0.18725912272930145, + "learning_rate": 4.137473576241446e-08, + "loss": 2.0123, + "step": 6882 + }, + { + "epoch": 2.969161095535907, + "grad_norm": 0.20052030682563782, + "learning_rate": 4.024904613348645e-08, + "loss": 2.3143, + "step": 6883 + }, + { + "epoch": 2.969592408885055, + "grad_norm": 0.20144249498844147, + "learning_rate": 3.913887774074642e-08, + "loss": 2.1777, + "step": 6884 + }, + { + "epoch": 2.9700237222342034, + "grad_norm": 0.1862250715494156, + "learning_rate": 3.8044230814060496e-08, + "loss": 2.0669, + "step": 6885 + }, + { + "epoch": 2.9704550355833512, + "grad_norm": 0.19004148244857788, + "learning_rate": 3.696510558008903e-08, + "loss": 2.1808, + "step": 6886 + }, + { + "epoch": 2.9708863489324995, + "grad_norm": 0.19828954339027405, + "learning_rate": 3.5901502262269954e-08, + "loss": 2.2483, + "step": 6887 + }, + { + "epoch": 2.9713176622816477, + "grad_norm": 0.22624792158603668, + "learning_rate": 3.4853421080835444e-08, + "loss": 2.1074, + "step": 6888 + }, + { + "epoch": 2.9717489756307955, + "grad_norm": 0.20023319125175476, + "learning_rate": 3.3820862252811884e-08, + "loss": 2.1283, + "step": 6889 + }, + { + "epoch": 2.972180288979944, + "grad_norm": 0.1807999610900879, + "learning_rate": 3.2803825991994934e-08, + "loss": 2.0663, + "step": 6890 + }, + { + "epoch": 2.972611602329092, + "grad_norm": 0.19238461554050446, + "learning_rate": 3.1802312508966146e-08, + "loss": 1.9663, + "step": 6891 + }, + { + "epoch": 2.9730429156782403, + "grad_norm": 0.1982019543647766, + "learning_rate": 3.081632201111794e-08, + "loss": 2.084, + "step": 6892 + }, + { + "epoch": 2.9734742290273886, + "grad_norm": 0.19389572739601135, + "learning_rate": 2.9845854702587043e-08, + "loss": 2.1794, + "step": 6893 + }, + { + "epoch": 2.973905542376537, + "grad_norm": 0.19226449728012085, + "learning_rate": 2.8890910784337694e-08, + "loss": 1.9868, + "step": 6894 + }, + { + "epoch": 2.9743368557256846, + "grad_norm": 0.20984889566898346, + "learning_rate": 2.7951490454095062e-08, + "loss": 2.0223, + "step": 6895 + }, + { + "epoch": 2.974768169074833, + "grad_norm": 0.42044931650161743, + "learning_rate": 2.7027593906378542e-08, + "loss": 2.2972, + "step": 6896 + }, + { + "epoch": 2.975199482423981, + "grad_norm": 0.2137708216905594, + "learning_rate": 2.611922133248512e-08, + "loss": 2.254, + "step": 6897 + }, + { + "epoch": 2.975630795773129, + "grad_norm": 0.18392716348171234, + "learning_rate": 2.5226372920514326e-08, + "loss": 2.0367, + "step": 6898 + }, + { + "epoch": 2.976062109122277, + "grad_norm": 0.20180554687976837, + "learning_rate": 2.4349048855326625e-08, + "loss": 2.1274, + "step": 6899 + }, + { + "epoch": 2.9764934224714255, + "grad_norm": 0.18473385274410248, + "learning_rate": 2.3487249318593358e-08, + "loss": 1.9986, + "step": 6900 + }, + { + "epoch": 2.9764934224714255, + "eval_loss": 2.086920738220215, + "eval_runtime": 207.6476, + "eval_samples_per_second": 0.154, + "eval_steps_per_second": 0.154, + "step": 6900 + }, + { + "epoch": 2.9769247358205737, + "grad_norm": 0.20555371046066284, + "learning_rate": 2.2640974488746798e-08, + "loss": 2.2448, + "step": 6901 + }, + { + "epoch": 2.977356049169722, + "grad_norm": 0.18608687818050385, + "learning_rate": 2.1810224541030096e-08, + "loss": 2.1915, + "step": 6902 + }, + { + "epoch": 2.97778736251887, + "grad_norm": 0.19687053561210632, + "learning_rate": 2.0994999647455656e-08, + "loss": 2.1627, + "step": 6903 + }, + { + "epoch": 2.978218675868018, + "grad_norm": 0.20249271392822266, + "learning_rate": 2.0195299976821787e-08, + "loss": 2.2829, + "step": 6904 + }, + { + "epoch": 2.9786499892171663, + "grad_norm": 0.1894199401140213, + "learning_rate": 1.9411125694712704e-08, + "loss": 2.0305, + "step": 6905 + }, + { + "epoch": 2.9790813025663145, + "grad_norm": 0.20701386034488678, + "learning_rate": 1.864247696350685e-08, + "loss": 2.2719, + "step": 6906 + }, + { + "epoch": 2.9795126159154623, + "grad_norm": 0.18859870731830597, + "learning_rate": 1.788935394236024e-08, + "loss": 2.1539, + "step": 6907 + }, + { + "epoch": 2.9799439292646106, + "grad_norm": 0.2163713276386261, + "learning_rate": 1.7151756787206483e-08, + "loss": 2.0162, + "step": 6908 + }, + { + "epoch": 2.980375242613759, + "grad_norm": 0.21868695318698883, + "learning_rate": 1.642968565079006e-08, + "loss": 1.9967, + "step": 6909 + }, + { + "epoch": 2.980806555962907, + "grad_norm": 0.1910344958305359, + "learning_rate": 1.572314068260805e-08, + "loss": 2.0954, + "step": 6910 + }, + { + "epoch": 2.9812378693120554, + "grad_norm": 0.2031833529472351, + "learning_rate": 1.503212202896009e-08, + "loss": 2.2385, + "step": 6911 + }, + { + "epoch": 2.9816691826612036, + "grad_norm": 0.20792736113071442, + "learning_rate": 1.435662983294006e-08, + "loss": 2.162, + "step": 6912 + }, + { + "epoch": 2.9821004960103514, + "grad_norm": 0.22117304801940918, + "learning_rate": 1.369666423441107e-08, + "loss": 1.8649, + "step": 6913 + }, + { + "epoch": 2.9825318093594997, + "grad_norm": 0.2013591080904007, + "learning_rate": 1.3052225370013825e-08, + "loss": 2.0469, + "step": 6914 + }, + { + "epoch": 2.982963122708648, + "grad_norm": 0.3142205774784088, + "learning_rate": 1.2423313373199906e-08, + "loss": 2.0327, + "step": 6915 + }, + { + "epoch": 2.9833944360577958, + "grad_norm": 0.2305009365081787, + "learning_rate": 1.1809928374198475e-08, + "loss": 2.0315, + "step": 6916 + }, + { + "epoch": 2.983825749406944, + "grad_norm": 0.20702128112316132, + "learning_rate": 1.1212070499999615e-08, + "loss": 2.0369, + "step": 6917 + }, + { + "epoch": 2.9842570627560923, + "grad_norm": 0.18710415065288544, + "learning_rate": 1.0629739874404297e-08, + "loss": 1.9671, + "step": 6918 + }, + { + "epoch": 2.9846883761052405, + "grad_norm": 0.20689338445663452, + "learning_rate": 1.006293661799107e-08, + "loss": 2.1959, + "step": 6919 + }, + { + "epoch": 2.9851196894543888, + "grad_norm": 0.19970650970935822, + "learning_rate": 9.511660848132707e-09, + "loss": 2.1549, + "step": 6920 + }, + { + "epoch": 2.985551002803537, + "grad_norm": 0.19728341698646545, + "learning_rate": 8.975912678954589e-09, + "loss": 1.9665, + "step": 6921 + }, + { + "epoch": 2.985982316152685, + "grad_norm": 0.2005823254585266, + "learning_rate": 8.455692221409627e-09, + "loss": 2.3352, + "step": 6922 + }, + { + "epoch": 2.986413629501833, + "grad_norm": 0.20024988055229187, + "learning_rate": 7.950999583203333e-09, + "loss": 2.3154, + "step": 6923 + }, + { + "epoch": 2.9868449428509813, + "grad_norm": 0.19718816876411438, + "learning_rate": 7.461834868843775e-09, + "loss": 2.1285, + "step": 6924 + }, + { + "epoch": 2.987276256200129, + "grad_norm": 0.20390065014362335, + "learning_rate": 6.988198179616599e-09, + "loss": 2.1669, + "step": 6925 + }, + { + "epoch": 2.987276256200129, + "eval_loss": 2.0870180130004883, + "eval_runtime": 207.3305, + "eval_samples_per_second": 0.154, + "eval_steps_per_second": 0.154, + "step": 6925 + }, + { + "epoch": 2.9877075695492774, + "grad_norm": 0.197320818901062, + "learning_rate": 6.530089613593359e-09, + "loss": 2.139, + "step": 6926 + }, + { + "epoch": 2.9881388828984257, + "grad_norm": 0.19850942492485046, + "learning_rate": 6.087509265623181e-09, + "loss": 2.1301, + "step": 6927 + }, + { + "epoch": 2.988570196247574, + "grad_norm": 0.252987802028656, + "learning_rate": 5.660457227357751e-09, + "loss": 2.1739, + "step": 6928 + }, + { + "epoch": 2.989001509596722, + "grad_norm": 0.20021499693393707, + "learning_rate": 5.248933587218007e-09, + "loss": 2.0975, + "step": 6929 + }, + { + "epoch": 2.9894328229458704, + "grad_norm": 0.18422454595565796, + "learning_rate": 4.852938430419118e-09, + "loss": 2.1753, + "step": 6930 + }, + { + "epoch": 2.9898641362950182, + "grad_norm": 0.19496819376945496, + "learning_rate": 4.4724718389538286e-09, + "loss": 2.1651, + "step": 6931 + }, + { + "epoch": 2.9902954496441665, + "grad_norm": 0.18620194494724274, + "learning_rate": 4.107533891592462e-09, + "loss": 1.9124, + "step": 6932 + }, + { + "epoch": 2.9907267629933147, + "grad_norm": 0.19842314720153809, + "learning_rate": 3.758124663916229e-09, + "loss": 2.1233, + "step": 6933 + }, + { + "epoch": 2.9911580763424626, + "grad_norm": 0.19758078455924988, + "learning_rate": 3.4242442282589322e-09, + "loss": 2.0477, + "step": 6934 + }, + { + "epoch": 2.991589389691611, + "grad_norm": 0.2080450803041458, + "learning_rate": 3.105892653765263e-09, + "loss": 2.1694, + "step": 6935 + }, + { + "epoch": 2.992020703040759, + "grad_norm": 0.20617592334747314, + "learning_rate": 2.803070006340835e-09, + "loss": 2.2902, + "step": 6936 + }, + { + "epoch": 2.9924520163899073, + "grad_norm": 0.20559188723564148, + "learning_rate": 2.5157763487021477e-09, + "loss": 1.8361, + "step": 6937 + }, + { + "epoch": 2.9928833297390556, + "grad_norm": 0.20320501923561096, + "learning_rate": 2.2440117403266233e-09, + "loss": 1.8617, + "step": 6938 + }, + { + "epoch": 2.993314643088204, + "grad_norm": 0.18574926257133484, + "learning_rate": 1.987776237494243e-09, + "loss": 2.0917, + "step": 6939 + }, + { + "epoch": 2.9937459564373516, + "grad_norm": 0.1990656554698944, + "learning_rate": 1.747069893262565e-09, + "loss": 2.3291, + "step": 6940 + }, + { + "epoch": 2.9941772697865, + "grad_norm": 0.20082135498523712, + "learning_rate": 1.5218927574583983e-09, + "loss": 2.1523, + "step": 6941 + }, + { + "epoch": 2.994608583135648, + "grad_norm": 0.2077145278453827, + "learning_rate": 1.312244876719437e-09, + "loss": 2.3263, + "step": 6942 + }, + { + "epoch": 2.995039896484796, + "grad_norm": 0.1954571008682251, + "learning_rate": 1.1181262944442993e-09, + "loss": 2.1045, + "step": 6943 + }, + { + "epoch": 2.995471209833944, + "grad_norm": 0.18653079867362976, + "learning_rate": 9.395370508424871e-10, + "loss": 1.9513, + "step": 6944 + }, + { + "epoch": 2.9959025231830925, + "grad_norm": 0.18665404617786407, + "learning_rate": 7.764771828844274e-10, + "loss": 2.1955, + "step": 6945 + }, + { + "epoch": 2.9963338365322407, + "grad_norm": 0.2045997977256775, + "learning_rate": 6.289467243347779e-10, + "loss": 2.2903, + "step": 6946 + }, + { + "epoch": 2.996765149881389, + "grad_norm": 0.18565250933170319, + "learning_rate": 4.969457057441007e-10, + "loss": 2.2809, + "step": 6947 + }, + { + "epoch": 2.9971964632305372, + "grad_norm": 0.21470415592193604, + "learning_rate": 3.8047415443220873e-10, + "loss": 2.1994, + "step": 6948 + }, + { + "epoch": 2.997627776579685, + "grad_norm": 0.20408982038497925, + "learning_rate": 2.795320945297996e-10, + "loss": 2.2236, + "step": 6949 + }, + { + "epoch": 2.9980590899288333, + "grad_norm": 0.20640811324119568, + "learning_rate": 1.9411954693682176e-10, + "loss": 2.0928, + "step": 6950 + }, + { + "epoch": 2.9980590899288333, + "eval_loss": 2.0869898796081543, + "eval_runtime": 207.3656, + "eval_samples_per_second": 0.154, + "eval_steps_per_second": 0.154, + "step": 6950 + }, + { + "epoch": 2.9984904032779816, + "grad_norm": 0.19671790301799774, + "learning_rate": 1.242365293308012e-10, + "loss": 2.1314, + "step": 6951 + }, + { + "epoch": 2.9989217166271294, + "grad_norm": 0.20118442177772522, + "learning_rate": 6.988305619182178e-11, + "loss": 2.0186, + "step": 6952 + }, + { + "epoch": 2.9993530299762776, + "grad_norm": 0.19977568089962006, + "learning_rate": 3.105913876089161e-11, + "loss": 2.1311, + "step": 6953 + }, + { + "epoch": 2.999784343325426, + "grad_norm": 0.2036544382572174, + "learning_rate": 7.764785089903192e-12, + "loss": 2.289, + "step": 6954 + } + ], + "logging_steps": 1, + "max_steps": 6954, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 10, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.62625932186227e+19, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}