{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0080277626792657, "eval_steps": 500, "global_step": 3000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006689802232721495, "grad_norm": 1.7879213094711304, "learning_rate": 1.11358574610245e-07, "loss": 3.5222, "step": 1 }, { "epoch": 0.001337960446544299, "grad_norm": 1.4433249235153198, "learning_rate": 2.2271714922049e-07, "loss": 3.6449, "step": 2 }, { "epoch": 0.0020069406698164484, "grad_norm": 1.370274543762207, "learning_rate": 3.34075723830735e-07, "loss": 3.4489, "step": 3 }, { "epoch": 0.002675920893088598, "grad_norm": 1.2992745637893677, "learning_rate": 4.4543429844098e-07, "loss": 3.623, "step": 4 }, { "epoch": 0.0033449011163607474, "grad_norm": 1.18953275680542, "learning_rate": 5.567928730512249e-07, "loss": 3.2437, "step": 5 }, { "epoch": 0.004013881339632897, "grad_norm": 1.7558338642120361, "learning_rate": 6.6815144766147e-07, "loss": 3.8406, "step": 6 }, { "epoch": 0.004682861562905046, "grad_norm": 1.383160948753357, "learning_rate": 7.79510022271715e-07, "loss": 3.4969, "step": 7 }, { "epoch": 0.005351841786177196, "grad_norm": 1.0541542768478394, "learning_rate": 8.9086859688196e-07, "loss": 3.3137, "step": 8 }, { "epoch": 0.006020822009449345, "grad_norm": 1.486899733543396, "learning_rate": 1.002227171492205e-06, "loss": 3.6094, "step": 9 }, { "epoch": 0.006689802232721495, "grad_norm": 1.3455109596252441, "learning_rate": 1.1135857461024499e-06, "loss": 3.3945, "step": 10 }, { "epoch": 0.007358782455993645, "grad_norm": 1.5229488611221313, "learning_rate": 1.224944320712695e-06, "loss": 3.379, "step": 11 }, { "epoch": 0.008027762679265794, "grad_norm": 1.6204516887664795, "learning_rate": 1.33630289532294e-06, "loss": 3.4507, "step": 12 }, { "epoch": 0.008696742902537944, "grad_norm": 1.4001444578170776, "learning_rate": 1.447661469933185e-06, "loss": 3.3992, "step": 13 }, { "epoch": 0.009365723125810093, "grad_norm": 1.6883145570755005, "learning_rate": 1.55902004454343e-06, "loss": 3.6524, "step": 14 }, { "epoch": 0.010034703349082243, "grad_norm": 1.1602007150650024, "learning_rate": 1.670378619153675e-06, "loss": 3.4021, "step": 15 }, { "epoch": 0.010703683572354392, "grad_norm": 1.2480249404907227, "learning_rate": 1.78173719376392e-06, "loss": 3.4262, "step": 16 }, { "epoch": 0.011372663795626542, "grad_norm": 1.0981214046478271, "learning_rate": 1.893095768374165e-06, "loss": 3.3911, "step": 17 }, { "epoch": 0.01204164401889869, "grad_norm": 1.5630245208740234, "learning_rate": 2.00445434298441e-06, "loss": 3.7733, "step": 18 }, { "epoch": 0.012710624242170841, "grad_norm": 1.4163099527359009, "learning_rate": 2.1158129175946547e-06, "loss": 3.544, "step": 19 }, { "epoch": 0.01337960446544299, "grad_norm": 1.4298584461212158, "learning_rate": 2.2271714922048998e-06, "loss": 3.4907, "step": 20 }, { "epoch": 0.01404858468871514, "grad_norm": 1.3225711584091187, "learning_rate": 2.338530066815145e-06, "loss": 3.3728, "step": 21 }, { "epoch": 0.01471756491198729, "grad_norm": 1.684406042098999, "learning_rate": 2.44988864142539e-06, "loss": 3.835, "step": 22 }, { "epoch": 0.015386545135259439, "grad_norm": 1.498872995376587, "learning_rate": 2.561247216035635e-06, "loss": 3.4506, "step": 23 }, { "epoch": 0.016055525358531587, "grad_norm": 1.7191704511642456, "learning_rate": 2.67260579064588e-06, "loss": 3.7095, "step": 24 }, { "epoch": 0.016724505581803738, "grad_norm": 1.579593539237976, "learning_rate": 2.783964365256125e-06, "loss": 3.5933, "step": 25 }, { "epoch": 0.017393485805075888, "grad_norm": 1.3628127574920654, "learning_rate": 2.89532293986637e-06, "loss": 3.199, "step": 26 }, { "epoch": 0.01806246602834804, "grad_norm": 1.6564180850982666, "learning_rate": 3.006681514476615e-06, "loss": 3.725, "step": 27 }, { "epoch": 0.018731446251620185, "grad_norm": 1.3412981033325195, "learning_rate": 3.11804008908686e-06, "loss": 3.5123, "step": 28 }, { "epoch": 0.019400426474892336, "grad_norm": 0.9989301562309265, "learning_rate": 3.229398663697105e-06, "loss": 2.9673, "step": 29 }, { "epoch": 0.020069406698164486, "grad_norm": 1.3338531255722046, "learning_rate": 3.34075723830735e-06, "loss": 3.299, "step": 30 }, { "epoch": 0.020738386921436636, "grad_norm": 1.562309741973877, "learning_rate": 3.4521158129175947e-06, "loss": 3.5201, "step": 31 }, { "epoch": 0.021407367144708783, "grad_norm": 1.2774256467819214, "learning_rate": 3.56347438752784e-06, "loss": 3.2878, "step": 32 }, { "epoch": 0.022076347367980934, "grad_norm": 1.455329418182373, "learning_rate": 3.674832962138085e-06, "loss": 3.4257, "step": 33 }, { "epoch": 0.022745327591253084, "grad_norm": 1.2751973867416382, "learning_rate": 3.78619153674833e-06, "loss": 3.3183, "step": 34 }, { "epoch": 0.023414307814525234, "grad_norm": 1.400235652923584, "learning_rate": 3.897550111358575e-06, "loss": 3.5279, "step": 35 }, { "epoch": 0.02408328803779738, "grad_norm": 1.029577612876892, "learning_rate": 4.00890868596882e-06, "loss": 2.774, "step": 36 }, { "epoch": 0.02475226826106953, "grad_norm": 1.280173897743225, "learning_rate": 4.120267260579064e-06, "loss": 3.4011, "step": 37 }, { "epoch": 0.025421248484341682, "grad_norm": 1.3390440940856934, "learning_rate": 4.231625835189309e-06, "loss": 3.4696, "step": 38 }, { "epoch": 0.026090228707613832, "grad_norm": 1.429360270500183, "learning_rate": 4.3429844097995545e-06, "loss": 3.5177, "step": 39 }, { "epoch": 0.02675920893088598, "grad_norm": 1.375282645225525, "learning_rate": 4.4543429844097995e-06, "loss": 3.5709, "step": 40 }, { "epoch": 0.02742818915415813, "grad_norm": 1.241412878036499, "learning_rate": 4.565701559020045e-06, "loss": 3.2464, "step": 41 }, { "epoch": 0.02809716937743028, "grad_norm": 1.5399926900863647, "learning_rate": 4.67706013363029e-06, "loss": 3.2398, "step": 42 }, { "epoch": 0.02876614960070243, "grad_norm": 1.683517575263977, "learning_rate": 4.788418708240535e-06, "loss": 3.6636, "step": 43 }, { "epoch": 0.02943512982397458, "grad_norm": 1.3960689306259155, "learning_rate": 4.89977728285078e-06, "loss": 3.3246, "step": 44 }, { "epoch": 0.030104110047246727, "grad_norm": 1.0281037092208862, "learning_rate": 5.011135857461025e-06, "loss": 3.2595, "step": 45 }, { "epoch": 0.030773090270518878, "grad_norm": 1.204046607017517, "learning_rate": 5.12249443207127e-06, "loss": 3.1779, "step": 46 }, { "epoch": 0.031442070493791024, "grad_norm": 1.2449531555175781, "learning_rate": 5.233853006681515e-06, "loss": 3.1104, "step": 47 }, { "epoch": 0.032111050717063175, "grad_norm": 1.1790910959243774, "learning_rate": 5.34521158129176e-06, "loss": 3.2203, "step": 48 }, { "epoch": 0.032780030940335325, "grad_norm": 1.266721248626709, "learning_rate": 5.456570155902005e-06, "loss": 3.4279, "step": 49 }, { "epoch": 0.033449011163607476, "grad_norm": 1.5345031023025513, "learning_rate": 5.56792873051225e-06, "loss": 3.4272, "step": 50 }, { "epoch": 0.034117991386879626, "grad_norm": 1.250928521156311, "learning_rate": 5.6792873051224945e-06, "loss": 3.124, "step": 51 }, { "epoch": 0.034786971610151776, "grad_norm": 1.2841166257858276, "learning_rate": 5.79064587973274e-06, "loss": 3.1168, "step": 52 }, { "epoch": 0.03545595183342393, "grad_norm": 1.311871886253357, "learning_rate": 5.902004454342985e-06, "loss": 3.1856, "step": 53 }, { "epoch": 0.03612493205669608, "grad_norm": 1.2422196865081787, "learning_rate": 6.01336302895323e-06, "loss": 3.2101, "step": 54 }, { "epoch": 0.03679391227996822, "grad_norm": 1.347784399986267, "learning_rate": 6.124721603563475e-06, "loss": 3.1658, "step": 55 }, { "epoch": 0.03746289250324037, "grad_norm": 1.3649243116378784, "learning_rate": 6.23608017817372e-06, "loss": 3.0376, "step": 56 }, { "epoch": 0.03813187272651252, "grad_norm": 1.2381495237350464, "learning_rate": 6.347438752783964e-06, "loss": 2.7926, "step": 57 }, { "epoch": 0.03880085294978467, "grad_norm": 1.3895397186279297, "learning_rate": 6.45879732739421e-06, "loss": 3.194, "step": 58 }, { "epoch": 0.03946983317305682, "grad_norm": 1.2116481065750122, "learning_rate": 6.570155902004454e-06, "loss": 3.1445, "step": 59 }, { "epoch": 0.04013881339632897, "grad_norm": 1.3978331089019775, "learning_rate": 6.6815144766147e-06, "loss": 3.218, "step": 60 }, { "epoch": 0.04080779361960112, "grad_norm": 1.3602263927459717, "learning_rate": 6.792873051224944e-06, "loss": 3.2545, "step": 61 }, { "epoch": 0.04147677384287327, "grad_norm": 1.4267244338989258, "learning_rate": 6.9042316258351895e-06, "loss": 3.353, "step": 62 }, { "epoch": 0.04214575406614542, "grad_norm": 1.3509756326675415, "learning_rate": 7.0155902004454345e-06, "loss": 3.1955, "step": 63 }, { "epoch": 0.042814734289417566, "grad_norm": 1.8327754735946655, "learning_rate": 7.12694877505568e-06, "loss": 3.408, "step": 64 }, { "epoch": 0.04348371451268972, "grad_norm": 1.5196959972381592, "learning_rate": 7.238307349665925e-06, "loss": 3.2611, "step": 65 }, { "epoch": 0.04415269473596187, "grad_norm": 1.2716466188430786, "learning_rate": 7.34966592427617e-06, "loss": 3.0723, "step": 66 }, { "epoch": 0.04482167495923402, "grad_norm": 1.5642995834350586, "learning_rate": 7.461024498886416e-06, "loss": 3.2954, "step": 67 }, { "epoch": 0.04549065518250617, "grad_norm": 1.3303098678588867, "learning_rate": 7.57238307349666e-06, "loss": 3.2496, "step": 68 }, { "epoch": 0.04615963540577832, "grad_norm": 1.5286786556243896, "learning_rate": 7.683741648106903e-06, "loss": 3.1971, "step": 69 }, { "epoch": 0.04682861562905047, "grad_norm": 1.5279144048690796, "learning_rate": 7.79510022271715e-06, "loss": 3.0903, "step": 70 }, { "epoch": 0.04749759585232262, "grad_norm": 1.0978755950927734, "learning_rate": 7.906458797327395e-06, "loss": 2.8862, "step": 71 }, { "epoch": 0.04816657607559476, "grad_norm": 1.305240511894226, "learning_rate": 8.01781737193764e-06, "loss": 2.9631, "step": 72 }, { "epoch": 0.04883555629886691, "grad_norm": 1.7665414810180664, "learning_rate": 8.129175946547885e-06, "loss": 3.0723, "step": 73 }, { "epoch": 0.04950453652213906, "grad_norm": 1.4403198957443237, "learning_rate": 8.240534521158129e-06, "loss": 2.921, "step": 74 }, { "epoch": 0.05017351674541121, "grad_norm": 1.5842571258544922, "learning_rate": 8.351893095768375e-06, "loss": 3.2412, "step": 75 }, { "epoch": 0.050842496968683364, "grad_norm": 1.4527440071105957, "learning_rate": 8.463251670378619e-06, "loss": 3.0784, "step": 76 }, { "epoch": 0.051511477191955514, "grad_norm": 1.566912055015564, "learning_rate": 8.574610244988866e-06, "loss": 3.3487, "step": 77 }, { "epoch": 0.052180457415227664, "grad_norm": 1.4142961502075195, "learning_rate": 8.685968819599109e-06, "loss": 3.165, "step": 78 }, { "epoch": 0.052849437638499815, "grad_norm": 1.6616283655166626, "learning_rate": 8.797327394209356e-06, "loss": 2.9588, "step": 79 }, { "epoch": 0.05351841786177196, "grad_norm": 1.506828784942627, "learning_rate": 8.908685968819599e-06, "loss": 3.0466, "step": 80 }, { "epoch": 0.05418739808504411, "grad_norm": 1.2490448951721191, "learning_rate": 9.020044543429844e-06, "loss": 2.8133, "step": 81 }, { "epoch": 0.05485637830831626, "grad_norm": 1.604492425918579, "learning_rate": 9.13140311804009e-06, "loss": 3.0556, "step": 82 }, { "epoch": 0.05552535853158841, "grad_norm": 1.5094462633132935, "learning_rate": 9.242761692650334e-06, "loss": 3.0852, "step": 83 }, { "epoch": 0.05619433875486056, "grad_norm": 1.6594133377075195, "learning_rate": 9.35412026726058e-06, "loss": 3.1974, "step": 84 }, { "epoch": 0.05686331897813271, "grad_norm": 1.156653881072998, "learning_rate": 9.465478841870824e-06, "loss": 2.8347, "step": 85 }, { "epoch": 0.05753229920140486, "grad_norm": 1.2661216259002686, "learning_rate": 9.57683741648107e-06, "loss": 2.8568, "step": 86 }, { "epoch": 0.05820127942467701, "grad_norm": 1.5680220127105713, "learning_rate": 9.688195991091315e-06, "loss": 3.3677, "step": 87 }, { "epoch": 0.05887025964794916, "grad_norm": 1.2300074100494385, "learning_rate": 9.79955456570156e-06, "loss": 2.9659, "step": 88 }, { "epoch": 0.059539239871221304, "grad_norm": 1.4609466791152954, "learning_rate": 9.910913140311805e-06, "loss": 3.2218, "step": 89 }, { "epoch": 0.060208220094493455, "grad_norm": 1.5475412607192993, "learning_rate": 1.002227171492205e-05, "loss": 3.0252, "step": 90 }, { "epoch": 0.060877200317765605, "grad_norm": 1.712816834449768, "learning_rate": 1.0133630289532295e-05, "loss": 3.1354, "step": 91 }, { "epoch": 0.061546180541037755, "grad_norm": 1.9765368700027466, "learning_rate": 1.024498886414254e-05, "loss": 3.2223, "step": 92 }, { "epoch": 0.062215160764309906, "grad_norm": 1.6176987886428833, "learning_rate": 1.0356347438752785e-05, "loss": 3.1179, "step": 93 }, { "epoch": 0.06288414098758205, "grad_norm": 7.536865711212158, "learning_rate": 1.046770601336303e-05, "loss": 3.4484, "step": 94 }, { "epoch": 0.0635531212108542, "grad_norm": 1.5932583808898926, "learning_rate": 1.0579064587973274e-05, "loss": 2.9496, "step": 95 }, { "epoch": 0.06422210143412635, "grad_norm": 1.488038420677185, "learning_rate": 1.069042316258352e-05, "loss": 3.119, "step": 96 }, { "epoch": 0.0648910816573985, "grad_norm": 1.9568957090377808, "learning_rate": 1.0801781737193764e-05, "loss": 3.3372, "step": 97 }, { "epoch": 0.06556006188067065, "grad_norm": 1.7145527601242065, "learning_rate": 1.091314031180401e-05, "loss": 3.0294, "step": 98 }, { "epoch": 0.0662290421039428, "grad_norm": 1.7908884286880493, "learning_rate": 1.1024498886414254e-05, "loss": 3.241, "step": 99 }, { "epoch": 0.06689802232721495, "grad_norm": 1.2711106538772583, "learning_rate": 1.11358574610245e-05, "loss": 2.9724, "step": 100 }, { "epoch": 0.0675670025504871, "grad_norm": 1.7003251314163208, "learning_rate": 1.1247216035634744e-05, "loss": 3.0808, "step": 101 }, { "epoch": 0.06823598277375925, "grad_norm": 1.5021196603775024, "learning_rate": 1.1358574610244989e-05, "loss": 3.0445, "step": 102 }, { "epoch": 0.0689049629970314, "grad_norm": 1.330795168876648, "learning_rate": 1.1469933184855234e-05, "loss": 3.0629, "step": 103 }, { "epoch": 0.06957394322030355, "grad_norm": 1.3143346309661865, "learning_rate": 1.158129175946548e-05, "loss": 3.0155, "step": 104 }, { "epoch": 0.0702429234435757, "grad_norm": 1.519888162612915, "learning_rate": 1.1692650334075724e-05, "loss": 3.2906, "step": 105 }, { "epoch": 0.07091190366684785, "grad_norm": 1.4895938634872437, "learning_rate": 1.180400890868597e-05, "loss": 3.0162, "step": 106 }, { "epoch": 0.07158088389012, "grad_norm": 1.589227318763733, "learning_rate": 1.1915367483296214e-05, "loss": 2.9903, "step": 107 }, { "epoch": 0.07224986411339215, "grad_norm": 1.701695203781128, "learning_rate": 1.202672605790646e-05, "loss": 3.1095, "step": 108 }, { "epoch": 0.0729188443366643, "grad_norm": 1.981400489807129, "learning_rate": 1.2138084632516705e-05, "loss": 3.2036, "step": 109 }, { "epoch": 0.07358782455993644, "grad_norm": 1.8499116897583008, "learning_rate": 1.224944320712695e-05, "loss": 3.0118, "step": 110 }, { "epoch": 0.07425680478320859, "grad_norm": 1.604082465171814, "learning_rate": 1.2360801781737195e-05, "loss": 3.2236, "step": 111 }, { "epoch": 0.07492578500648074, "grad_norm": 2.0322389602661133, "learning_rate": 1.247216035634744e-05, "loss": 3.3046, "step": 112 }, { "epoch": 0.07559476522975289, "grad_norm": 2.7553274631500244, "learning_rate": 1.2583518930957685e-05, "loss": 3.1316, "step": 113 }, { "epoch": 0.07626374545302504, "grad_norm": 1.6986571550369263, "learning_rate": 1.2694877505567928e-05, "loss": 3.182, "step": 114 }, { "epoch": 0.07693272567629719, "grad_norm": 1.5125391483306885, "learning_rate": 1.2806236080178175e-05, "loss": 3.1322, "step": 115 }, { "epoch": 0.07760170589956934, "grad_norm": 1.5707584619522095, "learning_rate": 1.291759465478842e-05, "loss": 3.045, "step": 116 }, { "epoch": 0.07827068612284149, "grad_norm": 1.6684489250183105, "learning_rate": 1.3028953229398663e-05, "loss": 3.0713, "step": 117 }, { "epoch": 0.07893966634611364, "grad_norm": 1.5122387409210205, "learning_rate": 1.3140311804008909e-05, "loss": 3.022, "step": 118 }, { "epoch": 0.0796086465693858, "grad_norm": 2.1538209915161133, "learning_rate": 1.3251670378619155e-05, "loss": 3.2607, "step": 119 }, { "epoch": 0.08027762679265794, "grad_norm": 1.4033334255218506, "learning_rate": 1.33630289532294e-05, "loss": 3.0916, "step": 120 }, { "epoch": 0.0809466070159301, "grad_norm": 1.8540390729904175, "learning_rate": 1.3474387527839644e-05, "loss": 3.1077, "step": 121 }, { "epoch": 0.08161558723920224, "grad_norm": 1.456629753112793, "learning_rate": 1.3585746102449889e-05, "loss": 2.9815, "step": 122 }, { "epoch": 0.0822845674624744, "grad_norm": 1.5381335020065308, "learning_rate": 1.3697104677060136e-05, "loss": 2.9814, "step": 123 }, { "epoch": 0.08295354768574655, "grad_norm": 1.2347851991653442, "learning_rate": 1.3808463251670379e-05, "loss": 2.7295, "step": 124 }, { "epoch": 0.0836225279090187, "grad_norm": 1.2295702695846558, "learning_rate": 1.3919821826280624e-05, "loss": 2.8827, "step": 125 }, { "epoch": 0.08429150813229085, "grad_norm": 1.7871609926223755, "learning_rate": 1.4031180400890869e-05, "loss": 3.0303, "step": 126 }, { "epoch": 0.08496048835556298, "grad_norm": 1.6743766069412231, "learning_rate": 1.4142538975501116e-05, "loss": 2.9467, "step": 127 }, { "epoch": 0.08562946857883513, "grad_norm": 1.4504770040512085, "learning_rate": 1.425389755011136e-05, "loss": 3.3573, "step": 128 }, { "epoch": 0.08629844880210728, "grad_norm": 1.6345608234405518, "learning_rate": 1.4365256124721604e-05, "loss": 2.9953, "step": 129 }, { "epoch": 0.08696742902537943, "grad_norm": 1.3987749814987183, "learning_rate": 1.447661469933185e-05, "loss": 2.7236, "step": 130 }, { "epoch": 0.08763640924865158, "grad_norm": 1.359086275100708, "learning_rate": 1.4587973273942093e-05, "loss": 2.8055, "step": 131 }, { "epoch": 0.08830538947192373, "grad_norm": 1.4174124002456665, "learning_rate": 1.469933184855234e-05, "loss": 2.9624, "step": 132 }, { "epoch": 0.08897436969519588, "grad_norm": 1.8663897514343262, "learning_rate": 1.4810690423162585e-05, "loss": 2.9814, "step": 133 }, { "epoch": 0.08964334991846803, "grad_norm": 2.121668577194214, "learning_rate": 1.4922048997772831e-05, "loss": 3.1841, "step": 134 }, { "epoch": 0.09031233014174019, "grad_norm": 1.5990877151489258, "learning_rate": 1.5033407572383073e-05, "loss": 2.9322, "step": 135 }, { "epoch": 0.09098131036501234, "grad_norm": 2.1767890453338623, "learning_rate": 1.514476614699332e-05, "loss": 3.1197, "step": 136 }, { "epoch": 0.09165029058828449, "grad_norm": 1.5318809747695923, "learning_rate": 1.5256124721603565e-05, "loss": 2.9157, "step": 137 }, { "epoch": 0.09231927081155664, "grad_norm": 2.0057833194732666, "learning_rate": 1.5367483296213807e-05, "loss": 3.0975, "step": 138 }, { "epoch": 0.09298825103482879, "grad_norm": 1.485335350036621, "learning_rate": 1.5478841870824053e-05, "loss": 2.9477, "step": 139 }, { "epoch": 0.09365723125810094, "grad_norm": 1.8702194690704346, "learning_rate": 1.55902004454343e-05, "loss": 3.0644, "step": 140 }, { "epoch": 0.09432621148137309, "grad_norm": 1.8341530561447144, "learning_rate": 1.5701559020044543e-05, "loss": 3.0453, "step": 141 }, { "epoch": 0.09499519170464524, "grad_norm": 1.6797006130218506, "learning_rate": 1.581291759465479e-05, "loss": 3.0143, "step": 142 }, { "epoch": 0.09566417192791737, "grad_norm": 1.846571445465088, "learning_rate": 1.5924276169265034e-05, "loss": 3.18, "step": 143 }, { "epoch": 0.09633315215118952, "grad_norm": 1.3143115043640137, "learning_rate": 1.603563474387528e-05, "loss": 2.6912, "step": 144 }, { "epoch": 0.09700213237446167, "grad_norm": 1.8494148254394531, "learning_rate": 1.6146993318485524e-05, "loss": 3.0117, "step": 145 }, { "epoch": 0.09767111259773383, "grad_norm": 1.6968753337860107, "learning_rate": 1.625835189309577e-05, "loss": 3.0856, "step": 146 }, { "epoch": 0.09834009282100598, "grad_norm": 1.684114933013916, "learning_rate": 1.6369710467706014e-05, "loss": 3.1856, "step": 147 }, { "epoch": 0.09900907304427813, "grad_norm": 1.4782801866531372, "learning_rate": 1.6481069042316257e-05, "loss": 2.9094, "step": 148 }, { "epoch": 0.09967805326755028, "grad_norm": 1.9199340343475342, "learning_rate": 1.6592427616926504e-05, "loss": 2.9601, "step": 149 }, { "epoch": 0.10034703349082243, "grad_norm": 1.7519869804382324, "learning_rate": 1.670378619153675e-05, "loss": 3.0994, "step": 150 }, { "epoch": 0.10101601371409458, "grad_norm": 2.247997760772705, "learning_rate": 1.6815144766146994e-05, "loss": 3.277, "step": 151 }, { "epoch": 0.10168499393736673, "grad_norm": 1.676098108291626, "learning_rate": 1.6926503340757238e-05, "loss": 3.0809, "step": 152 }, { "epoch": 0.10235397416063888, "grad_norm": 1.3067691326141357, "learning_rate": 1.7037861915367484e-05, "loss": 2.8895, "step": 153 }, { "epoch": 0.10302295438391103, "grad_norm": 1.6931229829788208, "learning_rate": 1.714922048997773e-05, "loss": 3.0187, "step": 154 }, { "epoch": 0.10369193460718318, "grad_norm": 2.1117000579833984, "learning_rate": 1.726057906458797e-05, "loss": 3.2156, "step": 155 }, { "epoch": 0.10436091483045533, "grad_norm": 1.5815112590789795, "learning_rate": 1.7371937639198218e-05, "loss": 2.8911, "step": 156 }, { "epoch": 0.10502989505372748, "grad_norm": 1.5297530889511108, "learning_rate": 1.7483296213808465e-05, "loss": 3.0298, "step": 157 }, { "epoch": 0.10569887527699963, "grad_norm": 1.5161750316619873, "learning_rate": 1.759465478841871e-05, "loss": 3.0969, "step": 158 }, { "epoch": 0.10636785550027178, "grad_norm": 1.600009560585022, "learning_rate": 1.7706013363028955e-05, "loss": 2.9769, "step": 159 }, { "epoch": 0.10703683572354392, "grad_norm": 1.8170793056488037, "learning_rate": 1.7817371937639198e-05, "loss": 2.9447, "step": 160 }, { "epoch": 0.10770581594681607, "grad_norm": 1.4646176099777222, "learning_rate": 1.7928730512249445e-05, "loss": 2.9214, "step": 161 }, { "epoch": 0.10837479617008822, "grad_norm": 1.8447264432907104, "learning_rate": 1.804008908685969e-05, "loss": 3.1705, "step": 162 }, { "epoch": 0.10904377639336037, "grad_norm": 2.4299709796905518, "learning_rate": 1.8151447661469935e-05, "loss": 3.016, "step": 163 }, { "epoch": 0.10971275661663252, "grad_norm": 1.999182105064392, "learning_rate": 1.826280623608018e-05, "loss": 2.98, "step": 164 }, { "epoch": 0.11038173683990467, "grad_norm": 2.1972455978393555, "learning_rate": 1.8374164810690425e-05, "loss": 3.1834, "step": 165 }, { "epoch": 0.11105071706317682, "grad_norm": 2.052306652069092, "learning_rate": 1.848552338530067e-05, "loss": 2.9967, "step": 166 }, { "epoch": 0.11171969728644897, "grad_norm": 1.6119320392608643, "learning_rate": 1.8596881959910915e-05, "loss": 2.9699, "step": 167 }, { "epoch": 0.11238867750972112, "grad_norm": 2.123548746109009, "learning_rate": 1.870824053452116e-05, "loss": 3.2142, "step": 168 }, { "epoch": 0.11305765773299327, "grad_norm": 2.0712902545928955, "learning_rate": 1.8819599109131402e-05, "loss": 2.9206, "step": 169 }, { "epoch": 0.11372663795626542, "grad_norm": 1.5530469417572021, "learning_rate": 1.893095768374165e-05, "loss": 2.9556, "step": 170 }, { "epoch": 0.11439561817953757, "grad_norm": 1.4171209335327148, "learning_rate": 1.9042316258351896e-05, "loss": 2.9874, "step": 171 }, { "epoch": 0.11506459840280972, "grad_norm": 1.9125628471374512, "learning_rate": 1.915367483296214e-05, "loss": 3.0581, "step": 172 }, { "epoch": 0.11573357862608187, "grad_norm": 1.7964285612106323, "learning_rate": 1.9265033407572382e-05, "loss": 3.1749, "step": 173 }, { "epoch": 0.11640255884935402, "grad_norm": 1.7148891687393188, "learning_rate": 1.937639198218263e-05, "loss": 2.7974, "step": 174 }, { "epoch": 0.11707153907262617, "grad_norm": 1.8488072156906128, "learning_rate": 1.9487750556792876e-05, "loss": 3.0808, "step": 175 }, { "epoch": 0.11774051929589832, "grad_norm": 1.799553632736206, "learning_rate": 1.959910913140312e-05, "loss": 3.0121, "step": 176 }, { "epoch": 0.11840949951917046, "grad_norm": 1.8497250080108643, "learning_rate": 1.9710467706013363e-05, "loss": 3.0084, "step": 177 }, { "epoch": 0.11907847974244261, "grad_norm": 1.743905782699585, "learning_rate": 1.982182628062361e-05, "loss": 2.9881, "step": 178 }, { "epoch": 0.11974745996571476, "grad_norm": 2.140427827835083, "learning_rate": 1.9933184855233856e-05, "loss": 2.9783, "step": 179 }, { "epoch": 0.12041644018898691, "grad_norm": 1.3036153316497803, "learning_rate": 2.00445434298441e-05, "loss": 2.8561, "step": 180 }, { "epoch": 0.12108542041225906, "grad_norm": 1.707446813583374, "learning_rate": 2.0155902004454343e-05, "loss": 3.0634, "step": 181 }, { "epoch": 0.12175440063553121, "grad_norm": 1.7313611507415771, "learning_rate": 2.026726057906459e-05, "loss": 3.0231, "step": 182 }, { "epoch": 0.12242338085880336, "grad_norm": 1.5043818950653076, "learning_rate": 2.0378619153674833e-05, "loss": 2.7388, "step": 183 }, { "epoch": 0.12309236108207551, "grad_norm": 1.794662356376648, "learning_rate": 2.048997772828508e-05, "loss": 3.0349, "step": 184 }, { "epoch": 0.12376134130534766, "grad_norm": 1.933425784111023, "learning_rate": 2.0601336302895323e-05, "loss": 3.107, "step": 185 }, { "epoch": 0.12443032152861981, "grad_norm": 2.4808244705200195, "learning_rate": 2.071269487750557e-05, "loss": 3.0081, "step": 186 }, { "epoch": 0.12509930175189196, "grad_norm": 1.7459521293640137, "learning_rate": 2.0824053452115813e-05, "loss": 2.9181, "step": 187 }, { "epoch": 0.1257682819751641, "grad_norm": 2.0204484462738037, "learning_rate": 2.093541202672606e-05, "loss": 3.2941, "step": 188 }, { "epoch": 0.12643726219843626, "grad_norm": 1.8456350564956665, "learning_rate": 2.1046770601336304e-05, "loss": 3.0197, "step": 189 }, { "epoch": 0.1271062424217084, "grad_norm": 1.5815969705581665, "learning_rate": 2.1158129175946547e-05, "loss": 2.949, "step": 190 }, { "epoch": 0.12777522264498056, "grad_norm": 2.190483570098877, "learning_rate": 2.1269487750556794e-05, "loss": 3.0315, "step": 191 }, { "epoch": 0.1284442028682527, "grad_norm": 1.8807663917541504, "learning_rate": 2.138084632516704e-05, "loss": 2.8794, "step": 192 }, { "epoch": 0.12911318309152486, "grad_norm": 1.9881584644317627, "learning_rate": 2.1492204899777284e-05, "loss": 2.9104, "step": 193 }, { "epoch": 0.129782163314797, "grad_norm": 2.114427328109741, "learning_rate": 2.1603563474387527e-05, "loss": 2.8785, "step": 194 }, { "epoch": 0.13045114353806916, "grad_norm": 2.0160298347473145, "learning_rate": 2.1714922048997774e-05, "loss": 3.0206, "step": 195 }, { "epoch": 0.1311201237613413, "grad_norm": 1.7591657638549805, "learning_rate": 2.182628062360802e-05, "loss": 3.0766, "step": 196 }, { "epoch": 0.13178910398461346, "grad_norm": 2.251678705215454, "learning_rate": 2.1937639198218264e-05, "loss": 3.0232, "step": 197 }, { "epoch": 0.1324580842078856, "grad_norm": 2.139174699783325, "learning_rate": 2.2048997772828508e-05, "loss": 2.9806, "step": 198 }, { "epoch": 0.13312706443115777, "grad_norm": 2.0383951663970947, "learning_rate": 2.2160356347438754e-05, "loss": 3.3346, "step": 199 }, { "epoch": 0.1337960446544299, "grad_norm": 2.2171547412872314, "learning_rate": 2.2271714922049e-05, "loss": 2.8831, "step": 200 }, { "epoch": 0.13446502487770207, "grad_norm": 2.0654296875, "learning_rate": 2.2383073496659245e-05, "loss": 2.7744, "step": 201 }, { "epoch": 0.1351340051009742, "grad_norm": 1.9895446300506592, "learning_rate": 2.2494432071269488e-05, "loss": 3.0192, "step": 202 }, { "epoch": 0.13580298532424634, "grad_norm": 2.3785881996154785, "learning_rate": 2.2605790645879735e-05, "loss": 3.1008, "step": 203 }, { "epoch": 0.1364719655475185, "grad_norm": 2.2556653022766113, "learning_rate": 2.2717149220489978e-05, "loss": 3.2238, "step": 204 }, { "epoch": 0.13714094577079064, "grad_norm": 2.219691038131714, "learning_rate": 2.2828507795100225e-05, "loss": 2.8803, "step": 205 }, { "epoch": 0.1378099259940628, "grad_norm": 1.679511308670044, "learning_rate": 2.2939866369710468e-05, "loss": 2.7962, "step": 206 }, { "epoch": 0.13847890621733494, "grad_norm": 1.844044804573059, "learning_rate": 2.3051224944320715e-05, "loss": 3.0686, "step": 207 }, { "epoch": 0.1391478864406071, "grad_norm": 1.6532880067825317, "learning_rate": 2.316258351893096e-05, "loss": 3.0698, "step": 208 }, { "epoch": 0.13981686666387924, "grad_norm": 2.2440009117126465, "learning_rate": 2.3273942093541205e-05, "loss": 3.0008, "step": 209 }, { "epoch": 0.1404858468871514, "grad_norm": 1.1852771043777466, "learning_rate": 2.338530066815145e-05, "loss": 2.6482, "step": 210 }, { "epoch": 0.14115482711042354, "grad_norm": 1.9627621173858643, "learning_rate": 2.3496659242761692e-05, "loss": 2.9175, "step": 211 }, { "epoch": 0.1418238073336957, "grad_norm": 1.5972543954849243, "learning_rate": 2.360801781737194e-05, "loss": 2.874, "step": 212 }, { "epoch": 0.14249278755696784, "grad_norm": 2.07195782661438, "learning_rate": 2.3719376391982185e-05, "loss": 2.8547, "step": 213 }, { "epoch": 0.14316176778024, "grad_norm": 2.280430555343628, "learning_rate": 2.383073496659243e-05, "loss": 2.9573, "step": 214 }, { "epoch": 0.14383074800351214, "grad_norm": 1.7816762924194336, "learning_rate": 2.3942093541202672e-05, "loss": 3.0404, "step": 215 }, { "epoch": 0.1444997282267843, "grad_norm": 1.383423089981079, "learning_rate": 2.405345211581292e-05, "loss": 2.8613, "step": 216 }, { "epoch": 0.14516870845005644, "grad_norm": 1.8071893453598022, "learning_rate": 2.4164810690423166e-05, "loss": 3.0848, "step": 217 }, { "epoch": 0.1458376886733286, "grad_norm": 1.5270397663116455, "learning_rate": 2.427616926503341e-05, "loss": 2.8631, "step": 218 }, { "epoch": 0.14650666889660074, "grad_norm": 1.8604637384414673, "learning_rate": 2.4387527839643652e-05, "loss": 2.9685, "step": 219 }, { "epoch": 0.14717564911987288, "grad_norm": 1.9972350597381592, "learning_rate": 2.44988864142539e-05, "loss": 2.7886, "step": 220 }, { "epoch": 0.14784462934314505, "grad_norm": 1.5495383739471436, "learning_rate": 2.4610244988864146e-05, "loss": 2.8556, "step": 221 }, { "epoch": 0.14851360956641718, "grad_norm": 1.7119460105895996, "learning_rate": 2.472160356347439e-05, "loss": 2.971, "step": 222 }, { "epoch": 0.14918258978968935, "grad_norm": 1.7634843587875366, "learning_rate": 2.4832962138084633e-05, "loss": 2.791, "step": 223 }, { "epoch": 0.14985157001296148, "grad_norm": 1.7723678350448608, "learning_rate": 2.494432071269488e-05, "loss": 2.8888, "step": 224 }, { "epoch": 0.15052055023623365, "grad_norm": 2.0935842990875244, "learning_rate": 2.5055679287305123e-05, "loss": 3.0744, "step": 225 }, { "epoch": 0.15118953045950578, "grad_norm": 2.2163760662078857, "learning_rate": 2.516703786191537e-05, "loss": 3.0321, "step": 226 }, { "epoch": 0.15185851068277795, "grad_norm": 2.2750024795532227, "learning_rate": 2.5278396436525613e-05, "loss": 3.055, "step": 227 }, { "epoch": 0.15252749090605008, "grad_norm": 2.189126968383789, "learning_rate": 2.5389755011135856e-05, "loss": 3.0768, "step": 228 }, { "epoch": 0.15319647112932225, "grad_norm": 2.0731709003448486, "learning_rate": 2.5501113585746107e-05, "loss": 2.8946, "step": 229 }, { "epoch": 0.15386545135259438, "grad_norm": 2.6770660877227783, "learning_rate": 2.561247216035635e-05, "loss": 3.2448, "step": 230 }, { "epoch": 0.15453443157586655, "grad_norm": 2.252547264099121, "learning_rate": 2.5723830734966593e-05, "loss": 3.1992, "step": 231 }, { "epoch": 0.15520341179913869, "grad_norm": 2.1851205825805664, "learning_rate": 2.583518930957684e-05, "loss": 2.9443, "step": 232 }, { "epoch": 0.15587239202241085, "grad_norm": 1.8990890979766846, "learning_rate": 2.5946547884187083e-05, "loss": 2.7778, "step": 233 }, { "epoch": 0.15654137224568299, "grad_norm": 1.4797788858413696, "learning_rate": 2.6057906458797327e-05, "loss": 2.9676, "step": 234 }, { "epoch": 0.15721035246895515, "grad_norm": 2.029318332672119, "learning_rate": 2.6169265033407574e-05, "loss": 3.1086, "step": 235 }, { "epoch": 0.1578793326922273, "grad_norm": 2.179896354675293, "learning_rate": 2.6280623608017817e-05, "loss": 2.9045, "step": 236 }, { "epoch": 0.15854831291549942, "grad_norm": 1.391133189201355, "learning_rate": 2.639198218262806e-05, "loss": 2.694, "step": 237 }, { "epoch": 0.1592172931387716, "grad_norm": 2.4943625926971436, "learning_rate": 2.650334075723831e-05, "loss": 3.1102, "step": 238 }, { "epoch": 0.15988627336204372, "grad_norm": 2.750905752182007, "learning_rate": 2.6614699331848554e-05, "loss": 3.0298, "step": 239 }, { "epoch": 0.1605552535853159, "grad_norm": 1.7482820749282837, "learning_rate": 2.67260579064588e-05, "loss": 2.83, "step": 240 }, { "epoch": 0.16122423380858802, "grad_norm": 2.7642734050750732, "learning_rate": 2.6837416481069044e-05, "loss": 3.1332, "step": 241 }, { "epoch": 0.1618932140318602, "grad_norm": 2.404057025909424, "learning_rate": 2.6948775055679287e-05, "loss": 3.1092, "step": 242 }, { "epoch": 0.16256219425513233, "grad_norm": 1.8724385499954224, "learning_rate": 2.7060133630289534e-05, "loss": 3.0315, "step": 243 }, { "epoch": 0.1632311744784045, "grad_norm": 1.7511506080627441, "learning_rate": 2.7171492204899778e-05, "loss": 2.7904, "step": 244 }, { "epoch": 0.16390015470167663, "grad_norm": 1.8246021270751953, "learning_rate": 2.728285077951002e-05, "loss": 3.0182, "step": 245 }, { "epoch": 0.1645691349249488, "grad_norm": 1.8768839836120605, "learning_rate": 2.739420935412027e-05, "loss": 2.9269, "step": 246 }, { "epoch": 0.16523811514822093, "grad_norm": 2.0913779735565186, "learning_rate": 2.7505567928730515e-05, "loss": 2.8438, "step": 247 }, { "epoch": 0.1659070953714931, "grad_norm": 2.4028401374816895, "learning_rate": 2.7616926503340758e-05, "loss": 2.8489, "step": 248 }, { "epoch": 0.16657607559476523, "grad_norm": 1.8068379163742065, "learning_rate": 2.7728285077951005e-05, "loss": 2.9883, "step": 249 }, { "epoch": 0.1672450558180374, "grad_norm": 2.4506468772888184, "learning_rate": 2.7839643652561248e-05, "loss": 2.8825, "step": 250 }, { "epoch": 0.16791403604130953, "grad_norm": 1.8266322612762451, "learning_rate": 2.795100222717149e-05, "loss": 3.1515, "step": 251 }, { "epoch": 0.1685830162645817, "grad_norm": 3.492474317550659, "learning_rate": 2.8062360801781738e-05, "loss": 3.055, "step": 252 }, { "epoch": 0.16925199648785383, "grad_norm": 2.0248913764953613, "learning_rate": 2.817371937639198e-05, "loss": 2.8643, "step": 253 }, { "epoch": 0.16992097671112597, "grad_norm": 2.751221179962158, "learning_rate": 2.8285077951002232e-05, "loss": 2.9571, "step": 254 }, { "epoch": 0.17058995693439813, "grad_norm": 1.8173011541366577, "learning_rate": 2.8396436525612475e-05, "loss": 2.98, "step": 255 }, { "epoch": 0.17125893715767027, "grad_norm": 2.3821542263031006, "learning_rate": 2.850779510022272e-05, "loss": 3.0517, "step": 256 }, { "epoch": 0.17192791738094243, "grad_norm": 2.088780403137207, "learning_rate": 2.8619153674832965e-05, "loss": 2.9371, "step": 257 }, { "epoch": 0.17259689760421457, "grad_norm": 1.652597188949585, "learning_rate": 2.873051224944321e-05, "loss": 3.0313, "step": 258 }, { "epoch": 0.17326587782748673, "grad_norm": 1.7598010301589966, "learning_rate": 2.8841870824053452e-05, "loss": 2.8893, "step": 259 }, { "epoch": 0.17393485805075887, "grad_norm": 1.3597828149795532, "learning_rate": 2.89532293986637e-05, "loss": 2.6477, "step": 260 }, { "epoch": 0.17460383827403103, "grad_norm": 1.9741261005401611, "learning_rate": 2.9064587973273942e-05, "loss": 3.0815, "step": 261 }, { "epoch": 0.17527281849730317, "grad_norm": 2.220665693283081, "learning_rate": 2.9175946547884186e-05, "loss": 3.0102, "step": 262 }, { "epoch": 0.17594179872057533, "grad_norm": 1.7396165132522583, "learning_rate": 2.9287305122494436e-05, "loss": 3.0626, "step": 263 }, { "epoch": 0.17661077894384747, "grad_norm": 1.7730218172073364, "learning_rate": 2.939866369710468e-05, "loss": 2.9754, "step": 264 }, { "epoch": 0.17727975916711963, "grad_norm": 2.6833832263946533, "learning_rate": 2.9510022271714922e-05, "loss": 3.2264, "step": 265 }, { "epoch": 0.17794873939039177, "grad_norm": 1.9124982357025146, "learning_rate": 2.962138084632517e-05, "loss": 2.9205, "step": 266 }, { "epoch": 0.17861771961366393, "grad_norm": 2.111903190612793, "learning_rate": 2.9732739420935413e-05, "loss": 2.981, "step": 267 }, { "epoch": 0.17928669983693607, "grad_norm": 2.5708305835723877, "learning_rate": 2.9844097995545663e-05, "loss": 3.0503, "step": 268 }, { "epoch": 0.1799556800602082, "grad_norm": 2.538877010345459, "learning_rate": 2.9955456570155903e-05, "loss": 3.1285, "step": 269 }, { "epoch": 0.18062466028348037, "grad_norm": 2.1339826583862305, "learning_rate": 3.0066815144766146e-05, "loss": 2.9144, "step": 270 }, { "epoch": 0.1812936405067525, "grad_norm": 1.9419770240783691, "learning_rate": 3.0178173719376396e-05, "loss": 2.8248, "step": 271 }, { "epoch": 0.18196262073002467, "grad_norm": 1.6631495952606201, "learning_rate": 3.028953229398664e-05, "loss": 2.8192, "step": 272 }, { "epoch": 0.1826316009532968, "grad_norm": 2.0016918182373047, "learning_rate": 3.0400890868596883e-05, "loss": 2.7806, "step": 273 }, { "epoch": 0.18330058117656897, "grad_norm": 2.2896358966827393, "learning_rate": 3.051224944320713e-05, "loss": 3.047, "step": 274 }, { "epoch": 0.1839695613998411, "grad_norm": 2.4204511642456055, "learning_rate": 3.0623608017817377e-05, "loss": 3.0408, "step": 275 }, { "epoch": 0.18463854162311327, "grad_norm": 2.3816628456115723, "learning_rate": 3.073496659242761e-05, "loss": 2.9486, "step": 276 }, { "epoch": 0.1853075218463854, "grad_norm": 1.8626177310943604, "learning_rate": 3.084632516703786e-05, "loss": 3.0183, "step": 277 }, { "epoch": 0.18597650206965757, "grad_norm": 2.2854461669921875, "learning_rate": 3.095768374164811e-05, "loss": 3.0818, "step": 278 }, { "epoch": 0.1866454822929297, "grad_norm": 2.1244637966156006, "learning_rate": 3.106904231625835e-05, "loss": 3.0196, "step": 279 }, { "epoch": 0.18731446251620187, "grad_norm": 1.7232093811035156, "learning_rate": 3.11804008908686e-05, "loss": 2.8402, "step": 280 }, { "epoch": 0.187983442739474, "grad_norm": 1.849948763847351, "learning_rate": 3.1291759465478844e-05, "loss": 2.9892, "step": 281 }, { "epoch": 0.18865242296274617, "grad_norm": 1.760595440864563, "learning_rate": 3.140311804008909e-05, "loss": 2.6975, "step": 282 }, { "epoch": 0.1893214031860183, "grad_norm": 2.229926824569702, "learning_rate": 3.151447661469934e-05, "loss": 3.0247, "step": 283 }, { "epoch": 0.18999038340929048, "grad_norm": 2.4357099533081055, "learning_rate": 3.162583518930958e-05, "loss": 2.8133, "step": 284 }, { "epoch": 0.1906593636325626, "grad_norm": 1.6331199407577515, "learning_rate": 3.1737193763919824e-05, "loss": 2.8615, "step": 285 }, { "epoch": 0.19132834385583475, "grad_norm": 2.13657546043396, "learning_rate": 3.184855233853007e-05, "loss": 3.0742, "step": 286 }, { "epoch": 0.1919973240791069, "grad_norm": 2.2347702980041504, "learning_rate": 3.195991091314031e-05, "loss": 2.9647, "step": 287 }, { "epoch": 0.19266630430237905, "grad_norm": 2.1436219215393066, "learning_rate": 3.207126948775056e-05, "loss": 2.9529, "step": 288 }, { "epoch": 0.1933352845256512, "grad_norm": 1.7989563941955566, "learning_rate": 3.2182628062360804e-05, "loss": 2.9052, "step": 289 }, { "epoch": 0.19400426474892335, "grad_norm": 2.0161685943603516, "learning_rate": 3.229398663697105e-05, "loss": 2.9663, "step": 290 }, { "epoch": 0.1946732449721955, "grad_norm": 1.6002366542816162, "learning_rate": 3.24053452115813e-05, "loss": 2.929, "step": 291 }, { "epoch": 0.19534222519546765, "grad_norm": 1.668652057647705, "learning_rate": 3.251670378619154e-05, "loss": 2.8479, "step": 292 }, { "epoch": 0.19601120541873981, "grad_norm": 1.8546702861785889, "learning_rate": 3.262806236080178e-05, "loss": 2.9774, "step": 293 }, { "epoch": 0.19668018564201195, "grad_norm": 2.1030890941619873, "learning_rate": 3.273942093541203e-05, "loss": 2.8901, "step": 294 }, { "epoch": 0.19734916586528412, "grad_norm": 1.4736385345458984, "learning_rate": 3.285077951002227e-05, "loss": 2.963, "step": 295 }, { "epoch": 0.19801814608855625, "grad_norm": 1.8257207870483398, "learning_rate": 3.2962138084632515e-05, "loss": 2.98, "step": 296 }, { "epoch": 0.19868712631182842, "grad_norm": 2.160999298095703, "learning_rate": 3.3073496659242765e-05, "loss": 2.7556, "step": 297 }, { "epoch": 0.19935610653510055, "grad_norm": 2.267620801925659, "learning_rate": 3.318485523385301e-05, "loss": 2.798, "step": 298 }, { "epoch": 0.20002508675837272, "grad_norm": 2.425196409225464, "learning_rate": 3.329621380846326e-05, "loss": 3.0942, "step": 299 }, { "epoch": 0.20069406698164485, "grad_norm": 2.4903581142425537, "learning_rate": 3.34075723830735e-05, "loss": 2.9861, "step": 300 }, { "epoch": 0.20136304720491702, "grad_norm": 1.8119566440582275, "learning_rate": 3.3518930957683745e-05, "loss": 2.8985, "step": 301 }, { "epoch": 0.20203202742818915, "grad_norm": 3.619180202484131, "learning_rate": 3.363028953229399e-05, "loss": 3.0096, "step": 302 }, { "epoch": 0.2027010076514613, "grad_norm": 2.1625945568084717, "learning_rate": 3.374164810690423e-05, "loss": 2.871, "step": 303 }, { "epoch": 0.20336998787473345, "grad_norm": 2.3259880542755127, "learning_rate": 3.3853006681514475e-05, "loss": 3.1425, "step": 304 }, { "epoch": 0.2040389680980056, "grad_norm": 2.2823097705841064, "learning_rate": 3.3964365256124725e-05, "loss": 2.9197, "step": 305 }, { "epoch": 0.20470794832127776, "grad_norm": 2.460858106613159, "learning_rate": 3.407572383073497e-05, "loss": 2.9594, "step": 306 }, { "epoch": 0.2053769285445499, "grad_norm": 2.1677207946777344, "learning_rate": 3.418708240534521e-05, "loss": 3.0082, "step": 307 }, { "epoch": 0.20604590876782206, "grad_norm": 2.5266077518463135, "learning_rate": 3.429844097995546e-05, "loss": 3.2755, "step": 308 }, { "epoch": 0.2067148889910942, "grad_norm": 2.598078727722168, "learning_rate": 3.4409799554565706e-05, "loss": 3.0439, "step": 309 }, { "epoch": 0.20738386921436636, "grad_norm": 2.6901001930236816, "learning_rate": 3.452115812917594e-05, "loss": 3.2016, "step": 310 }, { "epoch": 0.2080528494376385, "grad_norm": 2.208012342453003, "learning_rate": 3.463251670378619e-05, "loss": 3.0823, "step": 311 }, { "epoch": 0.20872182966091066, "grad_norm": 2.5381200313568115, "learning_rate": 3.4743875278396436e-05, "loss": 2.8945, "step": 312 }, { "epoch": 0.2093908098841828, "grad_norm": 2.7676663398742676, "learning_rate": 3.4855233853006686e-05, "loss": 3.2512, "step": 313 }, { "epoch": 0.21005979010745496, "grad_norm": 1.880428433418274, "learning_rate": 3.496659242761693e-05, "loss": 2.8545, "step": 314 }, { "epoch": 0.2107287703307271, "grad_norm": 1.7739355564117432, "learning_rate": 3.507795100222717e-05, "loss": 2.8965, "step": 315 }, { "epoch": 0.21139775055399926, "grad_norm": 2.162645101547241, "learning_rate": 3.518930957683742e-05, "loss": 3.005, "step": 316 }, { "epoch": 0.2120667307772714, "grad_norm": 1.821035385131836, "learning_rate": 3.5300668151447666e-05, "loss": 3.0064, "step": 317 }, { "epoch": 0.21273571100054356, "grad_norm": 3.1734619140625, "learning_rate": 3.541202672605791e-05, "loss": 3.1068, "step": 318 }, { "epoch": 0.2134046912238157, "grad_norm": 3.1917405128479004, "learning_rate": 3.552338530066815e-05, "loss": 3.0866, "step": 319 }, { "epoch": 0.21407367144708783, "grad_norm": 2.3374900817871094, "learning_rate": 3.5634743875278396e-05, "loss": 2.9519, "step": 320 }, { "epoch": 0.21474265167036, "grad_norm": 2.6081695556640625, "learning_rate": 3.574610244988864e-05, "loss": 3.0563, "step": 321 }, { "epoch": 0.21541163189363213, "grad_norm": 2.195274591445923, "learning_rate": 3.585746102449889e-05, "loss": 2.9423, "step": 322 }, { "epoch": 0.2160806121169043, "grad_norm": 3.0954089164733887, "learning_rate": 3.596881959910913e-05, "loss": 3.0645, "step": 323 }, { "epoch": 0.21674959234017643, "grad_norm": 2.7243683338165283, "learning_rate": 3.608017817371938e-05, "loss": 3.0141, "step": 324 }, { "epoch": 0.2174185725634486, "grad_norm": 2.720536947250366, "learning_rate": 3.619153674832963e-05, "loss": 3.0356, "step": 325 }, { "epoch": 0.21808755278672073, "grad_norm": 2.3073344230651855, "learning_rate": 3.630289532293987e-05, "loss": 2.8578, "step": 326 }, { "epoch": 0.2187565330099929, "grad_norm": 3.312396287918091, "learning_rate": 3.6414253897550114e-05, "loss": 3.1004, "step": 327 }, { "epoch": 0.21942551323326503, "grad_norm": 2.946176528930664, "learning_rate": 3.652561247216036e-05, "loss": 2.9902, "step": 328 }, { "epoch": 0.2200944934565372, "grad_norm": 2.6662216186523438, "learning_rate": 3.66369710467706e-05, "loss": 2.9431, "step": 329 }, { "epoch": 0.22076347367980934, "grad_norm": 2.286525249481201, "learning_rate": 3.674832962138085e-05, "loss": 2.9168, "step": 330 }, { "epoch": 0.2214324539030815, "grad_norm": 2.14633846282959, "learning_rate": 3.6859688195991094e-05, "loss": 2.9081, "step": 331 }, { "epoch": 0.22210143412635364, "grad_norm": 2.405035972595215, "learning_rate": 3.697104677060134e-05, "loss": 3.1149, "step": 332 }, { "epoch": 0.2227704143496258, "grad_norm": 2.870598554611206, "learning_rate": 3.708240534521159e-05, "loss": 2.9908, "step": 333 }, { "epoch": 0.22343939457289794, "grad_norm": 3.6809451580047607, "learning_rate": 3.719376391982183e-05, "loss": 2.8389, "step": 334 }, { "epoch": 0.2241083747961701, "grad_norm": 4.133260726928711, "learning_rate": 3.7305122494432074e-05, "loss": 2.9818, "step": 335 }, { "epoch": 0.22477735501944224, "grad_norm": 2.9487850666046143, "learning_rate": 3.741648106904232e-05, "loss": 3.086, "step": 336 }, { "epoch": 0.22544633524271437, "grad_norm": 2.474928617477417, "learning_rate": 3.752783964365256e-05, "loss": 2.9965, "step": 337 }, { "epoch": 0.22611531546598654, "grad_norm": 2.069495439529419, "learning_rate": 3.7639198218262804e-05, "loss": 2.8583, "step": 338 }, { "epoch": 0.22678429568925867, "grad_norm": 2.0604686737060547, "learning_rate": 3.7750556792873054e-05, "loss": 2.7425, "step": 339 }, { "epoch": 0.22745327591253084, "grad_norm": 2.515392541885376, "learning_rate": 3.78619153674833e-05, "loss": 3.0675, "step": 340 }, { "epoch": 0.22812225613580298, "grad_norm": 2.4841203689575195, "learning_rate": 3.797327394209355e-05, "loss": 3.0104, "step": 341 }, { "epoch": 0.22879123635907514, "grad_norm": 3.003702163696289, "learning_rate": 3.808463251670379e-05, "loss": 3.0008, "step": 342 }, { "epoch": 0.22946021658234728, "grad_norm": 2.7392759323120117, "learning_rate": 3.8195991091314035e-05, "loss": 2.8839, "step": 343 }, { "epoch": 0.23012919680561944, "grad_norm": 2.9119346141815186, "learning_rate": 3.830734966592428e-05, "loss": 2.6593, "step": 344 }, { "epoch": 0.23079817702889158, "grad_norm": 3.402265787124634, "learning_rate": 3.841870824053452e-05, "loss": 2.978, "step": 345 }, { "epoch": 0.23146715725216374, "grad_norm": 2.0154426097869873, "learning_rate": 3.8530066815144765e-05, "loss": 2.8954, "step": 346 }, { "epoch": 0.23213613747543588, "grad_norm": 2.149036169052124, "learning_rate": 3.8641425389755015e-05, "loss": 2.8539, "step": 347 }, { "epoch": 0.23280511769870804, "grad_norm": 2.9926230907440186, "learning_rate": 3.875278396436526e-05, "loss": 3.0115, "step": 348 }, { "epoch": 0.23347409792198018, "grad_norm": 2.1109957695007324, "learning_rate": 3.88641425389755e-05, "loss": 2.7915, "step": 349 }, { "epoch": 0.23414307814525234, "grad_norm": 1.8408353328704834, "learning_rate": 3.897550111358575e-05, "loss": 2.6129, "step": 350 }, { "epoch": 0.23481205836852448, "grad_norm": 4.683795928955078, "learning_rate": 3.9086859688195995e-05, "loss": 2.9513, "step": 351 }, { "epoch": 0.23548103859179664, "grad_norm": 2.911135673522949, "learning_rate": 3.919821826280624e-05, "loss": 2.9199, "step": 352 }, { "epoch": 0.23615001881506878, "grad_norm": 3.0055723190307617, "learning_rate": 3.930957683741648e-05, "loss": 2.7776, "step": 353 }, { "epoch": 0.23681899903834092, "grad_norm": 2.214090347290039, "learning_rate": 3.9420935412026726e-05, "loss": 2.8616, "step": 354 }, { "epoch": 0.23748797926161308, "grad_norm": 4.9794135093688965, "learning_rate": 3.9532293986636976e-05, "loss": 2.9461, "step": 355 }, { "epoch": 0.23815695948488522, "grad_norm": 2.91196870803833, "learning_rate": 3.964365256124722e-05, "loss": 3.1177, "step": 356 }, { "epoch": 0.23882593970815738, "grad_norm": 3.165623664855957, "learning_rate": 3.975501113585746e-05, "loss": 2.8003, "step": 357 }, { "epoch": 0.23949491993142952, "grad_norm": 2.178732395172119, "learning_rate": 3.986636971046771e-05, "loss": 3.1017, "step": 358 }, { "epoch": 0.24016390015470168, "grad_norm": 2.2613189220428467, "learning_rate": 3.9977728285077956e-05, "loss": 2.9624, "step": 359 }, { "epoch": 0.24083288037797382, "grad_norm": 3.2467665672302246, "learning_rate": 4.00890868596882e-05, "loss": 2.7477, "step": 360 }, { "epoch": 0.24150186060124598, "grad_norm": 4.1236467361450195, "learning_rate": 4.020044543429844e-05, "loss": 2.8327, "step": 361 }, { "epoch": 0.24217084082451812, "grad_norm": 6.381893157958984, "learning_rate": 4.0311804008908686e-05, "loss": 2.9811, "step": 362 }, { "epoch": 0.24283982104779028, "grad_norm": 4.767080783843994, "learning_rate": 4.042316258351893e-05, "loss": 2.7984, "step": 363 }, { "epoch": 0.24350880127106242, "grad_norm": 3.955322504043579, "learning_rate": 4.053452115812918e-05, "loss": 3.0451, "step": 364 }, { "epoch": 0.24417778149433458, "grad_norm": 2.52062726020813, "learning_rate": 4.064587973273942e-05, "loss": 3.0299, "step": 365 }, { "epoch": 0.24484676171760672, "grad_norm": 2.5843591690063477, "learning_rate": 4.0757238307349666e-05, "loss": 2.9065, "step": 366 }, { "epoch": 0.24551574194087888, "grad_norm": 2.6871824264526367, "learning_rate": 4.0868596881959917e-05, "loss": 2.9752, "step": 367 }, { "epoch": 0.24618472216415102, "grad_norm": 2.8572964668273926, "learning_rate": 4.097995545657016e-05, "loss": 3.0047, "step": 368 }, { "epoch": 0.24685370238742319, "grad_norm": 3.1573598384857178, "learning_rate": 4.10913140311804e-05, "loss": 2.8892, "step": 369 }, { "epoch": 0.24752268261069532, "grad_norm": 4.083068370819092, "learning_rate": 4.120267260579065e-05, "loss": 2.5709, "step": 370 }, { "epoch": 0.24819166283396746, "grad_norm": 4.6753387451171875, "learning_rate": 4.131403118040089e-05, "loss": 3.1801, "step": 371 }, { "epoch": 0.24886064305723962, "grad_norm": 3.2260594367980957, "learning_rate": 4.142538975501114e-05, "loss": 3.1373, "step": 372 }, { "epoch": 0.24952962328051176, "grad_norm": 3.0378241539001465, "learning_rate": 4.1536748329621384e-05, "loss": 3.0976, "step": 373 }, { "epoch": 0.2501986035037839, "grad_norm": 3.077693462371826, "learning_rate": 4.164810690423163e-05, "loss": 3.0883, "step": 374 }, { "epoch": 0.2508675837270561, "grad_norm": 4.166459083557129, "learning_rate": 4.175946547884188e-05, "loss": 3.178, "step": 375 }, { "epoch": 0.2515365639503282, "grad_norm": 3.4417877197265625, "learning_rate": 4.187082405345212e-05, "loss": 3.028, "step": 376 }, { "epoch": 0.25220554417360036, "grad_norm": 3.2813799381256104, "learning_rate": 4.1982182628062364e-05, "loss": 3.0955, "step": 377 }, { "epoch": 0.2528745243968725, "grad_norm": 3.2578415870666504, "learning_rate": 4.209354120267261e-05, "loss": 3.0973, "step": 378 }, { "epoch": 0.2535435046201447, "grad_norm": 7.314671516418457, "learning_rate": 4.220489977728285e-05, "loss": 3.1364, "step": 379 }, { "epoch": 0.2542124848434168, "grad_norm": 4.096634387969971, "learning_rate": 4.2316258351893094e-05, "loss": 2.9937, "step": 380 }, { "epoch": 0.25488146506668896, "grad_norm": 8.772887229919434, "learning_rate": 4.2427616926503344e-05, "loss": 2.9784, "step": 381 }, { "epoch": 0.2555504452899611, "grad_norm": 3.3955204486846924, "learning_rate": 4.253897550111359e-05, "loss": 2.9533, "step": 382 }, { "epoch": 0.2562194255132333, "grad_norm": 2.985086679458618, "learning_rate": 4.265033407572383e-05, "loss": 3.0271, "step": 383 }, { "epoch": 0.2568884057365054, "grad_norm": 3.7264626026153564, "learning_rate": 4.276169265033408e-05, "loss": 3.0873, "step": 384 }, { "epoch": 0.25755738595977756, "grad_norm": 3.565138101577759, "learning_rate": 4.2873051224944324e-05, "loss": 2.9025, "step": 385 }, { "epoch": 0.2582263661830497, "grad_norm": 2.661137342453003, "learning_rate": 4.298440979955457e-05, "loss": 3.1241, "step": 386 }, { "epoch": 0.25889534640632184, "grad_norm": 2.084932804107666, "learning_rate": 4.309576837416481e-05, "loss": 2.911, "step": 387 }, { "epoch": 0.259564326629594, "grad_norm": 2.9260430335998535, "learning_rate": 4.3207126948775055e-05, "loss": 3.0158, "step": 388 }, { "epoch": 0.26023330685286616, "grad_norm": 2.6525022983551025, "learning_rate": 4.3318485523385305e-05, "loss": 2.8838, "step": 389 }, { "epoch": 0.26090228707613833, "grad_norm": 2.613525629043579, "learning_rate": 4.342984409799555e-05, "loss": 2.9201, "step": 390 }, { "epoch": 0.26157126729941044, "grad_norm": 3.924321413040161, "learning_rate": 4.354120267260579e-05, "loss": 2.9793, "step": 391 }, { "epoch": 0.2622402475226826, "grad_norm": 5.627641201019287, "learning_rate": 4.365256124721604e-05, "loss": 3.081, "step": 392 }, { "epoch": 0.26290922774595477, "grad_norm": 3.294105052947998, "learning_rate": 4.3763919821826285e-05, "loss": 2.8867, "step": 393 }, { "epoch": 0.26357820796922693, "grad_norm": 2.795708179473877, "learning_rate": 4.387527839643653e-05, "loss": 2.8974, "step": 394 }, { "epoch": 0.26424718819249904, "grad_norm": 5.553576946258545, "learning_rate": 4.398663697104677e-05, "loss": 3.1988, "step": 395 }, { "epoch": 0.2649161684157712, "grad_norm": 3.2928688526153564, "learning_rate": 4.4097995545657015e-05, "loss": 2.8335, "step": 396 }, { "epoch": 0.26558514863904337, "grad_norm": 5.719513893127441, "learning_rate": 4.420935412026726e-05, "loss": 3.0254, "step": 397 }, { "epoch": 0.26625412886231553, "grad_norm": 3.231088161468506, "learning_rate": 4.432071269487751e-05, "loss": 3.1652, "step": 398 }, { "epoch": 0.26692310908558764, "grad_norm": 4.428014278411865, "learning_rate": 4.443207126948775e-05, "loss": 3.1403, "step": 399 }, { "epoch": 0.2675920893088598, "grad_norm": 2.7273247241973877, "learning_rate": 4.4543429844098e-05, "loss": 2.9933, "step": 400 }, { "epoch": 0.26826106953213197, "grad_norm": 3.1555709838867188, "learning_rate": 4.4654788418708246e-05, "loss": 2.7692, "step": 401 }, { "epoch": 0.26893004975540413, "grad_norm": 4.41472053527832, "learning_rate": 4.476614699331849e-05, "loss": 3.1536, "step": 402 }, { "epoch": 0.26959902997867624, "grad_norm": 2.652088165283203, "learning_rate": 4.487750556792873e-05, "loss": 2.7263, "step": 403 }, { "epoch": 0.2702680102019484, "grad_norm": 2.2178761959075928, "learning_rate": 4.4988864142538976e-05, "loss": 2.7703, "step": 404 }, { "epoch": 0.27093699042522057, "grad_norm": 3.2548203468322754, "learning_rate": 4.510022271714922e-05, "loss": 3.0788, "step": 405 }, { "epoch": 0.2716059706484927, "grad_norm": 3.69232177734375, "learning_rate": 4.521158129175947e-05, "loss": 3.1247, "step": 406 }, { "epoch": 0.27227495087176484, "grad_norm": 2.2741658687591553, "learning_rate": 4.532293986636971e-05, "loss": 2.9171, "step": 407 }, { "epoch": 0.272943931095037, "grad_norm": 3.9409332275390625, "learning_rate": 4.5434298440979956e-05, "loss": 3.1141, "step": 408 }, { "epoch": 0.27361291131830917, "grad_norm": 4.8423004150390625, "learning_rate": 4.5545657015590206e-05, "loss": 3.0009, "step": 409 }, { "epoch": 0.2742818915415813, "grad_norm": 4.420136451721191, "learning_rate": 4.565701559020045e-05, "loss": 3.2071, "step": 410 }, { "epoch": 0.27495087176485344, "grad_norm": 2.833836317062378, "learning_rate": 4.576837416481069e-05, "loss": 2.8665, "step": 411 }, { "epoch": 0.2756198519881256, "grad_norm": 4.68223237991333, "learning_rate": 4.5879732739420936e-05, "loss": 3.0481, "step": 412 }, { "epoch": 0.2762888322113978, "grad_norm": 1.9087761640548706, "learning_rate": 4.599109131403118e-05, "loss": 2.8015, "step": 413 }, { "epoch": 0.2769578124346699, "grad_norm": 5.1107177734375, "learning_rate": 4.610244988864143e-05, "loss": 2.9523, "step": 414 }, { "epoch": 0.27762679265794205, "grad_norm": 5.10582160949707, "learning_rate": 4.621380846325167e-05, "loss": 2.6587, "step": 415 }, { "epoch": 0.2782957728812142, "grad_norm": 5.031997203826904, "learning_rate": 4.632516703786192e-05, "loss": 2.7179, "step": 416 }, { "epoch": 0.2789647531044864, "grad_norm": 3.6552696228027344, "learning_rate": 4.643652561247217e-05, "loss": 2.8385, "step": 417 }, { "epoch": 0.2796337333277585, "grad_norm": 4.179525852203369, "learning_rate": 4.654788418708241e-05, "loss": 3.0226, "step": 418 }, { "epoch": 0.28030271355103065, "grad_norm": 5.424846649169922, "learning_rate": 4.6659242761692654e-05, "loss": 3.0232, "step": 419 }, { "epoch": 0.2809716937743028, "grad_norm": 3.7056894302368164, "learning_rate": 4.67706013363029e-05, "loss": 2.9397, "step": 420 }, { "epoch": 0.2816406739975749, "grad_norm": 3.52085542678833, "learning_rate": 4.688195991091314e-05, "loss": 3.1203, "step": 421 }, { "epoch": 0.2823096542208471, "grad_norm": 4.416035175323486, "learning_rate": 4.6993318485523384e-05, "loss": 3.1685, "step": 422 }, { "epoch": 0.28297863444411925, "grad_norm": 4.974792957305908, "learning_rate": 4.7104677060133634e-05, "loss": 2.9907, "step": 423 }, { "epoch": 0.2836476146673914, "grad_norm": 2.7833263874053955, "learning_rate": 4.721603563474388e-05, "loss": 2.9569, "step": 424 }, { "epoch": 0.2843165948906635, "grad_norm": 4.347465991973877, "learning_rate": 4.732739420935412e-05, "loss": 2.9085, "step": 425 }, { "epoch": 0.2849855751139357, "grad_norm": 4.685590744018555, "learning_rate": 4.743875278396437e-05, "loss": 2.945, "step": 426 }, { "epoch": 0.28565455533720785, "grad_norm": 2.592012643814087, "learning_rate": 4.7550111358574614e-05, "loss": 3.0512, "step": 427 }, { "epoch": 0.28632353556048, "grad_norm": 2.9887845516204834, "learning_rate": 4.766146993318486e-05, "loss": 3.0683, "step": 428 }, { "epoch": 0.2869925157837521, "grad_norm": 2.813981294631958, "learning_rate": 4.77728285077951e-05, "loss": 2.9379, "step": 429 }, { "epoch": 0.2876614960070243, "grad_norm": 4.186840057373047, "learning_rate": 4.7884187082405344e-05, "loss": 2.8466, "step": 430 }, { "epoch": 0.28833047623029645, "grad_norm": 2.8538403511047363, "learning_rate": 4.7995545657015594e-05, "loss": 2.9927, "step": 431 }, { "epoch": 0.2889994564535686, "grad_norm": 2.1021909713745117, "learning_rate": 4.810690423162584e-05, "loss": 2.7849, "step": 432 }, { "epoch": 0.2896684366768407, "grad_norm": 1.9388480186462402, "learning_rate": 4.821826280623608e-05, "loss": 3.2563, "step": 433 }, { "epoch": 0.2903374169001129, "grad_norm": 4.667293548583984, "learning_rate": 4.832962138084633e-05, "loss": 2.9185, "step": 434 }, { "epoch": 0.29100639712338505, "grad_norm": 3.824524164199829, "learning_rate": 4.8440979955456575e-05, "loss": 2.9848, "step": 435 }, { "epoch": 0.2916753773466572, "grad_norm": 2.4440345764160156, "learning_rate": 4.855233853006682e-05, "loss": 2.8454, "step": 436 }, { "epoch": 0.2923443575699293, "grad_norm": 7.678300380706787, "learning_rate": 4.866369710467706e-05, "loss": 2.9164, "step": 437 }, { "epoch": 0.2930133377932015, "grad_norm": 3.7858011722564697, "learning_rate": 4.8775055679287305e-05, "loss": 2.9595, "step": 438 }, { "epoch": 0.29368231801647365, "grad_norm": 4.288517951965332, "learning_rate": 4.888641425389755e-05, "loss": 2.8055, "step": 439 }, { "epoch": 0.29435129823974576, "grad_norm": 5.512247085571289, "learning_rate": 4.89977728285078e-05, "loss": 3.0377, "step": 440 }, { "epoch": 0.2950202784630179, "grad_norm": 4.056970596313477, "learning_rate": 4.910913140311804e-05, "loss": 3.2016, "step": 441 }, { "epoch": 0.2956892586862901, "grad_norm": 3.7022035121917725, "learning_rate": 4.922048997772829e-05, "loss": 3.1512, "step": 442 }, { "epoch": 0.29635823890956225, "grad_norm": 2.9726853370666504, "learning_rate": 4.9331848552338535e-05, "loss": 3.1247, "step": 443 }, { "epoch": 0.29702721913283436, "grad_norm": 2.665365219116211, "learning_rate": 4.944320712694878e-05, "loss": 2.871, "step": 444 }, { "epoch": 0.29769619935610653, "grad_norm": 2.9526169300079346, "learning_rate": 4.955456570155902e-05, "loss": 2.8726, "step": 445 }, { "epoch": 0.2983651795793787, "grad_norm": 3.251380681991577, "learning_rate": 4.9665924276169265e-05, "loss": 3.0167, "step": 446 }, { "epoch": 0.29903415980265086, "grad_norm": 3.6996798515319824, "learning_rate": 4.977728285077951e-05, "loss": 3.0624, "step": 447 }, { "epoch": 0.29970314002592296, "grad_norm": 3.7724545001983643, "learning_rate": 4.988864142538976e-05, "loss": 2.8873, "step": 448 }, { "epoch": 0.30037212024919513, "grad_norm": 3.4690496921539307, "learning_rate": 5e-05, "loss": 3.0608, "step": 449 }, { "epoch": 0.3010411004724673, "grad_norm": 2.773916482925415, "learning_rate": 4.999999241504004e-05, "loss": 2.9176, "step": 450 }, { "epoch": 0.30171008069573946, "grad_norm": 5.8442816734313965, "learning_rate": 4.999996966016478e-05, "loss": 3.1301, "step": 451 }, { "epoch": 0.30237906091901157, "grad_norm": 5.579975605010986, "learning_rate": 4.9999931735387995e-05, "loss": 3.0301, "step": 452 }, { "epoch": 0.30304804114228373, "grad_norm": 5.85215425491333, "learning_rate": 4.999987864073273e-05, "loss": 3.0296, "step": 453 }, { "epoch": 0.3037170213655559, "grad_norm": 3.585082530975342, "learning_rate": 4.999981037623118e-05, "loss": 2.9487, "step": 454 }, { "epoch": 0.304386001588828, "grad_norm": 3.605626344680786, "learning_rate": 4.999972694192479e-05, "loss": 3.0024, "step": 455 }, { "epoch": 0.30505498181210017, "grad_norm": 6.199455738067627, "learning_rate": 4.999962833786417e-05, "loss": 3.354, "step": 456 }, { "epoch": 0.30572396203537233, "grad_norm": 4.334702014923096, "learning_rate": 4.999951456410915e-05, "loss": 3.1339, "step": 457 }, { "epoch": 0.3063929422586445, "grad_norm": 4.66254186630249, "learning_rate": 4.9999385620728776e-05, "loss": 3.1656, "step": 458 }, { "epoch": 0.3070619224819166, "grad_norm": 3.108375072479248, "learning_rate": 4.99992415078013e-05, "loss": 2.8593, "step": 459 }, { "epoch": 0.30773090270518877, "grad_norm": 4.193397521972656, "learning_rate": 4.9999082225414154e-05, "loss": 3.0536, "step": 460 }, { "epoch": 0.30839988292846093, "grad_norm": 3.317269802093506, "learning_rate": 4.9998907773663996e-05, "loss": 3.0542, "step": 461 }, { "epoch": 0.3090688631517331, "grad_norm": 3.166783332824707, "learning_rate": 4.9998718152656684e-05, "loss": 2.902, "step": 462 }, { "epoch": 0.3097378433750052, "grad_norm": 2.3117363452911377, "learning_rate": 4.9998513362507274e-05, "loss": 2.7614, "step": 463 }, { "epoch": 0.31040682359827737, "grad_norm": 4.291511058807373, "learning_rate": 4.999829340334003e-05, "loss": 3.2065, "step": 464 }, { "epoch": 0.31107580382154953, "grad_norm": 4.51165246963501, "learning_rate": 4.9998058275288435e-05, "loss": 2.9523, "step": 465 }, { "epoch": 0.3117447840448217, "grad_norm": 3.7631454467773438, "learning_rate": 4.9997807978495154e-05, "loss": 2.9064, "step": 466 }, { "epoch": 0.3124137642680938, "grad_norm": 4.295648574829102, "learning_rate": 4.999754251311207e-05, "loss": 2.8078, "step": 467 }, { "epoch": 0.31308274449136597, "grad_norm": 4.3560004234313965, "learning_rate": 4.9997261879300264e-05, "loss": 3.2831, "step": 468 }, { "epoch": 0.31375172471463814, "grad_norm": 4.275026798248291, "learning_rate": 4.999696607723003e-05, "loss": 2.8916, "step": 469 }, { "epoch": 0.3144207049379103, "grad_norm": 2.104069232940674, "learning_rate": 4.999665510708085e-05, "loss": 2.8333, "step": 470 }, { "epoch": 0.3150896851611824, "grad_norm": 3.4207301139831543, "learning_rate": 4.999632896904143e-05, "loss": 2.9654, "step": 471 }, { "epoch": 0.3157586653844546, "grad_norm": 4.362611293792725, "learning_rate": 4.9995987663309656e-05, "loss": 2.9717, "step": 472 }, { "epoch": 0.31642764560772674, "grad_norm": 3.473769426345825, "learning_rate": 4.999563119009264e-05, "loss": 3.0075, "step": 473 }, { "epoch": 0.31709662583099885, "grad_norm": 2.608717441558838, "learning_rate": 4.99952595496067e-05, "loss": 2.7839, "step": 474 }, { "epoch": 0.317765606054271, "grad_norm": 3.1809604167938232, "learning_rate": 4.9994872742077327e-05, "loss": 2.9022, "step": 475 }, { "epoch": 0.3184345862775432, "grad_norm": 3.7251923084259033, "learning_rate": 4.999447076773924e-05, "loss": 3.1133, "step": 476 }, { "epoch": 0.31910356650081534, "grad_norm": 2.886922597885132, "learning_rate": 4.999405362683636e-05, "loss": 3.0508, "step": 477 }, { "epoch": 0.31977254672408745, "grad_norm": 6.421143054962158, "learning_rate": 4.9993621319621804e-05, "loss": 3.0336, "step": 478 }, { "epoch": 0.3204415269473596, "grad_norm": 5.60941219329834, "learning_rate": 4.9993173846357896e-05, "loss": 3.0946, "step": 479 }, { "epoch": 0.3211105071706318, "grad_norm": 1.9423307180404663, "learning_rate": 4.9992711207316156e-05, "loss": 2.6631, "step": 480 }, { "epoch": 0.32177948739390394, "grad_norm": 3.3870861530303955, "learning_rate": 4.999223340277732e-05, "loss": 2.754, "step": 481 }, { "epoch": 0.32244846761717605, "grad_norm": 2.7448318004608154, "learning_rate": 4.999174043303132e-05, "loss": 2.9059, "step": 482 }, { "epoch": 0.3231174478404482, "grad_norm": 3.669349431991577, "learning_rate": 4.999123229837728e-05, "loss": 3.1572, "step": 483 }, { "epoch": 0.3237864280637204, "grad_norm": 5.015190601348877, "learning_rate": 4.999070899912353e-05, "loss": 3.1667, "step": 484 }, { "epoch": 0.32445540828699254, "grad_norm": 4.793431282043457, "learning_rate": 4.999017053558762e-05, "loss": 3.0626, "step": 485 }, { "epoch": 0.32512438851026465, "grad_norm": 2.3480074405670166, "learning_rate": 4.998961690809628e-05, "loss": 3.1293, "step": 486 }, { "epoch": 0.3257933687335368, "grad_norm": 3.8708577156066895, "learning_rate": 4.998904811698545e-05, "loss": 3.2092, "step": 487 }, { "epoch": 0.326462348956809, "grad_norm": 3.8278703689575195, "learning_rate": 4.998846416260028e-05, "loss": 2.8935, "step": 488 }, { "epoch": 0.3271313291800811, "grad_norm": 4.190570831298828, "learning_rate": 4.99878650452951e-05, "loss": 3.0671, "step": 489 }, { "epoch": 0.32780030940335325, "grad_norm": 3.4385035037994385, "learning_rate": 4.998725076543345e-05, "loss": 2.8071, "step": 490 }, { "epoch": 0.3284692896266254, "grad_norm": 3.1834115982055664, "learning_rate": 4.998662132338808e-05, "loss": 2.9962, "step": 491 }, { "epoch": 0.3291382698498976, "grad_norm": 4.123552322387695, "learning_rate": 4.9985976719540936e-05, "loss": 2.932, "step": 492 }, { "epoch": 0.3298072500731697, "grad_norm": 3.3126766681671143, "learning_rate": 4.998531695428316e-05, "loss": 2.968, "step": 493 }, { "epoch": 0.33047623029644185, "grad_norm": 4.15322732925415, "learning_rate": 4.99846420280151e-05, "loss": 3.1475, "step": 494 }, { "epoch": 0.331145210519714, "grad_norm": 3.0752909183502197, "learning_rate": 4.998395194114628e-05, "loss": 2.9903, "step": 495 }, { "epoch": 0.3318141907429862, "grad_norm": 4.031482696533203, "learning_rate": 4.9983246694095455e-05, "loss": 3.0917, "step": 496 }, { "epoch": 0.3324831709662583, "grad_norm": 2.93766450881958, "learning_rate": 4.998252628729058e-05, "loss": 2.8263, "step": 497 }, { "epoch": 0.33315215118953045, "grad_norm": 2.928539752960205, "learning_rate": 4.9981790721168767e-05, "loss": 3.0096, "step": 498 }, { "epoch": 0.3338211314128026, "grad_norm": 4.941032409667969, "learning_rate": 4.9981039996176375e-05, "loss": 2.9287, "step": 499 }, { "epoch": 0.3344901116360748, "grad_norm": 4.886608600616455, "learning_rate": 4.998027411276894e-05, "loss": 3.0736, "step": 500 }, { "epoch": 0.3351590918593469, "grad_norm": 4.183567047119141, "learning_rate": 4.997949307141119e-05, "loss": 2.8601, "step": 501 }, { "epoch": 0.33582807208261906, "grad_norm": 4.0794243812561035, "learning_rate": 4.997869687257707e-05, "loss": 3.002, "step": 502 }, { "epoch": 0.3364970523058912, "grad_norm": 6.189320087432861, "learning_rate": 4.99778855167497e-05, "loss": 2.928, "step": 503 }, { "epoch": 0.3371660325291634, "grad_norm": 3.896519184112549, "learning_rate": 4.997705900442141e-05, "loss": 3.0234, "step": 504 }, { "epoch": 0.3378350127524355, "grad_norm": 8.249415397644043, "learning_rate": 4.9976217336093726e-05, "loss": 2.8567, "step": 505 }, { "epoch": 0.33850399297570766, "grad_norm": 5.5367937088012695, "learning_rate": 4.997536051227738e-05, "loss": 2.9441, "step": 506 }, { "epoch": 0.3391729731989798, "grad_norm": 5.139379978179932, "learning_rate": 4.997448853349227e-05, "loss": 3.1405, "step": 507 }, { "epoch": 0.33984195342225193, "grad_norm": 4.751509189605713, "learning_rate": 4.997360140026752e-05, "loss": 2.8483, "step": 508 }, { "epoch": 0.3405109336455241, "grad_norm": 5.006605625152588, "learning_rate": 4.997269911314145e-05, "loss": 2.9661, "step": 509 }, { "epoch": 0.34117991386879626, "grad_norm": 3.2533388137817383, "learning_rate": 4.997178167266155e-05, "loss": 2.891, "step": 510 }, { "epoch": 0.3418488940920684, "grad_norm": 5.7418670654296875, "learning_rate": 4.9970849079384524e-05, "loss": 2.8222, "step": 511 }, { "epoch": 0.34251787431534053, "grad_norm": 5.2950005531311035, "learning_rate": 4.9969901333876264e-05, "loss": 2.9353, "step": 512 }, { "epoch": 0.3431868545386127, "grad_norm": 2.9090206623077393, "learning_rate": 4.996893843671187e-05, "loss": 2.7708, "step": 513 }, { "epoch": 0.34385583476188486, "grad_norm": 5.055557727813721, "learning_rate": 4.996796038847561e-05, "loss": 2.897, "step": 514 }, { "epoch": 0.344524814985157, "grad_norm": 5.402209281921387, "learning_rate": 4.996696718976098e-05, "loss": 3.1266, "step": 515 }, { "epoch": 0.34519379520842913, "grad_norm": 3.475196599960327, "learning_rate": 4.9965958841170626e-05, "loss": 2.911, "step": 516 }, { "epoch": 0.3458627754317013, "grad_norm": 5.288271427154541, "learning_rate": 4.9964935343316435e-05, "loss": 2.9801, "step": 517 }, { "epoch": 0.34653175565497346, "grad_norm": 5.355285167694092, "learning_rate": 4.9963896696819433e-05, "loss": 3.0884, "step": 518 }, { "epoch": 0.3472007358782456, "grad_norm": 6.395686149597168, "learning_rate": 4.99628429023099e-05, "loss": 3.1425, "step": 519 }, { "epoch": 0.34786971610151773, "grad_norm": 5.087702751159668, "learning_rate": 4.9961773960427246e-05, "loss": 2.9261, "step": 520 }, { "epoch": 0.3485386963247899, "grad_norm": 2.206958532333374, "learning_rate": 4.996068987182012e-05, "loss": 2.6969, "step": 521 }, { "epoch": 0.34920767654806206, "grad_norm": 3.5213186740875244, "learning_rate": 4.995959063714634e-05, "loss": 2.6573, "step": 522 }, { "epoch": 0.34987665677133417, "grad_norm": 5.5672807693481445, "learning_rate": 4.9958476257072914e-05, "loss": 3.1532, "step": 523 }, { "epoch": 0.35054563699460634, "grad_norm": 3.522601842880249, "learning_rate": 4.995734673227605e-05, "loss": 2.826, "step": 524 }, { "epoch": 0.3512146172178785, "grad_norm": 9.747411727905273, "learning_rate": 4.9956202063441135e-05, "loss": 3.0387, "step": 525 }, { "epoch": 0.35188359744115066, "grad_norm": 4.434753894805908, "learning_rate": 4.995504225126275e-05, "loss": 3.0938, "step": 526 }, { "epoch": 0.3525525776644228, "grad_norm": 4.533684253692627, "learning_rate": 4.9953867296444665e-05, "loss": 2.9057, "step": 527 }, { "epoch": 0.35322155788769494, "grad_norm": 3.0737051963806152, "learning_rate": 4.9952677199699846e-05, "loss": 2.9397, "step": 528 }, { "epoch": 0.3538905381109671, "grad_norm": 4.464580535888672, "learning_rate": 4.995147196175044e-05, "loss": 2.8025, "step": 529 }, { "epoch": 0.35455951833423927, "grad_norm": 7.038727760314941, "learning_rate": 4.9950251583327767e-05, "loss": 3.2705, "step": 530 }, { "epoch": 0.3552284985575114, "grad_norm": 3.4099175930023193, "learning_rate": 4.994901606517236e-05, "loss": 2.9136, "step": 531 }, { "epoch": 0.35589747878078354, "grad_norm": 5.16508674621582, "learning_rate": 4.994776540803393e-05, "loss": 2.9786, "step": 532 }, { "epoch": 0.3565664590040557, "grad_norm": 3.2725794315338135, "learning_rate": 4.994649961267136e-05, "loss": 2.8186, "step": 533 }, { "epoch": 0.35723543922732787, "grad_norm": 2.896885633468628, "learning_rate": 4.994521867985275e-05, "loss": 3.0125, "step": 534 }, { "epoch": 0.3579044194506, "grad_norm": 6.460264682769775, "learning_rate": 4.994392261035534e-05, "loss": 3.0882, "step": 535 }, { "epoch": 0.35857339967387214, "grad_norm": 3.8949625492095947, "learning_rate": 4.994261140496561e-05, "loss": 2.8898, "step": 536 }, { "epoch": 0.3592423798971443, "grad_norm": 4.002209186553955, "learning_rate": 4.9941285064479165e-05, "loss": 3.2028, "step": 537 }, { "epoch": 0.3599113601204164, "grad_norm": 3.564258575439453, "learning_rate": 4.9939943589700845e-05, "loss": 3.1001, "step": 538 }, { "epoch": 0.3605803403436886, "grad_norm": 2.9036061763763428, "learning_rate": 4.9938586981444647e-05, "loss": 2.9339, "step": 539 }, { "epoch": 0.36124932056696074, "grad_norm": 5.254669666290283, "learning_rate": 4.9937215240533757e-05, "loss": 2.8677, "step": 540 }, { "epoch": 0.3619183007902329, "grad_norm": 3.252579927444458, "learning_rate": 4.9935828367800544e-05, "loss": 3.0247, "step": 541 }, { "epoch": 0.362587281013505, "grad_norm": 4.90103006362915, "learning_rate": 4.9934426364086554e-05, "loss": 3.0836, "step": 542 }, { "epoch": 0.3632562612367772, "grad_norm": 4.381466865539551, "learning_rate": 4.9933009230242524e-05, "loss": 3.1326, "step": 543 }, { "epoch": 0.36392524146004934, "grad_norm": 2.9965758323669434, "learning_rate": 4.993157696712836e-05, "loss": 2.8662, "step": 544 }, { "epoch": 0.3645942216833215, "grad_norm": 3.2848966121673584, "learning_rate": 4.9930129575613156e-05, "loss": 2.8297, "step": 545 }, { "epoch": 0.3652632019065936, "grad_norm": 5.529489994049072, "learning_rate": 4.9928667056575185e-05, "loss": 2.9997, "step": 546 }, { "epoch": 0.3659321821298658, "grad_norm": 4.317118167877197, "learning_rate": 4.9927189410901905e-05, "loss": 3.0713, "step": 547 }, { "epoch": 0.36660116235313794, "grad_norm": 3.1297154426574707, "learning_rate": 4.992569663948994e-05, "loss": 2.6562, "step": 548 }, { "epoch": 0.3672701425764101, "grad_norm": 3.4097821712493896, "learning_rate": 4.992418874324509e-05, "loss": 2.7812, "step": 549 }, { "epoch": 0.3679391227996822, "grad_norm": 3.653111696243286, "learning_rate": 4.992266572308237e-05, "loss": 2.9268, "step": 550 }, { "epoch": 0.3686081030229544, "grad_norm": 4.445350170135498, "learning_rate": 4.992112757992591e-05, "loss": 3.0955, "step": 551 }, { "epoch": 0.36927708324622655, "grad_norm": 5.325762748718262, "learning_rate": 4.991957431470908e-05, "loss": 2.9877, "step": 552 }, { "epoch": 0.3699460634694987, "grad_norm": 3.6300137042999268, "learning_rate": 4.991800592837438e-05, "loss": 2.9609, "step": 553 }, { "epoch": 0.3706150436927708, "grad_norm": 3.8826160430908203, "learning_rate": 4.9916422421873496e-05, "loss": 2.9752, "step": 554 }, { "epoch": 0.371284023916043, "grad_norm": 3.5641696453094482, "learning_rate": 4.991482379616731e-05, "loss": 2.7834, "step": 555 }, { "epoch": 0.37195300413931515, "grad_norm": 3.369718074798584, "learning_rate": 4.991321005222585e-05, "loss": 2.9023, "step": 556 }, { "epoch": 0.37262198436258726, "grad_norm": 3.8720474243164062, "learning_rate": 4.991158119102834e-05, "loss": 3.0678, "step": 557 }, { "epoch": 0.3732909645858594, "grad_norm": 3.0319697856903076, "learning_rate": 4.9909937213563165e-05, "loss": 2.8119, "step": 558 }, { "epoch": 0.3739599448091316, "grad_norm": 5.15587043762207, "learning_rate": 4.9908278120827886e-05, "loss": 2.9111, "step": 559 }, { "epoch": 0.37462892503240375, "grad_norm": 3.232497215270996, "learning_rate": 4.990660391382923e-05, "loss": 2.9465, "step": 560 }, { "epoch": 0.37529790525567586, "grad_norm": 4.08522891998291, "learning_rate": 4.990491459358311e-05, "loss": 3.0208, "step": 561 }, { "epoch": 0.375966885478948, "grad_norm": 3.3792853355407715, "learning_rate": 4.990321016111459e-05, "loss": 2.92, "step": 562 }, { "epoch": 0.3766358657022202, "grad_norm": 3.6902658939361572, "learning_rate": 4.990149061745791e-05, "loss": 3.0366, "step": 563 }, { "epoch": 0.37730484592549235, "grad_norm": 4.229647636413574, "learning_rate": 4.9899755963656506e-05, "loss": 3.0959, "step": 564 }, { "epoch": 0.37797382614876446, "grad_norm": 5.087015151977539, "learning_rate": 4.989800620076295e-05, "loss": 2.8492, "step": 565 }, { "epoch": 0.3786428063720366, "grad_norm": 6.532529354095459, "learning_rate": 4.989624132983898e-05, "loss": 2.9678, "step": 566 }, { "epoch": 0.3793117865953088, "grad_norm": 2.5950136184692383, "learning_rate": 4.989446135195553e-05, "loss": 2.9949, "step": 567 }, { "epoch": 0.37998076681858095, "grad_norm": 4.946874618530273, "learning_rate": 4.989266626819266e-05, "loss": 2.8929, "step": 568 }, { "epoch": 0.38064974704185306, "grad_norm": 3.0101711750030518, "learning_rate": 4.989085607963965e-05, "loss": 2.8305, "step": 569 }, { "epoch": 0.3813187272651252, "grad_norm": 3.235175848007202, "learning_rate": 4.988903078739491e-05, "loss": 2.9382, "step": 570 }, { "epoch": 0.3819877074883974, "grad_norm": 5.849483966827393, "learning_rate": 4.988719039256601e-05, "loss": 3.1833, "step": 571 }, { "epoch": 0.3826566877116695, "grad_norm": 3.72119140625, "learning_rate": 4.9885334896269707e-05, "loss": 2.8637, "step": 572 }, { "epoch": 0.38332566793494166, "grad_norm": 5.516154766082764, "learning_rate": 4.988346429963191e-05, "loss": 3.1849, "step": 573 }, { "epoch": 0.3839946481582138, "grad_norm": 4.963644981384277, "learning_rate": 4.9881578603787684e-05, "loss": 3.0707, "step": 574 }, { "epoch": 0.384663628381486, "grad_norm": 4.417355060577393, "learning_rate": 4.987967780988126e-05, "loss": 3.0208, "step": 575 }, { "epoch": 0.3853326086047581, "grad_norm": 5.166300296783447, "learning_rate": 4.9877761919066044e-05, "loss": 3.1407, "step": 576 }, { "epoch": 0.38600158882803026, "grad_norm": 5.213954925537109, "learning_rate": 4.98758309325046e-05, "loss": 3.0561, "step": 577 }, { "epoch": 0.3866705690513024, "grad_norm": 4.354947566986084, "learning_rate": 4.987388485136861e-05, "loss": 2.8773, "step": 578 }, { "epoch": 0.3873395492745746, "grad_norm": 3.1227710247039795, "learning_rate": 4.9871923676838985e-05, "loss": 2.8522, "step": 579 }, { "epoch": 0.3880085294978467, "grad_norm": 4.111440658569336, "learning_rate": 4.986994741010576e-05, "loss": 3.0179, "step": 580 }, { "epoch": 0.38867750972111886, "grad_norm": 6.039926052093506, "learning_rate": 4.9867956052368094e-05, "loss": 3.0055, "step": 581 }, { "epoch": 0.389346489944391, "grad_norm": 3.7227399349212646, "learning_rate": 4.986594960483436e-05, "loss": 3.043, "step": 582 }, { "epoch": 0.3900154701676632, "grad_norm": 4.943892002105713, "learning_rate": 4.9863928068722065e-05, "loss": 2.8667, "step": 583 }, { "epoch": 0.3906844503909353, "grad_norm": 4.038351535797119, "learning_rate": 4.986189144525787e-05, "loss": 2.9695, "step": 584 }, { "epoch": 0.39135343061420746, "grad_norm": 4.159579277038574, "learning_rate": 4.9859839735677585e-05, "loss": 3.0379, "step": 585 }, { "epoch": 0.39202241083747963, "grad_norm": 3.6894772052764893, "learning_rate": 4.9857772941226174e-05, "loss": 2.8926, "step": 586 }, { "epoch": 0.3926913910607518, "grad_norm": 3.6522088050842285, "learning_rate": 4.9855691063157785e-05, "loss": 3.0332, "step": 587 }, { "epoch": 0.3933603712840239, "grad_norm": 5.894712448120117, "learning_rate": 4.9853594102735674e-05, "loss": 3.0254, "step": 588 }, { "epoch": 0.39402935150729607, "grad_norm": 4.821498870849609, "learning_rate": 4.985148206123228e-05, "loss": 3.232, "step": 589 }, { "epoch": 0.39469833173056823, "grad_norm": 4.994937419891357, "learning_rate": 4.9849354939929177e-05, "loss": 3.1466, "step": 590 }, { "epoch": 0.39536731195384034, "grad_norm": 4.068504333496094, "learning_rate": 4.98472127401171e-05, "loss": 2.9745, "step": 591 }, { "epoch": 0.3960362921771125, "grad_norm": 2.8930795192718506, "learning_rate": 4.984505546309592e-05, "loss": 3.0703, "step": 592 }, { "epoch": 0.39670527240038467, "grad_norm": 4.69880485534668, "learning_rate": 4.984288311017469e-05, "loss": 2.9232, "step": 593 }, { "epoch": 0.39737425262365683, "grad_norm": 2.9983294010162354, "learning_rate": 4.9840695682671555e-05, "loss": 2.9689, "step": 594 }, { "epoch": 0.39804323284692894, "grad_norm": 2.860313653945923, "learning_rate": 4.983849318191386e-05, "loss": 2.9432, "step": 595 }, { "epoch": 0.3987122130702011, "grad_norm": 2.924636125564575, "learning_rate": 4.983627560923807e-05, "loss": 2.8894, "step": 596 }, { "epoch": 0.39938119329347327, "grad_norm": 4.200955867767334, "learning_rate": 4.983404296598979e-05, "loss": 2.9354, "step": 597 }, { "epoch": 0.40005017351674543, "grad_norm": 3.3520474433898926, "learning_rate": 4.9831795253523804e-05, "loss": 2.8986, "step": 598 }, { "epoch": 0.40071915374001754, "grad_norm": 3.76237416267395, "learning_rate": 4.9829532473204e-05, "loss": 2.9264, "step": 599 }, { "epoch": 0.4013881339632897, "grad_norm": 4.730940818786621, "learning_rate": 4.9827254626403433e-05, "loss": 3.2129, "step": 600 }, { "epoch": 0.40205711418656187, "grad_norm": 3.7802793979644775, "learning_rate": 4.982496171450428e-05, "loss": 2.8529, "step": 601 }, { "epoch": 0.40272609440983403, "grad_norm": 3.456836462020874, "learning_rate": 4.982265373889789e-05, "loss": 2.9196, "step": 602 }, { "epoch": 0.40339507463310614, "grad_norm": 3.481623411178589, "learning_rate": 4.982033070098472e-05, "loss": 2.7904, "step": 603 }, { "epoch": 0.4040640548563783, "grad_norm": 3.897667169570923, "learning_rate": 4.981799260217439e-05, "loss": 3.1093, "step": 604 }, { "epoch": 0.40473303507965047, "grad_norm": 3.733328104019165, "learning_rate": 4.9815639443885656e-05, "loss": 3.0469, "step": 605 }, { "epoch": 0.4054020153029226, "grad_norm": 2.8006529808044434, "learning_rate": 4.9813271227546396e-05, "loss": 2.7381, "step": 606 }, { "epoch": 0.40607099552619474, "grad_norm": 3.6003527641296387, "learning_rate": 4.9810887954593655e-05, "loss": 2.8827, "step": 607 }, { "epoch": 0.4067399757494669, "grad_norm": 4.127907752990723, "learning_rate": 4.980848962647356e-05, "loss": 2.5829, "step": 608 }, { "epoch": 0.4074089559727391, "grad_norm": 5.597202301025391, "learning_rate": 4.980607624464145e-05, "loss": 2.97, "step": 609 }, { "epoch": 0.4080779361960112, "grad_norm": 3.340439558029175, "learning_rate": 4.9803647810561735e-05, "loss": 3.037, "step": 610 }, { "epoch": 0.40874691641928335, "grad_norm": 3.1540212631225586, "learning_rate": 4.980120432570797e-05, "loss": 2.9054, "step": 611 }, { "epoch": 0.4094158966425555, "grad_norm": 4.93973445892334, "learning_rate": 4.9798745791562885e-05, "loss": 2.9211, "step": 612 }, { "epoch": 0.4100848768658277, "grad_norm": 2.2099862098693848, "learning_rate": 4.9796272209618286e-05, "loss": 2.6368, "step": 613 }, { "epoch": 0.4107538570890998, "grad_norm": 4.601409912109375, "learning_rate": 4.979378358137514e-05, "loss": 2.8993, "step": 614 }, { "epoch": 0.41142283731237195, "grad_norm": 5.1786675453186035, "learning_rate": 4.979127990834354e-05, "loss": 3.0772, "step": 615 }, { "epoch": 0.4120918175356441, "grad_norm": 3.350377321243286, "learning_rate": 4.978876119204271e-05, "loss": 2.8451, "step": 616 }, { "epoch": 0.4127607977589163, "grad_norm": 5.280319690704346, "learning_rate": 4.9786227434000995e-05, "loss": 3.1549, "step": 617 }, { "epoch": 0.4134297779821884, "grad_norm": 4.424160003662109, "learning_rate": 4.978367863575587e-05, "loss": 2.9093, "step": 618 }, { "epoch": 0.41409875820546055, "grad_norm": 7.253220558166504, "learning_rate": 4.9781114798853945e-05, "loss": 3.2784, "step": 619 }, { "epoch": 0.4147677384287327, "grad_norm": 2.733386278152466, "learning_rate": 4.977853592485094e-05, "loss": 2.7615, "step": 620 }, { "epoch": 0.4154367186520049, "grad_norm": 5.424954414367676, "learning_rate": 4.9775942015311713e-05, "loss": 3.0018, "step": 621 }, { "epoch": 0.416105698875277, "grad_norm": 2.8420839309692383, "learning_rate": 4.9773333071810244e-05, "loss": 3.0407, "step": 622 }, { "epoch": 0.41677467909854915, "grad_norm": 3.767392635345459, "learning_rate": 4.977070909592961e-05, "loss": 2.8276, "step": 623 }, { "epoch": 0.4174436593218213, "grad_norm": 5.107291221618652, "learning_rate": 4.976807008926206e-05, "loss": 3.0319, "step": 624 }, { "epoch": 0.4181126395450934, "grad_norm": 3.7375893592834473, "learning_rate": 4.976541605340892e-05, "loss": 2.961, "step": 625 }, { "epoch": 0.4187816197683656, "grad_norm": 8.229865074157715, "learning_rate": 4.976274698998065e-05, "loss": 2.9438, "step": 626 }, { "epoch": 0.41945059999163775, "grad_norm": 4.140076160430908, "learning_rate": 4.9760062900596825e-05, "loss": 3.0025, "step": 627 }, { "epoch": 0.4201195802149099, "grad_norm": 4.103034496307373, "learning_rate": 4.9757363786886145e-05, "loss": 2.864, "step": 628 }, { "epoch": 0.420788560438182, "grad_norm": 3.0549144744873047, "learning_rate": 4.975464965048644e-05, "loss": 2.9363, "step": 629 }, { "epoch": 0.4214575406614542, "grad_norm": 5.202071189880371, "learning_rate": 4.975192049304461e-05, "loss": 2.9154, "step": 630 }, { "epoch": 0.42212652088472635, "grad_norm": 3.9111521244049072, "learning_rate": 4.974917631621673e-05, "loss": 2.7795, "step": 631 }, { "epoch": 0.4227955011079985, "grad_norm": 5.800964832305908, "learning_rate": 4.974641712166793e-05, "loss": 2.85, "step": 632 }, { "epoch": 0.4234644813312706, "grad_norm": 5.897298336029053, "learning_rate": 4.97436429110725e-05, "loss": 2.5939, "step": 633 }, { "epoch": 0.4241334615545428, "grad_norm": 3.8999950885772705, "learning_rate": 4.974085368611381e-05, "loss": 2.8274, "step": 634 }, { "epoch": 0.42480244177781495, "grad_norm": 3.5932559967041016, "learning_rate": 4.973804944848437e-05, "loss": 2.7909, "step": 635 }, { "epoch": 0.4254714220010871, "grad_norm": 5.648451328277588, "learning_rate": 4.973523019988575e-05, "loss": 3.0426, "step": 636 }, { "epoch": 0.4261404022243592, "grad_norm": 4.077558517456055, "learning_rate": 4.973239594202869e-05, "loss": 2.8152, "step": 637 }, { "epoch": 0.4268093824476314, "grad_norm": 3.34771728515625, "learning_rate": 4.9729546676633e-05, "loss": 2.7797, "step": 638 }, { "epoch": 0.42747836267090356, "grad_norm": 4.4564690589904785, "learning_rate": 4.972668240542761e-05, "loss": 2.7314, "step": 639 }, { "epoch": 0.42814734289417566, "grad_norm": 4.631012916564941, "learning_rate": 4.972380313015054e-05, "loss": 3.1524, "step": 640 }, { "epoch": 0.42881632311744783, "grad_norm": 10.560107231140137, "learning_rate": 4.972090885254893e-05, "loss": 3.1603, "step": 641 }, { "epoch": 0.42948530334072, "grad_norm": 3.9643378257751465, "learning_rate": 4.9717999574379016e-05, "loss": 2.7712, "step": 642 }, { "epoch": 0.43015428356399216, "grad_norm": 3.4705636501312256, "learning_rate": 4.971507529740614e-05, "loss": 2.8451, "step": 643 }, { "epoch": 0.43082326378726427, "grad_norm": 4.629807949066162, "learning_rate": 4.971213602340475e-05, "loss": 2.7529, "step": 644 }, { "epoch": 0.43149224401053643, "grad_norm": 4.530109405517578, "learning_rate": 4.970918175415838e-05, "loss": 2.9651, "step": 645 }, { "epoch": 0.4321612242338086, "grad_norm": 4.763259410858154, "learning_rate": 4.9706212491459685e-05, "loss": 2.8411, "step": 646 }, { "epoch": 0.43283020445708076, "grad_norm": 3.4377224445343018, "learning_rate": 4.9703228237110386e-05, "loss": 2.9072, "step": 647 }, { "epoch": 0.43349918468035287, "grad_norm": 9.420544624328613, "learning_rate": 4.970022899292133e-05, "loss": 3.0431, "step": 648 }, { "epoch": 0.43416816490362503, "grad_norm": 5.205924034118652, "learning_rate": 4.969721476071244e-05, "loss": 2.9704, "step": 649 }, { "epoch": 0.4348371451268972, "grad_norm": 3.097839832305908, "learning_rate": 4.9694185542312745e-05, "loss": 2.8079, "step": 650 }, { "epoch": 0.43550612535016936, "grad_norm": 4.139601707458496, "learning_rate": 4.969114133956037e-05, "loss": 2.8865, "step": 651 }, { "epoch": 0.43617510557344147, "grad_norm": 6.326453685760498, "learning_rate": 4.968808215430253e-05, "loss": 2.8658, "step": 652 }, { "epoch": 0.43684408579671363, "grad_norm": 3.709042549133301, "learning_rate": 4.968500798839552e-05, "loss": 2.9125, "step": 653 }, { "epoch": 0.4375130660199858, "grad_norm": 4.608620643615723, "learning_rate": 4.968191884370474e-05, "loss": 2.859, "step": 654 }, { "epoch": 0.43818204624325796, "grad_norm": 5.535547733306885, "learning_rate": 4.967881472210467e-05, "loss": 2.9336, "step": 655 }, { "epoch": 0.43885102646653007, "grad_norm": 4.740244388580322, "learning_rate": 4.967569562547888e-05, "loss": 3.0713, "step": 656 }, { "epoch": 0.43952000668980223, "grad_norm": 4.269866466522217, "learning_rate": 4.967256155572003e-05, "loss": 3.065, "step": 657 }, { "epoch": 0.4401889869130744, "grad_norm": 4.712810516357422, "learning_rate": 4.966941251472986e-05, "loss": 3.1094, "step": 658 }, { "epoch": 0.4408579671363465, "grad_norm": 4.342670917510986, "learning_rate": 4.966624850441921e-05, "loss": 3.1203, "step": 659 }, { "epoch": 0.44152694735961867, "grad_norm": 3.0770020484924316, "learning_rate": 4.966306952670798e-05, "loss": 2.9101, "step": 660 }, { "epoch": 0.44219592758289084, "grad_norm": 4.738711357116699, "learning_rate": 4.9659875583525165e-05, "loss": 2.8037, "step": 661 }, { "epoch": 0.442864907806163, "grad_norm": 4.434926986694336, "learning_rate": 4.9656666676808844e-05, "loss": 2.6684, "step": 662 }, { "epoch": 0.4435338880294351, "grad_norm": 2.662122964859009, "learning_rate": 4.9653442808506165e-05, "loss": 2.8781, "step": 663 }, { "epoch": 0.4442028682527073, "grad_norm": 4.463600158691406, "learning_rate": 4.965020398057336e-05, "loss": 3.0432, "step": 664 }, { "epoch": 0.44487184847597944, "grad_norm": 5.810666084289551, "learning_rate": 4.964695019497575e-05, "loss": 2.9434, "step": 665 }, { "epoch": 0.4455408286992516, "grad_norm": 4.265758037567139, "learning_rate": 4.964368145368772e-05, "loss": 2.8055, "step": 666 }, { "epoch": 0.4462098089225237, "grad_norm": 3.7653000354766846, "learning_rate": 4.9640397758692715e-05, "loss": 2.7077, "step": 667 }, { "epoch": 0.4468787891457959, "grad_norm": 5.089723110198975, "learning_rate": 4.963709911198329e-05, "loss": 3.1039, "step": 668 }, { "epoch": 0.44754776936906804, "grad_norm": 3.1420540809631348, "learning_rate": 4.9633785515561035e-05, "loss": 2.822, "step": 669 }, { "epoch": 0.4482167495923402, "grad_norm": 7.582685470581055, "learning_rate": 4.9630456971436654e-05, "loss": 2.915, "step": 670 }, { "epoch": 0.4488857298156123, "grad_norm": 4.884276866912842, "learning_rate": 4.962711348162987e-05, "loss": 2.8379, "step": 671 }, { "epoch": 0.4495547100388845, "grad_norm": 4.963716983795166, "learning_rate": 4.962375504816953e-05, "loss": 2.8788, "step": 672 }, { "epoch": 0.45022369026215664, "grad_norm": 6.727871417999268, "learning_rate": 4.9620381673093496e-05, "loss": 2.9308, "step": 673 }, { "epoch": 0.45089267048542875, "grad_norm": 3.4183032512664795, "learning_rate": 4.9616993358448734e-05, "loss": 2.7673, "step": 674 }, { "epoch": 0.4515616507087009, "grad_norm": 5.130782604217529, "learning_rate": 4.9613590106291266e-05, "loss": 2.746, "step": 675 }, { "epoch": 0.4522306309319731, "grad_norm": 5.305598735809326, "learning_rate": 4.9610171918686157e-05, "loss": 3.1816, "step": 676 }, { "epoch": 0.45289961115524524, "grad_norm": 5.105145454406738, "learning_rate": 4.960673879770757e-05, "loss": 2.9757, "step": 677 }, { "epoch": 0.45356859137851735, "grad_norm": 4.758883953094482, "learning_rate": 4.9603290745438703e-05, "loss": 2.7821, "step": 678 }, { "epoch": 0.4542375716017895, "grad_norm": 4.063161373138428, "learning_rate": 4.9599827763971825e-05, "loss": 3.003, "step": 679 }, { "epoch": 0.4549065518250617, "grad_norm": 4.086067199707031, "learning_rate": 4.9596349855408266e-05, "loss": 3.0175, "step": 680 }, { "epoch": 0.45557553204833384, "grad_norm": 3.486377477645874, "learning_rate": 4.9592857021858397e-05, "loss": 2.9731, "step": 681 }, { "epoch": 0.45624451227160595, "grad_norm": 4.757167816162109, "learning_rate": 4.958934926544168e-05, "loss": 3.0288, "step": 682 }, { "epoch": 0.4569134924948781, "grad_norm": 4.748201370239258, "learning_rate": 4.958582658828659e-05, "loss": 2.8637, "step": 683 }, { "epoch": 0.4575824727181503, "grad_norm": 5.692895889282227, "learning_rate": 4.958228899253068e-05, "loss": 3.1108, "step": 684 }, { "epoch": 0.45825145294142244, "grad_norm": 4.517188549041748, "learning_rate": 4.957873648032056e-05, "loss": 2.9381, "step": 685 }, { "epoch": 0.45892043316469455, "grad_norm": 7.702240467071533, "learning_rate": 4.9575169053811876e-05, "loss": 3.1256, "step": 686 }, { "epoch": 0.4595894133879667, "grad_norm": 6.084390640258789, "learning_rate": 4.957158671516934e-05, "loss": 2.9819, "step": 687 }, { "epoch": 0.4602583936112389, "grad_norm": 5.609198570251465, "learning_rate": 4.9567989466566686e-05, "loss": 3.296, "step": 688 }, { "epoch": 0.46092737383451104, "grad_norm": 4.34131383895874, "learning_rate": 4.956437731018674e-05, "loss": 2.8, "step": 689 }, { "epoch": 0.46159635405778315, "grad_norm": 4.470595359802246, "learning_rate": 4.956075024822132e-05, "loss": 2.9731, "step": 690 }, { "epoch": 0.4622653342810553, "grad_norm": 4.090426445007324, "learning_rate": 4.955710828287133e-05, "loss": 2.7561, "step": 691 }, { "epoch": 0.4629343145043275, "grad_norm": 4.911933422088623, "learning_rate": 4.95534514163467e-05, "loss": 2.9921, "step": 692 }, { "epoch": 0.4636032947275996, "grad_norm": 3.661410331726074, "learning_rate": 4.95497796508664e-05, "loss": 2.9448, "step": 693 }, { "epoch": 0.46427227495087176, "grad_norm": 3.1984846591949463, "learning_rate": 4.9546092988658457e-05, "loss": 2.7455, "step": 694 }, { "epoch": 0.4649412551741439, "grad_norm": 3.988065242767334, "learning_rate": 4.954239143195991e-05, "loss": 2.7055, "step": 695 }, { "epoch": 0.4656102353974161, "grad_norm": 4.223339080810547, "learning_rate": 4.9538674983016866e-05, "loss": 2.7932, "step": 696 }, { "epoch": 0.4662792156206882, "grad_norm": 2.925285816192627, "learning_rate": 4.953494364408445e-05, "loss": 3.1801, "step": 697 }, { "epoch": 0.46694819584396036, "grad_norm": 3.728097915649414, "learning_rate": 4.9531197417426825e-05, "loss": 3.066, "step": 698 }, { "epoch": 0.4676171760672325, "grad_norm": 3.7567453384399414, "learning_rate": 4.9527436305317195e-05, "loss": 2.7462, "step": 699 }, { "epoch": 0.4682861562905047, "grad_norm": 4.21150541305542, "learning_rate": 4.952366031003778e-05, "loss": 2.9276, "step": 700 }, { "epoch": 0.4689551365137768, "grad_norm": 4.783014297485352, "learning_rate": 4.951986943387984e-05, "loss": 3.1362, "step": 701 }, { "epoch": 0.46962411673704896, "grad_norm": 3.026461601257324, "learning_rate": 4.951606367914369e-05, "loss": 3.0478, "step": 702 }, { "epoch": 0.4702930969603211, "grad_norm": 5.113108158111572, "learning_rate": 4.951224304813862e-05, "loss": 3.2145, "step": 703 }, { "epoch": 0.4709620771835933, "grad_norm": 3.4587903022766113, "learning_rate": 4.950840754318299e-05, "loss": 2.9438, "step": 704 }, { "epoch": 0.4716310574068654, "grad_norm": 3.0687296390533447, "learning_rate": 4.950455716660418e-05, "loss": 2.8577, "step": 705 }, { "epoch": 0.47230003763013756, "grad_norm": 3.2031595706939697, "learning_rate": 4.950069192073857e-05, "loss": 3.0442, "step": 706 }, { "epoch": 0.4729690178534097, "grad_norm": 3.09106183052063, "learning_rate": 4.9496811807931596e-05, "loss": 3.0287, "step": 707 }, { "epoch": 0.47363799807668183, "grad_norm": 3.3463358879089355, "learning_rate": 4.949291683053769e-05, "loss": 2.9155, "step": 708 }, { "epoch": 0.474306978299954, "grad_norm": 3.321120023727417, "learning_rate": 4.948900699092031e-05, "loss": 2.8729, "step": 709 }, { "epoch": 0.47497595852322616, "grad_norm": 4.3413286209106445, "learning_rate": 4.948508229145194e-05, "loss": 2.813, "step": 710 }, { "epoch": 0.4756449387464983, "grad_norm": 2.997468948364258, "learning_rate": 4.948114273451405e-05, "loss": 2.9919, "step": 711 }, { "epoch": 0.47631391896977043, "grad_norm": 4.761056900024414, "learning_rate": 4.947718832249719e-05, "loss": 3.0345, "step": 712 }, { "epoch": 0.4769828991930426, "grad_norm": 6.336904048919678, "learning_rate": 4.9473219057800855e-05, "loss": 3.2108, "step": 713 }, { "epoch": 0.47765187941631476, "grad_norm": 2.752504348754883, "learning_rate": 4.94692349428336e-05, "loss": 2.7977, "step": 714 }, { "epoch": 0.4783208596395869, "grad_norm": 4.846428394317627, "learning_rate": 4.9465235980012964e-05, "loss": 2.8033, "step": 715 }, { "epoch": 0.47898983986285903, "grad_norm": 5.727593421936035, "learning_rate": 4.946122217176551e-05, "loss": 2.9128, "step": 716 }, { "epoch": 0.4796588200861312, "grad_norm": 4.22546911239624, "learning_rate": 4.945719352052679e-05, "loss": 2.9922, "step": 717 }, { "epoch": 0.48032780030940336, "grad_norm": 3.6872398853302, "learning_rate": 4.94531500287414e-05, "loss": 2.7136, "step": 718 }, { "epoch": 0.4809967805326755, "grad_norm": 5.122034072875977, "learning_rate": 4.94490916988629e-05, "loss": 3.047, "step": 719 }, { "epoch": 0.48166576075594764, "grad_norm": 3.345089912414551, "learning_rate": 4.944501853335387e-05, "loss": 2.8755, "step": 720 }, { "epoch": 0.4823347409792198, "grad_norm": 3.690261125564575, "learning_rate": 4.9440930534685914e-05, "loss": 2.9244, "step": 721 }, { "epoch": 0.48300372120249196, "grad_norm": 3.8499228954315186, "learning_rate": 4.9436827705339597e-05, "loss": 2.9331, "step": 722 }, { "epoch": 0.4836727014257641, "grad_norm": 5.345366477966309, "learning_rate": 4.94327100478045e-05, "loss": 2.984, "step": 723 }, { "epoch": 0.48434168164903624, "grad_norm": 3.7239580154418945, "learning_rate": 4.9428577564579227e-05, "loss": 3.0778, "step": 724 }, { "epoch": 0.4850106618723084, "grad_norm": 4.34011173248291, "learning_rate": 4.942443025817133e-05, "loss": 3.0008, "step": 725 }, { "epoch": 0.48567964209558057, "grad_norm": 5.08994197845459, "learning_rate": 4.94202681310974e-05, "loss": 3.0557, "step": 726 }, { "epoch": 0.4863486223188527, "grad_norm": 5.373446464538574, "learning_rate": 4.9416091185883e-05, "loss": 3.2981, "step": 727 }, { "epoch": 0.48701760254212484, "grad_norm": 3.0065951347351074, "learning_rate": 4.9411899425062665e-05, "loss": 2.9907, "step": 728 }, { "epoch": 0.487686582765397, "grad_norm": 3.025444269180298, "learning_rate": 4.9407692851179976e-05, "loss": 3.0356, "step": 729 }, { "epoch": 0.48835556298866917, "grad_norm": 2.8892788887023926, "learning_rate": 4.9403471466787446e-05, "loss": 2.8412, "step": 730 }, { "epoch": 0.4890245432119413, "grad_norm": 3.6930127143859863, "learning_rate": 4.93992352744466e-05, "loss": 2.8324, "step": 731 }, { "epoch": 0.48969352343521344, "grad_norm": 5.3692193031311035, "learning_rate": 4.9394984276727954e-05, "loss": 3.2483, "step": 732 }, { "epoch": 0.4903625036584856, "grad_norm": 3.029644727706909, "learning_rate": 4.9390718476210994e-05, "loss": 2.7399, "step": 733 }, { "epoch": 0.49103148388175777, "grad_norm": 4.040469646453857, "learning_rate": 4.9386437875484194e-05, "loss": 2.9284, "step": 734 }, { "epoch": 0.4917004641050299, "grad_norm": 5.720209121704102, "learning_rate": 4.938214247714501e-05, "loss": 2.7779, "step": 735 }, { "epoch": 0.49236944432830204, "grad_norm": 3.9115052223205566, "learning_rate": 4.937783228379988e-05, "loss": 2.7891, "step": 736 }, { "epoch": 0.4930384245515742, "grad_norm": 5.581562042236328, "learning_rate": 4.937350729806421e-05, "loss": 3.0304, "step": 737 }, { "epoch": 0.49370740477484637, "grad_norm": 3.564924955368042, "learning_rate": 4.9369167522562385e-05, "loss": 2.9908, "step": 738 }, { "epoch": 0.4943763849981185, "grad_norm": 6.215388774871826, "learning_rate": 4.9364812959927773e-05, "loss": 3.0139, "step": 739 }, { "epoch": 0.49504536522139064, "grad_norm": 3.3125667572021484, "learning_rate": 4.936044361280271e-05, "loss": 2.7297, "step": 740 }, { "epoch": 0.4957143454446628, "grad_norm": 3.6967480182647705, "learning_rate": 4.9356059483838495e-05, "loss": 3.0106, "step": 741 }, { "epoch": 0.4963833256679349, "grad_norm": 3.230792760848999, "learning_rate": 4.935166057569541e-05, "loss": 2.9077, "step": 742 }, { "epoch": 0.4970523058912071, "grad_norm": 4.57341194152832, "learning_rate": 4.9347246891042685e-05, "loss": 3.0735, "step": 743 }, { "epoch": 0.49772128611447924, "grad_norm": 2.9976770877838135, "learning_rate": 4.934281843255855e-05, "loss": 2.9161, "step": 744 }, { "epoch": 0.4983902663377514, "grad_norm": 6.950145244598389, "learning_rate": 4.933837520293017e-05, "loss": 2.9473, "step": 745 }, { "epoch": 0.4990592465610235, "grad_norm": 3.205528497695923, "learning_rate": 4.933391720485368e-05, "loss": 3.0744, "step": 746 }, { "epoch": 0.4997282267842957, "grad_norm": 5.989559173583984, "learning_rate": 4.932944444103418e-05, "loss": 3.0021, "step": 747 }, { "epoch": 0.5003972070075678, "grad_norm": 9.719673156738281, "learning_rate": 4.9324956914185725e-05, "loss": 3.17, "step": 748 }, { "epoch": 0.50106618723084, "grad_norm": 3.216552972793579, "learning_rate": 4.932045462703134e-05, "loss": 2.7118, "step": 749 }, { "epoch": 0.5017351674541122, "grad_norm": 3.494694471359253, "learning_rate": 4.9315937582303e-05, "loss": 2.9572, "step": 750 }, { "epoch": 0.5024041476773843, "grad_norm": 3.0318126678466797, "learning_rate": 4.931140578274162e-05, "loss": 3.0735, "step": 751 }, { "epoch": 0.5030731279006564, "grad_norm": 5.392656326293945, "learning_rate": 4.930685923109709e-05, "loss": 2.7952, "step": 752 }, { "epoch": 0.5037421081239286, "grad_norm": 2.4600045680999756, "learning_rate": 4.930229793012825e-05, "loss": 2.7525, "step": 753 }, { "epoch": 0.5044110883472007, "grad_norm": 3.9268031120300293, "learning_rate": 4.929772188260287e-05, "loss": 2.872, "step": 754 }, { "epoch": 0.5050800685704729, "grad_norm": 4.539796829223633, "learning_rate": 4.9293131091297686e-05, "loss": 2.8895, "step": 755 }, { "epoch": 0.505749048793745, "grad_norm": 2.849412202835083, "learning_rate": 4.928852555899838e-05, "loss": 2.8359, "step": 756 }, { "epoch": 0.5064180290170172, "grad_norm": 5.098194122314453, "learning_rate": 4.928390528849957e-05, "loss": 2.9828, "step": 757 }, { "epoch": 0.5070870092402894, "grad_norm": 6.503052711486816, "learning_rate": 4.927927028260482e-05, "loss": 2.9446, "step": 758 }, { "epoch": 0.5077559894635615, "grad_norm": 4.589889049530029, "learning_rate": 4.9274620544126625e-05, "loss": 2.7252, "step": 759 }, { "epoch": 0.5084249696868336, "grad_norm": 6.710662841796875, "learning_rate": 4.926995607588646e-05, "loss": 3.0542, "step": 760 }, { "epoch": 0.5090939499101058, "grad_norm": 3.628431558609009, "learning_rate": 4.9265276880714696e-05, "loss": 2.9673, "step": 761 }, { "epoch": 0.5097629301333779, "grad_norm": 4.540585517883301, "learning_rate": 4.9260582961450644e-05, "loss": 2.9251, "step": 762 }, { "epoch": 0.51043191035665, "grad_norm": 5.215747833251953, "learning_rate": 4.9255874320942565e-05, "loss": 2.8115, "step": 763 }, { "epoch": 0.5111008905799223, "grad_norm": 4.2205424308776855, "learning_rate": 4.925115096204765e-05, "loss": 3.0809, "step": 764 }, { "epoch": 0.5117698708031944, "grad_norm": 4.905090808868408, "learning_rate": 4.924641288763202e-05, "loss": 2.8316, "step": 765 }, { "epoch": 0.5124388510264666, "grad_norm": 4.075945854187012, "learning_rate": 4.924166010057072e-05, "loss": 3.1006, "step": 766 }, { "epoch": 0.5131078312497387, "grad_norm": 6.7424116134643555, "learning_rate": 4.9236892603747725e-05, "loss": 3.0465, "step": 767 }, { "epoch": 0.5137768114730108, "grad_norm": 3.8477931022644043, "learning_rate": 4.9232110400055944e-05, "loss": 2.8483, "step": 768 }, { "epoch": 0.514445791696283, "grad_norm": 4.381292819976807, "learning_rate": 4.9227313492397184e-05, "loss": 3.0262, "step": 769 }, { "epoch": 0.5151147719195551, "grad_norm": 3.9413158893585205, "learning_rate": 4.9222501883682214e-05, "loss": 2.927, "step": 770 }, { "epoch": 0.5157837521428272, "grad_norm": 3.5212087631225586, "learning_rate": 4.921767557683069e-05, "loss": 2.7958, "step": 771 }, { "epoch": 0.5164527323660995, "grad_norm": 4.910702228546143, "learning_rate": 4.921283457477121e-05, "loss": 3.0121, "step": 772 }, { "epoch": 0.5171217125893716, "grad_norm": 3.43432354927063, "learning_rate": 4.9207978880441275e-05, "loss": 2.9094, "step": 773 }, { "epoch": 0.5177906928126437, "grad_norm": 4.514383792877197, "learning_rate": 4.9203108496787295e-05, "loss": 2.9786, "step": 774 }, { "epoch": 0.5184596730359159, "grad_norm": 3.666508913040161, "learning_rate": 4.919822342676461e-05, "loss": 2.9302, "step": 775 }, { "epoch": 0.519128653259188, "grad_norm": 3.301198720932007, "learning_rate": 4.9193323673337476e-05, "loss": 3.1581, "step": 776 }, { "epoch": 0.5197976334824602, "grad_norm": 5.317511558532715, "learning_rate": 4.9188409239479026e-05, "loss": 3.1275, "step": 777 }, { "epoch": 0.5204666137057323, "grad_norm": 4.472630023956299, "learning_rate": 4.9183480128171345e-05, "loss": 3.1681, "step": 778 }, { "epoch": 0.5211355939290044, "grad_norm": 3.865774393081665, "learning_rate": 4.917853634240538e-05, "loss": 2.9445, "step": 779 }, { "epoch": 0.5218045741522767, "grad_norm": 2.5969860553741455, "learning_rate": 4.9173577885181024e-05, "loss": 2.7801, "step": 780 }, { "epoch": 0.5224735543755488, "grad_norm": 3.0315206050872803, "learning_rate": 4.916860475950704e-05, "loss": 2.7403, "step": 781 }, { "epoch": 0.5231425345988209, "grad_norm": 4.092448711395264, "learning_rate": 4.91636169684011e-05, "loss": 2.8117, "step": 782 }, { "epoch": 0.5238115148220931, "grad_norm": 4.825198173522949, "learning_rate": 4.9158614514889806e-05, "loss": 2.9907, "step": 783 }, { "epoch": 0.5244804950453652, "grad_norm": 4.413206577301025, "learning_rate": 4.915359740200861e-05, "loss": 3.1183, "step": 784 }, { "epoch": 0.5251494752686374, "grad_norm": 5.487285614013672, "learning_rate": 4.914856563280187e-05, "loss": 3.0364, "step": 785 }, { "epoch": 0.5258184554919095, "grad_norm": 3.3987667560577393, "learning_rate": 4.9143519210322875e-05, "loss": 2.7744, "step": 786 }, { "epoch": 0.5264874357151816, "grad_norm": 3.7231040000915527, "learning_rate": 4.9138458137633756e-05, "loss": 3.0891, "step": 787 }, { "epoch": 0.5271564159384539, "grad_norm": 10.888092041015625, "learning_rate": 4.913338241780557e-05, "loss": 2.7896, "step": 788 }, { "epoch": 0.527825396161726, "grad_norm": 4.869259357452393, "learning_rate": 4.9128292053918235e-05, "loss": 2.8969, "step": 789 }, { "epoch": 0.5284943763849981, "grad_norm": 5.367812156677246, "learning_rate": 4.9123187049060584e-05, "loss": 3.0896, "step": 790 }, { "epoch": 0.5291633566082703, "grad_norm": 2.8876943588256836, "learning_rate": 4.911806740633029e-05, "loss": 2.9287, "step": 791 }, { "epoch": 0.5298323368315424, "grad_norm": 4.382396697998047, "learning_rate": 4.9112933128833974e-05, "loss": 2.9172, "step": 792 }, { "epoch": 0.5305013170548145, "grad_norm": 5.258941173553467, "learning_rate": 4.9107784219687055e-05, "loss": 2.9639, "step": 793 }, { "epoch": 0.5311702972780867, "grad_norm": 4.112799167633057, "learning_rate": 4.9102620682013915e-05, "loss": 3.0852, "step": 794 }, { "epoch": 0.5318392775013588, "grad_norm": 3.10772442817688, "learning_rate": 4.909744251894775e-05, "loss": 2.8169, "step": 795 }, { "epoch": 0.5325082577246311, "grad_norm": 5.637611389160156, "learning_rate": 4.9092249733630656e-05, "loss": 3.0063, "step": 796 }, { "epoch": 0.5331772379479032, "grad_norm": 4.501651287078857, "learning_rate": 4.9087042329213606e-05, "loss": 2.6273, "step": 797 }, { "epoch": 0.5338462181711753, "grad_norm": 5.757697582244873, "learning_rate": 4.9081820308856425e-05, "loss": 3.2429, "step": 798 }, { "epoch": 0.5345151983944475, "grad_norm": 3.935551404953003, "learning_rate": 4.907658367572783e-05, "loss": 2.7728, "step": 799 }, { "epoch": 0.5351841786177196, "grad_norm": 3.3114700317382812, "learning_rate": 4.907133243300538e-05, "loss": 2.8407, "step": 800 }, { "epoch": 0.5358531588409917, "grad_norm": 9.505463600158691, "learning_rate": 4.906606658387551e-05, "loss": 3.1491, "step": 801 }, { "epoch": 0.5365221390642639, "grad_norm": 8.680765151977539, "learning_rate": 4.906078613153354e-05, "loss": 3.1871, "step": 802 }, { "epoch": 0.537191119287536, "grad_norm": 5.7888031005859375, "learning_rate": 4.905549107918362e-05, "loss": 3.0384, "step": 803 }, { "epoch": 0.5378600995108083, "grad_norm": 6.471538066864014, "learning_rate": 4.905018143003878e-05, "loss": 2.8996, "step": 804 }, { "epoch": 0.5385290797340804, "grad_norm": 3.8376762866973877, "learning_rate": 4.904485718732088e-05, "loss": 3.2006, "step": 805 }, { "epoch": 0.5391980599573525, "grad_norm": 7.126554012298584, "learning_rate": 4.9039518354260674e-05, "loss": 3.0624, "step": 806 }, { "epoch": 0.5398670401806247, "grad_norm": 3.2664778232574463, "learning_rate": 4.903416493409772e-05, "loss": 2.6461, "step": 807 }, { "epoch": 0.5405360204038968, "grad_norm": 4.890481948852539, "learning_rate": 4.902879693008049e-05, "loss": 2.9073, "step": 808 }, { "epoch": 0.5412050006271689, "grad_norm": 4.069634437561035, "learning_rate": 4.902341434546626e-05, "loss": 2.806, "step": 809 }, { "epoch": 0.5418739808504411, "grad_norm": 3.5017058849334717, "learning_rate": 4.901801718352115e-05, "loss": 2.8788, "step": 810 }, { "epoch": 0.5425429610737132, "grad_norm": 7.153815269470215, "learning_rate": 4.901260544752015e-05, "loss": 3.0846, "step": 811 }, { "epoch": 0.5432119412969854, "grad_norm": 5.80309534072876, "learning_rate": 4.90071791407471e-05, "loss": 3.0031, "step": 812 }, { "epoch": 0.5438809215202576, "grad_norm": 5.59669828414917, "learning_rate": 4.900173826649464e-05, "loss": 3.1324, "step": 813 }, { "epoch": 0.5445499017435297, "grad_norm": 5.2630486488342285, "learning_rate": 4.899628282806428e-05, "loss": 3.0937, "step": 814 }, { "epoch": 0.5452188819668019, "grad_norm": 7.03258752822876, "learning_rate": 4.8990812828766375e-05, "loss": 2.9336, "step": 815 }, { "epoch": 0.545887862190074, "grad_norm": 5.068579196929932, "learning_rate": 4.8985328271920104e-05, "loss": 3.3006, "step": 816 }, { "epoch": 0.5465568424133461, "grad_norm": 4.967462062835693, "learning_rate": 4.897982916085346e-05, "loss": 2.7347, "step": 817 }, { "epoch": 0.5472258226366183, "grad_norm": 4.353154182434082, "learning_rate": 4.897431549890331e-05, "loss": 2.9929, "step": 818 }, { "epoch": 0.5478948028598905, "grad_norm": 4.147519588470459, "learning_rate": 4.896878728941531e-05, "loss": 2.8716, "step": 819 }, { "epoch": 0.5485637830831626, "grad_norm": 3.507474422454834, "learning_rate": 4.8963244535743954e-05, "loss": 2.7667, "step": 820 }, { "epoch": 0.5492327633064348, "grad_norm": 3.999964952468872, "learning_rate": 4.895768724125259e-05, "loss": 2.8255, "step": 821 }, { "epoch": 0.5499017435297069, "grad_norm": 4.473382949829102, "learning_rate": 4.895211540931335e-05, "loss": 3.0781, "step": 822 }, { "epoch": 0.5505707237529791, "grad_norm": 4.881600379943848, "learning_rate": 4.894652904330721e-05, "loss": 2.8138, "step": 823 }, { "epoch": 0.5512397039762512, "grad_norm": 4.237582206726074, "learning_rate": 4.894092814662395e-05, "loss": 2.9103, "step": 824 }, { "epoch": 0.5519086841995233, "grad_norm": 18.73539924621582, "learning_rate": 4.893531272266218e-05, "loss": 2.9088, "step": 825 }, { "epoch": 0.5525776644227955, "grad_norm": 5.194465637207031, "learning_rate": 4.8929682774829336e-05, "loss": 3.065, "step": 826 }, { "epoch": 0.5532466446460677, "grad_norm": 4.28487491607666, "learning_rate": 4.892403830654163e-05, "loss": 3.0793, "step": 827 }, { "epoch": 0.5539156248693398, "grad_norm": 3.6678833961486816, "learning_rate": 4.891837932122412e-05, "loss": 2.8483, "step": 828 }, { "epoch": 0.554584605092612, "grad_norm": 4.224690914154053, "learning_rate": 4.8912705822310655e-05, "loss": 2.8658, "step": 829 }, { "epoch": 0.5552535853158841, "grad_norm": 4.132622241973877, "learning_rate": 4.89070178132439e-05, "loss": 2.7489, "step": 830 }, { "epoch": 0.5559225655391562, "grad_norm": 4.749293327331543, "learning_rate": 4.8901315297475315e-05, "loss": 2.8354, "step": 831 }, { "epoch": 0.5565915457624284, "grad_norm": 3.7333221435546875, "learning_rate": 4.889559827846518e-05, "loss": 2.8272, "step": 832 }, { "epoch": 0.5572605259857005, "grad_norm": 3.7024402618408203, "learning_rate": 4.8889866759682554e-05, "loss": 2.7571, "step": 833 }, { "epoch": 0.5579295062089727, "grad_norm": 3.952960968017578, "learning_rate": 4.88841207446053e-05, "loss": 2.8217, "step": 834 }, { "epoch": 0.5585984864322449, "grad_norm": 3.772315502166748, "learning_rate": 4.88783602367201e-05, "loss": 3.0521, "step": 835 }, { "epoch": 0.559267466655517, "grad_norm": 6.524354457855225, "learning_rate": 4.887258523952239e-05, "loss": 3.124, "step": 836 }, { "epoch": 0.5599364468787892, "grad_norm": 4.3125457763671875, "learning_rate": 4.886679575651643e-05, "loss": 3.1228, "step": 837 }, { "epoch": 0.5606054271020613, "grad_norm": 4.666600227355957, "learning_rate": 4.886099179121526e-05, "loss": 2.9512, "step": 838 }, { "epoch": 0.5612744073253334, "grad_norm": 4.160144329071045, "learning_rate": 4.885517334714072e-05, "loss": 3.0934, "step": 839 }, { "epoch": 0.5619433875486056, "grad_norm": 3.479520320892334, "learning_rate": 4.884934042782339e-05, "loss": 2.7539, "step": 840 }, { "epoch": 0.5626123677718777, "grad_norm": 3.340153694152832, "learning_rate": 4.8843493036802696e-05, "loss": 2.7626, "step": 841 }, { "epoch": 0.5632813479951498, "grad_norm": 3.464170217514038, "learning_rate": 4.8837631177626807e-05, "loss": 2.9802, "step": 842 }, { "epoch": 0.5639503282184221, "grad_norm": 4.128434181213379, "learning_rate": 4.883175485385268e-05, "loss": 3.3324, "step": 843 }, { "epoch": 0.5646193084416942, "grad_norm": 3.9891414642333984, "learning_rate": 4.8825864069046044e-05, "loss": 2.9105, "step": 844 }, { "epoch": 0.5652882886649664, "grad_norm": 5.570315837860107, "learning_rate": 4.881995882678142e-05, "loss": 2.8328, "step": 845 }, { "epoch": 0.5659572688882385, "grad_norm": 5.166423797607422, "learning_rate": 4.881403913064208e-05, "loss": 2.973, "step": 846 }, { "epoch": 0.5666262491115106, "grad_norm": 3.9031732082366943, "learning_rate": 4.880810498422009e-05, "loss": 3.0236, "step": 847 }, { "epoch": 0.5672952293347828, "grad_norm": 4.1895012855529785, "learning_rate": 4.880215639111626e-05, "loss": 2.9534, "step": 848 }, { "epoch": 0.5679642095580549, "grad_norm": 8.483187675476074, "learning_rate": 4.879619335494017e-05, "loss": 2.8898, "step": 849 }, { "epoch": 0.568633189781327, "grad_norm": 4.321814060211182, "learning_rate": 4.879021587931019e-05, "loss": 3.1692, "step": 850 }, { "epoch": 0.5693021700045993, "grad_norm": 3.7368454933166504, "learning_rate": 4.878422396785342e-05, "loss": 2.8991, "step": 851 }, { "epoch": 0.5699711502278714, "grad_norm": 7.553808212280273, "learning_rate": 4.877821762420574e-05, "loss": 3.1544, "step": 852 }, { "epoch": 0.5706401304511436, "grad_norm": 4.322549819946289, "learning_rate": 4.877219685201176e-05, "loss": 2.9816, "step": 853 }, { "epoch": 0.5713091106744157, "grad_norm": 3.573345184326172, "learning_rate": 4.87661616549249e-05, "loss": 2.8022, "step": 854 }, { "epoch": 0.5719780908976878, "grad_norm": 6.072822093963623, "learning_rate": 4.876011203660727e-05, "loss": 3.0581, "step": 855 }, { "epoch": 0.57264707112096, "grad_norm": 3.929838180541992, "learning_rate": 4.875404800072977e-05, "loss": 2.9111, "step": 856 }, { "epoch": 0.5733160513442321, "grad_norm": 4.304371356964111, "learning_rate": 4.874796955097204e-05, "loss": 3.0608, "step": 857 }, { "epoch": 0.5739850315675042, "grad_norm": 4.513864994049072, "learning_rate": 4.874187669102246e-05, "loss": 3.066, "step": 858 }, { "epoch": 0.5746540117907765, "grad_norm": 3.6599647998809814, "learning_rate": 4.873576942457815e-05, "loss": 3.075, "step": 859 }, { "epoch": 0.5753229920140486, "grad_norm": 3.387683391571045, "learning_rate": 4.8729647755344995e-05, "loss": 2.7858, "step": 860 }, { "epoch": 0.5759919722373207, "grad_norm": 4.5267181396484375, "learning_rate": 4.872351168703759e-05, "loss": 2.924, "step": 861 }, { "epoch": 0.5766609524605929, "grad_norm": 4.898892879486084, "learning_rate": 4.87173612233793e-05, "loss": 3.2621, "step": 862 }, { "epoch": 0.577329932683865, "grad_norm": 5.0480732917785645, "learning_rate": 4.871119636810219e-05, "loss": 3.143, "step": 863 }, { "epoch": 0.5779989129071372, "grad_norm": 4.6042304039001465, "learning_rate": 4.870501712494708e-05, "loss": 2.876, "step": 864 }, { "epoch": 0.5786678931304093, "grad_norm": 3.421410083770752, "learning_rate": 4.8698823497663513e-05, "loss": 2.947, "step": 865 }, { "epoch": 0.5793368733536814, "grad_norm": 4.7167534828186035, "learning_rate": 4.869261549000976e-05, "loss": 2.9044, "step": 866 }, { "epoch": 0.5800058535769537, "grad_norm": 2.3363020420074463, "learning_rate": 4.868639310575283e-05, "loss": 2.8712, "step": 867 }, { "epoch": 0.5806748338002258, "grad_norm": 5.71278715133667, "learning_rate": 4.8680156348668436e-05, "loss": 2.9753, "step": 868 }, { "epoch": 0.5813438140234979, "grad_norm": 4.479608058929443, "learning_rate": 4.867390522254103e-05, "loss": 2.9986, "step": 869 }, { "epoch": 0.5820127942467701, "grad_norm": 4.775486469268799, "learning_rate": 4.8667639731163775e-05, "loss": 2.8507, "step": 870 }, { "epoch": 0.5826817744700422, "grad_norm": 5.254409313201904, "learning_rate": 4.866135987833854e-05, "loss": 2.9328, "step": 871 }, { "epoch": 0.5833507546933144, "grad_norm": 5.818423748016357, "learning_rate": 4.865506566787593e-05, "loss": 2.7693, "step": 872 }, { "epoch": 0.5840197349165865, "grad_norm": 3.793675422668457, "learning_rate": 4.864875710359524e-05, "loss": 2.8004, "step": 873 }, { "epoch": 0.5846887151398587, "grad_norm": 3.916386365890503, "learning_rate": 4.864243418932451e-05, "loss": 2.9722, "step": 874 }, { "epoch": 0.5853576953631309, "grad_norm": 8.37555980682373, "learning_rate": 4.8636096928900446e-05, "loss": 2.7889, "step": 875 }, { "epoch": 0.586026675586403, "grad_norm": 4.431842803955078, "learning_rate": 4.862974532616848e-05, "loss": 2.683, "step": 876 }, { "epoch": 0.5866956558096751, "grad_norm": 4.644500732421875, "learning_rate": 4.862337938498274e-05, "loss": 2.8752, "step": 877 }, { "epoch": 0.5873646360329473, "grad_norm": 4.099252700805664, "learning_rate": 4.8616999109206063e-05, "loss": 2.9416, "step": 878 }, { "epoch": 0.5880336162562194, "grad_norm": 5.259937286376953, "learning_rate": 4.8610604502709984e-05, "loss": 3.0663, "step": 879 }, { "epoch": 0.5887025964794915, "grad_norm": 3.8882627487182617, "learning_rate": 4.8604195569374725e-05, "loss": 2.8836, "step": 880 }, { "epoch": 0.5893715767027637, "grad_norm": 5.224497318267822, "learning_rate": 4.859777231308921e-05, "loss": 3.2656, "step": 881 }, { "epoch": 0.5900405569260359, "grad_norm": 4.912106513977051, "learning_rate": 4.859133473775105e-05, "loss": 3.0812, "step": 882 }, { "epoch": 0.5907095371493081, "grad_norm": 3.9396631717681885, "learning_rate": 4.858488284726654e-05, "loss": 2.7061, "step": 883 }, { "epoch": 0.5913785173725802, "grad_norm": 3.4575467109680176, "learning_rate": 4.857841664555067e-05, "loss": 3.0659, "step": 884 }, { "epoch": 0.5920474975958523, "grad_norm": 4.488811016082764, "learning_rate": 4.857193613652711e-05, "loss": 2.8396, "step": 885 }, { "epoch": 0.5927164778191245, "grad_norm": 6.178808212280273, "learning_rate": 4.856544132412821e-05, "loss": 2.9394, "step": 886 }, { "epoch": 0.5933854580423966, "grad_norm": 4.329935073852539, "learning_rate": 4.8558932212295006e-05, "loss": 3.1392, "step": 887 }, { "epoch": 0.5940544382656687, "grad_norm": 4.575646877288818, "learning_rate": 4.85524088049772e-05, "loss": 2.9275, "step": 888 }, { "epoch": 0.594723418488941, "grad_norm": 4.456279277801514, "learning_rate": 4.854587110613318e-05, "loss": 3.0891, "step": 889 }, { "epoch": 0.5953923987122131, "grad_norm": 4.95554780960083, "learning_rate": 4.853931911973e-05, "loss": 2.9593, "step": 890 }, { "epoch": 0.5960613789354852, "grad_norm": 4.824105262756348, "learning_rate": 4.8532752849743384e-05, "loss": 3.1116, "step": 891 }, { "epoch": 0.5967303591587574, "grad_norm": 3.0384416580200195, "learning_rate": 4.8526172300157726e-05, "loss": 2.9218, "step": 892 }, { "epoch": 0.5973993393820295, "grad_norm": 5.79701566696167, "learning_rate": 4.8519577474966074e-05, "loss": 3.0881, "step": 893 }, { "epoch": 0.5980683196053017, "grad_norm": 3.513871431350708, "learning_rate": 4.851296837817015e-05, "loss": 2.7299, "step": 894 }, { "epoch": 0.5987372998285738, "grad_norm": 3.8924996852874756, "learning_rate": 4.850634501378034e-05, "loss": 2.9338, "step": 895 }, { "epoch": 0.5994062800518459, "grad_norm": 5.226293087005615, "learning_rate": 4.849970738581568e-05, "loss": 3.0033, "step": 896 }, { "epoch": 0.6000752602751181, "grad_norm": 4.557132244110107, "learning_rate": 4.8493055498303854e-05, "loss": 2.8244, "step": 897 }, { "epoch": 0.6007442404983903, "grad_norm": 5.492852687835693, "learning_rate": 4.84863893552812e-05, "loss": 2.8643, "step": 898 }, { "epoch": 0.6014132207216624, "grad_norm": 6.306492805480957, "learning_rate": 4.847970896079272e-05, "loss": 2.9746, "step": 899 }, { "epoch": 0.6020822009449346, "grad_norm": 3.952476739883423, "learning_rate": 4.8473014318892075e-05, "loss": 3.0116, "step": 900 }, { "epoch": 0.6027511811682067, "grad_norm": 4.202814102172852, "learning_rate": 4.846630543364152e-05, "loss": 3.0347, "step": 901 }, { "epoch": 0.6034201613914789, "grad_norm": 5.378247261047363, "learning_rate": 4.8459582309112e-05, "loss": 3.2602, "step": 902 }, { "epoch": 0.604089141614751, "grad_norm": 4.968826770782471, "learning_rate": 4.8452844949383094e-05, "loss": 2.8037, "step": 903 }, { "epoch": 0.6047581218380231, "grad_norm": 5.133361339569092, "learning_rate": 4.8446093358542986e-05, "loss": 2.8434, "step": 904 }, { "epoch": 0.6054271020612954, "grad_norm": 4.359605312347412, "learning_rate": 4.843932754068854e-05, "loss": 2.7, "step": 905 }, { "epoch": 0.6060960822845675, "grad_norm": 4.303665637969971, "learning_rate": 4.843254749992523e-05, "loss": 2.6906, "step": 906 }, { "epoch": 0.6067650625078396, "grad_norm": 4.343417644500732, "learning_rate": 4.8425753240367165e-05, "loss": 2.8728, "step": 907 }, { "epoch": 0.6074340427311118, "grad_norm": 4.789746284484863, "learning_rate": 4.841894476613707e-05, "loss": 2.9626, "step": 908 }, { "epoch": 0.6081030229543839, "grad_norm": 5.736357688903809, "learning_rate": 4.841212208136631e-05, "loss": 2.8692, "step": 909 }, { "epoch": 0.608772003177656, "grad_norm": 3.2540862560272217, "learning_rate": 4.840528519019487e-05, "loss": 2.7127, "step": 910 }, { "epoch": 0.6094409834009282, "grad_norm": 5.809683322906494, "learning_rate": 4.839843409677135e-05, "loss": 3.1236, "step": 911 }, { "epoch": 0.6101099636242003, "grad_norm": 6.905038356781006, "learning_rate": 4.839156880525297e-05, "loss": 2.9768, "step": 912 }, { "epoch": 0.6107789438474726, "grad_norm": 5.895445346832275, "learning_rate": 4.8384689319805584e-05, "loss": 2.9639, "step": 913 }, { "epoch": 0.6114479240707447, "grad_norm": 5.081824779510498, "learning_rate": 4.8377795644603615e-05, "loss": 3.1095, "step": 914 }, { "epoch": 0.6121169042940168, "grad_norm": 2.9040255546569824, "learning_rate": 4.837088778383015e-05, "loss": 2.6119, "step": 915 }, { "epoch": 0.612785884517289, "grad_norm": 4.30228328704834, "learning_rate": 4.836396574167684e-05, "loss": 2.7013, "step": 916 }, { "epoch": 0.6134548647405611, "grad_norm": 3.297487735748291, "learning_rate": 4.835702952234395e-05, "loss": 2.6564, "step": 917 }, { "epoch": 0.6141238449638332, "grad_norm": 5.532101154327393, "learning_rate": 4.835007913004038e-05, "loss": 2.9093, "step": 918 }, { "epoch": 0.6147928251871054, "grad_norm": 8.50019359588623, "learning_rate": 4.8343114568983594e-05, "loss": 2.9708, "step": 919 }, { "epoch": 0.6154618054103775, "grad_norm": 6.267725944519043, "learning_rate": 4.833613584339965e-05, "loss": 2.9291, "step": 920 }, { "epoch": 0.6161307856336498, "grad_norm": 6.294321537017822, "learning_rate": 4.8329142957523245e-05, "loss": 2.545, "step": 921 }, { "epoch": 0.6167997658569219, "grad_norm": 3.558936834335327, "learning_rate": 4.832213591559762e-05, "loss": 2.8664, "step": 922 }, { "epoch": 0.617468746080194, "grad_norm": 5.082416534423828, "learning_rate": 4.831511472187463e-05, "loss": 2.9146, "step": 923 }, { "epoch": 0.6181377263034662, "grad_norm": 5.50706672668457, "learning_rate": 4.830807938061471e-05, "loss": 3.0, "step": 924 }, { "epoch": 0.6188067065267383, "grad_norm": 3.523247003555298, "learning_rate": 4.83010298960869e-05, "loss": 2.9033, "step": 925 }, { "epoch": 0.6194756867500104, "grad_norm": 5.927065849304199, "learning_rate": 4.829396627256878e-05, "loss": 2.9559, "step": 926 }, { "epoch": 0.6201446669732826, "grad_norm": 6.5479865074157715, "learning_rate": 4.828688851434655e-05, "loss": 2.9306, "step": 927 }, { "epoch": 0.6208136471965547, "grad_norm": 5.179655075073242, "learning_rate": 4.8279796625714955e-05, "loss": 2.7334, "step": 928 }, { "epoch": 0.6214826274198268, "grad_norm": 5.061421871185303, "learning_rate": 4.8272690610977356e-05, "loss": 2.8804, "step": 929 }, { "epoch": 0.6221516076430991, "grad_norm": 3.9310529232025146, "learning_rate": 4.8265570474445636e-05, "loss": 3.0256, "step": 930 }, { "epoch": 0.6228205878663712, "grad_norm": 5.7879228591918945, "learning_rate": 4.825843622044028e-05, "loss": 3.0422, "step": 931 }, { "epoch": 0.6234895680896434, "grad_norm": 4.052291393280029, "learning_rate": 4.825128785329034e-05, "loss": 2.8619, "step": 932 }, { "epoch": 0.6241585483129155, "grad_norm": 3.4358327388763428, "learning_rate": 4.824412537733341e-05, "loss": 2.8506, "step": 933 }, { "epoch": 0.6248275285361876, "grad_norm": 3.948198080062866, "learning_rate": 4.823694879691565e-05, "loss": 2.8697, "step": 934 }, { "epoch": 0.6254965087594598, "grad_norm": 4.091851234436035, "learning_rate": 4.822975811639181e-05, "loss": 3.051, "step": 935 }, { "epoch": 0.6261654889827319, "grad_norm": 4.4150896072387695, "learning_rate": 4.822255334012515e-05, "loss": 3.0754, "step": 936 }, { "epoch": 0.626834469206004, "grad_norm": 4.241657257080078, "learning_rate": 4.821533447248752e-05, "loss": 2.9059, "step": 937 }, { "epoch": 0.6275034494292763, "grad_norm": 7.57624626159668, "learning_rate": 4.8208101517859294e-05, "loss": 2.7324, "step": 938 }, { "epoch": 0.6281724296525484, "grad_norm": 5.501160621643066, "learning_rate": 4.820085448062942e-05, "loss": 2.9821, "step": 939 }, { "epoch": 0.6288414098758206, "grad_norm": 6.796340465545654, "learning_rate": 4.819359336519536e-05, "loss": 2.9937, "step": 940 }, { "epoch": 0.6295103900990927, "grad_norm": 5.409112453460693, "learning_rate": 4.8186318175963145e-05, "loss": 2.7989, "step": 941 }, { "epoch": 0.6301793703223648, "grad_norm": 6.310184955596924, "learning_rate": 4.817902891734734e-05, "loss": 3.2403, "step": 942 }, { "epoch": 0.630848350545637, "grad_norm": 4.645697593688965, "learning_rate": 4.817172559377103e-05, "loss": 2.8919, "step": 943 }, { "epoch": 0.6315173307689091, "grad_norm": 8.987037658691406, "learning_rate": 4.816440820966587e-05, "loss": 2.862, "step": 944 }, { "epoch": 0.6321863109921813, "grad_norm": 4.836058616638184, "learning_rate": 4.8157076769472e-05, "loss": 2.7711, "step": 945 }, { "epoch": 0.6328552912154535, "grad_norm": 6.288712024688721, "learning_rate": 4.814973127763813e-05, "loss": 2.9659, "step": 946 }, { "epoch": 0.6335242714387256, "grad_norm": 3.072624683380127, "learning_rate": 4.814237173862148e-05, "loss": 2.8166, "step": 947 }, { "epoch": 0.6341932516619977, "grad_norm": 5.474343299865723, "learning_rate": 4.81349981568878e-05, "loss": 2.9033, "step": 948 }, { "epoch": 0.6348622318852699, "grad_norm": 3.749812602996826, "learning_rate": 4.812761053691134e-05, "loss": 2.9709, "step": 949 }, { "epoch": 0.635531212108542, "grad_norm": 3.7517666816711426, "learning_rate": 4.81202088831749e-05, "loss": 2.9877, "step": 950 }, { "epoch": 0.6362001923318142, "grad_norm": 4.512857913970947, "learning_rate": 4.811279320016976e-05, "loss": 3.0786, "step": 951 }, { "epoch": 0.6368691725550863, "grad_norm": 4.326724529266357, "learning_rate": 4.810536349239576e-05, "loss": 2.8043, "step": 952 }, { "epoch": 0.6375381527783585, "grad_norm": 3.670686960220337, "learning_rate": 4.8097919764361194e-05, "loss": 3.0172, "step": 953 }, { "epoch": 0.6382071330016307, "grad_norm": 3.1971354484558105, "learning_rate": 4.809046202058291e-05, "loss": 2.9217, "step": 954 }, { "epoch": 0.6388761132249028, "grad_norm": 4.29288387298584, "learning_rate": 4.8082990265586245e-05, "loss": 2.9419, "step": 955 }, { "epoch": 0.6395450934481749, "grad_norm": 6.09660005569458, "learning_rate": 4.8075504503905025e-05, "loss": 2.7751, "step": 956 }, { "epoch": 0.6402140736714471, "grad_norm": 3.5380003452301025, "learning_rate": 4.80680047400816e-05, "loss": 2.9738, "step": 957 }, { "epoch": 0.6408830538947192, "grad_norm": 5.08881139755249, "learning_rate": 4.8060490978666784e-05, "loss": 2.9587, "step": 958 }, { "epoch": 0.6415520341179913, "grad_norm": 4.936773300170898, "learning_rate": 4.8052963224219915e-05, "loss": 3.017, "step": 959 }, { "epoch": 0.6422210143412636, "grad_norm": 5.949796199798584, "learning_rate": 4.804542148130881e-05, "loss": 3.1539, "step": 960 }, { "epoch": 0.6428899945645357, "grad_norm": 5.088866233825684, "learning_rate": 4.803786575450978e-05, "loss": 2.9623, "step": 961 }, { "epoch": 0.6435589747878079, "grad_norm": 4.024708271026611, "learning_rate": 4.8030296048407596e-05, "loss": 3.0045, "step": 962 }, { "epoch": 0.64422795501108, "grad_norm": 6.3203630447387695, "learning_rate": 4.802271236759556e-05, "loss": 3.103, "step": 963 }, { "epoch": 0.6448969352343521, "grad_norm": 4.5583367347717285, "learning_rate": 4.8015114716675395e-05, "loss": 2.9665, "step": 964 }, { "epoch": 0.6455659154576243, "grad_norm": 4.08353853225708, "learning_rate": 4.800750310025735e-05, "loss": 2.9864, "step": 965 }, { "epoch": 0.6462348956808964, "grad_norm": 2.692695140838623, "learning_rate": 4.799987752296013e-05, "loss": 2.8868, "step": 966 }, { "epoch": 0.6469038759041685, "grad_norm": 6.414584159851074, "learning_rate": 4.7992237989410904e-05, "loss": 3.0365, "step": 967 }, { "epoch": 0.6475728561274408, "grad_norm": 4.764004707336426, "learning_rate": 4.7984584504245325e-05, "loss": 3.1698, "step": 968 }, { "epoch": 0.6482418363507129, "grad_norm": 3.422572612762451, "learning_rate": 4.7976917072107486e-05, "loss": 2.7567, "step": 969 }, { "epoch": 0.6489108165739851, "grad_norm": 4.826651096343994, "learning_rate": 4.796923569764998e-05, "loss": 2.7321, "step": 970 }, { "epoch": 0.6495797967972572, "grad_norm": 5.226298809051514, "learning_rate": 4.796154038553382e-05, "loss": 3.1136, "step": 971 }, { "epoch": 0.6502487770205293, "grad_norm": 3.5223143100738525, "learning_rate": 4.795383114042852e-05, "loss": 2.74, "step": 972 }, { "epoch": 0.6509177572438015, "grad_norm": 5.651745319366455, "learning_rate": 4.794610796701201e-05, "loss": 3.0947, "step": 973 }, { "epoch": 0.6515867374670736, "grad_norm": 4.480429172515869, "learning_rate": 4.7938370869970694e-05, "loss": 2.9169, "step": 974 }, { "epoch": 0.6522557176903457, "grad_norm": 5.427626132965088, "learning_rate": 4.793061985399942e-05, "loss": 2.9253, "step": 975 }, { "epoch": 0.652924697913618, "grad_norm": 3.702354669570923, "learning_rate": 4.7922854923801457e-05, "loss": 2.8818, "step": 976 }, { "epoch": 0.6535936781368901, "grad_norm": 9.913829803466797, "learning_rate": 4.7915076084088565e-05, "loss": 3.3618, "step": 977 }, { "epoch": 0.6542626583601622, "grad_norm": 3.803285837173462, "learning_rate": 4.790728333958091e-05, "loss": 2.9537, "step": 978 }, { "epoch": 0.6549316385834344, "grad_norm": 5.172455787658691, "learning_rate": 4.789947669500711e-05, "loss": 2.9666, "step": 979 }, { "epoch": 0.6556006188067065, "grad_norm": 4.230083465576172, "learning_rate": 4.78916561551042e-05, "loss": 2.9084, "step": 980 }, { "epoch": 0.6562695990299787, "grad_norm": 5.195817470550537, "learning_rate": 4.7883821724617674e-05, "loss": 3.0398, "step": 981 }, { "epoch": 0.6569385792532508, "grad_norm": 4.758103370666504, "learning_rate": 4.7875973408301424e-05, "loss": 2.8131, "step": 982 }, { "epoch": 0.6576075594765229, "grad_norm": 7.711447715759277, "learning_rate": 4.786811121091779e-05, "loss": 2.9391, "step": 983 }, { "epoch": 0.6582765396997952, "grad_norm": 4.11110258102417, "learning_rate": 4.786023513723753e-05, "loss": 2.6406, "step": 984 }, { "epoch": 0.6589455199230673, "grad_norm": 8.119991302490234, "learning_rate": 4.785234519203982e-05, "loss": 3.0358, "step": 985 }, { "epoch": 0.6596145001463394, "grad_norm": 4.686762809753418, "learning_rate": 4.7844441380112247e-05, "loss": 3.0016, "step": 986 }, { "epoch": 0.6602834803696116, "grad_norm": 5.753101348876953, "learning_rate": 4.7836523706250825e-05, "loss": 2.9268, "step": 987 }, { "epoch": 0.6609524605928837, "grad_norm": 7.028621196746826, "learning_rate": 4.7828592175259976e-05, "loss": 3.234, "step": 988 }, { "epoch": 0.6616214408161559, "grad_norm": 5.211911678314209, "learning_rate": 4.782064679195253e-05, "loss": 3.0338, "step": 989 }, { "epoch": 0.662290421039428, "grad_norm": 5.439329147338867, "learning_rate": 4.78126875611497e-05, "loss": 2.9931, "step": 990 }, { "epoch": 0.6629594012627001, "grad_norm": 5.949470043182373, "learning_rate": 4.780471448768115e-05, "loss": 2.9316, "step": 991 }, { "epoch": 0.6636283814859724, "grad_norm": 4.6272783279418945, "learning_rate": 4.7796727576384884e-05, "loss": 3.0215, "step": 992 }, { "epoch": 0.6642973617092445, "grad_norm": 4.166497230529785, "learning_rate": 4.778872683210736e-05, "loss": 2.7412, "step": 993 }, { "epoch": 0.6649663419325166, "grad_norm": 5.167282581329346, "learning_rate": 4.77807122597034e-05, "loss": 2.8083, "step": 994 }, { "epoch": 0.6656353221557888, "grad_norm": 4.663416385650635, "learning_rate": 4.77726838640362e-05, "loss": 2.8814, "step": 995 }, { "epoch": 0.6663043023790609, "grad_norm": 4.006105422973633, "learning_rate": 4.776464164997739e-05, "loss": 3.0979, "step": 996 }, { "epoch": 0.666973282602333, "grad_norm": 7.690087795257568, "learning_rate": 4.775658562240696e-05, "loss": 2.8149, "step": 997 }, { "epoch": 0.6676422628256052, "grad_norm": 4.8195295333862305, "learning_rate": 4.7748515786213264e-05, "loss": 2.9307, "step": 998 }, { "epoch": 0.6683112430488773, "grad_norm": 3.997514009475708, "learning_rate": 4.7740432146293055e-05, "loss": 2.9505, "step": 999 }, { "epoch": 0.6689802232721496, "grad_norm": 6.172044277191162, "learning_rate": 4.773233470755147e-05, "loss": 2.9291, "step": 1000 }, { "epoch": 0.6696492034954217, "grad_norm": 6.186550140380859, "learning_rate": 4.7724223474902014e-05, "loss": 3.0619, "step": 1001 }, { "epoch": 0.6703181837186938, "grad_norm": 5.554472923278809, "learning_rate": 4.771609845326654e-05, "loss": 2.8572, "step": 1002 }, { "epoch": 0.670987163941966, "grad_norm": 3.5486297607421875, "learning_rate": 4.7707959647575295e-05, "loss": 2.793, "step": 1003 }, { "epoch": 0.6716561441652381, "grad_norm": 5.987931728363037, "learning_rate": 4.7699807062766876e-05, "loss": 2.8616, "step": 1004 }, { "epoch": 0.6723251243885102, "grad_norm": 5.39992618560791, "learning_rate": 4.769164070378824e-05, "loss": 2.9884, "step": 1005 }, { "epoch": 0.6729941046117824, "grad_norm": 4.178929805755615, "learning_rate": 4.768346057559473e-05, "loss": 2.9624, "step": 1006 }, { "epoch": 0.6736630848350545, "grad_norm": 6.299113750457764, "learning_rate": 4.7675266683149996e-05, "loss": 3.1258, "step": 1007 }, { "epoch": 0.6743320650583268, "grad_norm": 2.810047149658203, "learning_rate": 4.766705903142608e-05, "loss": 2.8287, "step": 1008 }, { "epoch": 0.6750010452815989, "grad_norm": 4.146429061889648, "learning_rate": 4.7658837625403354e-05, "loss": 3.1088, "step": 1009 }, { "epoch": 0.675670025504871, "grad_norm": 4.9735798835754395, "learning_rate": 4.7650602470070536e-05, "loss": 2.8834, "step": 1010 }, { "epoch": 0.6763390057281432, "grad_norm": 5.806169033050537, "learning_rate": 4.7642353570424704e-05, "loss": 3.0712, "step": 1011 }, { "epoch": 0.6770079859514153, "grad_norm": 5.46205472946167, "learning_rate": 4.7634090931471254e-05, "loss": 2.8523, "step": 1012 }, { "epoch": 0.6776769661746874, "grad_norm": 4.201307773590088, "learning_rate": 4.762581455822394e-05, "loss": 2.6202, "step": 1013 }, { "epoch": 0.6783459463979596, "grad_norm": 4.1945271492004395, "learning_rate": 4.761752445570482e-05, "loss": 2.7655, "step": 1014 }, { "epoch": 0.6790149266212318, "grad_norm": 4.354991436004639, "learning_rate": 4.760922062894432e-05, "loss": 3.0846, "step": 1015 }, { "epoch": 0.6796839068445039, "grad_norm": 3.5169050693511963, "learning_rate": 4.760090308298116e-05, "loss": 2.9702, "step": 1016 }, { "epoch": 0.6803528870677761, "grad_norm": 3.669956684112549, "learning_rate": 4.759257182286242e-05, "loss": 2.7531, "step": 1017 }, { "epoch": 0.6810218672910482, "grad_norm": 4.439055919647217, "learning_rate": 4.7584226853643465e-05, "loss": 3.0305, "step": 1018 }, { "epoch": 0.6816908475143204, "grad_norm": 4.583697319030762, "learning_rate": 4.7575868180388e-05, "loss": 2.988, "step": 1019 }, { "epoch": 0.6823598277375925, "grad_norm": 4.089028835296631, "learning_rate": 4.756749580816804e-05, "loss": 2.8122, "step": 1020 }, { "epoch": 0.6830288079608646, "grad_norm": 4.180925369262695, "learning_rate": 4.755910974206392e-05, "loss": 2.9264, "step": 1021 }, { "epoch": 0.6836977881841368, "grad_norm": 4.284313201904297, "learning_rate": 4.755070998716428e-05, "loss": 2.8798, "step": 1022 }, { "epoch": 0.684366768407409, "grad_norm": 6.518707752227783, "learning_rate": 4.7542296548566044e-05, "loss": 2.909, "step": 1023 }, { "epoch": 0.6850357486306811, "grad_norm": 5.645822048187256, "learning_rate": 4.753386943137448e-05, "loss": 3.0253, "step": 1024 }, { "epoch": 0.6857047288539533, "grad_norm": 3.4720051288604736, "learning_rate": 4.752542864070313e-05, "loss": 3.0259, "step": 1025 }, { "epoch": 0.6863737090772254, "grad_norm": 3.090329170227051, "learning_rate": 4.751697418167384e-05, "loss": 2.8428, "step": 1026 }, { "epoch": 0.6870426893004975, "grad_norm": 3.7356035709381104, "learning_rate": 4.750850605941675e-05, "loss": 2.9791, "step": 1027 }, { "epoch": 0.6877116695237697, "grad_norm": 3.315736770629883, "learning_rate": 4.750002427907028e-05, "loss": 2.7206, "step": 1028 }, { "epoch": 0.6883806497470418, "grad_norm": 5.103281497955322, "learning_rate": 4.7491528845781155e-05, "loss": 3.16, "step": 1029 }, { "epoch": 0.689049629970314, "grad_norm": 4.23598575592041, "learning_rate": 4.7483019764704365e-05, "loss": 2.9537, "step": 1030 }, { "epoch": 0.6897186101935862, "grad_norm": 3.8574106693267822, "learning_rate": 4.747449704100322e-05, "loss": 3.0713, "step": 1031 }, { "epoch": 0.6903875904168583, "grad_norm": 4.238114356994629, "learning_rate": 4.746596067984925e-05, "loss": 2.9019, "step": 1032 }, { "epoch": 0.6910565706401305, "grad_norm": 4.508023262023926, "learning_rate": 4.745741068642232e-05, "loss": 3.2339, "step": 1033 }, { "epoch": 0.6917255508634026, "grad_norm": 5.460549831390381, "learning_rate": 4.744884706591052e-05, "loss": 3.0797, "step": 1034 }, { "epoch": 0.6923945310866747, "grad_norm": 6.020919322967529, "learning_rate": 4.744026982351023e-05, "loss": 3.1077, "step": 1035 }, { "epoch": 0.6930635113099469, "grad_norm": 4.980844020843506, "learning_rate": 4.74316789644261e-05, "loss": 2.893, "step": 1036 }, { "epoch": 0.693732491533219, "grad_norm": 5.355983734130859, "learning_rate": 4.742307449387103e-05, "loss": 2.8476, "step": 1037 }, { "epoch": 0.6944014717564913, "grad_norm": 6.6928839683532715, "learning_rate": 4.741445641706618e-05, "loss": 3.0333, "step": 1038 }, { "epoch": 0.6950704519797634, "grad_norm": 5.792276859283447, "learning_rate": 4.740582473924099e-05, "loss": 3.2354, "step": 1039 }, { "epoch": 0.6957394322030355, "grad_norm": 4.4441609382629395, "learning_rate": 4.739717946563311e-05, "loss": 2.7452, "step": 1040 }, { "epoch": 0.6964084124263077, "grad_norm": 5.933568477630615, "learning_rate": 4.738852060148849e-05, "loss": 3.0236, "step": 1041 }, { "epoch": 0.6970773926495798, "grad_norm": 4.327657222747803, "learning_rate": 4.737984815206128e-05, "loss": 2.8741, "step": 1042 }, { "epoch": 0.6977463728728519, "grad_norm": 5.816018104553223, "learning_rate": 4.73711621226139e-05, "loss": 2.8933, "step": 1043 }, { "epoch": 0.6984153530961241, "grad_norm": 4.199925422668457, "learning_rate": 4.736246251841701e-05, "loss": 2.8275, "step": 1044 }, { "epoch": 0.6990843333193962, "grad_norm": 5.037257194519043, "learning_rate": 4.73537493447495e-05, "loss": 2.9567, "step": 1045 }, { "epoch": 0.6997533135426683, "grad_norm": 4.057535171508789, "learning_rate": 4.734502260689849e-05, "loss": 2.8808, "step": 1046 }, { "epoch": 0.7004222937659406, "grad_norm": 5.710482120513916, "learning_rate": 4.7336282310159356e-05, "loss": 2.9466, "step": 1047 }, { "epoch": 0.7010912739892127, "grad_norm": 3.3100647926330566, "learning_rate": 4.7327528459835654e-05, "loss": 2.7895, "step": 1048 }, { "epoch": 0.7017602542124849, "grad_norm": 3.2015061378479004, "learning_rate": 4.7318761061239206e-05, "loss": 2.603, "step": 1049 }, { "epoch": 0.702429234435757, "grad_norm": 4.796367168426514, "learning_rate": 4.730998011969004e-05, "loss": 3.0706, "step": 1050 }, { "epoch": 0.7030982146590291, "grad_norm": 3.3222239017486572, "learning_rate": 4.730118564051642e-05, "loss": 2.9381, "step": 1051 }, { "epoch": 0.7037671948823013, "grad_norm": 5.005463123321533, "learning_rate": 4.7292377629054777e-05, "loss": 2.8974, "step": 1052 }, { "epoch": 0.7044361751055734, "grad_norm": 5.9778218269348145, "learning_rate": 4.728355609064981e-05, "loss": 3.2306, "step": 1053 }, { "epoch": 0.7051051553288455, "grad_norm": 5.388123035430908, "learning_rate": 4.727472103065439e-05, "loss": 3.1614, "step": 1054 }, { "epoch": 0.7057741355521178, "grad_norm": 5.160175800323486, "learning_rate": 4.726587245442959e-05, "loss": 2.8656, "step": 1055 }, { "epoch": 0.7064431157753899, "grad_norm": 3.826104164123535, "learning_rate": 4.725701036734472e-05, "loss": 2.8349, "step": 1056 }, { "epoch": 0.7071120959986621, "grad_norm": 6.139577388763428, "learning_rate": 4.7248134774777255e-05, "loss": 3.0752, "step": 1057 }, { "epoch": 0.7077810762219342, "grad_norm": 5.152160167694092, "learning_rate": 4.723924568211288e-05, "loss": 3.0287, "step": 1058 }, { "epoch": 0.7084500564452063, "grad_norm": 3.847810745239258, "learning_rate": 4.723034309474546e-05, "loss": 3.0277, "step": 1059 }, { "epoch": 0.7091190366684785, "grad_norm": 4.554903984069824, "learning_rate": 4.722142701807706e-05, "loss": 2.7659, "step": 1060 }, { "epoch": 0.7097880168917506, "grad_norm": 3.8796348571777344, "learning_rate": 4.721249745751794e-05, "loss": 2.8684, "step": 1061 }, { "epoch": 0.7104569971150227, "grad_norm": 3.869396686553955, "learning_rate": 4.720355441848651e-05, "loss": 2.8877, "step": 1062 }, { "epoch": 0.711125977338295, "grad_norm": 3.0460147857666016, "learning_rate": 4.719459790640939e-05, "loss": 2.8196, "step": 1063 }, { "epoch": 0.7117949575615671, "grad_norm": 4.743002414703369, "learning_rate": 4.718562792672135e-05, "loss": 3.0261, "step": 1064 }, { "epoch": 0.7124639377848392, "grad_norm": 3.112426280975342, "learning_rate": 4.717664448486536e-05, "loss": 3.0122, "step": 1065 }, { "epoch": 0.7131329180081114, "grad_norm": 5.043789386749268, "learning_rate": 4.716764758629254e-05, "loss": 2.8663, "step": 1066 }, { "epoch": 0.7138018982313835, "grad_norm": 4.557572364807129, "learning_rate": 4.7158637236462163e-05, "loss": 2.7515, "step": 1067 }, { "epoch": 0.7144708784546557, "grad_norm": 5.140640735626221, "learning_rate": 4.714961344084171e-05, "loss": 3.0498, "step": 1068 }, { "epoch": 0.7151398586779278, "grad_norm": 7.291658401489258, "learning_rate": 4.714057620490676e-05, "loss": 3.0888, "step": 1069 }, { "epoch": 0.7158088389012, "grad_norm": 5.374858379364014, "learning_rate": 4.71315255341411e-05, "loss": 2.9642, "step": 1070 }, { "epoch": 0.7164778191244722, "grad_norm": 3.6649510860443115, "learning_rate": 4.7122461434036645e-05, "loss": 2.8619, "step": 1071 }, { "epoch": 0.7171467993477443, "grad_norm": 4.465708255767822, "learning_rate": 4.7113383910093455e-05, "loss": 2.972, "step": 1072 }, { "epoch": 0.7178157795710164, "grad_norm": 5.3930253982543945, "learning_rate": 4.710429296781974e-05, "loss": 2.9884, "step": 1073 }, { "epoch": 0.7184847597942886, "grad_norm": 4.323741912841797, "learning_rate": 4.709518861273187e-05, "loss": 2.7952, "step": 1074 }, { "epoch": 0.7191537400175607, "grad_norm": 3.911958694458008, "learning_rate": 4.708607085035433e-05, "loss": 2.8223, "step": 1075 }, { "epoch": 0.7198227202408328, "grad_norm": 4.966227054595947, "learning_rate": 4.7076939686219734e-05, "loss": 3.0945, "step": 1076 }, { "epoch": 0.720491700464105, "grad_norm": 6.530167579650879, "learning_rate": 4.706779512586887e-05, "loss": 3.1844, "step": 1077 }, { "epoch": 0.7211606806873772, "grad_norm": 5.185251235961914, "learning_rate": 4.7058637174850604e-05, "loss": 2.9711, "step": 1078 }, { "epoch": 0.7218296609106494, "grad_norm": 4.045567512512207, "learning_rate": 4.704946583872197e-05, "loss": 3.0596, "step": 1079 }, { "epoch": 0.7224986411339215, "grad_norm": 6.590458393096924, "learning_rate": 4.70402811230481e-05, "loss": 2.9715, "step": 1080 }, { "epoch": 0.7231676213571936, "grad_norm": 5.262936592102051, "learning_rate": 4.703108303340225e-05, "loss": 2.9436, "step": 1081 }, { "epoch": 0.7238366015804658, "grad_norm": 6.591318130493164, "learning_rate": 4.702187157536578e-05, "loss": 3.0539, "step": 1082 }, { "epoch": 0.7245055818037379, "grad_norm": 4.073249816894531, "learning_rate": 4.701264675452819e-05, "loss": 3.0097, "step": 1083 }, { "epoch": 0.72517456202701, "grad_norm": 4.671321392059326, "learning_rate": 4.700340857648706e-05, "loss": 2.8997, "step": 1084 }, { "epoch": 0.7258435422502822, "grad_norm": 5.47347354888916, "learning_rate": 4.6994157046848085e-05, "loss": 2.7696, "step": 1085 }, { "epoch": 0.7265125224735544, "grad_norm": 5.776899814605713, "learning_rate": 4.6984892171225084e-05, "loss": 2.8977, "step": 1086 }, { "epoch": 0.7271815026968266, "grad_norm": 2.396639823913574, "learning_rate": 4.697561395523993e-05, "loss": 2.7917, "step": 1087 }, { "epoch": 0.7278504829200987, "grad_norm": 3.468226671218872, "learning_rate": 4.6966322404522625e-05, "loss": 3.0844, "step": 1088 }, { "epoch": 0.7285194631433708, "grad_norm": 4.072845458984375, "learning_rate": 4.695701752471125e-05, "loss": 2.9978, "step": 1089 }, { "epoch": 0.729188443366643, "grad_norm": 3.296337127685547, "learning_rate": 4.694769932145198e-05, "loss": 3.1235, "step": 1090 }, { "epoch": 0.7298574235899151, "grad_norm": 6.070874214172363, "learning_rate": 4.693836780039906e-05, "loss": 3.0078, "step": 1091 }, { "epoch": 0.7305264038131872, "grad_norm": 3.937580108642578, "learning_rate": 4.6929022967214845e-05, "loss": 2.6871, "step": 1092 }, { "epoch": 0.7311953840364595, "grad_norm": 4.932339191436768, "learning_rate": 4.691966482756974e-05, "loss": 3.0435, "step": 1093 }, { "epoch": 0.7318643642597316, "grad_norm": 5.55933141708374, "learning_rate": 4.6910293387142234e-05, "loss": 2.995, "step": 1094 }, { "epoch": 0.7325333444830037, "grad_norm": 3.3646323680877686, "learning_rate": 4.690090865161889e-05, "loss": 2.6285, "step": 1095 }, { "epoch": 0.7332023247062759, "grad_norm": 4.399787902832031, "learning_rate": 4.6891510626694325e-05, "loss": 2.7016, "step": 1096 }, { "epoch": 0.733871304929548, "grad_norm": 3.4604458808898926, "learning_rate": 4.6882099318071246e-05, "loss": 2.7843, "step": 1097 }, { "epoch": 0.7345402851528202, "grad_norm": 2.961883544921875, "learning_rate": 4.687267473146039e-05, "loss": 2.8187, "step": 1098 }, { "epoch": 0.7352092653760923, "grad_norm": 5.411362171173096, "learning_rate": 4.686323687258058e-05, "loss": 3.1326, "step": 1099 }, { "epoch": 0.7358782455993644, "grad_norm": 5.700189113616943, "learning_rate": 4.685378574715867e-05, "loss": 3.1582, "step": 1100 }, { "epoch": 0.7365472258226367, "grad_norm": 7.942058086395264, "learning_rate": 4.6844321360929574e-05, "loss": 3.0623, "step": 1101 }, { "epoch": 0.7372162060459088, "grad_norm": 5.992877006530762, "learning_rate": 4.6834843719636256e-05, "loss": 2.9932, "step": 1102 }, { "epoch": 0.7378851862691809, "grad_norm": 4.572200775146484, "learning_rate": 4.6825352829029705e-05, "loss": 2.6074, "step": 1103 }, { "epoch": 0.7385541664924531, "grad_norm": 3.488412857055664, "learning_rate": 4.681584869486898e-05, "loss": 2.8706, "step": 1104 }, { "epoch": 0.7392231467157252, "grad_norm": 4.054234504699707, "learning_rate": 4.680633132292115e-05, "loss": 2.9623, "step": 1105 }, { "epoch": 0.7398921269389974, "grad_norm": 5.8013176918029785, "learning_rate": 4.679680071896132e-05, "loss": 2.7356, "step": 1106 }, { "epoch": 0.7405611071622695, "grad_norm": 3.128580331802368, "learning_rate": 4.678725688877265e-05, "loss": 2.7304, "step": 1107 }, { "epoch": 0.7412300873855416, "grad_norm": 6.505280494689941, "learning_rate": 4.6777699838146286e-05, "loss": 3.1473, "step": 1108 }, { "epoch": 0.7418990676088139, "grad_norm": 4.020694255828857, "learning_rate": 4.676812957288141e-05, "loss": 2.8131, "step": 1109 }, { "epoch": 0.742568047832086, "grad_norm": 5.444528579711914, "learning_rate": 4.675854609878526e-05, "loss": 3.0127, "step": 1110 }, { "epoch": 0.7432370280553581, "grad_norm": 3.6009886264801025, "learning_rate": 4.674894942167303e-05, "loss": 2.6999, "step": 1111 }, { "epoch": 0.7439060082786303, "grad_norm": 3.48321533203125, "learning_rate": 4.673933954736796e-05, "loss": 2.9221, "step": 1112 }, { "epoch": 0.7445749885019024, "grad_norm": 5.604138374328613, "learning_rate": 4.672971648170129e-05, "loss": 3.0887, "step": 1113 }, { "epoch": 0.7452439687251745, "grad_norm": 3.9699227809906006, "learning_rate": 4.672008023051228e-05, "loss": 2.876, "step": 1114 }, { "epoch": 0.7459129489484467, "grad_norm": 5.587133884429932, "learning_rate": 4.671043079964815e-05, "loss": 3.1502, "step": 1115 }, { "epoch": 0.7465819291717188, "grad_norm": 5.9802680015563965, "learning_rate": 4.670076819496416e-05, "loss": 3.1501, "step": 1116 }, { "epoch": 0.7472509093949911, "grad_norm": 5.209526538848877, "learning_rate": 4.669109242232355e-05, "loss": 3.0757, "step": 1117 }, { "epoch": 0.7479198896182632, "grad_norm": 6.363523006439209, "learning_rate": 4.6681403487597536e-05, "loss": 3.0408, "step": 1118 }, { "epoch": 0.7485888698415353, "grad_norm": 4.495762825012207, "learning_rate": 4.6671701396665345e-05, "loss": 2.8424, "step": 1119 }, { "epoch": 0.7492578500648075, "grad_norm": 4.736873626708984, "learning_rate": 4.6661986155414164e-05, "loss": 3.1298, "step": 1120 }, { "epoch": 0.7499268302880796, "grad_norm": 4.165266513824463, "learning_rate": 4.665225776973918e-05, "loss": 2.8714, "step": 1121 }, { "epoch": 0.7505958105113517, "grad_norm": 8.435874938964844, "learning_rate": 4.664251624554354e-05, "loss": 3.3392, "step": 1122 }, { "epoch": 0.7512647907346239, "grad_norm": 5.860448360443115, "learning_rate": 4.663276158873837e-05, "loss": 2.8908, "step": 1123 }, { "epoch": 0.751933770957896, "grad_norm": 3.957852363586426, "learning_rate": 4.6622993805242766e-05, "loss": 3.0232, "step": 1124 }, { "epoch": 0.7526027511811683, "grad_norm": 5.143473148345947, "learning_rate": 4.661321290098379e-05, "loss": 3.0588, "step": 1125 }, { "epoch": 0.7532717314044404, "grad_norm": 3.922308921813965, "learning_rate": 4.660341888189646e-05, "loss": 2.9771, "step": 1126 }, { "epoch": 0.7539407116277125, "grad_norm": 3.8919553756713867, "learning_rate": 4.6593611753923756e-05, "loss": 2.8287, "step": 1127 }, { "epoch": 0.7546096918509847, "grad_norm": 3.5253610610961914, "learning_rate": 4.6583791523016616e-05, "loss": 2.7894, "step": 1128 }, { "epoch": 0.7552786720742568, "grad_norm": 5.46091890335083, "learning_rate": 4.657395819513392e-05, "loss": 3.1581, "step": 1129 }, { "epoch": 0.7559476522975289, "grad_norm": 5.317687034606934, "learning_rate": 4.6564111776242494e-05, "loss": 3.0963, "step": 1130 }, { "epoch": 0.7566166325208011, "grad_norm": 5.06818151473999, "learning_rate": 4.655425227231712e-05, "loss": 2.9743, "step": 1131 }, { "epoch": 0.7572856127440732, "grad_norm": 3.99940824508667, "learning_rate": 4.6544379689340515e-05, "loss": 2.8127, "step": 1132 }, { "epoch": 0.7579545929673454, "grad_norm": 4.829176425933838, "learning_rate": 4.653449403330333e-05, "loss": 2.8884, "step": 1133 }, { "epoch": 0.7586235731906176, "grad_norm": 4.37836217880249, "learning_rate": 4.652459531020416e-05, "loss": 2.8942, "step": 1134 }, { "epoch": 0.7592925534138897, "grad_norm": 5.445929050445557, "learning_rate": 4.651468352604949e-05, "loss": 2.8611, "step": 1135 }, { "epoch": 0.7599615336371619, "grad_norm": 3.0896148681640625, "learning_rate": 4.6504758686853786e-05, "loss": 2.7729, "step": 1136 }, { "epoch": 0.760630513860434, "grad_norm": 7.576822757720947, "learning_rate": 4.6494820798639396e-05, "loss": 2.9835, "step": 1137 }, { "epoch": 0.7612994940837061, "grad_norm": 11.344874382019043, "learning_rate": 4.64848698674366e-05, "loss": 2.8235, "step": 1138 }, { "epoch": 0.7619684743069783, "grad_norm": 6.878731727600098, "learning_rate": 4.6474905899283596e-05, "loss": 2.8795, "step": 1139 }, { "epoch": 0.7626374545302504, "grad_norm": 6.209124565124512, "learning_rate": 4.646492890022648e-05, "loss": 2.6676, "step": 1140 }, { "epoch": 0.7633064347535226, "grad_norm": 5.197012424468994, "learning_rate": 4.6454938876319266e-05, "loss": 2.7382, "step": 1141 }, { "epoch": 0.7639754149767948, "grad_norm": 8.035080909729004, "learning_rate": 4.644493583362387e-05, "loss": 3.1107, "step": 1142 }, { "epoch": 0.7646443952000669, "grad_norm": 6.702689170837402, "learning_rate": 4.6434919778210114e-05, "loss": 2.7169, "step": 1143 }, { "epoch": 0.765313375423339, "grad_norm": 6.232007026672363, "learning_rate": 4.64248907161557e-05, "loss": 2.9951, "step": 1144 }, { "epoch": 0.7659823556466112, "grad_norm": 7.285654067993164, "learning_rate": 4.641484865354623e-05, "loss": 2.8934, "step": 1145 }, { "epoch": 0.7666513358698833, "grad_norm": 4.895506858825684, "learning_rate": 4.6404793596475195e-05, "loss": 2.7874, "step": 1146 }, { "epoch": 0.7673203160931555, "grad_norm": 7.719137191772461, "learning_rate": 4.639472555104397e-05, "loss": 3.0158, "step": 1147 }, { "epoch": 0.7679892963164276, "grad_norm": 6.7118730545043945, "learning_rate": 4.638464452336182e-05, "loss": 3.0185, "step": 1148 }, { "epoch": 0.7686582765396998, "grad_norm": 6.159465312957764, "learning_rate": 4.637455051954587e-05, "loss": 2.9336, "step": 1149 }, { "epoch": 0.769327256762972, "grad_norm": 4.991342544555664, "learning_rate": 4.6364443545721146e-05, "loss": 2.9299, "step": 1150 }, { "epoch": 0.7699962369862441, "grad_norm": 5.403529167175293, "learning_rate": 4.635432360802051e-05, "loss": 3.1296, "step": 1151 }, { "epoch": 0.7706652172095162, "grad_norm": 5.111245155334473, "learning_rate": 4.634419071258472e-05, "loss": 2.6837, "step": 1152 }, { "epoch": 0.7713341974327884, "grad_norm": 5.243202209472656, "learning_rate": 4.633404486556238e-05, "loss": 2.8771, "step": 1153 }, { "epoch": 0.7720031776560605, "grad_norm": 6.665611743927002, "learning_rate": 4.632388607310995e-05, "loss": 2.7898, "step": 1154 }, { "epoch": 0.7726721578793327, "grad_norm": 5.265156269073486, "learning_rate": 4.631371434139176e-05, "loss": 2.896, "step": 1155 }, { "epoch": 0.7733411381026049, "grad_norm": 5.411212921142578, "learning_rate": 4.630352967657998e-05, "loss": 3.0554, "step": 1156 }, { "epoch": 0.774010118325877, "grad_norm": 5.297793388366699, "learning_rate": 4.629333208485464e-05, "loss": 2.7445, "step": 1157 }, { "epoch": 0.7746790985491492, "grad_norm": 6.694152355194092, "learning_rate": 4.62831215724036e-05, "loss": 3.0652, "step": 1158 }, { "epoch": 0.7753480787724213, "grad_norm": 4.164678573608398, "learning_rate": 4.627289814542257e-05, "loss": 2.8941, "step": 1159 }, { "epoch": 0.7760170589956934, "grad_norm": 5.421492099761963, "learning_rate": 4.626266181011509e-05, "loss": 3.0211, "step": 1160 }, { "epoch": 0.7766860392189656, "grad_norm": 5.934850215911865, "learning_rate": 4.625241257269254e-05, "loss": 2.9949, "step": 1161 }, { "epoch": 0.7773550194422377, "grad_norm": 3.3746259212493896, "learning_rate": 4.624215043937411e-05, "loss": 2.9759, "step": 1162 }, { "epoch": 0.7780239996655098, "grad_norm": 4.722992420196533, "learning_rate": 4.623187541638685e-05, "loss": 3.0717, "step": 1163 }, { "epoch": 0.778692979888782, "grad_norm": 4.32106876373291, "learning_rate": 4.6221587509965594e-05, "loss": 2.6513, "step": 1164 }, { "epoch": 0.7793619601120542, "grad_norm": 4.257534980773926, "learning_rate": 4.621128672635302e-05, "loss": 2.798, "step": 1165 }, { "epoch": 0.7800309403353264, "grad_norm": 4.742833137512207, "learning_rate": 4.620097307179961e-05, "loss": 3.0547, "step": 1166 }, { "epoch": 0.7806999205585985, "grad_norm": 5.508314609527588, "learning_rate": 4.6190646552563655e-05, "loss": 3.1237, "step": 1167 }, { "epoch": 0.7813689007818706, "grad_norm": 4.580541133880615, "learning_rate": 4.6180307174911255e-05, "loss": 2.8756, "step": 1168 }, { "epoch": 0.7820378810051428, "grad_norm": 3.2291085720062256, "learning_rate": 4.61699549451163e-05, "loss": 2.7087, "step": 1169 }, { "epoch": 0.7827068612284149, "grad_norm": 5.868531227111816, "learning_rate": 4.6159589869460504e-05, "loss": 2.9949, "step": 1170 }, { "epoch": 0.783375841451687, "grad_norm": 4.447974681854248, "learning_rate": 4.614921195423336e-05, "loss": 2.8073, "step": 1171 }, { "epoch": 0.7840448216749593, "grad_norm": 5.745816230773926, "learning_rate": 4.613882120573215e-05, "loss": 2.9943, "step": 1172 }, { "epoch": 0.7847138018982314, "grad_norm": 3.089176654815674, "learning_rate": 4.612841763026195e-05, "loss": 2.8825, "step": 1173 }, { "epoch": 0.7853827821215036, "grad_norm": 4.548274040222168, "learning_rate": 4.611800123413561e-05, "loss": 3.0515, "step": 1174 }, { "epoch": 0.7860517623447757, "grad_norm": 5.2816853523254395, "learning_rate": 4.6107572023673774e-05, "loss": 3.1152, "step": 1175 }, { "epoch": 0.7867207425680478, "grad_norm": 4.492981433868408, "learning_rate": 4.6097130005204846e-05, "loss": 3.0368, "step": 1176 }, { "epoch": 0.78738972279132, "grad_norm": 4.274287223815918, "learning_rate": 4.608667518506502e-05, "loss": 2.9959, "step": 1177 }, { "epoch": 0.7880587030145921, "grad_norm": 5.986141204833984, "learning_rate": 4.607620756959823e-05, "loss": 3.1899, "step": 1178 }, { "epoch": 0.7887276832378642, "grad_norm": 6.11605167388916, "learning_rate": 4.6065727165156214e-05, "loss": 3.2268, "step": 1179 }, { "epoch": 0.7893966634611365, "grad_norm": 3.461554527282715, "learning_rate": 4.6055233978098424e-05, "loss": 2.9106, "step": 1180 }, { "epoch": 0.7900656436844086, "grad_norm": 3.9393229484558105, "learning_rate": 4.604472801479211e-05, "loss": 2.7877, "step": 1181 }, { "epoch": 0.7907346239076807, "grad_norm": 3.958235263824463, "learning_rate": 4.603420928161225e-05, "loss": 2.7716, "step": 1182 }, { "epoch": 0.7914036041309529, "grad_norm": 5.014087200164795, "learning_rate": 4.602367778494158e-05, "loss": 3.1674, "step": 1183 }, { "epoch": 0.792072584354225, "grad_norm": 5.102466106414795, "learning_rate": 4.601313353117057e-05, "loss": 2.8841, "step": 1184 }, { "epoch": 0.7927415645774972, "grad_norm": 3.9684786796569824, "learning_rate": 4.6002576526697446e-05, "loss": 2.9394, "step": 1185 }, { "epoch": 0.7934105448007693, "grad_norm": 5.413763523101807, "learning_rate": 4.599200677792818e-05, "loss": 3.0567, "step": 1186 }, { "epoch": 0.7940795250240414, "grad_norm": 3.9001362323760986, "learning_rate": 4.598142429127643e-05, "loss": 2.7906, "step": 1187 }, { "epoch": 0.7947485052473137, "grad_norm": 5.886847019195557, "learning_rate": 4.597082907316363e-05, "loss": 3.0456, "step": 1188 }, { "epoch": 0.7954174854705858, "grad_norm": 5.5517096519470215, "learning_rate": 4.5960221130018946e-05, "loss": 3.0884, "step": 1189 }, { "epoch": 0.7960864656938579, "grad_norm": 4.110422134399414, "learning_rate": 4.594960046827921e-05, "loss": 2.9232, "step": 1190 }, { "epoch": 0.7967554459171301, "grad_norm": 6.881380558013916, "learning_rate": 4.593896709438902e-05, "loss": 3.0062, "step": 1191 }, { "epoch": 0.7974244261404022, "grad_norm": 4.250757217407227, "learning_rate": 4.592832101480067e-05, "loss": 2.9539, "step": 1192 }, { "epoch": 0.7980934063636744, "grad_norm": 4.646194934844971, "learning_rate": 4.591766223597417e-05, "loss": 2.858, "step": 1193 }, { "epoch": 0.7987623865869465, "grad_norm": 3.1693055629730225, "learning_rate": 4.5906990764377235e-05, "loss": 2.7729, "step": 1194 }, { "epoch": 0.7994313668102186, "grad_norm": 4.924377918243408, "learning_rate": 4.589630660648527e-05, "loss": 3.1445, "step": 1195 }, { "epoch": 0.8001003470334909, "grad_norm": 5.191469669342041, "learning_rate": 4.5885609768781405e-05, "loss": 2.6655, "step": 1196 }, { "epoch": 0.800769327256763, "grad_norm": 5.284387111663818, "learning_rate": 4.587490025775644e-05, "loss": 3.0168, "step": 1197 }, { "epoch": 0.8014383074800351, "grad_norm": 4.831686019897461, "learning_rate": 4.586417807990886e-05, "loss": 2.8958, "step": 1198 }, { "epoch": 0.8021072877033073, "grad_norm": 4.951563835144043, "learning_rate": 4.585344324174485e-05, "loss": 2.9277, "step": 1199 }, { "epoch": 0.8027762679265794, "grad_norm": 5.770039081573486, "learning_rate": 4.58426957497783e-05, "loss": 3.1331, "step": 1200 }, { "epoch": 0.8034452481498515, "grad_norm": 4.684397220611572, "learning_rate": 4.583193561053072e-05, "loss": 2.8653, "step": 1201 }, { "epoch": 0.8041142283731237, "grad_norm": 4.134792804718018, "learning_rate": 4.582116283053135e-05, "loss": 2.9133, "step": 1202 }, { "epoch": 0.8047832085963958, "grad_norm": 6.689801216125488, "learning_rate": 4.581037741631708e-05, "loss": 3.1337, "step": 1203 }, { "epoch": 0.8054521888196681, "grad_norm": 6.469520568847656, "learning_rate": 4.579957937443245e-05, "loss": 2.9906, "step": 1204 }, { "epoch": 0.8061211690429402, "grad_norm": 6.04118537902832, "learning_rate": 4.5788768711429685e-05, "loss": 2.8179, "step": 1205 }, { "epoch": 0.8067901492662123, "grad_norm": 4.4167799949646, "learning_rate": 4.5777945433868664e-05, "loss": 2.9181, "step": 1206 }, { "epoch": 0.8074591294894845, "grad_norm": 4.885730743408203, "learning_rate": 4.576710954831691e-05, "loss": 3.3447, "step": 1207 }, { "epoch": 0.8081281097127566, "grad_norm": 3.864809989929199, "learning_rate": 4.57562610613496e-05, "loss": 2.8715, "step": 1208 }, { "epoch": 0.8087970899360287, "grad_norm": 3.0726685523986816, "learning_rate": 4.574539997954957e-05, "loss": 2.953, "step": 1209 }, { "epoch": 0.8094660701593009, "grad_norm": 4.57878303527832, "learning_rate": 4.5734526309507294e-05, "loss": 2.9744, "step": 1210 }, { "epoch": 0.810135050382573, "grad_norm": 5.683286190032959, "learning_rate": 4.5723640057820874e-05, "loss": 3.0864, "step": 1211 }, { "epoch": 0.8108040306058452, "grad_norm": 4.932238578796387, "learning_rate": 4.571274123109606e-05, "loss": 3.0714, "step": 1212 }, { "epoch": 0.8114730108291174, "grad_norm": 4.840191841125488, "learning_rate": 4.5701829835946204e-05, "loss": 3.0249, "step": 1213 }, { "epoch": 0.8121419910523895, "grad_norm": 5.308492183685303, "learning_rate": 4.569090587899232e-05, "loss": 3.0613, "step": 1214 }, { "epoch": 0.8128109712756617, "grad_norm": 5.62808895111084, "learning_rate": 4.567996936686303e-05, "loss": 2.5797, "step": 1215 }, { "epoch": 0.8134799514989338, "grad_norm": 5.087706089019775, "learning_rate": 4.5669020306194585e-05, "loss": 2.8123, "step": 1216 }, { "epoch": 0.8141489317222059, "grad_norm": 4.26098108291626, "learning_rate": 4.565805870363082e-05, "loss": 2.9352, "step": 1217 }, { "epoch": 0.8148179119454781, "grad_norm": 3.571173906326294, "learning_rate": 4.564708456582321e-05, "loss": 3.0319, "step": 1218 }, { "epoch": 0.8154868921687503, "grad_norm": 6.964523792266846, "learning_rate": 4.5636097899430826e-05, "loss": 2.9455, "step": 1219 }, { "epoch": 0.8161558723920224, "grad_norm": 5.344782829284668, "learning_rate": 4.562509871112034e-05, "loss": 3.2849, "step": 1220 }, { "epoch": 0.8168248526152946, "grad_norm": 4.287156105041504, "learning_rate": 4.561408700756603e-05, "loss": 3.0911, "step": 1221 }, { "epoch": 0.8174938328385667, "grad_norm": 6.898590564727783, "learning_rate": 4.560306279544975e-05, "loss": 3.1278, "step": 1222 }, { "epoch": 0.8181628130618389, "grad_norm": 4.712205410003662, "learning_rate": 4.559202608146098e-05, "loss": 2.7449, "step": 1223 }, { "epoch": 0.818831793285111, "grad_norm": 2.392604351043701, "learning_rate": 4.558097687229673e-05, "loss": 2.5767, "step": 1224 }, { "epoch": 0.8195007735083831, "grad_norm": 6.0168561935424805, "learning_rate": 4.5569915174661656e-05, "loss": 3.0924, "step": 1225 }, { "epoch": 0.8201697537316553, "grad_norm": 5.974442958831787, "learning_rate": 4.555884099526794e-05, "loss": 3.2305, "step": 1226 }, { "epoch": 0.8208387339549275, "grad_norm": 4.008500099182129, "learning_rate": 4.554775434083537e-05, "loss": 3.1273, "step": 1227 }, { "epoch": 0.8215077141781996, "grad_norm": 6.343125343322754, "learning_rate": 4.553665521809128e-05, "loss": 3.2656, "step": 1228 }, { "epoch": 0.8221766944014718, "grad_norm": 4.358270645141602, "learning_rate": 4.5525543633770604e-05, "loss": 2.8296, "step": 1229 }, { "epoch": 0.8228456746247439, "grad_norm": 3.9398019313812256, "learning_rate": 4.551441959461579e-05, "loss": 2.9505, "step": 1230 }, { "epoch": 0.823514654848016, "grad_norm": 7.38106107711792, "learning_rate": 4.5503283107376885e-05, "loss": 3.3745, "step": 1231 }, { "epoch": 0.8241836350712882, "grad_norm": 4.664844989776611, "learning_rate": 4.549213417881147e-05, "loss": 2.8598, "step": 1232 }, { "epoch": 0.8248526152945603, "grad_norm": 8.018467903137207, "learning_rate": 4.548097281568469e-05, "loss": 2.961, "step": 1233 }, { "epoch": 0.8255215955178326, "grad_norm": 4.438595294952393, "learning_rate": 4.54697990247692e-05, "loss": 2.9577, "step": 1234 }, { "epoch": 0.8261905757411047, "grad_norm": 4.215197563171387, "learning_rate": 4.545861281284524e-05, "loss": 2.9438, "step": 1235 }, { "epoch": 0.8268595559643768, "grad_norm": 4.889639377593994, "learning_rate": 4.5447414186700556e-05, "loss": 3.1007, "step": 1236 }, { "epoch": 0.827528536187649, "grad_norm": 5.220578670501709, "learning_rate": 4.543620315313045e-05, "loss": 2.8278, "step": 1237 }, { "epoch": 0.8281975164109211, "grad_norm": 5.146899700164795, "learning_rate": 4.5424979718937736e-05, "loss": 2.9282, "step": 1238 }, { "epoch": 0.8288664966341932, "grad_norm": 3.74849796295166, "learning_rate": 4.5413743890932754e-05, "loss": 2.9679, "step": 1239 }, { "epoch": 0.8295354768574654, "grad_norm": 2.725539207458496, "learning_rate": 4.5402495675933373e-05, "loss": 2.6577, "step": 1240 }, { "epoch": 0.8302044570807375, "grad_norm": 4.239365100860596, "learning_rate": 4.5391235080764973e-05, "loss": 2.6505, "step": 1241 }, { "epoch": 0.8308734373040098, "grad_norm": 3.76081919670105, "learning_rate": 4.5379962112260446e-05, "loss": 2.775, "step": 1242 }, { "epoch": 0.8315424175272819, "grad_norm": 4.860802173614502, "learning_rate": 4.536867677726019e-05, "loss": 2.9374, "step": 1243 }, { "epoch": 0.832211397750554, "grad_norm": 5.248920917510986, "learning_rate": 4.535737908261212e-05, "loss": 2.9561, "step": 1244 }, { "epoch": 0.8328803779738262, "grad_norm": 3.981989860534668, "learning_rate": 4.534606903517163e-05, "loss": 2.7477, "step": 1245 }, { "epoch": 0.8335493581970983, "grad_norm": 5.067933559417725, "learning_rate": 4.533474664180163e-05, "loss": 2.7223, "step": 1246 }, { "epoch": 0.8342183384203704, "grad_norm": 4.483163833618164, "learning_rate": 4.5323411909372516e-05, "loss": 2.9859, "step": 1247 }, { "epoch": 0.8348873186436426, "grad_norm": 3.9220480918884277, "learning_rate": 4.5312064844762144e-05, "loss": 2.9838, "step": 1248 }, { "epoch": 0.8355562988669147, "grad_norm": 6.302621841430664, "learning_rate": 4.53007054548559e-05, "loss": 2.9708, "step": 1249 }, { "epoch": 0.8362252790901868, "grad_norm": 6.725874423980713, "learning_rate": 4.5289333746546614e-05, "loss": 2.7491, "step": 1250 }, { "epoch": 0.8368942593134591, "grad_norm": 6.032719612121582, "learning_rate": 4.52779497267346e-05, "loss": 2.9548, "step": 1251 }, { "epoch": 0.8375632395367312, "grad_norm": 5.064156532287598, "learning_rate": 4.526655340232766e-05, "loss": 3.1138, "step": 1252 }, { "epoch": 0.8382322197600034, "grad_norm": 5.924524307250977, "learning_rate": 4.5255144780241025e-05, "loss": 3.0439, "step": 1253 }, { "epoch": 0.8389011999832755, "grad_norm": 3.7269787788391113, "learning_rate": 4.5243723867397433e-05, "loss": 2.9438, "step": 1254 }, { "epoch": 0.8395701802065476, "grad_norm": 4.633069038391113, "learning_rate": 4.523229067072704e-05, "loss": 3.035, "step": 1255 }, { "epoch": 0.8402391604298198, "grad_norm": 6.016887664794922, "learning_rate": 4.522084519716748e-05, "loss": 3.1314, "step": 1256 }, { "epoch": 0.8409081406530919, "grad_norm": 5.080687999725342, "learning_rate": 4.520938745366383e-05, "loss": 3.1458, "step": 1257 }, { "epoch": 0.841577120876364, "grad_norm": 4.570522785186768, "learning_rate": 4.519791744716861e-05, "loss": 2.6878, "step": 1258 }, { "epoch": 0.8422461010996363, "grad_norm": 5.6180009841918945, "learning_rate": 4.518643518464179e-05, "loss": 2.8925, "step": 1259 }, { "epoch": 0.8429150813229084, "grad_norm": 5.0585408210754395, "learning_rate": 4.517494067305076e-05, "loss": 2.8532, "step": 1260 }, { "epoch": 0.8435840615461805, "grad_norm": 4.849548816680908, "learning_rate": 4.516343391937036e-05, "loss": 2.9977, "step": 1261 }, { "epoch": 0.8442530417694527, "grad_norm": 6.075852394104004, "learning_rate": 4.515191493058285e-05, "loss": 2.9442, "step": 1262 }, { "epoch": 0.8449220219927248, "grad_norm": 3.9582536220550537, "learning_rate": 4.5140383713677916e-05, "loss": 2.9436, "step": 1263 }, { "epoch": 0.845591002215997, "grad_norm": 4.025606632232666, "learning_rate": 4.512884027565265e-05, "loss": 2.5885, "step": 1264 }, { "epoch": 0.8462599824392691, "grad_norm": 9.147647857666016, "learning_rate": 4.51172846235116e-05, "loss": 2.9629, "step": 1265 }, { "epoch": 0.8469289626625413, "grad_norm": 5.3007330894470215, "learning_rate": 4.510571676426667e-05, "loss": 2.9883, "step": 1266 }, { "epoch": 0.8475979428858135, "grad_norm": 5.245005130767822, "learning_rate": 4.5094136704937225e-05, "loss": 3.0921, "step": 1267 }, { "epoch": 0.8482669231090856, "grad_norm": 7.631860256195068, "learning_rate": 4.508254445254999e-05, "loss": 2.9209, "step": 1268 }, { "epoch": 0.8489359033323577, "grad_norm": 3.889382839202881, "learning_rate": 4.507094001413911e-05, "loss": 2.8368, "step": 1269 }, { "epoch": 0.8496048835556299, "grad_norm": 5.040628910064697, "learning_rate": 4.505932339674613e-05, "loss": 2.6859, "step": 1270 }, { "epoch": 0.850273863778902, "grad_norm": 4.311012268066406, "learning_rate": 4.5047694607419974e-05, "loss": 3.067, "step": 1271 }, { "epoch": 0.8509428440021742, "grad_norm": 4.256524562835693, "learning_rate": 4.503605365321695e-05, "loss": 2.7441, "step": 1272 }, { "epoch": 0.8516118242254463, "grad_norm": 4.711860179901123, "learning_rate": 4.502440054120074e-05, "loss": 3.0067, "step": 1273 }, { "epoch": 0.8522808044487185, "grad_norm": 4.862099647521973, "learning_rate": 4.5012735278442436e-05, "loss": 2.816, "step": 1274 }, { "epoch": 0.8529497846719907, "grad_norm": 4.249661922454834, "learning_rate": 4.500105787202047e-05, "loss": 2.9722, "step": 1275 }, { "epoch": 0.8536187648952628, "grad_norm": 5.019298076629639, "learning_rate": 4.4989368329020664e-05, "loss": 2.9151, "step": 1276 }, { "epoch": 0.8542877451185349, "grad_norm": 4.351741313934326, "learning_rate": 4.497766665653619e-05, "loss": 2.8578, "step": 1277 }, { "epoch": 0.8549567253418071, "grad_norm": 4.4666972160339355, "learning_rate": 4.4965952861667574e-05, "loss": 2.9205, "step": 1278 }, { "epoch": 0.8556257055650792, "grad_norm": 6.395944118499756, "learning_rate": 4.495422695152272e-05, "loss": 2.8507, "step": 1279 }, { "epoch": 0.8562946857883513, "grad_norm": 3.730733871459961, "learning_rate": 4.494248893321689e-05, "loss": 2.6328, "step": 1280 }, { "epoch": 0.8569636660116235, "grad_norm": 5.058286190032959, "learning_rate": 4.493073881387265e-05, "loss": 3.0965, "step": 1281 }, { "epoch": 0.8576326462348957, "grad_norm": 3.647876501083374, "learning_rate": 4.491897660061994e-05, "loss": 2.8117, "step": 1282 }, { "epoch": 0.8583016264581679, "grad_norm": 5.1443867683410645, "learning_rate": 4.4907202300596036e-05, "loss": 2.8762, "step": 1283 }, { "epoch": 0.85897060668144, "grad_norm": 4.4886250495910645, "learning_rate": 4.489541592094555e-05, "loss": 3.094, "step": 1284 }, { "epoch": 0.8596395869047121, "grad_norm": 5.375618934631348, "learning_rate": 4.4883617468820415e-05, "loss": 3.1313, "step": 1285 }, { "epoch": 0.8603085671279843, "grad_norm": 5.438999176025391, "learning_rate": 4.4871806951379894e-05, "loss": 3.0824, "step": 1286 }, { "epoch": 0.8609775473512564, "grad_norm": 5.070971488952637, "learning_rate": 4.485998437579056e-05, "loss": 2.7387, "step": 1287 }, { "epoch": 0.8616465275745285, "grad_norm": 5.442570209503174, "learning_rate": 4.484814974922634e-05, "loss": 3.1292, "step": 1288 }, { "epoch": 0.8623155077978008, "grad_norm": 6.013929843902588, "learning_rate": 4.4836303078868435e-05, "loss": 2.7225, "step": 1289 }, { "epoch": 0.8629844880210729, "grad_norm": 5.940018653869629, "learning_rate": 4.482444437190536e-05, "loss": 3.0959, "step": 1290 }, { "epoch": 0.8636534682443451, "grad_norm": 4.380285739898682, "learning_rate": 4.4812573635532945e-05, "loss": 2.9311, "step": 1291 }, { "epoch": 0.8643224484676172, "grad_norm": 3.536611795425415, "learning_rate": 4.4800690876954324e-05, "loss": 2.6594, "step": 1292 }, { "epoch": 0.8649914286908893, "grad_norm": 4.585886001586914, "learning_rate": 4.478879610337989e-05, "loss": 2.6898, "step": 1293 }, { "epoch": 0.8656604089141615, "grad_norm": 4.433380126953125, "learning_rate": 4.477688932202738e-05, "loss": 2.9575, "step": 1294 }, { "epoch": 0.8663293891374336, "grad_norm": 4.734560012817383, "learning_rate": 4.476497054012179e-05, "loss": 2.8395, "step": 1295 }, { "epoch": 0.8669983693607057, "grad_norm": 7.193070888519287, "learning_rate": 4.475303976489538e-05, "loss": 3.2112, "step": 1296 }, { "epoch": 0.867667349583978, "grad_norm": 5.735711097717285, "learning_rate": 4.4741097003587716e-05, "loss": 3.2148, "step": 1297 }, { "epoch": 0.8683363298072501, "grad_norm": 4.532172679901123, "learning_rate": 4.472914226344564e-05, "loss": 3.09, "step": 1298 }, { "epoch": 0.8690053100305222, "grad_norm": 6.511048793792725, "learning_rate": 4.471717555172323e-05, "loss": 2.9723, "step": 1299 }, { "epoch": 0.8696742902537944, "grad_norm": 6.3584113121032715, "learning_rate": 4.4705196875681854e-05, "loss": 3.1419, "step": 1300 }, { "epoch": 0.8703432704770665, "grad_norm": 9.031208992004395, "learning_rate": 4.4693206242590145e-05, "loss": 3.0329, "step": 1301 }, { "epoch": 0.8710122507003387, "grad_norm": 4.642002105712891, "learning_rate": 4.468120365972397e-05, "loss": 2.9389, "step": 1302 }, { "epoch": 0.8716812309236108, "grad_norm": 5.527377605438232, "learning_rate": 4.466918913436646e-05, "loss": 2.9945, "step": 1303 }, { "epoch": 0.8723502111468829, "grad_norm": 4.439020156860352, "learning_rate": 4.465716267380799e-05, "loss": 2.6869, "step": 1304 }, { "epoch": 0.8730191913701552, "grad_norm": 6.921942234039307, "learning_rate": 4.464512428534618e-05, "loss": 2.8721, "step": 1305 }, { "epoch": 0.8736881715934273, "grad_norm": 7.672665596008301, "learning_rate": 4.463307397628588e-05, "loss": 3.0736, "step": 1306 }, { "epoch": 0.8743571518166994, "grad_norm": 5.868035316467285, "learning_rate": 4.462101175393919e-05, "loss": 2.9702, "step": 1307 }, { "epoch": 0.8750261320399716, "grad_norm": 4.5191168785095215, "learning_rate": 4.460893762562542e-05, "loss": 2.9085, "step": 1308 }, { "epoch": 0.8756951122632437, "grad_norm": 5.143170356750488, "learning_rate": 4.459685159867111e-05, "loss": 2.8955, "step": 1309 }, { "epoch": 0.8763640924865159, "grad_norm": 4.201148986816406, "learning_rate": 4.458475368041003e-05, "loss": 2.5116, "step": 1310 }, { "epoch": 0.877033072709788, "grad_norm": 5.7478179931640625, "learning_rate": 4.457264387818315e-05, "loss": 3.0584, "step": 1311 }, { "epoch": 0.8777020529330601, "grad_norm": 5.526137828826904, "learning_rate": 4.456052219933867e-05, "loss": 2.8589, "step": 1312 }, { "epoch": 0.8783710331563324, "grad_norm": 4.569222927093506, "learning_rate": 4.454838865123197e-05, "loss": 3.0637, "step": 1313 }, { "epoch": 0.8790400133796045, "grad_norm": 5.451717853546143, "learning_rate": 4.453624324122566e-05, "loss": 2.8953, "step": 1314 }, { "epoch": 0.8797089936028766, "grad_norm": 7.568721294403076, "learning_rate": 4.4524085976689536e-05, "loss": 2.9357, "step": 1315 }, { "epoch": 0.8803779738261488, "grad_norm": 7.537128925323486, "learning_rate": 4.451191686500058e-05, "loss": 2.9977, "step": 1316 }, { "epoch": 0.8810469540494209, "grad_norm": 4.4014739990234375, "learning_rate": 4.449973591354298e-05, "loss": 3.0327, "step": 1317 }, { "epoch": 0.881715934272693, "grad_norm": 4.172126293182373, "learning_rate": 4.448754312970809e-05, "loss": 2.8322, "step": 1318 }, { "epoch": 0.8823849144959652, "grad_norm": 6.07208251953125, "learning_rate": 4.447533852089445e-05, "loss": 2.837, "step": 1319 }, { "epoch": 0.8830538947192373, "grad_norm": 3.788719892501831, "learning_rate": 4.4463122094507794e-05, "loss": 2.8765, "step": 1320 }, { "epoch": 0.8837228749425096, "grad_norm": 3.7321970462799072, "learning_rate": 4.445089385796099e-05, "loss": 2.7925, "step": 1321 }, { "epoch": 0.8843918551657817, "grad_norm": 5.387599945068359, "learning_rate": 4.4438653818674105e-05, "loss": 2.9877, "step": 1322 }, { "epoch": 0.8850608353890538, "grad_norm": 5.4844255447387695, "learning_rate": 4.442640198407435e-05, "loss": 2.8944, "step": 1323 }, { "epoch": 0.885729815612326, "grad_norm": 4.812926292419434, "learning_rate": 4.4414138361596105e-05, "loss": 2.8036, "step": 1324 }, { "epoch": 0.8863987958355981, "grad_norm": 4.339323997497559, "learning_rate": 4.4401862958680884e-05, "loss": 2.8654, "step": 1325 }, { "epoch": 0.8870677760588702, "grad_norm": 4.4866509437561035, "learning_rate": 4.438957578277738e-05, "loss": 3.0448, "step": 1326 }, { "epoch": 0.8877367562821424, "grad_norm": 4.485202789306641, "learning_rate": 4.4377276841341395e-05, "loss": 2.9447, "step": 1327 }, { "epoch": 0.8884057365054145, "grad_norm": 4.087451457977295, "learning_rate": 4.4364966141835904e-05, "loss": 3.0245, "step": 1328 }, { "epoch": 0.8890747167286867, "grad_norm": 4.466521739959717, "learning_rate": 4.435264369173099e-05, "loss": 3.1356, "step": 1329 }, { "epoch": 0.8897436969519589, "grad_norm": 4.541243076324463, "learning_rate": 4.434030949850387e-05, "loss": 3.0463, "step": 1330 }, { "epoch": 0.890412677175231, "grad_norm": 4.771731376647949, "learning_rate": 4.4327963569638905e-05, "loss": 2.7375, "step": 1331 }, { "epoch": 0.8910816573985032, "grad_norm": 5.35650110244751, "learning_rate": 4.4315605912627565e-05, "loss": 3.0898, "step": 1332 }, { "epoch": 0.8917506376217753, "grad_norm": 4.51669454574585, "learning_rate": 4.430323653496843e-05, "loss": 3.1061, "step": 1333 }, { "epoch": 0.8924196178450474, "grad_norm": 6.2187652587890625, "learning_rate": 4.4290855444167194e-05, "loss": 2.9603, "step": 1334 }, { "epoch": 0.8930885980683196, "grad_norm": 5.403723239898682, "learning_rate": 4.4278462647736675e-05, "loss": 3.0003, "step": 1335 }, { "epoch": 0.8937575782915917, "grad_norm": 3.582946538925171, "learning_rate": 4.426605815319678e-05, "loss": 2.9088, "step": 1336 }, { "epoch": 0.8944265585148639, "grad_norm": 5.798472881317139, "learning_rate": 4.425364196807451e-05, "loss": 2.7459, "step": 1337 }, { "epoch": 0.8950955387381361, "grad_norm": 5.068627834320068, "learning_rate": 4.4241214099903976e-05, "loss": 2.9603, "step": 1338 }, { "epoch": 0.8957645189614082, "grad_norm": 6.003021240234375, "learning_rate": 4.422877455622636e-05, "loss": 3.0229, "step": 1339 }, { "epoch": 0.8964334991846804, "grad_norm": 5.207427024841309, "learning_rate": 4.421632334458994e-05, "loss": 3.0078, "step": 1340 }, { "epoch": 0.8971024794079525, "grad_norm": 9.058572769165039, "learning_rate": 4.4203860472550075e-05, "loss": 3.1514, "step": 1341 }, { "epoch": 0.8977714596312246, "grad_norm": 5.014550685882568, "learning_rate": 4.4191385947669187e-05, "loss": 3.0033, "step": 1342 }, { "epoch": 0.8984404398544968, "grad_norm": 9.838627815246582, "learning_rate": 4.4178899777516786e-05, "loss": 2.8175, "step": 1343 }, { "epoch": 0.899109420077769, "grad_norm": 3.6878244876861572, "learning_rate": 4.4166401969669434e-05, "loss": 2.8379, "step": 1344 }, { "epoch": 0.8997784003010411, "grad_norm": 5.310073375701904, "learning_rate": 4.415389253171077e-05, "loss": 3.0186, "step": 1345 }, { "epoch": 0.9004473805243133, "grad_norm": 6.830930709838867, "learning_rate": 4.414137147123148e-05, "loss": 2.6658, "step": 1346 }, { "epoch": 0.9011163607475854, "grad_norm": 7.637371063232422, "learning_rate": 4.412883879582928e-05, "loss": 3.1366, "step": 1347 }, { "epoch": 0.9017853409708575, "grad_norm": 8.96474552154541, "learning_rate": 4.4116294513108985e-05, "loss": 2.9803, "step": 1348 }, { "epoch": 0.9024543211941297, "grad_norm": 4.182546138763428, "learning_rate": 4.4103738630682416e-05, "loss": 2.9514, "step": 1349 }, { "epoch": 0.9031233014174018, "grad_norm": 5.354439735412598, "learning_rate": 4.409117115616844e-05, "loss": 2.8105, "step": 1350 }, { "epoch": 0.903792281640674, "grad_norm": 5.362891674041748, "learning_rate": 4.407859209719297e-05, "loss": 2.8012, "step": 1351 }, { "epoch": 0.9044612618639462, "grad_norm": 3.78045654296875, "learning_rate": 4.406600146138893e-05, "loss": 2.5848, "step": 1352 }, { "epoch": 0.9051302420872183, "grad_norm": 2.21057391166687, "learning_rate": 4.4053399256396275e-05, "loss": 2.5734, "step": 1353 }, { "epoch": 0.9057992223104905, "grad_norm": 5.139932632446289, "learning_rate": 4.404078548986199e-05, "loss": 3.0914, "step": 1354 }, { "epoch": 0.9064682025337626, "grad_norm": 3.5655221939086914, "learning_rate": 4.402816016944006e-05, "loss": 2.6834, "step": 1355 }, { "epoch": 0.9071371827570347, "grad_norm": 5.4275994300842285, "learning_rate": 4.401552330279149e-05, "loss": 3.3536, "step": 1356 }, { "epoch": 0.9078061629803069, "grad_norm": 8.335515022277832, "learning_rate": 4.40028748975843e-05, "loss": 3.2658, "step": 1357 }, { "epoch": 0.908475143203579, "grad_norm": 5.343057632446289, "learning_rate": 4.3990214961493495e-05, "loss": 2.8544, "step": 1358 }, { "epoch": 0.9091441234268512, "grad_norm": 5.262948036193848, "learning_rate": 4.397754350220108e-05, "loss": 3.0868, "step": 1359 }, { "epoch": 0.9098131036501234, "grad_norm": 5.593395233154297, "learning_rate": 4.3964860527396066e-05, "loss": 2.9476, "step": 1360 }, { "epoch": 0.9104820838733955, "grad_norm": 9.119357109069824, "learning_rate": 4.3952166044774435e-05, "loss": 2.8594, "step": 1361 }, { "epoch": 0.9111510640966677, "grad_norm": 4.6348676681518555, "learning_rate": 4.393946006203915e-05, "loss": 2.9661, "step": 1362 }, { "epoch": 0.9118200443199398, "grad_norm": 4.553847789764404, "learning_rate": 4.392674258690018e-05, "loss": 2.7935, "step": 1363 }, { "epoch": 0.9124890245432119, "grad_norm": 3.9665307998657227, "learning_rate": 4.391401362707444e-05, "loss": 2.8593, "step": 1364 }, { "epoch": 0.9131580047664841, "grad_norm": 4.720427513122559, "learning_rate": 4.390127319028581e-05, "loss": 2.8045, "step": 1365 }, { "epoch": 0.9138269849897562, "grad_norm": 3.928102970123291, "learning_rate": 4.388852128426516e-05, "loss": 2.8448, "step": 1366 }, { "epoch": 0.9144959652130283, "grad_norm": 4.242812156677246, "learning_rate": 4.38757579167503e-05, "loss": 2.9404, "step": 1367 }, { "epoch": 0.9151649454363006, "grad_norm": 3.773695945739746, "learning_rate": 4.3862983095486e-05, "loss": 2.7635, "step": 1368 }, { "epoch": 0.9158339256595727, "grad_norm": 3.5291969776153564, "learning_rate": 4.385019682822399e-05, "loss": 2.5655, "step": 1369 }, { "epoch": 0.9165029058828449, "grad_norm": 4.14195442199707, "learning_rate": 4.383739912272292e-05, "loss": 2.739, "step": 1370 }, { "epoch": 0.917171886106117, "grad_norm": 4.202181816101074, "learning_rate": 4.382458998674841e-05, "loss": 2.8227, "step": 1371 }, { "epoch": 0.9178408663293891, "grad_norm": 5.822393417358398, "learning_rate": 4.3811769428073004e-05, "loss": 2.7845, "step": 1372 }, { "epoch": 0.9185098465526613, "grad_norm": 4.378946304321289, "learning_rate": 4.3798937454476164e-05, "loss": 2.7979, "step": 1373 }, { "epoch": 0.9191788267759334, "grad_norm": 5.244045734405518, "learning_rate": 4.37860940737443e-05, "loss": 2.944, "step": 1374 }, { "epoch": 0.9198478069992055, "grad_norm": 7.615596771240234, "learning_rate": 4.377323929367073e-05, "loss": 3.1287, "step": 1375 }, { "epoch": 0.9205167872224778, "grad_norm": 4.038976192474365, "learning_rate": 4.37603731220557e-05, "loss": 3.0772, "step": 1376 }, { "epoch": 0.9211857674457499, "grad_norm": 4.48494291305542, "learning_rate": 4.3747495566706344e-05, "loss": 2.8203, "step": 1377 }, { "epoch": 0.9218547476690221, "grad_norm": 6.21243953704834, "learning_rate": 4.3734606635436734e-05, "loss": 3.0804, "step": 1378 }, { "epoch": 0.9225237278922942, "grad_norm": 5.390618324279785, "learning_rate": 4.372170633606784e-05, "loss": 3.0191, "step": 1379 }, { "epoch": 0.9231927081155663, "grad_norm": 3.5611183643341064, "learning_rate": 4.370879467642751e-05, "loss": 2.7865, "step": 1380 }, { "epoch": 0.9238616883388385, "grad_norm": 6.147199630737305, "learning_rate": 4.369587166435051e-05, "loss": 2.9694, "step": 1381 }, { "epoch": 0.9245306685621106, "grad_norm": 6.140897274017334, "learning_rate": 4.368293730767846e-05, "loss": 2.8317, "step": 1382 }, { "epoch": 0.9251996487853827, "grad_norm": 4.62473726272583, "learning_rate": 4.366999161425991e-05, "loss": 3.0022, "step": 1383 }, { "epoch": 0.925868629008655, "grad_norm": 4.647932052612305, "learning_rate": 4.3657034591950254e-05, "loss": 3.1382, "step": 1384 }, { "epoch": 0.9265376092319271, "grad_norm": 6.712825775146484, "learning_rate": 4.364406624861177e-05, "loss": 3.1722, "step": 1385 }, { "epoch": 0.9272065894551992, "grad_norm": 4.585726261138916, "learning_rate": 4.363108659211361e-05, "loss": 3.1261, "step": 1386 }, { "epoch": 0.9278755696784714, "grad_norm": 4.828583240509033, "learning_rate": 4.361809563033179e-05, "loss": 2.9589, "step": 1387 }, { "epoch": 0.9285445499017435, "grad_norm": 4.226254940032959, "learning_rate": 4.360509337114918e-05, "loss": 3.0593, "step": 1388 }, { "epoch": 0.9292135301250157, "grad_norm": 4.8540940284729, "learning_rate": 4.359207982245551e-05, "loss": 2.6497, "step": 1389 }, { "epoch": 0.9298825103482878, "grad_norm": 6.509584903717041, "learning_rate": 4.357905499214736e-05, "loss": 2.9424, "step": 1390 }, { "epoch": 0.93055149057156, "grad_norm": 5.68663215637207, "learning_rate": 4.3566018888128165e-05, "loss": 3.1698, "step": 1391 }, { "epoch": 0.9312204707948322, "grad_norm": 3.5178868770599365, "learning_rate": 4.355297151830818e-05, "loss": 2.908, "step": 1392 }, { "epoch": 0.9318894510181043, "grad_norm": 5.952272891998291, "learning_rate": 4.3539912890604504e-05, "loss": 3.0404, "step": 1393 }, { "epoch": 0.9325584312413764, "grad_norm": 6.591036796569824, "learning_rate": 4.352684301294108e-05, "loss": 2.8514, "step": 1394 }, { "epoch": 0.9332274114646486, "grad_norm": 5.238092422485352, "learning_rate": 4.351376189324867e-05, "loss": 2.7471, "step": 1395 }, { "epoch": 0.9338963916879207, "grad_norm": 4.533151149749756, "learning_rate": 4.3500669539464846e-05, "loss": 3.0083, "step": 1396 }, { "epoch": 0.9345653719111928, "grad_norm": 8.677496910095215, "learning_rate": 4.3487565959534004e-05, "loss": 3.0341, "step": 1397 }, { "epoch": 0.935234352134465, "grad_norm": 5.255722522735596, "learning_rate": 4.3474451161407364e-05, "loss": 2.8045, "step": 1398 }, { "epoch": 0.9359033323577371, "grad_norm": 5.049302577972412, "learning_rate": 4.346132515304294e-05, "loss": 2.7667, "step": 1399 }, { "epoch": 0.9365723125810094, "grad_norm": 5.458214282989502, "learning_rate": 4.344818794240556e-05, "loss": 2.8785, "step": 1400 }, { "epoch": 0.9372412928042815, "grad_norm": 5.965051174163818, "learning_rate": 4.343503953746681e-05, "loss": 2.9263, "step": 1401 }, { "epoch": 0.9379102730275536, "grad_norm": 7.19428825378418, "learning_rate": 4.3421879946205145e-05, "loss": 2.8425, "step": 1402 }, { "epoch": 0.9385792532508258, "grad_norm": 4.539826393127441, "learning_rate": 4.3408709176605734e-05, "loss": 2.8562, "step": 1403 }, { "epoch": 0.9392482334740979, "grad_norm": 4.270603656768799, "learning_rate": 4.339552723666057e-05, "loss": 2.7289, "step": 1404 }, { "epoch": 0.93991721369737, "grad_norm": 4.474053859710693, "learning_rate": 4.338233413436839e-05, "loss": 2.7669, "step": 1405 }, { "epoch": 0.9405861939206422, "grad_norm": 10.911941528320312, "learning_rate": 4.336912987773476e-05, "loss": 2.9069, "step": 1406 }, { "epoch": 0.9412551741439144, "grad_norm": 7.811734676361084, "learning_rate": 4.335591447477196e-05, "loss": 3.0396, "step": 1407 }, { "epoch": 0.9419241543671866, "grad_norm": 6.42579460144043, "learning_rate": 4.334268793349905e-05, "loss": 2.8405, "step": 1408 }, { "epoch": 0.9425931345904587, "grad_norm": 6.013075351715088, "learning_rate": 4.332945026194187e-05, "loss": 2.9972, "step": 1409 }, { "epoch": 0.9432621148137308, "grad_norm": 5.426165580749512, "learning_rate": 4.3316201468132985e-05, "loss": 3.0449, "step": 1410 }, { "epoch": 0.943931095037003, "grad_norm": 7.533892631530762, "learning_rate": 4.330294156011172e-05, "loss": 2.7979, "step": 1411 }, { "epoch": 0.9446000752602751, "grad_norm": 7.255002021789551, "learning_rate": 4.3289670545924144e-05, "loss": 2.8471, "step": 1412 }, { "epoch": 0.9452690554835472, "grad_norm": 4.2335686683654785, "learning_rate": 4.327638843362307e-05, "loss": 2.8513, "step": 1413 }, { "epoch": 0.9459380357068194, "grad_norm": 3.1816556453704834, "learning_rate": 4.3263095231268044e-05, "loss": 2.7395, "step": 1414 }, { "epoch": 0.9466070159300916, "grad_norm": 5.051225185394287, "learning_rate": 4.324979094692534e-05, "loss": 2.9584, "step": 1415 }, { "epoch": 0.9472759961533637, "grad_norm": 5.697210788726807, "learning_rate": 4.3236475588667946e-05, "loss": 2.8683, "step": 1416 }, { "epoch": 0.9479449763766359, "grad_norm": 5.087251663208008, "learning_rate": 4.3223149164575585e-05, "loss": 3.1022, "step": 1417 }, { "epoch": 0.948613956599908, "grad_norm": 5.369622707366943, "learning_rate": 4.320981168273468e-05, "loss": 2.8525, "step": 1418 }, { "epoch": 0.9492829368231802, "grad_norm": 4.636052131652832, "learning_rate": 4.319646315123839e-05, "loss": 2.954, "step": 1419 }, { "epoch": 0.9499519170464523, "grad_norm": 4.25987434387207, "learning_rate": 4.318310357818654e-05, "loss": 2.8005, "step": 1420 }, { "epoch": 0.9506208972697244, "grad_norm": 4.870820045471191, "learning_rate": 4.3169732971685686e-05, "loss": 3.0632, "step": 1421 }, { "epoch": 0.9512898774929966, "grad_norm": 4.916179656982422, "learning_rate": 4.315635133984908e-05, "loss": 3.0651, "step": 1422 }, { "epoch": 0.9519588577162688, "grad_norm": 2.8693549633026123, "learning_rate": 4.3142958690796624e-05, "loss": 2.6375, "step": 1423 }, { "epoch": 0.9526278379395409, "grad_norm": 3.8217246532440186, "learning_rate": 4.312955503265497e-05, "loss": 2.5429, "step": 1424 }, { "epoch": 0.9532968181628131, "grad_norm": 4.727277755737305, "learning_rate": 4.311614037355739e-05, "loss": 3.1547, "step": 1425 }, { "epoch": 0.9539657983860852, "grad_norm": 3.9927327632904053, "learning_rate": 4.310271472164387e-05, "loss": 2.638, "step": 1426 }, { "epoch": 0.9546347786093574, "grad_norm": 4.312863349914551, "learning_rate": 4.3089278085061035e-05, "loss": 2.7786, "step": 1427 }, { "epoch": 0.9553037588326295, "grad_norm": 4.242437839508057, "learning_rate": 4.307583047196221e-05, "loss": 2.8335, "step": 1428 }, { "epoch": 0.9559727390559016, "grad_norm": 6.428162097930908, "learning_rate": 4.306237189050737e-05, "loss": 2.7921, "step": 1429 }, { "epoch": 0.9566417192791739, "grad_norm": 5.5870537757873535, "learning_rate": 4.3048902348863116e-05, "loss": 2.9045, "step": 1430 }, { "epoch": 0.957310699502446, "grad_norm": 4.459833145141602, "learning_rate": 4.303542185520273e-05, "loss": 2.9307, "step": 1431 }, { "epoch": 0.9579796797257181, "grad_norm": 7.269958972930908, "learning_rate": 4.3021930417706144e-05, "loss": 2.9311, "step": 1432 }, { "epoch": 0.9586486599489903, "grad_norm": 4.830624580383301, "learning_rate": 4.300842804455991e-05, "loss": 2.7988, "step": 1433 }, { "epoch": 0.9593176401722624, "grad_norm": 5.2541422843933105, "learning_rate": 4.2994914743957226e-05, "loss": 2.9206, "step": 1434 }, { "epoch": 0.9599866203955345, "grad_norm": 5.538652420043945, "learning_rate": 4.298139052409792e-05, "loss": 2.9494, "step": 1435 }, { "epoch": 0.9606556006188067, "grad_norm": 4.298689842224121, "learning_rate": 4.296785539318845e-05, "loss": 2.6522, "step": 1436 }, { "epoch": 0.9613245808420788, "grad_norm": 5.333482265472412, "learning_rate": 4.295430935944188e-05, "loss": 2.8006, "step": 1437 }, { "epoch": 0.961993561065351, "grad_norm": 6.548314094543457, "learning_rate": 4.29407524310779e-05, "loss": 3.0809, "step": 1438 }, { "epoch": 0.9626625412886232, "grad_norm": 5.444504737854004, "learning_rate": 4.2927184616322823e-05, "loss": 2.9464, "step": 1439 }, { "epoch": 0.9633315215118953, "grad_norm": 8.92197036743164, "learning_rate": 4.291360592340955e-05, "loss": 3.3383, "step": 1440 }, { "epoch": 0.9640005017351675, "grad_norm": 4.618531703948975, "learning_rate": 4.2900016360577585e-05, "loss": 2.8019, "step": 1441 }, { "epoch": 0.9646694819584396, "grad_norm": 6.276029109954834, "learning_rate": 4.2886415936073035e-05, "loss": 3.4108, "step": 1442 }, { "epoch": 0.9653384621817117, "grad_norm": 3.802837371826172, "learning_rate": 4.287280465814858e-05, "loss": 2.8491, "step": 1443 }, { "epoch": 0.9660074424049839, "grad_norm": 5.446820259094238, "learning_rate": 4.2859182535063525e-05, "loss": 3.0644, "step": 1444 }, { "epoch": 0.966676422628256, "grad_norm": 5.9092888832092285, "learning_rate": 4.284554957508371e-05, "loss": 2.9815, "step": 1445 }, { "epoch": 0.9673454028515281, "grad_norm": 5.796885013580322, "learning_rate": 4.283190578648158e-05, "loss": 3.1203, "step": 1446 }, { "epoch": 0.9680143830748004, "grad_norm": 6.438416957855225, "learning_rate": 4.2818251177536136e-05, "loss": 2.8655, "step": 1447 }, { "epoch": 0.9686833632980725, "grad_norm": 6.921572685241699, "learning_rate": 4.2804585756532965e-05, "loss": 3.0448, "step": 1448 }, { "epoch": 0.9693523435213447, "grad_norm": 7.228209495544434, "learning_rate": 4.2790909531764196e-05, "loss": 3.05, "step": 1449 }, { "epoch": 0.9700213237446168, "grad_norm": 7.1065449714660645, "learning_rate": 4.2777222511528504e-05, "loss": 2.5712, "step": 1450 }, { "epoch": 0.9706903039678889, "grad_norm": 6.614255905151367, "learning_rate": 4.276352470413114e-05, "loss": 3.1933, "step": 1451 }, { "epoch": 0.9713592841911611, "grad_norm": 4.059726715087891, "learning_rate": 4.274981611788389e-05, "loss": 2.9931, "step": 1452 }, { "epoch": 0.9720282644144332, "grad_norm": 4.65960693359375, "learning_rate": 4.273609676110508e-05, "loss": 2.9133, "step": 1453 }, { "epoch": 0.9726972446377053, "grad_norm": 5.1689324378967285, "learning_rate": 4.272236664211957e-05, "loss": 2.9943, "step": 1454 }, { "epoch": 0.9733662248609776, "grad_norm": 8.48508358001709, "learning_rate": 4.2708625769258756e-05, "loss": 2.9463, "step": 1455 }, { "epoch": 0.9740352050842497, "grad_norm": 4.781745433807373, "learning_rate": 4.269487415086055e-05, "loss": 2.61, "step": 1456 }, { "epoch": 0.9747041853075219, "grad_norm": 6.530716896057129, "learning_rate": 4.268111179526939e-05, "loss": 3.0087, "step": 1457 }, { "epoch": 0.975373165530794, "grad_norm": 4.822265625, "learning_rate": 4.266733871083624e-05, "loss": 2.8659, "step": 1458 }, { "epoch": 0.9760421457540661, "grad_norm": 3.321605682373047, "learning_rate": 4.2653554905918544e-05, "loss": 2.8481, "step": 1459 }, { "epoch": 0.9767111259773383, "grad_norm": 4.193107604980469, "learning_rate": 4.263976038888029e-05, "loss": 3.0597, "step": 1460 }, { "epoch": 0.9773801062006104, "grad_norm": 6.2677483558654785, "learning_rate": 4.262595516809194e-05, "loss": 2.99, "step": 1461 }, { "epoch": 0.9780490864238826, "grad_norm": 6.198704719543457, "learning_rate": 4.261213925193045e-05, "loss": 3.0541, "step": 1462 }, { "epoch": 0.9787180666471548, "grad_norm": 4.100915431976318, "learning_rate": 4.259831264877928e-05, "loss": 3.0413, "step": 1463 }, { "epoch": 0.9793870468704269, "grad_norm": 5.984002590179443, "learning_rate": 4.258447536702838e-05, "loss": 2.7577, "step": 1464 }, { "epoch": 0.980056027093699, "grad_norm": 4.392581462860107, "learning_rate": 4.2570627415074146e-05, "loss": 2.8131, "step": 1465 }, { "epoch": 0.9807250073169712, "grad_norm": 7.353703498840332, "learning_rate": 4.2556768801319485e-05, "loss": 3.0096, "step": 1466 }, { "epoch": 0.9813939875402433, "grad_norm": 6.616916179656982, "learning_rate": 4.254289953417376e-05, "loss": 2.958, "step": 1467 }, { "epoch": 0.9820629677635155, "grad_norm": 6.607530117034912, "learning_rate": 4.252901962205279e-05, "loss": 2.9169, "step": 1468 }, { "epoch": 0.9827319479867876, "grad_norm": 3.915989398956299, "learning_rate": 4.2515129073378866e-05, "loss": 2.8512, "step": 1469 }, { "epoch": 0.9834009282100598, "grad_norm": 5.1544189453125, "learning_rate": 4.250122789658073e-05, "loss": 3.093, "step": 1470 }, { "epoch": 0.984069908433332, "grad_norm": 3.6646721363067627, "learning_rate": 4.2487316100093564e-05, "loss": 3.1035, "step": 1471 }, { "epoch": 0.9847388886566041, "grad_norm": 5.358120441436768, "learning_rate": 4.247339369235901e-05, "loss": 2.9236, "step": 1472 }, { "epoch": 0.9854078688798762, "grad_norm": 4.24708890914917, "learning_rate": 4.2459460681825134e-05, "loss": 3.06, "step": 1473 }, { "epoch": 0.9860768491031484, "grad_norm": 4.978499889373779, "learning_rate": 4.244551707694645e-05, "loss": 2.8849, "step": 1474 }, { "epoch": 0.9867458293264205, "grad_norm": 5.576192855834961, "learning_rate": 4.2431562886183886e-05, "loss": 2.8256, "step": 1475 }, { "epoch": 0.9874148095496927, "grad_norm": 8.536967277526855, "learning_rate": 4.24175981180048e-05, "loss": 3.2155, "step": 1476 }, { "epoch": 0.9880837897729648, "grad_norm": 6.740468978881836, "learning_rate": 4.2403622780882976e-05, "loss": 2.942, "step": 1477 }, { "epoch": 0.988752769996237, "grad_norm": 7.4838032722473145, "learning_rate": 4.23896368832986e-05, "loss": 2.9484, "step": 1478 }, { "epoch": 0.9894217502195092, "grad_norm": 4.943514823913574, "learning_rate": 4.237564043373827e-05, "loss": 3.074, "step": 1479 }, { "epoch": 0.9900907304427813, "grad_norm": 3.450420618057251, "learning_rate": 4.236163344069498e-05, "loss": 2.8443, "step": 1480 }, { "epoch": 0.9907597106660534, "grad_norm": 5.6595659255981445, "learning_rate": 4.2347615912668136e-05, "loss": 2.9945, "step": 1481 }, { "epoch": 0.9914286908893256, "grad_norm": 4.968037128448486, "learning_rate": 4.2333587858163524e-05, "loss": 3.077, "step": 1482 }, { "epoch": 0.9920976711125977, "grad_norm": 6.695304870605469, "learning_rate": 4.2319549285693325e-05, "loss": 3.1005, "step": 1483 }, { "epoch": 0.9927666513358698, "grad_norm": 5.530367374420166, "learning_rate": 4.230550020377611e-05, "loss": 3.1696, "step": 1484 }, { "epoch": 0.993435631559142, "grad_norm": 6.297682762145996, "learning_rate": 4.2291440620936796e-05, "loss": 2.9688, "step": 1485 }, { "epoch": 0.9941046117824142, "grad_norm": 3.887913465499878, "learning_rate": 4.227737054570671e-05, "loss": 2.768, "step": 1486 }, { "epoch": 0.9947735920056864, "grad_norm": 5.558178901672363, "learning_rate": 4.2263289986623525e-05, "loss": 2.8784, "step": 1487 }, { "epoch": 0.9954425722289585, "grad_norm": 4.500943183898926, "learning_rate": 4.224919895223127e-05, "loss": 3.1376, "step": 1488 }, { "epoch": 0.9961115524522306, "grad_norm": 4.17768669128418, "learning_rate": 4.223509745108035e-05, "loss": 2.5942, "step": 1489 }, { "epoch": 0.9967805326755028, "grad_norm": 4.17385196685791, "learning_rate": 4.222098549172751e-05, "loss": 3.0907, "step": 1490 }, { "epoch": 0.9974495128987749, "grad_norm": 4.986076354980469, "learning_rate": 4.2206863082735837e-05, "loss": 2.9474, "step": 1491 }, { "epoch": 0.998118493122047, "grad_norm": 5.448312759399414, "learning_rate": 4.219273023267476e-05, "loss": 3.0986, "step": 1492 }, { "epoch": 0.9987874733453193, "grad_norm": 5.713490962982178, "learning_rate": 4.217858695012007e-05, "loss": 2.8431, "step": 1493 }, { "epoch": 0.9994564535685914, "grad_norm": 4.883641719818115, "learning_rate": 4.216443324365383e-05, "loss": 2.9106, "step": 1494 }, { "epoch": 1.0006689802232722, "grad_norm": 11.87452220916748, "learning_rate": 4.215026912186449e-05, "loss": 5.5449, "step": 1495 }, { "epoch": 1.0013379604465442, "grad_norm": 6.693982124328613, "learning_rate": 4.213609459334678e-05, "loss": 3.0085, "step": 1496 }, { "epoch": 1.0020069406698164, "grad_norm": 4.143570899963379, "learning_rate": 4.2121909666701766e-05, "loss": 2.6887, "step": 1497 }, { "epoch": 1.0026759208930887, "grad_norm": 3.687885046005249, "learning_rate": 4.210771435053682e-05, "loss": 2.6233, "step": 1498 }, { "epoch": 1.0033449011163607, "grad_norm": 3.9757401943206787, "learning_rate": 4.2093508653465605e-05, "loss": 2.6142, "step": 1499 }, { "epoch": 1.0040138813396329, "grad_norm": 4.822112560272217, "learning_rate": 4.207929258410809e-05, "loss": 2.794, "step": 1500 }, { "epoch": 1.004682861562905, "grad_norm": 3.2196388244628906, "learning_rate": 4.206506615109055e-05, "loss": 2.6118, "step": 1501 }, { "epoch": 1.005351841786177, "grad_norm": 6.094923973083496, "learning_rate": 4.205082936304554e-05, "loss": 2.6836, "step": 1502 }, { "epoch": 1.0060208220094493, "grad_norm": 3.8810741901397705, "learning_rate": 4.203658222861189e-05, "loss": 2.7444, "step": 1503 }, { "epoch": 1.0066898022327215, "grad_norm": 4.88921594619751, "learning_rate": 4.2022324756434715e-05, "loss": 2.6127, "step": 1504 }, { "epoch": 1.0073587824559938, "grad_norm": 5.336818695068359, "learning_rate": 4.200805695516541e-05, "loss": 2.7797, "step": 1505 }, { "epoch": 1.0080277626792657, "grad_norm": 4.798128604888916, "learning_rate": 4.199377883346163e-05, "loss": 3.0443, "step": 1506 }, { "epoch": 1.008696742902538, "grad_norm": 4.512149810791016, "learning_rate": 4.197949039998729e-05, "loss": 2.9289, "step": 1507 }, { "epoch": 1.0093657231258102, "grad_norm": 4.345590114593506, "learning_rate": 4.196519166341256e-05, "loss": 2.6111, "step": 1508 }, { "epoch": 1.0100347033490822, "grad_norm": 4.6364030838012695, "learning_rate": 4.1950882632413876e-05, "loss": 3.0167, "step": 1509 }, { "epoch": 1.0107036835723544, "grad_norm": 3.8757243156433105, "learning_rate": 4.193656331567392e-05, "loss": 2.73, "step": 1510 }, { "epoch": 1.0113726637956266, "grad_norm": 4.849210739135742, "learning_rate": 4.192223372188159e-05, "loss": 2.7352, "step": 1511 }, { "epoch": 1.0120416440188986, "grad_norm": 6.5901312828063965, "learning_rate": 4.190789385973205e-05, "loss": 2.7431, "step": 1512 }, { "epoch": 1.0127106242421708, "grad_norm": 4.913456439971924, "learning_rate": 4.189354373792668e-05, "loss": 2.7734, "step": 1513 }, { "epoch": 1.013379604465443, "grad_norm": 6.975290775299072, "learning_rate": 4.187918336517308e-05, "loss": 2.8391, "step": 1514 }, { "epoch": 1.014048584688715, "grad_norm": 6.176608085632324, "learning_rate": 4.186481275018509e-05, "loss": 2.7831, "step": 1515 }, { "epoch": 1.0147175649119873, "grad_norm": 6.237487316131592, "learning_rate": 4.185043190168274e-05, "loss": 2.8756, "step": 1516 }, { "epoch": 1.0153865451352595, "grad_norm": 4.657079696655273, "learning_rate": 4.18360408283923e-05, "loss": 2.8786, "step": 1517 }, { "epoch": 1.0160555253585315, "grad_norm": 5.093790054321289, "learning_rate": 4.182163953904621e-05, "loss": 2.7005, "step": 1518 }, { "epoch": 1.0167245055818037, "grad_norm": 5.690388202667236, "learning_rate": 4.180722804238314e-05, "loss": 2.9104, "step": 1519 }, { "epoch": 1.017393485805076, "grad_norm": 5.185051441192627, "learning_rate": 4.179280634714793e-05, "loss": 2.6394, "step": 1520 }, { "epoch": 1.018062466028348, "grad_norm": 6.092372417449951, "learning_rate": 4.1778374462091616e-05, "loss": 2.6966, "step": 1521 }, { "epoch": 1.0187314462516202, "grad_norm": 3.6631357669830322, "learning_rate": 4.176393239597144e-05, "loss": 2.7309, "step": 1522 }, { "epoch": 1.0194004264748924, "grad_norm": 4.976971626281738, "learning_rate": 4.1749480157550774e-05, "loss": 2.9311, "step": 1523 }, { "epoch": 1.0200694066981646, "grad_norm": 5.371352195739746, "learning_rate": 4.173501775559921e-05, "loss": 2.9003, "step": 1524 }, { "epoch": 1.0207383869214366, "grad_norm": 4.864296913146973, "learning_rate": 4.172054519889248e-05, "loss": 2.8299, "step": 1525 }, { "epoch": 1.0214073671447088, "grad_norm": 4.238711357116699, "learning_rate": 4.1706062496212487e-05, "loss": 2.9332, "step": 1526 }, { "epoch": 1.022076347367981, "grad_norm": 5.82922887802124, "learning_rate": 4.169156965634728e-05, "loss": 2.7335, "step": 1527 }, { "epoch": 1.022745327591253, "grad_norm": 5.272682189941406, "learning_rate": 4.1677066688091085e-05, "loss": 2.7228, "step": 1528 }, { "epoch": 1.0234143078145252, "grad_norm": 4.921049118041992, "learning_rate": 4.1662553600244234e-05, "loss": 2.9095, "step": 1529 }, { "epoch": 1.0240832880377975, "grad_norm": 4.771050453186035, "learning_rate": 4.1648030401613246e-05, "loss": 2.7689, "step": 1530 }, { "epoch": 1.0247522682610695, "grad_norm": 3.7184505462646484, "learning_rate": 4.163349710101073e-05, "loss": 2.8432, "step": 1531 }, { "epoch": 1.0254212484843417, "grad_norm": 5.250668048858643, "learning_rate": 4.161895370725547e-05, "loss": 2.8125, "step": 1532 }, { "epoch": 1.026090228707614, "grad_norm": 5.156813144683838, "learning_rate": 4.1604400229172324e-05, "loss": 2.5678, "step": 1533 }, { "epoch": 1.026759208930886, "grad_norm": 4.721811294555664, "learning_rate": 4.158983667559232e-05, "loss": 3.0089, "step": 1534 }, { "epoch": 1.0274281891541581, "grad_norm": 3.9132628440856934, "learning_rate": 4.157526305535256e-05, "loss": 2.654, "step": 1535 }, { "epoch": 1.0280971693774303, "grad_norm": 4.286519527435303, "learning_rate": 4.156067937729628e-05, "loss": 2.9266, "step": 1536 }, { "epoch": 1.0287661496007023, "grad_norm": 6.115054607391357, "learning_rate": 4.1546085650272795e-05, "loss": 2.7046, "step": 1537 }, { "epoch": 1.0294351298239746, "grad_norm": 3.2762906551361084, "learning_rate": 4.153148188313753e-05, "loss": 2.4544, "step": 1538 }, { "epoch": 1.0301041100472468, "grad_norm": 5.929058074951172, "learning_rate": 4.151686808475204e-05, "loss": 2.4098, "step": 1539 }, { "epoch": 1.0307730902705188, "grad_norm": 4.244700908660889, "learning_rate": 4.1502244263983894e-05, "loss": 2.6728, "step": 1540 }, { "epoch": 1.031442070493791, "grad_norm": 5.544959545135498, "learning_rate": 4.148761042970679e-05, "loss": 2.7866, "step": 1541 }, { "epoch": 1.0321110507170632, "grad_norm": 8.907869338989258, "learning_rate": 4.14729665908005e-05, "loss": 3.0113, "step": 1542 }, { "epoch": 1.0327800309403354, "grad_norm": 4.410710334777832, "learning_rate": 4.145831275615084e-05, "loss": 2.7286, "step": 1543 }, { "epoch": 1.0334490111636074, "grad_norm": 4.794613361358643, "learning_rate": 4.144364893464974e-05, "loss": 2.7656, "step": 1544 }, { "epoch": 1.0341179913868797, "grad_norm": 4.0600361824035645, "learning_rate": 4.142897513519512e-05, "loss": 2.5502, "step": 1545 }, { "epoch": 1.0347869716101519, "grad_norm": 4.491250514984131, "learning_rate": 4.141429136669103e-05, "loss": 2.4829, "step": 1546 }, { "epoch": 1.0354559518334239, "grad_norm": 5.653236389160156, "learning_rate": 4.139959763804752e-05, "loss": 2.6999, "step": 1547 }, { "epoch": 1.036124932056696, "grad_norm": 4.625912666320801, "learning_rate": 4.138489395818069e-05, "loss": 2.853, "step": 1548 }, { "epoch": 1.0367939122799683, "grad_norm": 4.4478254318237305, "learning_rate": 4.1370180336012696e-05, "loss": 2.7662, "step": 1549 }, { "epoch": 1.0374628925032403, "grad_norm": 6.205041408538818, "learning_rate": 4.1355456780471716e-05, "loss": 3.1062, "step": 1550 }, { "epoch": 1.0381318727265125, "grad_norm": 4.5253987312316895, "learning_rate": 4.134072330049195e-05, "loss": 2.6946, "step": 1551 }, { "epoch": 1.0388008529497847, "grad_norm": 5.051163196563721, "learning_rate": 4.132597990501363e-05, "loss": 2.7989, "step": 1552 }, { "epoch": 1.0394698331730567, "grad_norm": 4.920440673828125, "learning_rate": 4.131122660298301e-05, "loss": 3.0435, "step": 1553 }, { "epoch": 1.040138813396329, "grad_norm": 3.8905622959136963, "learning_rate": 4.129646340335234e-05, "loss": 2.5991, "step": 1554 }, { "epoch": 1.0408077936196012, "grad_norm": 4.620569705963135, "learning_rate": 4.128169031507987e-05, "loss": 2.7107, "step": 1555 }, { "epoch": 1.0414767738428732, "grad_norm": 4.2700581550598145, "learning_rate": 4.126690734712988e-05, "loss": 2.9075, "step": 1556 }, { "epoch": 1.0421457540661454, "grad_norm": 4.611904621124268, "learning_rate": 4.1252114508472614e-05, "loss": 2.6566, "step": 1557 }, { "epoch": 1.0428147342894176, "grad_norm": 4.809654235839844, "learning_rate": 4.1237311808084335e-05, "loss": 2.7724, "step": 1558 }, { "epoch": 1.0434837145126896, "grad_norm": 4.333112716674805, "learning_rate": 4.122249925494726e-05, "loss": 2.7317, "step": 1559 }, { "epoch": 1.0441526947359618, "grad_norm": 4.872314453125, "learning_rate": 4.12076768580496e-05, "loss": 2.8534, "step": 1560 }, { "epoch": 1.044821674959234, "grad_norm": 4.6519036293029785, "learning_rate": 4.119284462638555e-05, "loss": 2.7633, "step": 1561 }, { "epoch": 1.045490655182506, "grad_norm": 4.549657821655273, "learning_rate": 4.1178002568955246e-05, "loss": 2.516, "step": 1562 }, { "epoch": 1.0461596354057783, "grad_norm": 5.804410457611084, "learning_rate": 4.116315069476481e-05, "loss": 2.8158, "step": 1563 }, { "epoch": 1.0468286156290505, "grad_norm": 5.040773868560791, "learning_rate": 4.114828901282631e-05, "loss": 2.7724, "step": 1564 }, { "epoch": 1.0474975958523227, "grad_norm": 5.569967746734619, "learning_rate": 4.113341753215777e-05, "loss": 2.8755, "step": 1565 }, { "epoch": 1.0481665760755947, "grad_norm": 4.0738444328308105, "learning_rate": 4.111853626178315e-05, "loss": 2.7354, "step": 1566 }, { "epoch": 1.048835556298867, "grad_norm": 4.854263782501221, "learning_rate": 4.110364521073236e-05, "loss": 2.7054, "step": 1567 }, { "epoch": 1.0495045365221392, "grad_norm": 7.207700252532959, "learning_rate": 4.1088744388041235e-05, "loss": 3.0215, "step": 1568 }, { "epoch": 1.0501735167454112, "grad_norm": 6.67018985748291, "learning_rate": 4.107383380275156e-05, "loss": 2.6792, "step": 1569 }, { "epoch": 1.0508424969686834, "grad_norm": 7.6153130531311035, "learning_rate": 4.105891346391102e-05, "loss": 2.8476, "step": 1570 }, { "epoch": 1.0515114771919556, "grad_norm": 5.213456153869629, "learning_rate": 4.1043983380573234e-05, "loss": 2.7051, "step": 1571 }, { "epoch": 1.0521804574152276, "grad_norm": 3.381913423538208, "learning_rate": 4.1029043561797734e-05, "loss": 2.6172, "step": 1572 }, { "epoch": 1.0528494376384998, "grad_norm": 7.620858669281006, "learning_rate": 4.101409401664994e-05, "loss": 2.7223, "step": 1573 }, { "epoch": 1.053518417861772, "grad_norm": 4.394527435302734, "learning_rate": 4.09991347542012e-05, "loss": 3.0849, "step": 1574 }, { "epoch": 1.054187398085044, "grad_norm": 4.2898335456848145, "learning_rate": 4.0984165783528736e-05, "loss": 2.5078, "step": 1575 }, { "epoch": 1.0548563783083162, "grad_norm": 4.043838977813721, "learning_rate": 4.096918711371569e-05, "loss": 2.6517, "step": 1576 }, { "epoch": 1.0555253585315885, "grad_norm": 4.759499549865723, "learning_rate": 4.0954198753851045e-05, "loss": 2.4175, "step": 1577 }, { "epoch": 1.0561943387548605, "grad_norm": 4.681356430053711, "learning_rate": 4.0939200713029715e-05, "loss": 2.7955, "step": 1578 }, { "epoch": 1.0568633189781327, "grad_norm": 4.3460373878479, "learning_rate": 4.0924193000352445e-05, "loss": 2.5704, "step": 1579 }, { "epoch": 1.057532299201405, "grad_norm": 7.12529993057251, "learning_rate": 4.0909175624925875e-05, "loss": 2.7936, "step": 1580 }, { "epoch": 1.0582012794246771, "grad_norm": 9.37367057800293, "learning_rate": 4.08941485958625e-05, "loss": 2.9732, "step": 1581 }, { "epoch": 1.0588702596479491, "grad_norm": 5.256577491760254, "learning_rate": 4.087911192228067e-05, "loss": 2.5803, "step": 1582 }, { "epoch": 1.0595392398712213, "grad_norm": 4.99888801574707, "learning_rate": 4.086406561330459e-05, "loss": 2.9305, "step": 1583 }, { "epoch": 1.0602082200944936, "grad_norm": 5.101346969604492, "learning_rate": 4.084900967806432e-05, "loss": 2.5905, "step": 1584 }, { "epoch": 1.0608772003177656, "grad_norm": 6.627160549163818, "learning_rate": 4.083394412569574e-05, "loss": 2.8194, "step": 1585 }, { "epoch": 1.0615461805410378, "grad_norm": 6.262012481689453, "learning_rate": 4.08188689653406e-05, "loss": 2.8002, "step": 1586 }, { "epoch": 1.06221516076431, "grad_norm": 4.888954162597656, "learning_rate": 4.0803784206146434e-05, "loss": 2.9786, "step": 1587 }, { "epoch": 1.062884140987582, "grad_norm": 6.124375343322754, "learning_rate": 4.078868985726665e-05, "loss": 2.8576, "step": 1588 }, { "epoch": 1.0635531212108542, "grad_norm": 5.127064228057861, "learning_rate": 4.077358592786043e-05, "loss": 2.9365, "step": 1589 }, { "epoch": 1.0642221014341264, "grad_norm": 4.611770153045654, "learning_rate": 4.0758472427092785e-05, "loss": 2.7742, "step": 1590 }, { "epoch": 1.0648910816573984, "grad_norm": 4.597067356109619, "learning_rate": 4.0743349364134566e-05, "loss": 2.6967, "step": 1591 }, { "epoch": 1.0655600618806707, "grad_norm": 5.276156425476074, "learning_rate": 4.072821674816239e-05, "loss": 2.872, "step": 1592 }, { "epoch": 1.0662290421039429, "grad_norm": 3.8604416847229004, "learning_rate": 4.071307458835866e-05, "loss": 2.7134, "step": 1593 }, { "epoch": 1.0668980223272149, "grad_norm": 4.565550327301025, "learning_rate": 4.069792289391161e-05, "loss": 2.753, "step": 1594 }, { "epoch": 1.067567002550487, "grad_norm": 5.758005142211914, "learning_rate": 4.0682761674015236e-05, "loss": 2.8769, "step": 1595 }, { "epoch": 1.0682359827737593, "grad_norm": 6.4277729988098145, "learning_rate": 4.066759093786931e-05, "loss": 2.7766, "step": 1596 }, { "epoch": 1.0689049629970313, "grad_norm": 6.652651309967041, "learning_rate": 4.06524106946794e-05, "loss": 2.823, "step": 1597 }, { "epoch": 1.0695739432203035, "grad_norm": 7.791245460510254, "learning_rate": 4.063722095365682e-05, "loss": 3.1185, "step": 1598 }, { "epoch": 1.0702429234435757, "grad_norm": 4.381209850311279, "learning_rate": 4.062202172401865e-05, "loss": 2.646, "step": 1599 }, { "epoch": 1.0709119036668477, "grad_norm": 4.1545634269714355, "learning_rate": 4.060681301498775e-05, "loss": 2.6542, "step": 1600 }, { "epoch": 1.07158088389012, "grad_norm": 5.229256629943848, "learning_rate": 4.05915948357927e-05, "loss": 2.8453, "step": 1601 }, { "epoch": 1.0722498641133922, "grad_norm": 5.327920436859131, "learning_rate": 4.057636719566785e-05, "loss": 2.8317, "step": 1602 }, { "epoch": 1.0729188443366644, "grad_norm": 5.403182029724121, "learning_rate": 4.056113010385329e-05, "loss": 2.8012, "step": 1603 }, { "epoch": 1.0735878245599364, "grad_norm": 4.147651195526123, "learning_rate": 4.054588356959482e-05, "loss": 2.7209, "step": 1604 }, { "epoch": 1.0742568047832086, "grad_norm": 4.1708784103393555, "learning_rate": 4.0530627602144015e-05, "loss": 2.6841, "step": 1605 }, { "epoch": 1.0749257850064808, "grad_norm": 5.463420391082764, "learning_rate": 4.0515362210758126e-05, "loss": 2.8198, "step": 1606 }, { "epoch": 1.0755947652297528, "grad_norm": 6.347439765930176, "learning_rate": 4.050008740470014e-05, "loss": 2.6981, "step": 1607 }, { "epoch": 1.076263745453025, "grad_norm": 5.56416130065918, "learning_rate": 4.0484803193238773e-05, "loss": 2.597, "step": 1608 }, { "epoch": 1.0769327256762973, "grad_norm": 4.583149433135986, "learning_rate": 4.046950958564843e-05, "loss": 2.6726, "step": 1609 }, { "epoch": 1.0776017058995693, "grad_norm": 4.929792881011963, "learning_rate": 4.045420659120923e-05, "loss": 2.9779, "step": 1610 }, { "epoch": 1.0782706861228415, "grad_norm": 5.686063289642334, "learning_rate": 4.043889421920698e-05, "loss": 3.0255, "step": 1611 }, { "epoch": 1.0789396663461137, "grad_norm": 6.810145854949951, "learning_rate": 4.042357247893317e-05, "loss": 2.8852, "step": 1612 }, { "epoch": 1.0796086465693857, "grad_norm": 5.9859724044799805, "learning_rate": 4.040824137968499e-05, "loss": 3.1111, "step": 1613 }, { "epoch": 1.080277626792658, "grad_norm": 5.715677738189697, "learning_rate": 4.039290093076529e-05, "loss": 2.7821, "step": 1614 }, { "epoch": 1.0809466070159301, "grad_norm": 3.387718439102173, "learning_rate": 4.0377551141482614e-05, "loss": 2.7442, "step": 1615 }, { "epoch": 1.0816155872392021, "grad_norm": 5.054075241088867, "learning_rate": 4.0362192021151174e-05, "loss": 3.0196, "step": 1616 }, { "epoch": 1.0822845674624744, "grad_norm": 3.9575531482696533, "learning_rate": 4.0346823579090826e-05, "loss": 2.5998, "step": 1617 }, { "epoch": 1.0829535476857466, "grad_norm": 3.741792917251587, "learning_rate": 4.033144582462709e-05, "loss": 2.8112, "step": 1618 }, { "epoch": 1.0836225279090188, "grad_norm": 4.696652412414551, "learning_rate": 4.031605876709113e-05, "loss": 2.9065, "step": 1619 }, { "epoch": 1.0842915081322908, "grad_norm": 6.763078689575195, "learning_rate": 4.030066241581979e-05, "loss": 2.8761, "step": 1620 }, { "epoch": 1.084960488355563, "grad_norm": 5.179434776306152, "learning_rate": 4.02852567801555e-05, "loss": 2.9401, "step": 1621 }, { "epoch": 1.0856294685788352, "grad_norm": 4.58511209487915, "learning_rate": 4.0269841869446365e-05, "loss": 2.6891, "step": 1622 }, { "epoch": 1.0862984488021072, "grad_norm": 3.7305219173431396, "learning_rate": 4.0254417693046096e-05, "loss": 2.4721, "step": 1623 }, { "epoch": 1.0869674290253795, "grad_norm": 4.831632137298584, "learning_rate": 4.0238984260314036e-05, "loss": 2.7192, "step": 1624 }, { "epoch": 1.0876364092486517, "grad_norm": 5.245891094207764, "learning_rate": 4.022354158061515e-05, "loss": 2.7525, "step": 1625 }, { "epoch": 1.0883053894719237, "grad_norm": 4.761187553405762, "learning_rate": 4.0208089663319994e-05, "loss": 2.9389, "step": 1626 }, { "epoch": 1.088974369695196, "grad_norm": 5.587457180023193, "learning_rate": 4.019262851780474e-05, "loss": 2.7128, "step": 1627 }, { "epoch": 1.0896433499184681, "grad_norm": 6.042799949645996, "learning_rate": 4.0177158153451176e-05, "loss": 2.7198, "step": 1628 }, { "epoch": 1.0903123301417401, "grad_norm": 5.319640636444092, "learning_rate": 4.016167857964667e-05, "loss": 2.5434, "step": 1629 }, { "epoch": 1.0909813103650123, "grad_norm": 4.484240531921387, "learning_rate": 4.014618980578416e-05, "loss": 2.6542, "step": 1630 }, { "epoch": 1.0916502905882846, "grad_norm": 5.133976936340332, "learning_rate": 4.0130691841262194e-05, "loss": 2.7771, "step": 1631 }, { "epoch": 1.0923192708115566, "grad_norm": 3.1250579357147217, "learning_rate": 4.01151846954849e-05, "loss": 2.606, "step": 1632 }, { "epoch": 1.0929882510348288, "grad_norm": 5.172818183898926, "learning_rate": 4.0099668377861944e-05, "loss": 3.0745, "step": 1633 }, { "epoch": 1.093657231258101, "grad_norm": 4.249288082122803, "learning_rate": 4.008414289780859e-05, "loss": 2.6543, "step": 1634 }, { "epoch": 1.094326211481373, "grad_norm": 8.297806739807129, "learning_rate": 4.0068608264745636e-05, "loss": 2.8929, "step": 1635 }, { "epoch": 1.0949951917046452, "grad_norm": 3.5871880054473877, "learning_rate": 4.005306448809946e-05, "loss": 2.5515, "step": 1636 }, { "epoch": 1.0956641719279174, "grad_norm": 5.7941999435424805, "learning_rate": 4.003751157730198e-05, "loss": 2.5705, "step": 1637 }, { "epoch": 1.0963331521511894, "grad_norm": 3.356236219406128, "learning_rate": 4.002194954179064e-05, "loss": 2.4806, "step": 1638 }, { "epoch": 1.0970021323744616, "grad_norm": 4.91937780380249, "learning_rate": 4.000637839100845e-05, "loss": 2.842, "step": 1639 }, { "epoch": 1.0976711125977339, "grad_norm": 5.350463390350342, "learning_rate": 3.9990798134403906e-05, "loss": 2.8923, "step": 1640 }, { "epoch": 1.0983400928210059, "grad_norm": 4.7535319328308105, "learning_rate": 3.997520878143109e-05, "loss": 2.9656, "step": 1641 }, { "epoch": 1.099009073044278, "grad_norm": 6.683962345123291, "learning_rate": 3.9959610341549546e-05, "loss": 3.0393, "step": 1642 }, { "epoch": 1.0996780532675503, "grad_norm": 4.360228061676025, "learning_rate": 3.994400282422438e-05, "loss": 2.7038, "step": 1643 }, { "epoch": 1.1003470334908225, "grad_norm": 3.990546941757202, "learning_rate": 3.9928386238926165e-05, "loss": 2.8572, "step": 1644 }, { "epoch": 1.1010160137140945, "grad_norm": 3.997955560684204, "learning_rate": 3.991276059513099e-05, "loss": 2.7878, "step": 1645 }, { "epoch": 1.1016849939373667, "grad_norm": 7.930902004241943, "learning_rate": 3.989712590232048e-05, "loss": 3.067, "step": 1646 }, { "epoch": 1.102353974160639, "grad_norm": 5.619079113006592, "learning_rate": 3.9881482169981676e-05, "loss": 2.6665, "step": 1647 }, { "epoch": 1.103022954383911, "grad_norm": 7.326338291168213, "learning_rate": 3.986582940760717e-05, "loss": 2.6139, "step": 1648 }, { "epoch": 1.1036919346071832, "grad_norm": 4.203922271728516, "learning_rate": 3.9850167624694994e-05, "loss": 2.7178, "step": 1649 }, { "epoch": 1.1043609148304554, "grad_norm": 13.013337135314941, "learning_rate": 3.983449683074868e-05, "loss": 2.858, "step": 1650 }, { "epoch": 1.1050298950537274, "grad_norm": 7.882613658905029, "learning_rate": 3.981881703527721e-05, "loss": 2.6311, "step": 1651 }, { "epoch": 1.1056988752769996, "grad_norm": 8.829507827758789, "learning_rate": 3.980312824779503e-05, "loss": 2.6887, "step": 1652 }, { "epoch": 1.1063678555002718, "grad_norm": 6.895455837249756, "learning_rate": 3.9787430477822046e-05, "loss": 2.5745, "step": 1653 }, { "epoch": 1.1070368357235438, "grad_norm": 5.998924732208252, "learning_rate": 3.9771723734883624e-05, "loss": 2.8309, "step": 1654 }, { "epoch": 1.107705815946816, "grad_norm": 8.006895065307617, "learning_rate": 3.975600802851056e-05, "loss": 2.7938, "step": 1655 }, { "epoch": 1.1083747961700883, "grad_norm": 4.172781944274902, "learning_rate": 3.9740283368239086e-05, "loss": 2.8041, "step": 1656 }, { "epoch": 1.1090437763933603, "grad_norm": 5.066199779510498, "learning_rate": 3.9724549763610884e-05, "loss": 2.5142, "step": 1657 }, { "epoch": 1.1097127566166325, "grad_norm": 4.01627779006958, "learning_rate": 3.9708807224173064e-05, "loss": 2.6579, "step": 1658 }, { "epoch": 1.1103817368399047, "grad_norm": 5.281778335571289, "learning_rate": 3.9693055759478124e-05, "loss": 2.8503, "step": 1659 }, { "epoch": 1.111050717063177, "grad_norm": 5.004472255706787, "learning_rate": 3.9677295379084026e-05, "loss": 2.9395, "step": 1660 }, { "epoch": 1.111719697286449, "grad_norm": 4.206826686859131, "learning_rate": 3.96615260925541e-05, "loss": 2.6226, "step": 1661 }, { "epoch": 1.1123886775097211, "grad_norm": 5.367136001586914, "learning_rate": 3.964574790945712e-05, "loss": 2.6628, "step": 1662 }, { "epoch": 1.1130576577329934, "grad_norm": 4.41979455947876, "learning_rate": 3.962996083936722e-05, "loss": 2.6717, "step": 1663 }, { "epoch": 1.1137266379562654, "grad_norm": 4.343268394470215, "learning_rate": 3.961416489186394e-05, "loss": 2.8632, "step": 1664 }, { "epoch": 1.1143956181795376, "grad_norm": 6.543291091918945, "learning_rate": 3.9598360076532226e-05, "loss": 2.8875, "step": 1665 }, { "epoch": 1.1150645984028098, "grad_norm": 4.79495096206665, "learning_rate": 3.958254640296238e-05, "loss": 2.7381, "step": 1666 }, { "epoch": 1.1157335786260818, "grad_norm": 3.8318138122558594, "learning_rate": 3.956672388075009e-05, "loss": 2.6061, "step": 1667 }, { "epoch": 1.116402558849354, "grad_norm": 5.714412689208984, "learning_rate": 3.955089251949641e-05, "loss": 2.6708, "step": 1668 }, { "epoch": 1.1170715390726262, "grad_norm": 5.564817428588867, "learning_rate": 3.9535052328807764e-05, "loss": 2.5249, "step": 1669 }, { "epoch": 1.1177405192958982, "grad_norm": 5.668190002441406, "learning_rate": 3.951920331829593e-05, "loss": 2.7984, "step": 1670 }, { "epoch": 1.1184094995191705, "grad_norm": 3.878696918487549, "learning_rate": 3.950334549757803e-05, "loss": 2.6898, "step": 1671 }, { "epoch": 1.1190784797424427, "grad_norm": 4.1487274169921875, "learning_rate": 3.948747887627653e-05, "loss": 2.5919, "step": 1672 }, { "epoch": 1.1197474599657147, "grad_norm": 8.2774019241333, "learning_rate": 3.947160346401927e-05, "loss": 3.047, "step": 1673 }, { "epoch": 1.120416440188987, "grad_norm": 4.1392974853515625, "learning_rate": 3.9455719270439386e-05, "loss": 2.4994, "step": 1674 }, { "epoch": 1.1210854204122591, "grad_norm": 4.564013481140137, "learning_rate": 3.9439826305175345e-05, "loss": 2.7787, "step": 1675 }, { "epoch": 1.1217544006355311, "grad_norm": 4.009519100189209, "learning_rate": 3.942392457787096e-05, "loss": 2.4334, "step": 1676 }, { "epoch": 1.1224233808588033, "grad_norm": 3.0793025493621826, "learning_rate": 3.940801409817536e-05, "loss": 2.5784, "step": 1677 }, { "epoch": 1.1230923610820756, "grad_norm": 4.557519912719727, "learning_rate": 3.939209487574295e-05, "loss": 2.8636, "step": 1678 }, { "epoch": 1.1237613413053475, "grad_norm": 4.885311603546143, "learning_rate": 3.937616692023347e-05, "loss": 2.4412, "step": 1679 }, { "epoch": 1.1244303215286198, "grad_norm": 4.2733354568481445, "learning_rate": 3.936023024131196e-05, "loss": 2.738, "step": 1680 }, { "epoch": 1.125099301751892, "grad_norm": 5.4617180824279785, "learning_rate": 3.934428484864874e-05, "loss": 2.9871, "step": 1681 }, { "epoch": 1.125768281975164, "grad_norm": 4.2067365646362305, "learning_rate": 3.9328330751919424e-05, "loss": 2.4473, "step": 1682 }, { "epoch": 1.1264372621984362, "grad_norm": 5.401765823364258, "learning_rate": 3.9312367960804905e-05, "loss": 2.7275, "step": 1683 }, { "epoch": 1.1271062424217084, "grad_norm": 4.645919322967529, "learning_rate": 3.929639648499136e-05, "loss": 2.7506, "step": 1684 }, { "epoch": 1.1277752226449806, "grad_norm": 6.534749984741211, "learning_rate": 3.928041633417022e-05, "loss": 2.3065, "step": 1685 }, { "epoch": 1.1284442028682526, "grad_norm": 4.626363754272461, "learning_rate": 3.926442751803819e-05, "loss": 2.7914, "step": 1686 }, { "epoch": 1.1291131830915249, "grad_norm": 4.61073637008667, "learning_rate": 3.9248430046297246e-05, "loss": 2.7046, "step": 1687 }, { "epoch": 1.129782163314797, "grad_norm": 3.616173505783081, "learning_rate": 3.923242392865459e-05, "loss": 2.6379, "step": 1688 }, { "epoch": 1.130451143538069, "grad_norm": 5.434972286224365, "learning_rate": 3.9216409174822685e-05, "loss": 2.8262, "step": 1689 }, { "epoch": 1.1311201237613413, "grad_norm": 5.812896251678467, "learning_rate": 3.920038579451923e-05, "loss": 2.8654, "step": 1690 }, { "epoch": 1.1317891039846135, "grad_norm": 4.690445423126221, "learning_rate": 3.918435379746716e-05, "loss": 2.8625, "step": 1691 }, { "epoch": 1.1324580842078855, "grad_norm": 5.103313446044922, "learning_rate": 3.9168313193394655e-05, "loss": 2.8396, "step": 1692 }, { "epoch": 1.1331270644311577, "grad_norm": 5.238773822784424, "learning_rate": 3.915226399203509e-05, "loss": 2.903, "step": 1693 }, { "epoch": 1.13379604465443, "grad_norm": 7.802103519439697, "learning_rate": 3.913620620312706e-05, "loss": 3.1013, "step": 1694 }, { "epoch": 1.1344650248777022, "grad_norm": 5.691326141357422, "learning_rate": 3.912013983641439e-05, "loss": 2.6053, "step": 1695 }, { "epoch": 1.1351340051009742, "grad_norm": 5.531711101531982, "learning_rate": 3.910406490164611e-05, "loss": 2.7102, "step": 1696 }, { "epoch": 1.1358029853242464, "grad_norm": 5.916006088256836, "learning_rate": 3.908798140857642e-05, "loss": 2.8587, "step": 1697 }, { "epoch": 1.1364719655475186, "grad_norm": 4.010486602783203, "learning_rate": 3.907188936696475e-05, "loss": 2.5842, "step": 1698 }, { "epoch": 1.1371409457707906, "grad_norm": 4.513060569763184, "learning_rate": 3.905578878657567e-05, "loss": 2.7569, "step": 1699 }, { "epoch": 1.1378099259940628, "grad_norm": 6.514904499053955, "learning_rate": 3.9039679677179e-05, "loss": 2.6758, "step": 1700 }, { "epoch": 1.138478906217335, "grad_norm": 5.1382246017456055, "learning_rate": 3.902356204854967e-05, "loss": 2.8837, "step": 1701 }, { "epoch": 1.139147886440607, "grad_norm": 6.538110256195068, "learning_rate": 3.900743591046782e-05, "loss": 2.6701, "step": 1702 }, { "epoch": 1.1398168666638793, "grad_norm": 5.245736122131348, "learning_rate": 3.8991301272718713e-05, "loss": 2.8015, "step": 1703 }, { "epoch": 1.1404858468871515, "grad_norm": 6.4442362785339355, "learning_rate": 3.8975158145092825e-05, "loss": 3.0992, "step": 1704 }, { "epoch": 1.1411548271104235, "grad_norm": 4.114341735839844, "learning_rate": 3.8959006537385736e-05, "loss": 2.6867, "step": 1705 }, { "epoch": 1.1418238073336957, "grad_norm": 5.672820568084717, "learning_rate": 3.8942846459398194e-05, "loss": 2.6206, "step": 1706 }, { "epoch": 1.142492787556968, "grad_norm": 4.0786356925964355, "learning_rate": 3.8926677920936096e-05, "loss": 3.0933, "step": 1707 }, { "epoch": 1.14316176778024, "grad_norm": 5.188081741333008, "learning_rate": 3.891050093181044e-05, "loss": 2.7933, "step": 1708 }, { "epoch": 1.1438307480035121, "grad_norm": 5.515549659729004, "learning_rate": 3.889431550183738e-05, "loss": 2.8488, "step": 1709 }, { "epoch": 1.1444997282267844, "grad_norm": 5.796931743621826, "learning_rate": 3.8878121640838186e-05, "loss": 2.9218, "step": 1710 }, { "epoch": 1.1451687084500564, "grad_norm": 5.553003787994385, "learning_rate": 3.886191935863923e-05, "loss": 2.7191, "step": 1711 }, { "epoch": 1.1458376886733286, "grad_norm": 4.528774738311768, "learning_rate": 3.884570866507202e-05, "loss": 2.9298, "step": 1712 }, { "epoch": 1.1465066688966008, "grad_norm": 4.108309745788574, "learning_rate": 3.882948956997314e-05, "loss": 2.8881, "step": 1713 }, { "epoch": 1.1471756491198728, "grad_norm": 4.2996439933776855, "learning_rate": 3.8813262083184286e-05, "loss": 2.7954, "step": 1714 }, { "epoch": 1.147844629343145, "grad_norm": 6.285360813140869, "learning_rate": 3.879702621455226e-05, "loss": 2.7149, "step": 1715 }, { "epoch": 1.1485136095664172, "grad_norm": 6.409306526184082, "learning_rate": 3.878078197392891e-05, "loss": 2.8814, "step": 1716 }, { "epoch": 1.1491825897896892, "grad_norm": 4.87045955657959, "learning_rate": 3.876452937117122e-05, "loss": 2.7626, "step": 1717 }, { "epoch": 1.1498515700129615, "grad_norm": 4.315850734710693, "learning_rate": 3.8748268416141184e-05, "loss": 2.8149, "step": 1718 }, { "epoch": 1.1505205502362337, "grad_norm": 4.383823394775391, "learning_rate": 3.8731999118705926e-05, "loss": 2.732, "step": 1719 }, { "epoch": 1.1511895304595057, "grad_norm": 5.145540714263916, "learning_rate": 3.871572148873759e-05, "loss": 2.6938, "step": 1720 }, { "epoch": 1.151858510682778, "grad_norm": 3.9767284393310547, "learning_rate": 3.869943553611338e-05, "loss": 2.4974, "step": 1721 }, { "epoch": 1.1525274909060501, "grad_norm": 5.369380950927734, "learning_rate": 3.868314127071559e-05, "loss": 2.6343, "step": 1722 }, { "epoch": 1.1531964711293223, "grad_norm": 4.192481994628906, "learning_rate": 3.86668387024315e-05, "loss": 2.4316, "step": 1723 }, { "epoch": 1.1538654513525943, "grad_norm": 6.330613613128662, "learning_rate": 3.8650527841153454e-05, "loss": 2.7826, "step": 1724 }, { "epoch": 1.1545344315758665, "grad_norm": 5.088796615600586, "learning_rate": 3.8634208696778856e-05, "loss": 2.8024, "step": 1725 }, { "epoch": 1.1552034117991388, "grad_norm": 5.2484450340271, "learning_rate": 3.861788127921009e-05, "loss": 2.7792, "step": 1726 }, { "epoch": 1.1558723920224108, "grad_norm": 4.708180904388428, "learning_rate": 3.8601545598354593e-05, "loss": 2.9515, "step": 1727 }, { "epoch": 1.156541372245683, "grad_norm": 4.836954593658447, "learning_rate": 3.8585201664124795e-05, "loss": 3.0762, "step": 1728 }, { "epoch": 1.1572103524689552, "grad_norm": 5.8461503982543945, "learning_rate": 3.856884948643814e-05, "loss": 2.9477, "step": 1729 }, { "epoch": 1.1578793326922272, "grad_norm": 5.0680694580078125, "learning_rate": 3.8552489075217085e-05, "loss": 2.9862, "step": 1730 }, { "epoch": 1.1585483129154994, "grad_norm": 5.802671432495117, "learning_rate": 3.853612044038908e-05, "loss": 2.9056, "step": 1731 }, { "epoch": 1.1592172931387716, "grad_norm": 6.276515483856201, "learning_rate": 3.8519743591886546e-05, "loss": 2.9355, "step": 1732 }, { "epoch": 1.1598862733620436, "grad_norm": 4.246423244476318, "learning_rate": 3.850335853964692e-05, "loss": 2.65, "step": 1733 }, { "epoch": 1.1605552535853159, "grad_norm": 5.163718223571777, "learning_rate": 3.848696529361258e-05, "loss": 2.7763, "step": 1734 }, { "epoch": 1.161224233808588, "grad_norm": 6.367431163787842, "learning_rate": 3.847056386373089e-05, "loss": 2.9175, "step": 1735 }, { "epoch": 1.1618932140318603, "grad_norm": 3.0664758682250977, "learning_rate": 3.845415425995421e-05, "loss": 2.6235, "step": 1736 }, { "epoch": 1.1625621942551323, "grad_norm": 4.423579216003418, "learning_rate": 3.843773649223983e-05, "loss": 2.8541, "step": 1737 }, { "epoch": 1.1632311744784045, "grad_norm": 4.328232765197754, "learning_rate": 3.8421310570549994e-05, "loss": 2.8183, "step": 1738 }, { "epoch": 1.1639001547016767, "grad_norm": 6.539478302001953, "learning_rate": 3.840487650485189e-05, "loss": 2.9259, "step": 1739 }, { "epoch": 1.1645691349249487, "grad_norm": 5.035600185394287, "learning_rate": 3.838843430511766e-05, "loss": 2.638, "step": 1740 }, { "epoch": 1.165238115148221, "grad_norm": 4.728442192077637, "learning_rate": 3.8371983981324395e-05, "loss": 2.7532, "step": 1741 }, { "epoch": 1.1659070953714932, "grad_norm": 4.8734540939331055, "learning_rate": 3.835552554345407e-05, "loss": 2.6562, "step": 1742 }, { "epoch": 1.1665760755947652, "grad_norm": 4.309051513671875, "learning_rate": 3.833905900149364e-05, "loss": 2.823, "step": 1743 }, { "epoch": 1.1672450558180374, "grad_norm": 4.245873928070068, "learning_rate": 3.832258436543494e-05, "loss": 2.5424, "step": 1744 }, { "epoch": 1.1679140360413096, "grad_norm": 5.439658164978027, "learning_rate": 3.830610164527472e-05, "loss": 2.7097, "step": 1745 }, { "epoch": 1.1685830162645816, "grad_norm": 5.236258506774902, "learning_rate": 3.828961085101463e-05, "loss": 2.7249, "step": 1746 }, { "epoch": 1.1692519964878538, "grad_norm": 4.445048809051514, "learning_rate": 3.827311199266127e-05, "loss": 2.5616, "step": 1747 }, { "epoch": 1.169920976711126, "grad_norm": 5.976138114929199, "learning_rate": 3.8256605080226064e-05, "loss": 2.9866, "step": 1748 }, { "epoch": 1.170589956934398, "grad_norm": 6.202071666717529, "learning_rate": 3.8240090123725357e-05, "loss": 2.7178, "step": 1749 }, { "epoch": 1.1712589371576703, "grad_norm": 4.172088623046875, "learning_rate": 3.822356713318038e-05, "loss": 2.79, "step": 1750 }, { "epoch": 1.1719279173809425, "grad_norm": 4.560709476470947, "learning_rate": 3.820703611861722e-05, "loss": 2.6545, "step": 1751 }, { "epoch": 1.1725968976042145, "grad_norm": 5.1008172035217285, "learning_rate": 3.819049709006687e-05, "loss": 2.5919, "step": 1752 }, { "epoch": 1.1732658778274867, "grad_norm": 6.211871147155762, "learning_rate": 3.8173950057565125e-05, "loss": 2.9597, "step": 1753 }, { "epoch": 1.173934858050759, "grad_norm": 5.119531631469727, "learning_rate": 3.815739503115268e-05, "loss": 2.7113, "step": 1754 }, { "epoch": 1.174603838274031, "grad_norm": 5.102022647857666, "learning_rate": 3.8140832020875086e-05, "loss": 2.915, "step": 1755 }, { "epoch": 1.1752728184973031, "grad_norm": 4.11543083190918, "learning_rate": 3.8124261036782714e-05, "loss": 2.5921, "step": 1756 }, { "epoch": 1.1759417987205754, "grad_norm": 3.230837106704712, "learning_rate": 3.8107682088930794e-05, "loss": 2.5988, "step": 1757 }, { "epoch": 1.1766107789438474, "grad_norm": 3.4689149856567383, "learning_rate": 3.8091095187379366e-05, "loss": 2.5246, "step": 1758 }, { "epoch": 1.1772797591671196, "grad_norm": 4.02721643447876, "learning_rate": 3.807450034219332e-05, "loss": 2.8093, "step": 1759 }, { "epoch": 1.1779487393903918, "grad_norm": 5.369773864746094, "learning_rate": 3.805789756344234e-05, "loss": 2.6829, "step": 1760 }, { "epoch": 1.178617719613664, "grad_norm": 3.211688756942749, "learning_rate": 3.804128686120095e-05, "loss": 2.6756, "step": 1761 }, { "epoch": 1.179286699836936, "grad_norm": 4.2267165184021, "learning_rate": 3.802466824554847e-05, "loss": 2.702, "step": 1762 }, { "epoch": 1.1799556800602082, "grad_norm": 7.6689300537109375, "learning_rate": 3.8008041726569024e-05, "loss": 2.8305, "step": 1763 }, { "epoch": 1.1806246602834805, "grad_norm": 5.8855485916137695, "learning_rate": 3.799140731435152e-05, "loss": 2.805, "step": 1764 }, { "epoch": 1.1812936405067525, "grad_norm": 3.6822450160980225, "learning_rate": 3.797476501898968e-05, "loss": 2.6626, "step": 1765 }, { "epoch": 1.1819626207300247, "grad_norm": 6.995255470275879, "learning_rate": 3.795811485058199e-05, "loss": 2.5857, "step": 1766 }, { "epoch": 1.182631600953297, "grad_norm": 6.011009216308594, "learning_rate": 3.7941456819231715e-05, "loss": 2.7998, "step": 1767 }, { "epoch": 1.1833005811765689, "grad_norm": 3.980313301086426, "learning_rate": 3.79247909350469e-05, "loss": 2.6141, "step": 1768 }, { "epoch": 1.183969561399841, "grad_norm": 4.1785759925842285, "learning_rate": 3.7908117208140346e-05, "loss": 2.5139, "step": 1769 }, { "epoch": 1.1846385416231133, "grad_norm": 5.0990376472473145, "learning_rate": 3.7891435648629625e-05, "loss": 2.8435, "step": 1770 }, { "epoch": 1.1853075218463853, "grad_norm": 5.530050754547119, "learning_rate": 3.787474626663705e-05, "loss": 2.8514, "step": 1771 }, { "epoch": 1.1859765020696575, "grad_norm": 4.31281042098999, "learning_rate": 3.785804907228968e-05, "loss": 2.8415, "step": 1772 }, { "epoch": 1.1866454822929298, "grad_norm": 5.490376949310303, "learning_rate": 3.784134407571932e-05, "loss": 2.9642, "step": 1773 }, { "epoch": 1.187314462516202, "grad_norm": 5.688234806060791, "learning_rate": 3.782463128706251e-05, "loss": 2.6184, "step": 1774 }, { "epoch": 1.187983442739474, "grad_norm": 5.56623649597168, "learning_rate": 3.780791071646052e-05, "loss": 2.6272, "step": 1775 }, { "epoch": 1.1886524229627462, "grad_norm": 4.018519401550293, "learning_rate": 3.7791182374059334e-05, "loss": 2.8058, "step": 1776 }, { "epoch": 1.1893214031860184, "grad_norm": 6.15987491607666, "learning_rate": 3.777444627000966e-05, "loss": 2.7379, "step": 1777 }, { "epoch": 1.1899903834092904, "grad_norm": 5.726695537567139, "learning_rate": 3.7757702414466914e-05, "loss": 2.7939, "step": 1778 }, { "epoch": 1.1906593636325626, "grad_norm": 6.305606365203857, "learning_rate": 3.77409508175912e-05, "loss": 2.8558, "step": 1779 }, { "epoch": 1.1913283438558349, "grad_norm": 5.644913196563721, "learning_rate": 3.772419148954735e-05, "loss": 2.996, "step": 1780 }, { "epoch": 1.1919973240791069, "grad_norm": 4.535463809967041, "learning_rate": 3.770742444050487e-05, "loss": 2.5719, "step": 1781 }, { "epoch": 1.192666304302379, "grad_norm": 5.501713752746582, "learning_rate": 3.7690649680637935e-05, "loss": 2.8254, "step": 1782 }, { "epoch": 1.1933352845256513, "grad_norm": 4.598433017730713, "learning_rate": 3.767386722012543e-05, "loss": 2.7285, "step": 1783 }, { "epoch": 1.1940042647489233, "grad_norm": 4.824279308319092, "learning_rate": 3.76570770691509e-05, "loss": 2.7039, "step": 1784 }, { "epoch": 1.1946732449721955, "grad_norm": 6.458413600921631, "learning_rate": 3.7640279237902554e-05, "loss": 2.8277, "step": 1785 }, { "epoch": 1.1953422251954677, "grad_norm": 3.8205926418304443, "learning_rate": 3.762347373657325e-05, "loss": 2.6138, "step": 1786 }, { "epoch": 1.1960112054187397, "grad_norm": 4.0686163902282715, "learning_rate": 3.760666057536052e-05, "loss": 2.7758, "step": 1787 }, { "epoch": 1.196680185642012, "grad_norm": 6.732685089111328, "learning_rate": 3.758983976446654e-05, "loss": 3.061, "step": 1788 }, { "epoch": 1.1973491658652842, "grad_norm": 4.614278793334961, "learning_rate": 3.757301131409812e-05, "loss": 2.6263, "step": 1789 }, { "epoch": 1.1980181460885562, "grad_norm": 5.702500343322754, "learning_rate": 3.7556175234466705e-05, "loss": 2.6778, "step": 1790 }, { "epoch": 1.1986871263118284, "grad_norm": 4.065542221069336, "learning_rate": 3.7539331535788387e-05, "loss": 2.792, "step": 1791 }, { "epoch": 1.1993561065351006, "grad_norm": 6.724399566650391, "learning_rate": 3.752248022828386e-05, "loss": 2.84, "step": 1792 }, { "epoch": 1.2000250867583726, "grad_norm": 6.001861095428467, "learning_rate": 3.750562132217844e-05, "loss": 2.7299, "step": 1793 }, { "epoch": 1.2006940669816448, "grad_norm": 4.842615604400635, "learning_rate": 3.748875482770207e-05, "loss": 2.6922, "step": 1794 }, { "epoch": 1.201363047204917, "grad_norm": 5.486823558807373, "learning_rate": 3.747188075508928e-05, "loss": 2.8164, "step": 1795 }, { "epoch": 1.202032027428189, "grad_norm": 5.144231796264648, "learning_rate": 3.745499911457919e-05, "loss": 2.7281, "step": 1796 }, { "epoch": 1.2027010076514613, "grad_norm": 5.316722393035889, "learning_rate": 3.743810991641553e-05, "loss": 2.7225, "step": 1797 }, { "epoch": 1.2033699878747335, "grad_norm": 4.000890254974365, "learning_rate": 3.742121317084662e-05, "loss": 2.6897, "step": 1798 }, { "epoch": 1.2040389680980055, "grad_norm": 5.847841739654541, "learning_rate": 3.740430888812536e-05, "loss": 2.8504, "step": 1799 }, { "epoch": 1.2047079483212777, "grad_norm": 4.641834259033203, "learning_rate": 3.738739707850919e-05, "loss": 2.8136, "step": 1800 }, { "epoch": 1.20537692854455, "grad_norm": 4.454596996307373, "learning_rate": 3.737047775226017e-05, "loss": 2.8346, "step": 1801 }, { "epoch": 1.2060459087678221, "grad_norm": 5.596829891204834, "learning_rate": 3.735355091964486e-05, "loss": 2.8441, "step": 1802 }, { "epoch": 1.2067148889910941, "grad_norm": 4.663771152496338, "learning_rate": 3.7336616590934434e-05, "loss": 2.7965, "step": 1803 }, { "epoch": 1.2073838692143664, "grad_norm": 4.829925060272217, "learning_rate": 3.731967477640457e-05, "loss": 2.8794, "step": 1804 }, { "epoch": 1.2080528494376386, "grad_norm": 7.195576190948486, "learning_rate": 3.7302725486335526e-05, "loss": 2.5335, "step": 1805 }, { "epoch": 1.2087218296609106, "grad_norm": 3.178122043609619, "learning_rate": 3.728576873101207e-05, "loss": 2.3999, "step": 1806 }, { "epoch": 1.2093908098841828, "grad_norm": 6.593000411987305, "learning_rate": 3.7268804520723495e-05, "loss": 2.7625, "step": 1807 }, { "epoch": 1.210059790107455, "grad_norm": 3.4164230823516846, "learning_rate": 3.725183286576363e-05, "loss": 2.6513, "step": 1808 }, { "epoch": 1.210728770330727, "grad_norm": 6.249453067779541, "learning_rate": 3.723485377643084e-05, "loss": 2.8028, "step": 1809 }, { "epoch": 1.2113977505539992, "grad_norm": 4.547549724578857, "learning_rate": 3.721786726302798e-05, "loss": 2.8385, "step": 1810 }, { "epoch": 1.2120667307772715, "grad_norm": 5.134420394897461, "learning_rate": 3.72008733358624e-05, "loss": 2.6617, "step": 1811 }, { "epoch": 1.2127357110005437, "grad_norm": 4.485007286071777, "learning_rate": 3.718387200524596e-05, "loss": 2.5839, "step": 1812 }, { "epoch": 1.2134046912238157, "grad_norm": 3.850969076156616, "learning_rate": 3.7166863281495005e-05, "loss": 2.6409, "step": 1813 }, { "epoch": 1.2140736714470879, "grad_norm": 4.061034202575684, "learning_rate": 3.71498471749304e-05, "loss": 2.8591, "step": 1814 }, { "epoch": 1.21474265167036, "grad_norm": 5.947023391723633, "learning_rate": 3.713282369587745e-05, "loss": 2.523, "step": 1815 }, { "epoch": 1.215411631893632, "grad_norm": 5.140153408050537, "learning_rate": 3.711579285466594e-05, "loss": 2.7576, "step": 1816 }, { "epoch": 1.2160806121169043, "grad_norm": 6.343984603881836, "learning_rate": 3.709875466163014e-05, "loss": 2.8076, "step": 1817 }, { "epoch": 1.2167495923401765, "grad_norm": 4.9777750968933105, "learning_rate": 3.708170912710877e-05, "loss": 2.6444, "step": 1818 }, { "epoch": 1.2174185725634485, "grad_norm": 4.851789474487305, "learning_rate": 3.7064656261445004e-05, "loss": 2.623, "step": 1819 }, { "epoch": 1.2180875527867208, "grad_norm": 6.258243560791016, "learning_rate": 3.704759607498646e-05, "loss": 2.8272, "step": 1820 }, { "epoch": 1.218756533009993, "grad_norm": 3.94958758354187, "learning_rate": 3.703052857808522e-05, "loss": 2.5172, "step": 1821 }, { "epoch": 1.219425513233265, "grad_norm": 4.468460559844971, "learning_rate": 3.7013453781097774e-05, "loss": 2.7856, "step": 1822 }, { "epoch": 1.2200944934565372, "grad_norm": 4.049428939819336, "learning_rate": 3.699637169438505e-05, "loss": 2.7046, "step": 1823 }, { "epoch": 1.2207634736798094, "grad_norm": 4.41973352432251, "learning_rate": 3.6979282328312414e-05, "loss": 2.6378, "step": 1824 }, { "epoch": 1.2214324539030814, "grad_norm": 5.145456790924072, "learning_rate": 3.6962185693249646e-05, "loss": 2.7381, "step": 1825 }, { "epoch": 1.2221014341263536, "grad_norm": 6.784951686859131, "learning_rate": 3.694508179957091e-05, "loss": 3.057, "step": 1826 }, { "epoch": 1.2227704143496259, "grad_norm": 3.9923250675201416, "learning_rate": 3.69279706576548e-05, "loss": 2.5722, "step": 1827 }, { "epoch": 1.2234393945728979, "grad_norm": 5.133220672607422, "learning_rate": 3.691085227788431e-05, "loss": 2.8907, "step": 1828 }, { "epoch": 1.22410837479617, "grad_norm": 5.521151542663574, "learning_rate": 3.689372667064681e-05, "loss": 2.7463, "step": 1829 }, { "epoch": 1.2247773550194423, "grad_norm": 6.073947429656982, "learning_rate": 3.687659384633407e-05, "loss": 2.5392, "step": 1830 }, { "epoch": 1.2254463352427143, "grad_norm": 4.901395320892334, "learning_rate": 3.685945381534222e-05, "loss": 2.7601, "step": 1831 }, { "epoch": 1.2261153154659865, "grad_norm": 4.377593517303467, "learning_rate": 3.6842306588071795e-05, "loss": 2.7779, "step": 1832 }, { "epoch": 1.2267842956892587, "grad_norm": 7.987823009490967, "learning_rate": 3.682515217492766e-05, "loss": 2.8408, "step": 1833 }, { "epoch": 1.2274532759125307, "grad_norm": 5.431763172149658, "learning_rate": 3.6807990586319076e-05, "loss": 2.6746, "step": 1834 }, { "epoch": 1.228122256135803, "grad_norm": 4.709609508514404, "learning_rate": 3.6790821832659616e-05, "loss": 2.8218, "step": 1835 }, { "epoch": 1.2287912363590752, "grad_norm": 6.702217102050781, "learning_rate": 3.677364592436725e-05, "loss": 2.9065, "step": 1836 }, { "epoch": 1.2294602165823472, "grad_norm": 5.945807456970215, "learning_rate": 3.675646287186425e-05, "loss": 2.6951, "step": 1837 }, { "epoch": 1.2301291968056194, "grad_norm": 3.4071404933929443, "learning_rate": 3.673927268557724e-05, "loss": 2.562, "step": 1838 }, { "epoch": 1.2307981770288916, "grad_norm": 4.416913032531738, "learning_rate": 3.6722075375937166e-05, "loss": 2.8139, "step": 1839 }, { "epoch": 1.2314671572521638, "grad_norm": 3.2345592975616455, "learning_rate": 3.670487095337931e-05, "loss": 2.3068, "step": 1840 }, { "epoch": 1.2321361374754358, "grad_norm": 4.063303470611572, "learning_rate": 3.668765942834324e-05, "loss": 2.7729, "step": 1841 }, { "epoch": 1.232805117698708, "grad_norm": 5.84446382522583, "learning_rate": 3.667044081127288e-05, "loss": 2.7892, "step": 1842 }, { "epoch": 1.2334740979219803, "grad_norm": 5.9814252853393555, "learning_rate": 3.665321511261642e-05, "loss": 2.6805, "step": 1843 }, { "epoch": 1.2341430781452523, "grad_norm": 4.754057884216309, "learning_rate": 3.663598234282636e-05, "loss": 2.8755, "step": 1844 }, { "epoch": 1.2348120583685245, "grad_norm": 4.782750129699707, "learning_rate": 3.6618742512359487e-05, "loss": 2.8039, "step": 1845 }, { "epoch": 1.2354810385917967, "grad_norm": 5.0642409324646, "learning_rate": 3.660149563167687e-05, "loss": 2.5444, "step": 1846 }, { "epoch": 1.2361500188150687, "grad_norm": 5.08937406539917, "learning_rate": 3.658424171124388e-05, "loss": 2.8231, "step": 1847 }, { "epoch": 1.236818999038341, "grad_norm": 4.314083576202393, "learning_rate": 3.656698076153013e-05, "loss": 2.4211, "step": 1848 }, { "epoch": 1.2374879792616131, "grad_norm": 5.273926734924316, "learning_rate": 3.65497127930095e-05, "loss": 2.7676, "step": 1849 }, { "epoch": 1.2381569594848851, "grad_norm": 6.733925819396973, "learning_rate": 3.6532437816160145e-05, "loss": 2.7285, "step": 1850 }, { "epoch": 1.2388259397081574, "grad_norm": 4.898383617401123, "learning_rate": 3.651515584146447e-05, "loss": 2.6919, "step": 1851 }, { "epoch": 1.2394949199314296, "grad_norm": 5.283299922943115, "learning_rate": 3.649786687940911e-05, "loss": 3.0118, "step": 1852 }, { "epoch": 1.2401639001547018, "grad_norm": 3.843108892440796, "learning_rate": 3.6480570940484956e-05, "loss": 2.5973, "step": 1853 }, { "epoch": 1.2408328803779738, "grad_norm": 4.891538143157959, "learning_rate": 3.646326803518715e-05, "loss": 2.78, "step": 1854 }, { "epoch": 1.241501860601246, "grad_norm": 6.736623287200928, "learning_rate": 3.644595817401501e-05, "loss": 2.905, "step": 1855 }, { "epoch": 1.2421708408245182, "grad_norm": 6.363578796386719, "learning_rate": 3.6428641367472116e-05, "loss": 2.9187, "step": 1856 }, { "epoch": 1.2428398210477902, "grad_norm": 3.4829938411712646, "learning_rate": 3.641131762606626e-05, "loss": 2.5329, "step": 1857 }, { "epoch": 1.2435088012710624, "grad_norm": 5.517954349517822, "learning_rate": 3.639398696030941e-05, "loss": 2.9501, "step": 1858 }, { "epoch": 1.2441777814943347, "grad_norm": 9.04926586151123, "learning_rate": 3.637664938071777e-05, "loss": 3.0881, "step": 1859 }, { "epoch": 1.2448467617176067, "grad_norm": 3.5141665935516357, "learning_rate": 3.635930489781173e-05, "loss": 2.5798, "step": 1860 }, { "epoch": 1.2455157419408789, "grad_norm": 4.416510105133057, "learning_rate": 3.6341953522115876e-05, "loss": 2.7676, "step": 1861 }, { "epoch": 1.246184722164151, "grad_norm": 6.0014238357543945, "learning_rate": 3.6324595264158955e-05, "loss": 2.7161, "step": 1862 }, { "epoch": 1.246853702387423, "grad_norm": 4.988345623016357, "learning_rate": 3.63072301344739e-05, "loss": 2.8456, "step": 1863 }, { "epoch": 1.2475226826106953, "grad_norm": 6.99977970123291, "learning_rate": 3.6289858143597826e-05, "loss": 2.8185, "step": 1864 }, { "epoch": 1.2481916628339675, "grad_norm": 3.997215509414673, "learning_rate": 3.6272479302072e-05, "loss": 2.5535, "step": 1865 }, { "epoch": 1.2488606430572395, "grad_norm": 5.643239974975586, "learning_rate": 3.6255093620441834e-05, "loss": 2.9158, "step": 1866 }, { "epoch": 1.2495296232805118, "grad_norm": 5.907159805297852, "learning_rate": 3.623770110925692e-05, "loss": 2.9065, "step": 1867 }, { "epoch": 1.250198603503784, "grad_norm": 3.4002277851104736, "learning_rate": 3.6220301779070966e-05, "loss": 2.548, "step": 1868 }, { "epoch": 1.250867583727056, "grad_norm": 4.918555736541748, "learning_rate": 3.620289564044183e-05, "loss": 2.7617, "step": 1869 }, { "epoch": 1.2515365639503282, "grad_norm": 4.701360702514648, "learning_rate": 3.618548270393152e-05, "loss": 2.7191, "step": 1870 }, { "epoch": 1.2522055441736004, "grad_norm": 5.712444305419922, "learning_rate": 3.6168062980106126e-05, "loss": 2.9235, "step": 1871 }, { "epoch": 1.2528745243968724, "grad_norm": 5.300381183624268, "learning_rate": 3.61506364795359e-05, "loss": 2.8037, "step": 1872 }, { "epoch": 1.2535435046201446, "grad_norm": 4.37861442565918, "learning_rate": 3.613320321279518e-05, "loss": 2.7889, "step": 1873 }, { "epoch": 1.2542124848434169, "grad_norm": 5.089453220367432, "learning_rate": 3.61157631904624e-05, "loss": 3.0934, "step": 1874 }, { "epoch": 1.2548814650666889, "grad_norm": 4.397032260894775, "learning_rate": 3.6098316423120133e-05, "loss": 2.7309, "step": 1875 }, { "epoch": 1.255550445289961, "grad_norm": 4.115896224975586, "learning_rate": 3.608086292135501e-05, "loss": 2.7652, "step": 1876 }, { "epoch": 1.2562194255132333, "grad_norm": 4.964057445526123, "learning_rate": 3.6063402695757765e-05, "loss": 2.8038, "step": 1877 }, { "epoch": 1.2568884057365053, "grad_norm": 4.864250659942627, "learning_rate": 3.60459357569232e-05, "loss": 2.685, "step": 1878 }, { "epoch": 1.2575573859597775, "grad_norm": 4.61652135848999, "learning_rate": 3.602846211545021e-05, "loss": 2.9624, "step": 1879 }, { "epoch": 1.2582263661830497, "grad_norm": 3.5036540031433105, "learning_rate": 3.601098178194173e-05, "loss": 2.8418, "step": 1880 }, { "epoch": 1.2588953464063217, "grad_norm": 5.311166286468506, "learning_rate": 3.599349476700478e-05, "loss": 2.6239, "step": 1881 }, { "epoch": 1.259564326629594, "grad_norm": 5.348335266113281, "learning_rate": 3.5976001081250414e-05, "loss": 2.7304, "step": 1882 }, { "epoch": 1.2602333068528662, "grad_norm": 6.328084945678711, "learning_rate": 3.595850073529377e-05, "loss": 2.6692, "step": 1883 }, { "epoch": 1.2609022870761384, "grad_norm": 5.2390007972717285, "learning_rate": 3.594099373975397e-05, "loss": 2.7312, "step": 1884 }, { "epoch": 1.2615712672994104, "grad_norm": 7.658189296722412, "learning_rate": 3.592348010525421e-05, "loss": 3.1415, "step": 1885 }, { "epoch": 1.2622402475226826, "grad_norm": 4.922848701477051, "learning_rate": 3.5905959842421726e-05, "loss": 2.7902, "step": 1886 }, { "epoch": 1.2629092277459548, "grad_norm": 4.480006217956543, "learning_rate": 3.588843296188775e-05, "loss": 2.6461, "step": 1887 }, { "epoch": 1.263578207969227, "grad_norm": 4.117129802703857, "learning_rate": 3.587089947428752e-05, "loss": 2.6328, "step": 1888 }, { "epoch": 1.264247188192499, "grad_norm": 5.596131801605225, "learning_rate": 3.585335939026032e-05, "loss": 2.6906, "step": 1889 }, { "epoch": 1.2649161684157713, "grad_norm": 7.91386604309082, "learning_rate": 3.583581272044941e-05, "loss": 2.7841, "step": 1890 }, { "epoch": 1.2655851486390435, "grad_norm": 5.925399303436279, "learning_rate": 3.581825947550205e-05, "loss": 2.723, "step": 1891 }, { "epoch": 1.2662541288623155, "grad_norm": 6.4818806648254395, "learning_rate": 3.580069966606949e-05, "loss": 2.8101, "step": 1892 }, { "epoch": 1.2669231090855877, "grad_norm": 4.033934593200684, "learning_rate": 3.578313330280698e-05, "loss": 2.5151, "step": 1893 }, { "epoch": 1.26759208930886, "grad_norm": 6.469705104827881, "learning_rate": 3.576556039637372e-05, "loss": 2.708, "step": 1894 }, { "epoch": 1.268261069532132, "grad_norm": 5.2010955810546875, "learning_rate": 3.57479809574329e-05, "loss": 2.589, "step": 1895 }, { "epoch": 1.2689300497554041, "grad_norm": 4.195599555969238, "learning_rate": 3.5730394996651664e-05, "loss": 2.5701, "step": 1896 }, { "epoch": 1.2695990299786764, "grad_norm": 3.8891968727111816, "learning_rate": 3.571280252470111e-05, "loss": 2.5916, "step": 1897 }, { "epoch": 1.2702680102019483, "grad_norm": 4.924985885620117, "learning_rate": 3.569520355225631e-05, "loss": 2.7964, "step": 1898 }, { "epoch": 1.2709369904252206, "grad_norm": 5.558375835418701, "learning_rate": 3.5677598089996254e-05, "loss": 2.8219, "step": 1899 }, { "epoch": 1.2716059706484928, "grad_norm": 4.105373382568359, "learning_rate": 3.565998614860388e-05, "loss": 2.8599, "step": 1900 }, { "epoch": 1.2722749508717648, "grad_norm": 5.856644153594971, "learning_rate": 3.564236773876606e-05, "loss": 2.814, "step": 1901 }, { "epoch": 1.272943931095037, "grad_norm": 3.8691678047180176, "learning_rate": 3.562474287117359e-05, "loss": 2.7608, "step": 1902 }, { "epoch": 1.2736129113183092, "grad_norm": 6.936550140380859, "learning_rate": 3.5607111556521175e-05, "loss": 3.1666, "step": 1903 }, { "epoch": 1.2742818915415812, "grad_norm": 4.588903903961182, "learning_rate": 3.558947380550744e-05, "loss": 2.5422, "step": 1904 }, { "epoch": 1.2749508717648534, "grad_norm": 5.447290897369385, "learning_rate": 3.557182962883494e-05, "loss": 2.8046, "step": 1905 }, { "epoch": 1.2756198519881257, "grad_norm": 5.050882339477539, "learning_rate": 3.555417903721008e-05, "loss": 2.8847, "step": 1906 }, { "epoch": 1.2762888322113977, "grad_norm": 4.748391151428223, "learning_rate": 3.5536522041343185e-05, "loss": 2.832, "step": 1907 }, { "epoch": 1.2769578124346699, "grad_norm": 5.2018890380859375, "learning_rate": 3.551885865194847e-05, "loss": 2.8506, "step": 1908 }, { "epoch": 1.277626792657942, "grad_norm": 5.160276412963867, "learning_rate": 3.550118887974402e-05, "loss": 2.6371, "step": 1909 }, { "epoch": 1.278295772881214, "grad_norm": 4.636349201202393, "learning_rate": 3.54835127354518e-05, "loss": 2.7996, "step": 1910 }, { "epoch": 1.2789647531044863, "grad_norm": 4.72723913192749, "learning_rate": 3.5465830229797623e-05, "loss": 2.6162, "step": 1911 }, { "epoch": 1.2796337333277585, "grad_norm": 4.232020854949951, "learning_rate": 3.54481413735112e-05, "loss": 2.7203, "step": 1912 }, { "epoch": 1.2803027135510305, "grad_norm": 4.942488193511963, "learning_rate": 3.543044617732606e-05, "loss": 2.8028, "step": 1913 }, { "epoch": 1.2809716937743028, "grad_norm": 6.861090660095215, "learning_rate": 3.541274465197959e-05, "loss": 2.8319, "step": 1914 }, { "epoch": 1.281640673997575, "grad_norm": 9.18194580078125, "learning_rate": 3.539503680821302e-05, "loss": 3.4554, "step": 1915 }, { "epoch": 1.282309654220847, "grad_norm": 5.122642517089844, "learning_rate": 3.537732265677142e-05, "loss": 2.5247, "step": 1916 }, { "epoch": 1.2829786344441192, "grad_norm": 5.579825401306152, "learning_rate": 3.5359602208403666e-05, "loss": 2.7803, "step": 1917 }, { "epoch": 1.2836476146673914, "grad_norm": 5.433165550231934, "learning_rate": 3.5341875473862485e-05, "loss": 2.8789, "step": 1918 }, { "epoch": 1.2843165948906634, "grad_norm": 5.925359725952148, "learning_rate": 3.5324142463904385e-05, "loss": 3.0326, "step": 1919 }, { "epoch": 1.2849855751139356, "grad_norm": 5.1778340339660645, "learning_rate": 3.5306403189289725e-05, "loss": 2.5914, "step": 1920 }, { "epoch": 1.2856545553372078, "grad_norm": 5.54917049407959, "learning_rate": 3.5288657660782615e-05, "loss": 2.7516, "step": 1921 }, { "epoch": 1.28632353556048, "grad_norm": 5.052066802978516, "learning_rate": 3.5270905889151e-05, "loss": 2.8725, "step": 1922 }, { "epoch": 1.286992515783752, "grad_norm": 3.5192253589630127, "learning_rate": 3.525314788516659e-05, "loss": 2.695, "step": 1923 }, { "epoch": 1.2876614960070243, "grad_norm": 4.60258674621582, "learning_rate": 3.523538365960489e-05, "loss": 2.8105, "step": 1924 }, { "epoch": 1.2883304762302965, "grad_norm": 5.980016708374023, "learning_rate": 3.5217613223245164e-05, "loss": 2.8577, "step": 1925 }, { "epoch": 1.2889994564535687, "grad_norm": 4.692354679107666, "learning_rate": 3.519983658687047e-05, "loss": 3.007, "step": 1926 }, { "epoch": 1.2896684366768407, "grad_norm": 6.644775867462158, "learning_rate": 3.518205376126762e-05, "loss": 2.5909, "step": 1927 }, { "epoch": 1.290337416900113, "grad_norm": 4.538536071777344, "learning_rate": 3.516426475722715e-05, "loss": 2.6458, "step": 1928 }, { "epoch": 1.2910063971233852, "grad_norm": 4.779280662536621, "learning_rate": 3.514646958554339e-05, "loss": 2.7819, "step": 1929 }, { "epoch": 1.2916753773466572, "grad_norm": 2.987003803253174, "learning_rate": 3.512866825701439e-05, "loss": 2.5834, "step": 1930 }, { "epoch": 1.2923443575699294, "grad_norm": 4.973797798156738, "learning_rate": 3.511086078244194e-05, "loss": 2.9792, "step": 1931 }, { "epoch": 1.2930133377932016, "grad_norm": 4.948237895965576, "learning_rate": 3.5093047172631555e-05, "loss": 3.0585, "step": 1932 }, { "epoch": 1.2936823180164736, "grad_norm": 4.105443954467773, "learning_rate": 3.507522743839247e-05, "loss": 2.7073, "step": 1933 }, { "epoch": 1.2943512982397458, "grad_norm": 5.659438610076904, "learning_rate": 3.505740159053766e-05, "loss": 2.785, "step": 1934 }, { "epoch": 1.295020278463018, "grad_norm": 5.185451984405518, "learning_rate": 3.5039569639883773e-05, "loss": 2.9656, "step": 1935 }, { "epoch": 1.29568925868629, "grad_norm": 5.640125751495361, "learning_rate": 3.502173159725119e-05, "loss": 2.9592, "step": 1936 }, { "epoch": 1.2963582389095623, "grad_norm": 3.587552785873413, "learning_rate": 3.5003887473463984e-05, "loss": 2.4749, "step": 1937 }, { "epoch": 1.2970272191328345, "grad_norm": 4.075209617614746, "learning_rate": 3.498603727934991e-05, "loss": 2.7787, "step": 1938 }, { "epoch": 1.2976961993561065, "grad_norm": 5.543696403503418, "learning_rate": 3.496818102574039e-05, "loss": 2.8827, "step": 1939 }, { "epoch": 1.2983651795793787, "grad_norm": 9.334135055541992, "learning_rate": 3.4950318723470565e-05, "loss": 2.8724, "step": 1940 }, { "epoch": 1.299034159802651, "grad_norm": 5.707855701446533, "learning_rate": 3.493245038337921e-05, "loss": 2.9266, "step": 1941 }, { "epoch": 1.299703140025923, "grad_norm": 7.300833702087402, "learning_rate": 3.491457601630878e-05, "loss": 3.1611, "step": 1942 }, { "epoch": 1.3003721202491951, "grad_norm": 5.916547775268555, "learning_rate": 3.489669563310538e-05, "loss": 2.5979, "step": 1943 }, { "epoch": 1.3010411004724673, "grad_norm": 5.269513130187988, "learning_rate": 3.487880924461878e-05, "loss": 2.723, "step": 1944 }, { "epoch": 1.3017100806957393, "grad_norm": 5.020068645477295, "learning_rate": 3.486091686170237e-05, "loss": 2.7228, "step": 1945 }, { "epoch": 1.3023790609190116, "grad_norm": 3.875847339630127, "learning_rate": 3.48430184952132e-05, "loss": 2.6033, "step": 1946 }, { "epoch": 1.3030480411422838, "grad_norm": 4.772490501403809, "learning_rate": 3.4825114156011934e-05, "loss": 2.6238, "step": 1947 }, { "epoch": 1.3037170213655558, "grad_norm": 4.842042922973633, "learning_rate": 3.480720385496287e-05, "loss": 2.5166, "step": 1948 }, { "epoch": 1.304386001588828, "grad_norm": 4.124337196350098, "learning_rate": 3.4789287602933936e-05, "loss": 2.757, "step": 1949 }, { "epoch": 1.3050549818121002, "grad_norm": 7.956783294677734, "learning_rate": 3.477136541079663e-05, "loss": 2.9116, "step": 1950 }, { "epoch": 1.3057239620353722, "grad_norm": 8.2808837890625, "learning_rate": 3.47534372894261e-05, "loss": 3.0217, "step": 1951 }, { "epoch": 1.3063929422586444, "grad_norm": 4.387228965759277, "learning_rate": 3.4735503249701065e-05, "loss": 2.8402, "step": 1952 }, { "epoch": 1.3070619224819167, "grad_norm": 5.978759765625, "learning_rate": 3.4717563302503844e-05, "loss": 2.9344, "step": 1953 }, { "epoch": 1.3077309027051887, "grad_norm": 6.487181186676025, "learning_rate": 3.469961745872034e-05, "loss": 2.7791, "step": 1954 }, { "epoch": 1.3083998829284609, "grad_norm": 3.971789836883545, "learning_rate": 3.4681665729240034e-05, "loss": 2.5318, "step": 1955 }, { "epoch": 1.309068863151733, "grad_norm": 6.075745105743408, "learning_rate": 3.466370812495598e-05, "loss": 2.8359, "step": 1956 }, { "epoch": 1.309737843375005, "grad_norm": 5.87401819229126, "learning_rate": 3.464574465676479e-05, "loss": 2.86, "step": 1957 }, { "epoch": 1.3104068235982773, "grad_norm": 9.17673110961914, "learning_rate": 3.4627775335566636e-05, "loss": 2.9832, "step": 1958 }, { "epoch": 1.3110758038215495, "grad_norm": 7.885680198669434, "learning_rate": 3.460980017226525e-05, "loss": 3.0542, "step": 1959 }, { "epoch": 1.3117447840448218, "grad_norm": 4.909152507781982, "learning_rate": 3.459181917776792e-05, "loss": 2.7076, "step": 1960 }, { "epoch": 1.3124137642680938, "grad_norm": 7.326097011566162, "learning_rate": 3.4573832362985424e-05, "loss": 2.7299, "step": 1961 }, { "epoch": 1.313082744491366, "grad_norm": 5.211129188537598, "learning_rate": 3.455583973883212e-05, "loss": 2.5845, "step": 1962 }, { "epoch": 1.3137517247146382, "grad_norm": 7.045957088470459, "learning_rate": 3.4537841316225885e-05, "loss": 2.6112, "step": 1963 }, { "epoch": 1.3144207049379104, "grad_norm": 6.861929416656494, "learning_rate": 3.4519837106088074e-05, "loss": 2.642, "step": 1964 }, { "epoch": 1.3150896851611824, "grad_norm": 4.8646321296691895, "learning_rate": 3.450182711934361e-05, "loss": 2.6681, "step": 1965 }, { "epoch": 1.3157586653844546, "grad_norm": 4.6669721603393555, "learning_rate": 3.448381136692089e-05, "loss": 2.7032, "step": 1966 }, { "epoch": 1.3164276456077268, "grad_norm": 6.643599510192871, "learning_rate": 3.446578985975182e-05, "loss": 2.6507, "step": 1967 }, { "epoch": 1.3170966258309988, "grad_norm": 4.169411659240723, "learning_rate": 3.444776260877177e-05, "loss": 2.8854, "step": 1968 }, { "epoch": 1.317765606054271, "grad_norm": 6.282775402069092, "learning_rate": 3.4429729624919644e-05, "loss": 2.892, "step": 1969 }, { "epoch": 1.3184345862775433, "grad_norm": 5.78385066986084, "learning_rate": 3.4411690919137786e-05, "loss": 2.689, "step": 1970 }, { "epoch": 1.3191035665008153, "grad_norm": 5.781465530395508, "learning_rate": 3.439364650237203e-05, "loss": 2.8935, "step": 1971 }, { "epoch": 1.3197725467240875, "grad_norm": 12.037712097167969, "learning_rate": 3.437559638557166e-05, "loss": 2.8092, "step": 1972 }, { "epoch": 1.3204415269473597, "grad_norm": 5.438355445861816, "learning_rate": 3.435754057968945e-05, "loss": 2.7404, "step": 1973 }, { "epoch": 1.3211105071706317, "grad_norm": 4.841614246368408, "learning_rate": 3.433947909568158e-05, "loss": 2.4909, "step": 1974 }, { "epoch": 1.321779487393904, "grad_norm": 6.428897857666016, "learning_rate": 3.432141194450772e-05, "loss": 2.564, "step": 1975 }, { "epoch": 1.3224484676171762, "grad_norm": 7.659292221069336, "learning_rate": 3.430333913713095e-05, "loss": 3.0917, "step": 1976 }, { "epoch": 1.3231174478404482, "grad_norm": 3.677401065826416, "learning_rate": 3.428526068451778e-05, "loss": 2.3395, "step": 1977 }, { "epoch": 1.3237864280637204, "grad_norm": 4.640537261962891, "learning_rate": 3.4267176597638194e-05, "loss": 2.8815, "step": 1978 }, { "epoch": 1.3244554082869926, "grad_norm": 5.220210075378418, "learning_rate": 3.424908688746552e-05, "loss": 2.84, "step": 1979 }, { "epoch": 1.3251243885102646, "grad_norm": 5.86066198348999, "learning_rate": 3.423099156497655e-05, "loss": 2.781, "step": 1980 }, { "epoch": 1.3257933687335368, "grad_norm": 4.961976528167725, "learning_rate": 3.421289064115147e-05, "loss": 2.841, "step": 1981 }, { "epoch": 1.326462348956809, "grad_norm": 5.830520153045654, "learning_rate": 3.419478412697388e-05, "loss": 3.0078, "step": 1982 }, { "epoch": 1.327131329180081, "grad_norm": 5.7861762046813965, "learning_rate": 3.4176672033430714e-05, "loss": 2.804, "step": 1983 }, { "epoch": 1.3278003094033533, "grad_norm": 4.089806079864502, "learning_rate": 3.415855437151237e-05, "loss": 2.4129, "step": 1984 }, { "epoch": 1.3284692896266255, "grad_norm": 2.5153582096099854, "learning_rate": 3.414043115221256e-05, "loss": 2.461, "step": 1985 }, { "epoch": 1.3291382698498975, "grad_norm": 6.85758638381958, "learning_rate": 3.4122302386528404e-05, "loss": 2.7807, "step": 1986 }, { "epoch": 1.3298072500731697, "grad_norm": 5.164159774780273, "learning_rate": 3.410416808546039e-05, "loss": 2.868, "step": 1987 }, { "epoch": 1.330476230296442, "grad_norm": 7.103829860687256, "learning_rate": 3.4086028260012344e-05, "loss": 2.7594, "step": 1988 }, { "epoch": 1.331145210519714, "grad_norm": 3.77983021736145, "learning_rate": 3.406788292119146e-05, "loss": 2.5318, "step": 1989 }, { "epoch": 1.3318141907429861, "grad_norm": 4.805266380310059, "learning_rate": 3.404973208000826e-05, "loss": 2.7652, "step": 1990 }, { "epoch": 1.3324831709662583, "grad_norm": 4.950036525726318, "learning_rate": 3.4031575747476624e-05, "loss": 2.427, "step": 1991 }, { "epoch": 1.3331521511895303, "grad_norm": 6.413376331329346, "learning_rate": 3.401341393461376e-05, "loss": 3.0278, "step": 1992 }, { "epoch": 1.3338211314128026, "grad_norm": 3.3853354454040527, "learning_rate": 3.3995246652440194e-05, "loss": 2.5914, "step": 1993 }, { "epoch": 1.3344901116360748, "grad_norm": 6.072537422180176, "learning_rate": 3.397707391197977e-05, "loss": 2.8304, "step": 1994 }, { "epoch": 1.3351590918593468, "grad_norm": 7.784000396728516, "learning_rate": 3.395889572425965e-05, "loss": 2.7672, "step": 1995 }, { "epoch": 1.335828072082619, "grad_norm": 4.891425609588623, "learning_rate": 3.3940712100310315e-05, "loss": 2.6594, "step": 1996 }, { "epoch": 1.3364970523058912, "grad_norm": 4.242300510406494, "learning_rate": 3.3922523051165515e-05, "loss": 2.6352, "step": 1997 }, { "epoch": 1.3371660325291634, "grad_norm": 6.101925849914551, "learning_rate": 3.39043285878623e-05, "loss": 3.0095, "step": 1998 }, { "epoch": 1.3378350127524354, "grad_norm": 4.063016891479492, "learning_rate": 3.388612872144104e-05, "loss": 2.6412, "step": 1999 }, { "epoch": 1.3385039929757077, "grad_norm": 5.656928062438965, "learning_rate": 3.386792346294532e-05, "loss": 2.8988, "step": 2000 }, { "epoch": 1.3391729731989799, "grad_norm": 4.359591007232666, "learning_rate": 3.384971282342206e-05, "loss": 2.7349, "step": 2001 }, { "epoch": 1.3398419534222519, "grad_norm": 4.76541805267334, "learning_rate": 3.38314968139214e-05, "loss": 2.8656, "step": 2002 }, { "epoch": 1.340510933645524, "grad_norm": 6.233043193817139, "learning_rate": 3.3813275445496764e-05, "loss": 2.7234, "step": 2003 }, { "epoch": 1.3411799138687963, "grad_norm": 4.522028923034668, "learning_rate": 3.379504872920483e-05, "loss": 2.6152, "step": 2004 }, { "epoch": 1.3418488940920685, "grad_norm": 5.168178081512451, "learning_rate": 3.3776816676105495e-05, "loss": 2.8025, "step": 2005 }, { "epoch": 1.3425178743153405, "grad_norm": 4.653557300567627, "learning_rate": 3.375857929726191e-05, "loss": 2.8364, "step": 2006 }, { "epoch": 1.3431868545386128, "grad_norm": 5.8500261306762695, "learning_rate": 3.374033660374047e-05, "loss": 2.8903, "step": 2007 }, { "epoch": 1.343855834761885, "grad_norm": 5.529808521270752, "learning_rate": 3.3722088606610784e-05, "loss": 2.8225, "step": 2008 }, { "epoch": 1.344524814985157, "grad_norm": 4.344649314880371, "learning_rate": 3.3703835316945665e-05, "loss": 2.6386, "step": 2009 }, { "epoch": 1.3451937952084292, "grad_norm": 5.259212970733643, "learning_rate": 3.368557674582116e-05, "loss": 2.8751, "step": 2010 }, { "epoch": 1.3458627754317014, "grad_norm": 5.934889793395996, "learning_rate": 3.3667312904316506e-05, "loss": 2.7374, "step": 2011 }, { "epoch": 1.3465317556549734, "grad_norm": 5.568259239196777, "learning_rate": 3.364904380351415e-05, "loss": 2.824, "step": 2012 }, { "epoch": 1.3472007358782456, "grad_norm": 7.427395820617676, "learning_rate": 3.363076945449971e-05, "loss": 2.8405, "step": 2013 }, { "epoch": 1.3478697161015178, "grad_norm": 6.825466632843018, "learning_rate": 3.3612489868362017e-05, "loss": 2.716, "step": 2014 }, { "epoch": 1.3485386963247898, "grad_norm": 5.038285732269287, "learning_rate": 3.3594205056193065e-05, "loss": 2.5977, "step": 2015 }, { "epoch": 1.349207676548062, "grad_norm": 5.154592037200928, "learning_rate": 3.357591502908802e-05, "loss": 2.791, "step": 2016 }, { "epoch": 1.3498766567713343, "grad_norm": 4.823801517486572, "learning_rate": 3.35576197981452e-05, "loss": 2.9221, "step": 2017 }, { "epoch": 1.3505456369946063, "grad_norm": 5.911928653717041, "learning_rate": 3.35393193744661e-05, "loss": 2.8327, "step": 2018 }, { "epoch": 1.3512146172178785, "grad_norm": 4.692645072937012, "learning_rate": 3.352101376915536e-05, "loss": 2.6108, "step": 2019 }, { "epoch": 1.3518835974411507, "grad_norm": 4.444061756134033, "learning_rate": 3.3502702993320754e-05, "loss": 2.6292, "step": 2020 }, { "epoch": 1.3525525776644227, "grad_norm": 6.115309715270996, "learning_rate": 3.348438705807322e-05, "loss": 2.9655, "step": 2021 }, { "epoch": 1.353221557887695, "grad_norm": 5.134885311126709, "learning_rate": 3.3466065974526794e-05, "loss": 2.711, "step": 2022 }, { "epoch": 1.3538905381109672, "grad_norm": 5.172490119934082, "learning_rate": 3.344773975379865e-05, "loss": 2.5484, "step": 2023 }, { "epoch": 1.3545595183342392, "grad_norm": 3.98433256149292, "learning_rate": 3.3429408407009086e-05, "loss": 2.672, "step": 2024 }, { "epoch": 1.3552284985575114, "grad_norm": 5.418702125549316, "learning_rate": 3.3411071945281515e-05, "loss": 2.7459, "step": 2025 }, { "epoch": 1.3558974787807836, "grad_norm": 6.298463344573975, "learning_rate": 3.339273037974241e-05, "loss": 2.7903, "step": 2026 }, { "epoch": 1.3565664590040556, "grad_norm": 6.452141761779785, "learning_rate": 3.337438372152141e-05, "loss": 2.7773, "step": 2027 }, { "epoch": 1.3572354392273278, "grad_norm": 3.5536160469055176, "learning_rate": 3.335603198175119e-05, "loss": 2.4353, "step": 2028 }, { "epoch": 1.3579044194506, "grad_norm": 6.420649528503418, "learning_rate": 3.333767517156754e-05, "loss": 2.7223, "step": 2029 }, { "epoch": 1.358573399673872, "grad_norm": 6.5228095054626465, "learning_rate": 3.33193133021093e-05, "loss": 3.1202, "step": 2030 }, { "epoch": 1.3592423798971442, "grad_norm": 5.087987422943115, "learning_rate": 3.330094638451839e-05, "loss": 2.9592, "step": 2031 }, { "epoch": 1.3599113601204165, "grad_norm": 5.699573040008545, "learning_rate": 3.32825744299398e-05, "loss": 2.7894, "step": 2032 }, { "epoch": 1.3605803403436885, "grad_norm": 3.749000310897827, "learning_rate": 3.32641974495216e-05, "loss": 2.6215, "step": 2033 }, { "epoch": 1.3612493205669607, "grad_norm": 4.384515285491943, "learning_rate": 3.324581545441485e-05, "loss": 2.53, "step": 2034 }, { "epoch": 1.361918300790233, "grad_norm": 4.807569980621338, "learning_rate": 3.3227428455773694e-05, "loss": 2.8052, "step": 2035 }, { "epoch": 1.362587281013505, "grad_norm": 5.514527797698975, "learning_rate": 3.320903646475531e-05, "loss": 2.7406, "step": 2036 }, { "epoch": 1.3632562612367771, "grad_norm": 3.7961368560791016, "learning_rate": 3.319063949251989e-05, "loss": 2.6709, "step": 2037 }, { "epoch": 1.3639252414600493, "grad_norm": 4.431958198547363, "learning_rate": 3.3172237550230666e-05, "loss": 2.756, "step": 2038 }, { "epoch": 1.3645942216833216, "grad_norm": 5.05426025390625, "learning_rate": 3.315383064905388e-05, "loss": 2.9408, "step": 2039 }, { "epoch": 1.3652632019065936, "grad_norm": 6.362969398498535, "learning_rate": 3.313541880015877e-05, "loss": 2.4547, "step": 2040 }, { "epoch": 1.3659321821298658, "grad_norm": 6.6766252517700195, "learning_rate": 3.3117002014717604e-05, "loss": 2.9067, "step": 2041 }, { "epoch": 1.366601162353138, "grad_norm": 4.779316425323486, "learning_rate": 3.30985803039056e-05, "loss": 2.83, "step": 2042 }, { "epoch": 1.3672701425764102, "grad_norm": 4.906229019165039, "learning_rate": 3.308015367890102e-05, "loss": 2.6688, "step": 2043 }, { "epoch": 1.3679391227996822, "grad_norm": 6.010954856872559, "learning_rate": 3.306172215088508e-05, "loss": 2.6788, "step": 2044 }, { "epoch": 1.3686081030229544, "grad_norm": 6.692445278167725, "learning_rate": 3.304328573104195e-05, "loss": 2.8227, "step": 2045 }, { "epoch": 1.3692770832462267, "grad_norm": 5.441335678100586, "learning_rate": 3.302484443055881e-05, "loss": 2.6127, "step": 2046 }, { "epoch": 1.3699460634694987, "grad_norm": 7.512343883514404, "learning_rate": 3.3006398260625774e-05, "loss": 2.8606, "step": 2047 }, { "epoch": 1.3706150436927709, "grad_norm": 5.280481815338135, "learning_rate": 3.298794723243592e-05, "loss": 2.6278, "step": 2048 }, { "epoch": 1.371284023916043, "grad_norm": 5.495764255523682, "learning_rate": 3.2969491357185275e-05, "loss": 2.771, "step": 2049 }, { "epoch": 1.371953004139315, "grad_norm": 8.037247657775879, "learning_rate": 3.295103064607281e-05, "loss": 2.725, "step": 2050 }, { "epoch": 1.3726219843625873, "grad_norm": 4.578863620758057, "learning_rate": 3.2932565110300415e-05, "loss": 2.8497, "step": 2051 }, { "epoch": 1.3732909645858595, "grad_norm": 4.044233798980713, "learning_rate": 3.2914094761072914e-05, "loss": 2.6499, "step": 2052 }, { "epoch": 1.3739599448091315, "grad_norm": 5.420017242431641, "learning_rate": 3.2895619609598075e-05, "loss": 3.1044, "step": 2053 }, { "epoch": 1.3746289250324037, "grad_norm": 5.717971324920654, "learning_rate": 3.2877139667086534e-05, "loss": 2.674, "step": 2054 }, { "epoch": 1.375297905255676, "grad_norm": 5.100498199462891, "learning_rate": 3.285865494475189e-05, "loss": 2.5895, "step": 2055 }, { "epoch": 1.375966885478948, "grad_norm": 4.620347023010254, "learning_rate": 3.28401654538106e-05, "loss": 2.6014, "step": 2056 }, { "epoch": 1.3766358657022202, "grad_norm": 6.425408840179443, "learning_rate": 3.2821671205482026e-05, "loss": 2.7834, "step": 2057 }, { "epoch": 1.3773048459254924, "grad_norm": 4.689121723175049, "learning_rate": 3.280317221098842e-05, "loss": 2.7513, "step": 2058 }, { "epoch": 1.3779738261487644, "grad_norm": 4.472986221313477, "learning_rate": 3.278466848155491e-05, "loss": 2.7845, "step": 2059 }, { "epoch": 1.3786428063720366, "grad_norm": 5.076809406280518, "learning_rate": 3.27661600284095e-05, "loss": 2.7951, "step": 2060 }, { "epoch": 1.3793117865953088, "grad_norm": 5.643988132476807, "learning_rate": 3.274764686278307e-05, "loss": 2.959, "step": 2061 }, { "epoch": 1.3799807668185808, "grad_norm": 5.873498439788818, "learning_rate": 3.272912899590934e-05, "loss": 2.6622, "step": 2062 }, { "epoch": 1.380649747041853, "grad_norm": 5.192942142486572, "learning_rate": 3.2710606439024896e-05, "loss": 2.8687, "step": 2063 }, { "epoch": 1.3813187272651253, "grad_norm": 4.184119701385498, "learning_rate": 3.2692079203369156e-05, "loss": 2.8474, "step": 2064 }, { "epoch": 1.3819877074883973, "grad_norm": 4.882514476776123, "learning_rate": 3.2673547300184404e-05, "loss": 2.6392, "step": 2065 }, { "epoch": 1.3826566877116695, "grad_norm": 6.144156455993652, "learning_rate": 3.2655010740715736e-05, "loss": 2.7037, "step": 2066 }, { "epoch": 1.3833256679349417, "grad_norm": 4.782991409301758, "learning_rate": 3.263646953621106e-05, "loss": 2.7595, "step": 2067 }, { "epoch": 1.3839946481582137, "grad_norm": 4.191819190979004, "learning_rate": 3.261792369792114e-05, "loss": 2.5515, "step": 2068 }, { "epoch": 1.384663628381486, "grad_norm": 5.204425811767578, "learning_rate": 3.259937323709952e-05, "loss": 2.8186, "step": 2069 }, { "epoch": 1.3853326086047582, "grad_norm": 4.756034851074219, "learning_rate": 3.258081816500257e-05, "loss": 2.7698, "step": 2070 }, { "epoch": 1.3860015888280302, "grad_norm": 5.133122444152832, "learning_rate": 3.256225849288943e-05, "loss": 2.8698, "step": 2071 }, { "epoch": 1.3866705690513024, "grad_norm": 6.972858905792236, "learning_rate": 3.254369423202207e-05, "loss": 2.9405, "step": 2072 }, { "epoch": 1.3873395492745746, "grad_norm": 4.848676681518555, "learning_rate": 3.2525125393665216e-05, "loss": 2.7072, "step": 2073 }, { "epoch": 1.3880085294978466, "grad_norm": 5.23881196975708, "learning_rate": 3.2506551989086374e-05, "loss": 2.8509, "step": 2074 }, { "epoch": 1.3886775097211188, "grad_norm": 4.224327564239502, "learning_rate": 3.248797402955583e-05, "loss": 2.6614, "step": 2075 }, { "epoch": 1.389346489944391, "grad_norm": 6.613017559051514, "learning_rate": 3.246939152634664e-05, "loss": 2.9236, "step": 2076 }, { "epoch": 1.3900154701676632, "grad_norm": 5.5714874267578125, "learning_rate": 3.245080449073459e-05, "loss": 2.7773, "step": 2077 }, { "epoch": 1.3906844503909352, "grad_norm": 5.062618732452393, "learning_rate": 3.243221293399825e-05, "loss": 2.9224, "step": 2078 }, { "epoch": 1.3913534306142075, "grad_norm": 4.741576671600342, "learning_rate": 3.2413616867418904e-05, "loss": 2.5638, "step": 2079 }, { "epoch": 1.3920224108374797, "grad_norm": 6.06392240524292, "learning_rate": 3.23950163022806e-05, "loss": 2.7167, "step": 2080 }, { "epoch": 1.392691391060752, "grad_norm": 6.1369733810424805, "learning_rate": 3.2376411249870085e-05, "loss": 2.5687, "step": 2081 }, { "epoch": 1.393360371284024, "grad_norm": 6.010268211364746, "learning_rate": 3.2357801721476854e-05, "loss": 3.0348, "step": 2082 }, { "epoch": 1.3940293515072961, "grad_norm": 5.525701522827148, "learning_rate": 3.23391877283931e-05, "loss": 2.7513, "step": 2083 }, { "epoch": 1.3946983317305683, "grad_norm": 5.653034687042236, "learning_rate": 3.232056928191376e-05, "loss": 2.5086, "step": 2084 }, { "epoch": 1.3953673119538403, "grad_norm": 5.4909772872924805, "learning_rate": 3.230194639333642e-05, "loss": 2.8214, "step": 2085 }, { "epoch": 1.3960362921771126, "grad_norm": 4.054011344909668, "learning_rate": 3.228331907396141e-05, "loss": 2.6106, "step": 2086 }, { "epoch": 1.3967052724003848, "grad_norm": 5.332735061645508, "learning_rate": 3.2264687335091696e-05, "loss": 2.8631, "step": 2087 }, { "epoch": 1.3973742526236568, "grad_norm": 7.368805885314941, "learning_rate": 3.2246051188033e-05, "loss": 2.9556, "step": 2088 }, { "epoch": 1.398043232846929, "grad_norm": 4.063880443572998, "learning_rate": 3.222741064409364e-05, "loss": 2.7505, "step": 2089 }, { "epoch": 1.3987122130702012, "grad_norm": 7.633145332336426, "learning_rate": 3.220876571458466e-05, "loss": 2.8162, "step": 2090 }, { "epoch": 1.3993811932934732, "grad_norm": 4.736606121063232, "learning_rate": 3.219011641081974e-05, "loss": 2.4496, "step": 2091 }, { "epoch": 1.4000501735167454, "grad_norm": 6.102734565734863, "learning_rate": 3.217146274411521e-05, "loss": 3.0237, "step": 2092 }, { "epoch": 1.4007191537400177, "grad_norm": 3.788506507873535, "learning_rate": 3.215280472579006e-05, "loss": 2.3751, "step": 2093 }, { "epoch": 1.4013881339632897, "grad_norm": 4.817192077636719, "learning_rate": 3.2134142367165916e-05, "loss": 2.875, "step": 2094 }, { "epoch": 1.4020571141865619, "grad_norm": 5.458423137664795, "learning_rate": 3.211547567956704e-05, "loss": 2.9478, "step": 2095 }, { "epoch": 1.402726094409834, "grad_norm": 3.884122610092163, "learning_rate": 3.2096804674320305e-05, "loss": 2.6837, "step": 2096 }, { "epoch": 1.403395074633106, "grad_norm": 4.676953315734863, "learning_rate": 3.2078129362755236e-05, "loss": 2.759, "step": 2097 }, { "epoch": 1.4040640548563783, "grad_norm": 4.61336088180542, "learning_rate": 3.205944975620394e-05, "loss": 2.8484, "step": 2098 }, { "epoch": 1.4047330350796505, "grad_norm": 4.984833240509033, "learning_rate": 3.2040765866001157e-05, "loss": 3.1608, "step": 2099 }, { "epoch": 1.4054020153029225, "grad_norm": 8.212212562561035, "learning_rate": 3.202207770348419e-05, "loss": 2.7521, "step": 2100 }, { "epoch": 1.4060709955261947, "grad_norm": 4.495399475097656, "learning_rate": 3.200338527999296e-05, "loss": 2.6543, "step": 2101 }, { "epoch": 1.406739975749467, "grad_norm": 4.785201072692871, "learning_rate": 3.198468860686999e-05, "loss": 2.6804, "step": 2102 }, { "epoch": 1.407408955972739, "grad_norm": 5.067458152770996, "learning_rate": 3.196598769546034e-05, "loss": 2.5417, "step": 2103 }, { "epoch": 1.4080779361960112, "grad_norm": 6.252809047698975, "learning_rate": 3.194728255711167e-05, "loss": 2.792, "step": 2104 }, { "epoch": 1.4087469164192834, "grad_norm": 5.277778625488281, "learning_rate": 3.1928573203174206e-05, "loss": 2.7697, "step": 2105 }, { "epoch": 1.4094158966425554, "grad_norm": 4.683997631072998, "learning_rate": 3.1909859645000714e-05, "loss": 2.873, "step": 2106 }, { "epoch": 1.4100848768658276, "grad_norm": 4.702674388885498, "learning_rate": 3.189114189394653e-05, "loss": 2.8581, "step": 2107 }, { "epoch": 1.4107538570890998, "grad_norm": 5.353943347930908, "learning_rate": 3.187241996136951e-05, "loss": 2.6926, "step": 2108 }, { "epoch": 1.4114228373123718, "grad_norm": 4.439987659454346, "learning_rate": 3.185369385863007e-05, "loss": 2.7103, "step": 2109 }, { "epoch": 1.412091817535644, "grad_norm": 5.188424110412598, "learning_rate": 3.1834963597091165e-05, "loss": 2.7763, "step": 2110 }, { "epoch": 1.4127607977589163, "grad_norm": 3.740800380706787, "learning_rate": 3.181622918811824e-05, "loss": 2.5914, "step": 2111 }, { "epoch": 1.4134297779821883, "grad_norm": 5.433175563812256, "learning_rate": 3.179749064307927e-05, "loss": 2.6397, "step": 2112 }, { "epoch": 1.4140987582054605, "grad_norm": 6.00779390335083, "learning_rate": 3.177874797334477e-05, "loss": 2.5472, "step": 2113 }, { "epoch": 1.4147677384287327, "grad_norm": 5.701888084411621, "learning_rate": 3.17600011902877e-05, "loss": 2.8456, "step": 2114 }, { "epoch": 1.415436718652005, "grad_norm": 5.572426795959473, "learning_rate": 3.1741250305283566e-05, "loss": 2.7787, "step": 2115 }, { "epoch": 1.416105698875277, "grad_norm": 5.159278869628906, "learning_rate": 3.172249532971033e-05, "loss": 2.7681, "step": 2116 }, { "epoch": 1.4167746790985492, "grad_norm": 7.181278705596924, "learning_rate": 3.170373627494848e-05, "loss": 2.7483, "step": 2117 }, { "epoch": 1.4174436593218214, "grad_norm": 5.110281944274902, "learning_rate": 3.1684973152380934e-05, "loss": 2.6859, "step": 2118 }, { "epoch": 1.4181126395450934, "grad_norm": 6.126352310180664, "learning_rate": 3.1666205973393084e-05, "loss": 2.5781, "step": 2119 }, { "epoch": 1.4187816197683656, "grad_norm": 4.8328118324279785, "learning_rate": 3.1647434749372804e-05, "loss": 2.6383, "step": 2120 }, { "epoch": 1.4194505999916378, "grad_norm": 4.447200298309326, "learning_rate": 3.162865949171042e-05, "loss": 2.5225, "step": 2121 }, { "epoch": 1.42011958021491, "grad_norm": 4.1558074951171875, "learning_rate": 3.160988021179868e-05, "loss": 2.7313, "step": 2122 }, { "epoch": 1.420788560438182, "grad_norm": 3.8075015544891357, "learning_rate": 3.15910969210328e-05, "loss": 2.6495, "step": 2123 }, { "epoch": 1.4214575406614542, "grad_norm": 6.061837196350098, "learning_rate": 3.1572309630810434e-05, "loss": 2.8919, "step": 2124 }, { "epoch": 1.4221265208847265, "grad_norm": 4.573391437530518, "learning_rate": 3.155351835253163e-05, "loss": 2.7806, "step": 2125 }, { "epoch": 1.4227955011079985, "grad_norm": 4.304897308349609, "learning_rate": 3.153472309759888e-05, "loss": 2.613, "step": 2126 }, { "epoch": 1.4234644813312707, "grad_norm": 4.302021026611328, "learning_rate": 3.15159238774171e-05, "loss": 2.7939, "step": 2127 }, { "epoch": 1.424133461554543, "grad_norm": 5.093643665313721, "learning_rate": 3.1497120703393576e-05, "loss": 2.6664, "step": 2128 }, { "epoch": 1.424802441777815, "grad_norm": 6.002468109130859, "learning_rate": 3.1478313586938025e-05, "loss": 2.543, "step": 2129 }, { "epoch": 1.4254714220010871, "grad_norm": 6.307436466217041, "learning_rate": 3.1459502539462536e-05, "loss": 2.7632, "step": 2130 }, { "epoch": 1.4261404022243593, "grad_norm": 5.699851036071777, "learning_rate": 3.14406875723816e-05, "loss": 2.5711, "step": 2131 }, { "epoch": 1.4268093824476313, "grad_norm": 6.780220985412598, "learning_rate": 3.1421868697112084e-05, "loss": 2.6382, "step": 2132 }, { "epoch": 1.4274783626709036, "grad_norm": 6.741306304931641, "learning_rate": 3.140304592507321e-05, "loss": 2.6952, "step": 2133 }, { "epoch": 1.4281473428941758, "grad_norm": 5.3856096267700195, "learning_rate": 3.138421926768658e-05, "loss": 2.6026, "step": 2134 }, { "epoch": 1.4288163231174478, "grad_norm": 4.4347310066223145, "learning_rate": 3.136538873637615e-05, "loss": 2.4124, "step": 2135 }, { "epoch": 1.42948530334072, "grad_norm": 5.228308200836182, "learning_rate": 3.134655434256822e-05, "loss": 3.0497, "step": 2136 }, { "epoch": 1.4301542835639922, "grad_norm": 3.872063636779785, "learning_rate": 3.132771609769145e-05, "loss": 2.684, "step": 2137 }, { "epoch": 1.4308232637872642, "grad_norm": 4.839834690093994, "learning_rate": 3.130887401317682e-05, "loss": 2.8611, "step": 2138 }, { "epoch": 1.4314922440105364, "grad_norm": 3.6270556449890137, "learning_rate": 3.129002810045765e-05, "loss": 2.8794, "step": 2139 }, { "epoch": 1.4321612242338086, "grad_norm": 4.208321571350098, "learning_rate": 3.127117837096958e-05, "loss": 2.8055, "step": 2140 }, { "epoch": 1.4328302044570806, "grad_norm": 6.322272777557373, "learning_rate": 3.125232483615056e-05, "loss": 2.6562, "step": 2141 }, { "epoch": 1.4334991846803529, "grad_norm": 5.087634563446045, "learning_rate": 3.123346750744086e-05, "loss": 2.8283, "step": 2142 }, { "epoch": 1.434168164903625, "grad_norm": 5.131560802459717, "learning_rate": 3.1214606396283044e-05, "loss": 3.0144, "step": 2143 }, { "epoch": 1.434837145126897, "grad_norm": 6.122884273529053, "learning_rate": 3.119574151412197e-05, "loss": 2.7595, "step": 2144 }, { "epoch": 1.4355061253501693, "grad_norm": 7.18737268447876, "learning_rate": 3.11768728724048e-05, "loss": 3.0811, "step": 2145 }, { "epoch": 1.4361751055734415, "grad_norm": 4.818100452423096, "learning_rate": 3.115800048258096e-05, "loss": 2.6299, "step": 2146 }, { "epoch": 1.4368440857967135, "grad_norm": 4.766518592834473, "learning_rate": 3.1139124356102145e-05, "loss": 2.8003, "step": 2147 }, { "epoch": 1.4375130660199857, "grad_norm": 7.5392937660217285, "learning_rate": 3.112024450442234e-05, "loss": 2.7066, "step": 2148 }, { "epoch": 1.438182046243258, "grad_norm": 6.259847640991211, "learning_rate": 3.110136093899777e-05, "loss": 3.0412, "step": 2149 }, { "epoch": 1.43885102646653, "grad_norm": 5.854325771331787, "learning_rate": 3.108247367128694e-05, "loss": 2.7265, "step": 2150 }, { "epoch": 1.4395200066898022, "grad_norm": 8.203265190124512, "learning_rate": 3.106358271275056e-05, "loss": 2.8515, "step": 2151 }, { "epoch": 1.4401889869130744, "grad_norm": 4.087404727935791, "learning_rate": 3.1044688074851615e-05, "loss": 2.6054, "step": 2152 }, { "epoch": 1.4408579671363464, "grad_norm": 4.96695613861084, "learning_rate": 3.102578976905531e-05, "loss": 2.6986, "step": 2153 }, { "epoch": 1.4415269473596186, "grad_norm": 4.936301231384277, "learning_rate": 3.1006887806829085e-05, "loss": 2.6312, "step": 2154 }, { "epoch": 1.4421959275828908, "grad_norm": 9.694040298461914, "learning_rate": 3.098798219964257e-05, "loss": 2.9009, "step": 2155 }, { "epoch": 1.442864907806163, "grad_norm": 8.300200462341309, "learning_rate": 3.096907295896764e-05, "loss": 2.7605, "step": 2156 }, { "epoch": 1.443533888029435, "grad_norm": 4.935064792633057, "learning_rate": 3.0950160096278364e-05, "loss": 2.8497, "step": 2157 }, { "epoch": 1.4442028682527073, "grad_norm": 5.755708694458008, "learning_rate": 3.0931243623051e-05, "loss": 2.9343, "step": 2158 }, { "epoch": 1.4448718484759795, "grad_norm": 5.971693515777588, "learning_rate": 3.0912323550764e-05, "loss": 2.7599, "step": 2159 }, { "epoch": 1.4455408286992517, "grad_norm": 5.377431392669678, "learning_rate": 3.0893399890898014e-05, "loss": 2.7569, "step": 2160 }, { "epoch": 1.4462098089225237, "grad_norm": 5.563959121704102, "learning_rate": 3.087447265493586e-05, "loss": 2.6611, "step": 2161 }, { "epoch": 1.446878789145796, "grad_norm": 8.300984382629395, "learning_rate": 3.08555418543625e-05, "loss": 2.5863, "step": 2162 }, { "epoch": 1.4475477693690681, "grad_norm": 4.10382604598999, "learning_rate": 3.083660750066511e-05, "loss": 2.8028, "step": 2163 }, { "epoch": 1.4482167495923401, "grad_norm": 5.85797119140625, "learning_rate": 3.081766960533299e-05, "loss": 2.8329, "step": 2164 }, { "epoch": 1.4488857298156124, "grad_norm": 6.761934280395508, "learning_rate": 3.0798728179857584e-05, "loss": 2.7883, "step": 2165 }, { "epoch": 1.4495547100388846, "grad_norm": 5.8419976234436035, "learning_rate": 3.0779783235732495e-05, "loss": 2.7093, "step": 2166 }, { "epoch": 1.4502236902621566, "grad_norm": 6.216766357421875, "learning_rate": 3.0760834784453453e-05, "loss": 2.6303, "step": 2167 }, { "epoch": 1.4508926704854288, "grad_norm": 7.233863830566406, "learning_rate": 3.074188283751832e-05, "loss": 2.5462, "step": 2168 }, { "epoch": 1.451561650708701, "grad_norm": 7.057400226593018, "learning_rate": 3.072292740642707e-05, "loss": 2.6736, "step": 2169 }, { "epoch": 1.452230630931973, "grad_norm": 5.978362083435059, "learning_rate": 3.070396850268181e-05, "loss": 2.7756, "step": 2170 }, { "epoch": 1.4528996111552452, "grad_norm": 6.943479537963867, "learning_rate": 3.0685006137786726e-05, "loss": 3.0649, "step": 2171 }, { "epoch": 1.4535685913785175, "grad_norm": 7.135985374450684, "learning_rate": 3.066604032324813e-05, "loss": 2.8992, "step": 2172 }, { "epoch": 1.4542375716017895, "grad_norm": 3.9003190994262695, "learning_rate": 3.064707107057443e-05, "loss": 2.4526, "step": 2173 }, { "epoch": 1.4549065518250617, "grad_norm": 7.062236785888672, "learning_rate": 3.062809839127607e-05, "loss": 3.088, "step": 2174 }, { "epoch": 1.455575532048334, "grad_norm": 5.225010395050049, "learning_rate": 3.060912229686565e-05, "loss": 2.6068, "step": 2175 }, { "epoch": 1.456244512271606, "grad_norm": 5.395065784454346, "learning_rate": 3.059014279885779e-05, "loss": 2.7208, "step": 2176 }, { "epoch": 1.4569134924948781, "grad_norm": 5.0800275802612305, "learning_rate": 3.057115990876918e-05, "loss": 2.6391, "step": 2177 }, { "epoch": 1.4575824727181503, "grad_norm": 8.030608177185059, "learning_rate": 3.055217363811859e-05, "loss": 2.9492, "step": 2178 }, { "epoch": 1.4582514529414223, "grad_norm": 5.007354259490967, "learning_rate": 3.053318399842682e-05, "loss": 2.3849, "step": 2179 }, { "epoch": 1.4589204331646946, "grad_norm": 5.231847763061523, "learning_rate": 3.0514191001216724e-05, "loss": 2.831, "step": 2180 }, { "epoch": 1.4595894133879668, "grad_norm": 5.7164306640625, "learning_rate": 3.0495194658013194e-05, "loss": 2.7224, "step": 2181 }, { "epoch": 1.4602583936112388, "grad_norm": 5.191862106323242, "learning_rate": 3.047619498034314e-05, "loss": 2.7955, "step": 2182 }, { "epoch": 1.460927373834511, "grad_norm": 5.349247932434082, "learning_rate": 3.0457191979735528e-05, "loss": 2.9513, "step": 2183 }, { "epoch": 1.4615963540577832, "grad_norm": 4.896350383758545, "learning_rate": 3.04381856677213e-05, "loss": 2.7237, "step": 2184 }, { "epoch": 1.4622653342810552, "grad_norm": 5.641765117645264, "learning_rate": 3.0419176055833426e-05, "loss": 2.5212, "step": 2185 }, { "epoch": 1.4629343145043274, "grad_norm": 6.158010959625244, "learning_rate": 3.0400163155606887e-05, "loss": 2.9407, "step": 2186 }, { "epoch": 1.4636032947275996, "grad_norm": 4.178887367248535, "learning_rate": 3.0381146978578633e-05, "loss": 2.5664, "step": 2187 }, { "epoch": 1.4642722749508716, "grad_norm": 5.496964931488037, "learning_rate": 3.0362127536287637e-05, "loss": 2.9001, "step": 2188 }, { "epoch": 1.4649412551741439, "grad_norm": 5.312406539916992, "learning_rate": 3.034310484027483e-05, "loss": 2.5888, "step": 2189 }, { "epoch": 1.465610235397416, "grad_norm": 4.688312530517578, "learning_rate": 3.032407890208312e-05, "loss": 2.5179, "step": 2190 }, { "epoch": 1.466279215620688, "grad_norm": 4.950438499450684, "learning_rate": 3.0305049733257384e-05, "loss": 2.5942, "step": 2191 }, { "epoch": 1.4669481958439603, "grad_norm": 5.423849582672119, "learning_rate": 3.0286017345344465e-05, "loss": 2.7217, "step": 2192 }, { "epoch": 1.4676171760672325, "grad_norm": 4.332028865814209, "learning_rate": 3.0266981749893157e-05, "loss": 2.5761, "step": 2193 }, { "epoch": 1.4682861562905047, "grad_norm": 5.536228656768799, "learning_rate": 3.0247942958454196e-05, "loss": 2.6629, "step": 2194 }, { "epoch": 1.4689551365137767, "grad_norm": 5.129214763641357, "learning_rate": 3.0228900982580256e-05, "loss": 2.5893, "step": 2195 }, { "epoch": 1.469624116737049, "grad_norm": 3.6339681148529053, "learning_rate": 3.0209855833825952e-05, "loss": 2.2444, "step": 2196 }, { "epoch": 1.4702930969603212, "grad_norm": 4.836607933044434, "learning_rate": 3.0190807523747822e-05, "loss": 2.6725, "step": 2197 }, { "epoch": 1.4709620771835934, "grad_norm": 3.9454689025878906, "learning_rate": 3.0171756063904303e-05, "loss": 2.6341, "step": 2198 }, { "epoch": 1.4716310574068654, "grad_norm": 4.593690395355225, "learning_rate": 3.0152701465855778e-05, "loss": 2.6333, "step": 2199 }, { "epoch": 1.4723000376301376, "grad_norm": 8.685956954956055, "learning_rate": 3.0133643741164508e-05, "loss": 2.9083, "step": 2200 }, { "epoch": 1.4729690178534098, "grad_norm": 5.938663005828857, "learning_rate": 3.0114582901394667e-05, "loss": 2.6169, "step": 2201 }, { "epoch": 1.4736379980766818, "grad_norm": 4.498161315917969, "learning_rate": 3.0095518958112295e-05, "loss": 2.7415, "step": 2202 }, { "epoch": 1.474306978299954, "grad_norm": 6.615968227386475, "learning_rate": 3.0076451922885346e-05, "loss": 2.9558, "step": 2203 }, { "epoch": 1.4749759585232263, "grad_norm": 7.6475725173950195, "learning_rate": 3.0057381807283642e-05, "loss": 2.8374, "step": 2204 }, { "epoch": 1.4756449387464983, "grad_norm": 5.680813789367676, "learning_rate": 3.0038308622878846e-05, "loss": 2.741, "step": 2205 }, { "epoch": 1.4763139189697705, "grad_norm": 5.488100051879883, "learning_rate": 3.0019232381244515e-05, "loss": 2.6979, "step": 2206 }, { "epoch": 1.4769828991930427, "grad_norm": 4.570145606994629, "learning_rate": 3.000015309395606e-05, "loss": 2.8078, "step": 2207 }, { "epoch": 1.4776518794163147, "grad_norm": 3.7472822666168213, "learning_rate": 2.998107077259073e-05, "loss": 2.6571, "step": 2208 }, { "epoch": 1.478320859639587, "grad_norm": 4.0875773429870605, "learning_rate": 2.99619854287276e-05, "loss": 2.9654, "step": 2209 }, { "epoch": 1.4789898398628591, "grad_norm": 6.403375148773193, "learning_rate": 2.9942897073947612e-05, "loss": 2.7213, "step": 2210 }, { "epoch": 1.4796588200861311, "grad_norm": 4.437558650970459, "learning_rate": 2.9923805719833515e-05, "loss": 2.6668, "step": 2211 }, { "epoch": 1.4803278003094034, "grad_norm": 7.018237113952637, "learning_rate": 2.9904711377969884e-05, "loss": 2.8256, "step": 2212 }, { "epoch": 1.4809967805326756, "grad_norm": 4.830729961395264, "learning_rate": 2.988561405994309e-05, "loss": 2.8145, "step": 2213 }, { "epoch": 1.4816657607559476, "grad_norm": 6.391282081604004, "learning_rate": 2.986651377734134e-05, "loss": 2.9568, "step": 2214 }, { "epoch": 1.4823347409792198, "grad_norm": 5.865687847137451, "learning_rate": 2.984741054175463e-05, "loss": 2.9988, "step": 2215 }, { "epoch": 1.483003721202492, "grad_norm": 6.1040544509887695, "learning_rate": 2.9828304364774713e-05, "loss": 2.7426, "step": 2216 }, { "epoch": 1.483672701425764, "grad_norm": 4.569874286651611, "learning_rate": 2.9809195257995182e-05, "loss": 2.6406, "step": 2217 }, { "epoch": 1.4843416816490362, "grad_norm": 5.783868312835693, "learning_rate": 2.9790083233011372e-05, "loss": 2.9414, "step": 2218 }, { "epoch": 1.4850106618723085, "grad_norm": 5.311706066131592, "learning_rate": 2.977096830142041e-05, "loss": 2.7498, "step": 2219 }, { "epoch": 1.4856796420955805, "grad_norm": 5.672433376312256, "learning_rate": 2.9751850474821153e-05, "loss": 2.8065, "step": 2220 }, { "epoch": 1.4863486223188527, "grad_norm": 9.245611190795898, "learning_rate": 2.9732729764814254e-05, "loss": 3.0967, "step": 2221 }, { "epoch": 1.487017602542125, "grad_norm": 5.793460845947266, "learning_rate": 2.9713606183002098e-05, "loss": 2.8664, "step": 2222 }, { "epoch": 1.487686582765397, "grad_norm": 3.8744213581085205, "learning_rate": 2.969447974098881e-05, "loss": 2.504, "step": 2223 }, { "epoch": 1.4883555629886691, "grad_norm": 4.165330410003662, "learning_rate": 2.9675350450380253e-05, "loss": 2.8091, "step": 2224 }, { "epoch": 1.4890245432119413, "grad_norm": 4.632652759552002, "learning_rate": 2.9656218322784014e-05, "loss": 2.662, "step": 2225 }, { "epoch": 1.4896935234352133, "grad_norm": 4.595365047454834, "learning_rate": 2.963708336980942e-05, "loss": 2.8775, "step": 2226 }, { "epoch": 1.4903625036584855, "grad_norm": 3.8869335651397705, "learning_rate": 2.961794560306749e-05, "loss": 2.6281, "step": 2227 }, { "epoch": 1.4910314838817578, "grad_norm": 6.43788480758667, "learning_rate": 2.959880503417095e-05, "loss": 2.6576, "step": 2228 }, { "epoch": 1.4917004641050298, "grad_norm": 4.685789108276367, "learning_rate": 2.9579661674734256e-05, "loss": 2.6969, "step": 2229 }, { "epoch": 1.492369444328302, "grad_norm": 3.7685611248016357, "learning_rate": 2.9560515536373534e-05, "loss": 2.6655, "step": 2230 }, { "epoch": 1.4930384245515742, "grad_norm": 6.255924701690674, "learning_rate": 2.9541366630706586e-05, "loss": 2.7999, "step": 2231 }, { "epoch": 1.4937074047748464, "grad_norm": 4.318514347076416, "learning_rate": 2.9522214969352912e-05, "loss": 2.6507, "step": 2232 }, { "epoch": 1.4943763849981184, "grad_norm": 4.408954620361328, "learning_rate": 2.9503060563933682e-05, "loss": 2.5212, "step": 2233 }, { "epoch": 1.4950453652213906, "grad_norm": 6.041835784912109, "learning_rate": 2.9483903426071734e-05, "loss": 2.9122, "step": 2234 }, { "epoch": 1.4957143454446629, "grad_norm": 5.376307964324951, "learning_rate": 2.9464743567391546e-05, "loss": 2.6712, "step": 2235 }, { "epoch": 1.4963833256679349, "grad_norm": 6.752755641937256, "learning_rate": 2.944558099951926e-05, "loss": 2.8976, "step": 2236 }, { "epoch": 1.497052305891207, "grad_norm": 4.995867729187012, "learning_rate": 2.942641573408267e-05, "loss": 2.8559, "step": 2237 }, { "epoch": 1.4977212861144793, "grad_norm": 5.833775043487549, "learning_rate": 2.940724778271119e-05, "loss": 2.711, "step": 2238 }, { "epoch": 1.4983902663377515, "grad_norm": 6.020483016967773, "learning_rate": 2.938807715703587e-05, "loss": 2.5386, "step": 2239 }, { "epoch": 1.4990592465610235, "grad_norm": 9.630859375, "learning_rate": 2.9368903868689392e-05, "loss": 2.8233, "step": 2240 }, { "epoch": 1.4997282267842957, "grad_norm": 5.620861053466797, "learning_rate": 2.9349727929306042e-05, "loss": 2.6048, "step": 2241 }, { "epoch": 1.500397207007568, "grad_norm": 7.493035316467285, "learning_rate": 2.933054935052172e-05, "loss": 2.6908, "step": 2242 }, { "epoch": 1.50106618723084, "grad_norm": 6.487952709197998, "learning_rate": 2.9311368143973915e-05, "loss": 2.7721, "step": 2243 }, { "epoch": 1.5017351674541122, "grad_norm": 6.43418025970459, "learning_rate": 2.9292184321301742e-05, "loss": 2.9517, "step": 2244 }, { "epoch": 1.5024041476773844, "grad_norm": 4.627255916595459, "learning_rate": 2.927299789414587e-05, "loss": 2.6272, "step": 2245 }, { "epoch": 1.5030731279006564, "grad_norm": 5.553348064422607, "learning_rate": 2.925380887414856e-05, "loss": 2.777, "step": 2246 }, { "epoch": 1.5037421081239286, "grad_norm": 4.826366424560547, "learning_rate": 2.9234617272953653e-05, "loss": 2.5868, "step": 2247 }, { "epoch": 1.5044110883472008, "grad_norm": 4.4571213722229, "learning_rate": 2.921542310220655e-05, "loss": 2.5002, "step": 2248 }, { "epoch": 1.5050800685704728, "grad_norm": 5.0305376052856445, "learning_rate": 2.9196226373554213e-05, "loss": 2.7349, "step": 2249 }, { "epoch": 1.505749048793745, "grad_norm": 4.984687805175781, "learning_rate": 2.9177027098645155e-05, "loss": 2.7713, "step": 2250 }, { "epoch": 1.5064180290170173, "grad_norm": 5.873697280883789, "learning_rate": 2.915782528912943e-05, "loss": 2.7877, "step": 2251 }, { "epoch": 1.5070870092402893, "grad_norm": 7.105508327484131, "learning_rate": 2.9138620956658645e-05, "loss": 2.6506, "step": 2252 }, { "epoch": 1.5077559894635615, "grad_norm": 4.614120960235596, "learning_rate": 2.9119414112885917e-05, "loss": 2.834, "step": 2253 }, { "epoch": 1.5084249696868337, "grad_norm": 4.375961780548096, "learning_rate": 2.91002047694659e-05, "loss": 2.5743, "step": 2254 }, { "epoch": 1.5090939499101057, "grad_norm": 4.351882457733154, "learning_rate": 2.908099293805477e-05, "loss": 2.6655, "step": 2255 }, { "epoch": 1.509762930133378, "grad_norm": 6.840676784515381, "learning_rate": 2.9061778630310193e-05, "loss": 2.9516, "step": 2256 }, { "epoch": 1.5104319103566501, "grad_norm": 7.174767017364502, "learning_rate": 2.9042561857891353e-05, "loss": 3.0367, "step": 2257 }, { "epoch": 1.5111008905799221, "grad_norm": 5.582670211791992, "learning_rate": 2.9023342632458934e-05, "loss": 2.7125, "step": 2258 }, { "epoch": 1.5117698708031944, "grad_norm": 5.117880344390869, "learning_rate": 2.900412096567509e-05, "loss": 3.0327, "step": 2259 }, { "epoch": 1.5124388510264666, "grad_norm": 7.351419925689697, "learning_rate": 2.8984896869203472e-05, "loss": 2.5291, "step": 2260 }, { "epoch": 1.5131078312497386, "grad_norm": 4.509941577911377, "learning_rate": 2.8965670354709196e-05, "loss": 2.6116, "step": 2261 }, { "epoch": 1.5137768114730108, "grad_norm": 5.0681986808776855, "learning_rate": 2.894644143385885e-05, "loss": 2.6985, "step": 2262 }, { "epoch": 1.514445791696283, "grad_norm": 5.417664051055908, "learning_rate": 2.892721011832049e-05, "loss": 2.6398, "step": 2263 }, { "epoch": 1.515114771919555, "grad_norm": 7.193685531616211, "learning_rate": 2.8907976419763605e-05, "loss": 3.3176, "step": 2264 }, { "epoch": 1.5157837521428272, "grad_norm": 6.769975662231445, "learning_rate": 2.888874034985915e-05, "loss": 3.1323, "step": 2265 }, { "epoch": 1.5164527323660995, "grad_norm": 3.5006496906280518, "learning_rate": 2.8869501920279506e-05, "loss": 2.4759, "step": 2266 }, { "epoch": 1.5171217125893715, "grad_norm": 4.2594990730285645, "learning_rate": 2.88502611426985e-05, "loss": 2.6502, "step": 2267 }, { "epoch": 1.5177906928126437, "grad_norm": 4.369060039520264, "learning_rate": 2.8831018028791356e-05, "loss": 2.5705, "step": 2268 }, { "epoch": 1.518459673035916, "grad_norm": 5.244803428649902, "learning_rate": 2.8811772590234754e-05, "loss": 2.7474, "step": 2269 }, { "epoch": 1.5191286532591879, "grad_norm": 4.564072132110596, "learning_rate": 2.8792524838706754e-05, "loss": 2.8363, "step": 2270 }, { "epoch": 1.5197976334824603, "grad_norm": 2.9686384201049805, "learning_rate": 2.877327478588682e-05, "loss": 2.5589, "step": 2271 }, { "epoch": 1.5204666137057323, "grad_norm": 8.189765930175781, "learning_rate": 2.8754022443455842e-05, "loss": 2.48, "step": 2272 }, { "epoch": 1.5211355939290043, "grad_norm": 5.444003582000732, "learning_rate": 2.8734767823096065e-05, "loss": 2.8464, "step": 2273 }, { "epoch": 1.5218045741522768, "grad_norm": 5.806658744812012, "learning_rate": 2.8715510936491147e-05, "loss": 2.7004, "step": 2274 }, { "epoch": 1.5224735543755488, "grad_norm": 6.077183723449707, "learning_rate": 2.8696251795326083e-05, "loss": 2.7294, "step": 2275 }, { "epoch": 1.5231425345988208, "grad_norm": 5.352018356323242, "learning_rate": 2.8676990411287276e-05, "loss": 2.7559, "step": 2276 }, { "epoch": 1.5238115148220932, "grad_norm": 5.8964948654174805, "learning_rate": 2.8657726796062468e-05, "loss": 2.7667, "step": 2277 }, { "epoch": 1.5244804950453652, "grad_norm": 6.853862762451172, "learning_rate": 2.8638460961340756e-05, "loss": 2.5696, "step": 2278 }, { "epoch": 1.5251494752686374, "grad_norm": 4.766753673553467, "learning_rate": 2.8619192918812586e-05, "loss": 2.8655, "step": 2279 }, { "epoch": 1.5258184554919096, "grad_norm": 5.381381034851074, "learning_rate": 2.8599922680169744e-05, "loss": 3.0158, "step": 2280 }, { "epoch": 1.5264874357151816, "grad_norm": 5.1292405128479, "learning_rate": 2.8580650257105357e-05, "loss": 2.8744, "step": 2281 }, { "epoch": 1.5271564159384539, "grad_norm": 4.248069763183594, "learning_rate": 2.856137566131386e-05, "loss": 2.5012, "step": 2282 }, { "epoch": 1.527825396161726, "grad_norm": 8.12672233581543, "learning_rate": 2.854209890449102e-05, "loss": 3.2941, "step": 2283 }, { "epoch": 1.528494376384998, "grad_norm": 6.2802863121032715, "learning_rate": 2.8522819998333915e-05, "loss": 2.6625, "step": 2284 }, { "epoch": 1.5291633566082703, "grad_norm": 5.26627254486084, "learning_rate": 2.850353895454092e-05, "loss": 2.7865, "step": 2285 }, { "epoch": 1.5298323368315425, "grad_norm": 5.914576530456543, "learning_rate": 2.8484255784811707e-05, "loss": 2.6483, "step": 2286 }, { "epoch": 1.5305013170548145, "grad_norm": 4.533624649047852, "learning_rate": 2.846497050084725e-05, "loss": 2.7012, "step": 2287 }, { "epoch": 1.5311702972780867, "grad_norm": 6.660671234130859, "learning_rate": 2.844568311434979e-05, "loss": 2.8513, "step": 2288 }, { "epoch": 1.531839277501359, "grad_norm": 5.66995906829834, "learning_rate": 2.8426393637022858e-05, "loss": 2.7217, "step": 2289 }, { "epoch": 1.532508257724631, "grad_norm": 6.058644771575928, "learning_rate": 2.8407102080571234e-05, "loss": 2.7753, "step": 2290 }, { "epoch": 1.5331772379479032, "grad_norm": 5.899769306182861, "learning_rate": 2.8387808456700986e-05, "loss": 2.7172, "step": 2291 }, { "epoch": 1.5338462181711754, "grad_norm": 5.000714302062988, "learning_rate": 2.8368512777119428e-05, "loss": 3.0024, "step": 2292 }, { "epoch": 1.5345151983944474, "grad_norm": 4.8641252517700195, "learning_rate": 2.8349215053535095e-05, "loss": 2.8792, "step": 2293 }, { "epoch": 1.5351841786177196, "grad_norm": 4.457775115966797, "learning_rate": 2.83299152976578e-05, "loss": 2.6964, "step": 2294 }, { "epoch": 1.5358531588409918, "grad_norm": 8.015108108520508, "learning_rate": 2.8310613521198565e-05, "loss": 3.0513, "step": 2295 }, { "epoch": 1.5365221390642638, "grad_norm": 5.425005912780762, "learning_rate": 2.829130973586966e-05, "loss": 2.8324, "step": 2296 }, { "epoch": 1.537191119287536, "grad_norm": 4.353015422821045, "learning_rate": 2.8272003953384545e-05, "loss": 2.5468, "step": 2297 }, { "epoch": 1.5378600995108083, "grad_norm": 4.845297813415527, "learning_rate": 2.825269618545791e-05, "loss": 2.6662, "step": 2298 }, { "epoch": 1.5385290797340803, "grad_norm": 4.563453197479248, "learning_rate": 2.823338644380566e-05, "loss": 2.7973, "step": 2299 }, { "epoch": 1.5391980599573525, "grad_norm": 4.57271146774292, "learning_rate": 2.8214074740144864e-05, "loss": 2.6008, "step": 2300 }, { "epoch": 1.5398670401806247, "grad_norm": 4.428469181060791, "learning_rate": 2.819476108619381e-05, "loss": 2.9012, "step": 2301 }, { "epoch": 1.5405360204038967, "grad_norm": 6.25092077255249, "learning_rate": 2.8175445493671972e-05, "loss": 2.9164, "step": 2302 }, { "epoch": 1.541205000627169, "grad_norm": 5.633223533630371, "learning_rate": 2.8156127974299973e-05, "loss": 2.7299, "step": 2303 }, { "epoch": 1.5418739808504411, "grad_norm": 3.5399703979492188, "learning_rate": 2.813680853979963e-05, "loss": 2.8754, "step": 2304 }, { "epoch": 1.5425429610737131, "grad_norm": 5.035823822021484, "learning_rate": 2.8117487201893916e-05, "loss": 2.6865, "step": 2305 }, { "epoch": 1.5432119412969854, "grad_norm": 3.8767831325531006, "learning_rate": 2.8098163972306952e-05, "loss": 2.7932, "step": 2306 }, { "epoch": 1.5438809215202576, "grad_norm": 7.477834224700928, "learning_rate": 2.8078838862764016e-05, "loss": 2.6505, "step": 2307 }, { "epoch": 1.5445499017435296, "grad_norm": 5.698312282562256, "learning_rate": 2.8059511884991524e-05, "loss": 2.727, "step": 2308 }, { "epoch": 1.545218881966802, "grad_norm": 4.2874932289123535, "learning_rate": 2.8040183050717016e-05, "loss": 2.7379, "step": 2309 }, { "epoch": 1.545887862190074, "grad_norm": 5.99888277053833, "learning_rate": 2.8020852371669177e-05, "loss": 2.9435, "step": 2310 }, { "epoch": 1.546556842413346, "grad_norm": 7.154504776000977, "learning_rate": 2.800151985957779e-05, "loss": 2.9904, "step": 2311 }, { "epoch": 1.5472258226366185, "grad_norm": 6.446628570556641, "learning_rate": 2.7982185526173767e-05, "loss": 2.7681, "step": 2312 }, { "epoch": 1.5478948028598905, "grad_norm": 4.669741153717041, "learning_rate": 2.796284938318912e-05, "loss": 2.6973, "step": 2313 }, { "epoch": 1.5485637830831624, "grad_norm": 4.283153057098389, "learning_rate": 2.7943511442356968e-05, "loss": 2.5826, "step": 2314 }, { "epoch": 1.549232763306435, "grad_norm": 6.789968490600586, "learning_rate": 2.792417171541149e-05, "loss": 2.7476, "step": 2315 }, { "epoch": 1.5499017435297069, "grad_norm": 4.725489139556885, "learning_rate": 2.7904830214087984e-05, "loss": 2.9157, "step": 2316 }, { "epoch": 1.550570723752979, "grad_norm": 5.4655232429504395, "learning_rate": 2.7885486950122814e-05, "loss": 2.6649, "step": 2317 }, { "epoch": 1.5512397039762513, "grad_norm": 4.8952531814575195, "learning_rate": 2.7866141935253404e-05, "loss": 2.7492, "step": 2318 }, { "epoch": 1.5519086841995233, "grad_norm": 3.991525173187256, "learning_rate": 2.784679518121825e-05, "loss": 2.7339, "step": 2319 }, { "epoch": 1.5525776644227955, "grad_norm": 5.529263973236084, "learning_rate": 2.78274466997569e-05, "loss": 2.7831, "step": 2320 }, { "epoch": 1.5532466446460678, "grad_norm": 6.294371604919434, "learning_rate": 2.780809650260995e-05, "loss": 2.8431, "step": 2321 }, { "epoch": 1.5539156248693398, "grad_norm": 4.6729021072387695, "learning_rate": 2.7788744601519035e-05, "loss": 2.5727, "step": 2322 }, { "epoch": 1.554584605092612, "grad_norm": 7.0208916664123535, "learning_rate": 2.7769391008226825e-05, "loss": 2.8229, "step": 2323 }, { "epoch": 1.5552535853158842, "grad_norm": 6.4686126708984375, "learning_rate": 2.7750035734477027e-05, "loss": 2.7945, "step": 2324 }, { "epoch": 1.5559225655391562, "grad_norm": 7.349049091339111, "learning_rate": 2.7730678792014358e-05, "loss": 2.9645, "step": 2325 }, { "epoch": 1.5565915457624284, "grad_norm": 6.848609924316406, "learning_rate": 2.7711320192584543e-05, "loss": 2.8493, "step": 2326 }, { "epoch": 1.5572605259857006, "grad_norm": 4.186678886413574, "learning_rate": 2.769195994793432e-05, "loss": 2.7836, "step": 2327 }, { "epoch": 1.5579295062089726, "grad_norm": 5.591568470001221, "learning_rate": 2.7672598069811423e-05, "loss": 2.8385, "step": 2328 }, { "epoch": 1.5585984864322449, "grad_norm": 4.2852702140808105, "learning_rate": 2.7653234569964582e-05, "loss": 2.7116, "step": 2329 }, { "epoch": 1.559267466655517, "grad_norm": 3.834301710128784, "learning_rate": 2.76338694601435e-05, "loss": 2.6921, "step": 2330 }, { "epoch": 1.559936446878789, "grad_norm": 5.2634358406066895, "learning_rate": 2.761450275209887e-05, "loss": 2.8392, "step": 2331 }, { "epoch": 1.5606054271020613, "grad_norm": 6.76492977142334, "learning_rate": 2.7595134457582346e-05, "loss": 2.62, "step": 2332 }, { "epoch": 1.5612744073253335, "grad_norm": 6.453455448150635, "learning_rate": 2.7575764588346543e-05, "loss": 2.8073, "step": 2333 }, { "epoch": 1.5619433875486055, "grad_norm": 5.887423515319824, "learning_rate": 2.7556393156145032e-05, "loss": 2.8194, "step": 2334 }, { "epoch": 1.5626123677718777, "grad_norm": 6.127762794494629, "learning_rate": 2.753702017273235e-05, "loss": 2.6971, "step": 2335 }, { "epoch": 1.56328134799515, "grad_norm": 5.559381008148193, "learning_rate": 2.751764564986396e-05, "loss": 2.7486, "step": 2336 }, { "epoch": 1.563950328218422, "grad_norm": 5.403796195983887, "learning_rate": 2.749826959929625e-05, "loss": 2.7299, "step": 2337 }, { "epoch": 1.5646193084416942, "grad_norm": 4.958734035491943, "learning_rate": 2.7478892032786545e-05, "loss": 2.6112, "step": 2338 }, { "epoch": 1.5652882886649664, "grad_norm": 4.9305195808410645, "learning_rate": 2.74595129620931e-05, "loss": 2.8485, "step": 2339 }, { "epoch": 1.5659572688882384, "grad_norm": 6.793489456176758, "learning_rate": 2.7440132398975072e-05, "loss": 2.7815, "step": 2340 }, { "epoch": 1.5666262491115106, "grad_norm": 5.763112545013428, "learning_rate": 2.7420750355192516e-05, "loss": 2.806, "step": 2341 }, { "epoch": 1.5672952293347828, "grad_norm": 4.670166492462158, "learning_rate": 2.74013668425064e-05, "loss": 2.7168, "step": 2342 }, { "epoch": 1.5679642095580548, "grad_norm": 4.048379421234131, "learning_rate": 2.7381981872678575e-05, "loss": 2.5713, "step": 2343 }, { "epoch": 1.568633189781327, "grad_norm": 8.03201961517334, "learning_rate": 2.7362595457471786e-05, "loss": 2.8947, "step": 2344 }, { "epoch": 1.5693021700045993, "grad_norm": 5.185798168182373, "learning_rate": 2.7343207608649634e-05, "loss": 2.8595, "step": 2345 }, { "epoch": 1.5699711502278713, "grad_norm": 6.56360387802124, "learning_rate": 2.732381833797661e-05, "loss": 2.8782, "step": 2346 }, { "epoch": 1.5706401304511437, "grad_norm": 5.290652275085449, "learning_rate": 2.7304427657218074e-05, "loss": 2.6202, "step": 2347 }, { "epoch": 1.5713091106744157, "grad_norm": 4.337584495544434, "learning_rate": 2.7285035578140207e-05, "loss": 2.7512, "step": 2348 }, { "epoch": 1.5719780908976877, "grad_norm": 4.621699810028076, "learning_rate": 2.726564211251007e-05, "loss": 2.7962, "step": 2349 }, { "epoch": 1.5726470711209601, "grad_norm": 8.32238483428955, "learning_rate": 2.7246247272095564e-05, "loss": 2.7817, "step": 2350 }, { "epoch": 1.5733160513442321, "grad_norm": 4.816995620727539, "learning_rate": 2.722685106866541e-05, "loss": 2.7522, "step": 2351 }, { "epoch": 1.5739850315675041, "grad_norm": 4.7260518074035645, "learning_rate": 2.7207453513989155e-05, "loss": 2.5109, "step": 2352 }, { "epoch": 1.5746540117907766, "grad_norm": 5.570161819458008, "learning_rate": 2.7188054619837178e-05, "loss": 2.9705, "step": 2353 }, { "epoch": 1.5753229920140486, "grad_norm": 5.326760768890381, "learning_rate": 2.7168654397980674e-05, "loss": 2.6054, "step": 2354 }, { "epoch": 1.5759919722373206, "grad_norm": 6.1734466552734375, "learning_rate": 2.7149252860191628e-05, "loss": 2.705, "step": 2355 }, { "epoch": 1.576660952460593, "grad_norm": 5.643980979919434, "learning_rate": 2.7129850018242826e-05, "loss": 2.9079, "step": 2356 }, { "epoch": 1.577329932683865, "grad_norm": 5.271204471588135, "learning_rate": 2.7110445883907853e-05, "loss": 2.5904, "step": 2357 }, { "epoch": 1.5779989129071372, "grad_norm": 5.08945894241333, "learning_rate": 2.7091040468961087e-05, "loss": 2.7199, "step": 2358 }, { "epoch": 1.5786678931304094, "grad_norm": 6.384211540222168, "learning_rate": 2.707163378517766e-05, "loss": 2.8164, "step": 2359 }, { "epoch": 1.5793368733536814, "grad_norm": 4.800456523895264, "learning_rate": 2.7052225844333477e-05, "loss": 2.686, "step": 2360 }, { "epoch": 1.5800058535769537, "grad_norm": 5.3238959312438965, "learning_rate": 2.703281665820524e-05, "loss": 2.6533, "step": 2361 }, { "epoch": 1.5806748338002259, "grad_norm": 4.153467655181885, "learning_rate": 2.7013406238570364e-05, "loss": 2.5651, "step": 2362 }, { "epoch": 1.5813438140234979, "grad_norm": 4.877929210662842, "learning_rate": 2.699399459720703e-05, "loss": 2.6007, "step": 2363 }, { "epoch": 1.58201279424677, "grad_norm": 4.141830921173096, "learning_rate": 2.6974581745894163e-05, "loss": 2.6136, "step": 2364 }, { "epoch": 1.5826817744700423, "grad_norm": 5.560862064361572, "learning_rate": 2.6955167696411416e-05, "loss": 2.8333, "step": 2365 }, { "epoch": 1.5833507546933143, "grad_norm": 5.879995822906494, "learning_rate": 2.6935752460539175e-05, "loss": 2.8919, "step": 2366 }, { "epoch": 1.5840197349165865, "grad_norm": 4.493490219116211, "learning_rate": 2.691633605005854e-05, "loss": 2.7167, "step": 2367 }, { "epoch": 1.5846887151398588, "grad_norm": 6.017796516418457, "learning_rate": 2.689691847675133e-05, "loss": 2.9247, "step": 2368 }, { "epoch": 1.5853576953631308, "grad_norm": 4.717609405517578, "learning_rate": 2.687749975240007e-05, "loss": 2.6592, "step": 2369 }, { "epoch": 1.586026675586403, "grad_norm": 7.9234185218811035, "learning_rate": 2.6858079888787967e-05, "loss": 2.8465, "step": 2370 }, { "epoch": 1.5866956558096752, "grad_norm": 6.025661468505859, "learning_rate": 2.6838658897698944e-05, "loss": 2.8172, "step": 2371 }, { "epoch": 1.5873646360329472, "grad_norm": 5.558853626251221, "learning_rate": 2.681923679091759e-05, "loss": 2.5836, "step": 2372 }, { "epoch": 1.5880336162562194, "grad_norm": 6.307706356048584, "learning_rate": 2.6799813580229176e-05, "loss": 2.8743, "step": 2373 }, { "epoch": 1.5887025964794916, "grad_norm": 8.418734550476074, "learning_rate": 2.6780389277419653e-05, "loss": 3.1143, "step": 2374 }, { "epoch": 1.5893715767027636, "grad_norm": 4.946826457977295, "learning_rate": 2.6760963894275616e-05, "loss": 2.8108, "step": 2375 }, { "epoch": 1.5900405569260359, "grad_norm": 5.7382354736328125, "learning_rate": 2.674153744258433e-05, "loss": 2.6312, "step": 2376 }, { "epoch": 1.590709537149308, "grad_norm": 6.723291873931885, "learning_rate": 2.6722109934133705e-05, "loss": 2.8463, "step": 2377 }, { "epoch": 1.59137851737258, "grad_norm": 4.454161643981934, "learning_rate": 2.670268138071228e-05, "loss": 2.7235, "step": 2378 }, { "epoch": 1.5920474975958523, "grad_norm": 3.8878867626190186, "learning_rate": 2.6683251794109253e-05, "loss": 2.9071, "step": 2379 }, { "epoch": 1.5927164778191245, "grad_norm": 5.427379131317139, "learning_rate": 2.6663821186114434e-05, "loss": 2.776, "step": 2380 }, { "epoch": 1.5933854580423965, "grad_norm": 6.1245527267456055, "learning_rate": 2.6644389568518247e-05, "loss": 2.6846, "step": 2381 }, { "epoch": 1.5940544382656687, "grad_norm": 3.53409743309021, "learning_rate": 2.6624956953111735e-05, "loss": 2.681, "step": 2382 }, { "epoch": 1.594723418488941, "grad_norm": 6.1705002784729, "learning_rate": 2.6605523351686553e-05, "loss": 2.7252, "step": 2383 }, { "epoch": 1.595392398712213, "grad_norm": 4.879063606262207, "learning_rate": 2.658608877603494e-05, "loss": 2.8257, "step": 2384 }, { "epoch": 1.5960613789354852, "grad_norm": 5.19885778427124, "learning_rate": 2.6566653237949735e-05, "loss": 2.7644, "step": 2385 }, { "epoch": 1.5967303591587574, "grad_norm": 6.000220775604248, "learning_rate": 2.6547216749224368e-05, "loss": 2.7559, "step": 2386 }, { "epoch": 1.5973993393820294, "grad_norm": 4.362799644470215, "learning_rate": 2.652777932165284e-05, "loss": 2.6174, "step": 2387 }, { "epoch": 1.5980683196053018, "grad_norm": 6.249029636383057, "learning_rate": 2.6508340967029704e-05, "loss": 2.7535, "step": 2388 }, { "epoch": 1.5987372998285738, "grad_norm": 6.7952141761779785, "learning_rate": 2.6488901697150104e-05, "loss": 2.7719, "step": 2389 }, { "epoch": 1.5994062800518458, "grad_norm": 6.393811225891113, "learning_rate": 2.6469461523809723e-05, "loss": 2.6891, "step": 2390 }, { "epoch": 1.6000752602751183, "grad_norm": 6.876070976257324, "learning_rate": 2.6450020458804802e-05, "loss": 2.6072, "step": 2391 }, { "epoch": 1.6007442404983903, "grad_norm": 5.015986442565918, "learning_rate": 2.6430578513932108e-05, "loss": 2.4877, "step": 2392 }, { "epoch": 1.6014132207216623, "grad_norm": 5.196181774139404, "learning_rate": 2.6411135700988954e-05, "loss": 2.6933, "step": 2393 }, { "epoch": 1.6020822009449347, "grad_norm": 4.305976390838623, "learning_rate": 2.639169203177318e-05, "loss": 2.5628, "step": 2394 }, { "epoch": 1.6027511811682067, "grad_norm": 5.467509746551514, "learning_rate": 2.637224751808313e-05, "loss": 2.7082, "step": 2395 }, { "epoch": 1.603420161391479, "grad_norm": 5.646378993988037, "learning_rate": 2.635280217171769e-05, "loss": 2.5309, "step": 2396 }, { "epoch": 1.6040891416147511, "grad_norm": 4.870308876037598, "learning_rate": 2.6333356004476224e-05, "loss": 2.7018, "step": 2397 }, { "epoch": 1.6047581218380231, "grad_norm": 5.6591877937316895, "learning_rate": 2.6313909028158608e-05, "loss": 2.7843, "step": 2398 }, { "epoch": 1.6054271020612954, "grad_norm": 7.571524620056152, "learning_rate": 2.62944612545652e-05, "loss": 2.4803, "step": 2399 }, { "epoch": 1.6060960822845676, "grad_norm": 6.610879898071289, "learning_rate": 2.6275012695496847e-05, "loss": 2.904, "step": 2400 }, { "epoch": 1.6067650625078396, "grad_norm": 7.075892448425293, "learning_rate": 2.6255563362754875e-05, "loss": 2.7231, "step": 2401 }, { "epoch": 1.6074340427311118, "grad_norm": 6.549261569976807, "learning_rate": 2.6236113268141083e-05, "loss": 2.8158, "step": 2402 }, { "epoch": 1.608103022954384, "grad_norm": 5.817134380340576, "learning_rate": 2.6216662423457715e-05, "loss": 2.9392, "step": 2403 }, { "epoch": 1.608772003177656, "grad_norm": 6.896299839019775, "learning_rate": 2.6197210840507485e-05, "loss": 2.7464, "step": 2404 }, { "epoch": 1.6094409834009282, "grad_norm": 4.643448829650879, "learning_rate": 2.617775853109356e-05, "loss": 2.7464, "step": 2405 }, { "epoch": 1.6101099636242004, "grad_norm": 4.982819557189941, "learning_rate": 2.615830550701952e-05, "loss": 2.9396, "step": 2406 }, { "epoch": 1.6107789438474724, "grad_norm": 4.733968257904053, "learning_rate": 2.6138851780089413e-05, "loss": 2.7082, "step": 2407 }, { "epoch": 1.6114479240707447, "grad_norm": 6.8186421394348145, "learning_rate": 2.6119397362107694e-05, "loss": 2.7159, "step": 2408 }, { "epoch": 1.6121169042940169, "grad_norm": 5.55661153793335, "learning_rate": 2.6099942264879246e-05, "loss": 2.6459, "step": 2409 }, { "epoch": 1.6127858845172889, "grad_norm": 4.055827617645264, "learning_rate": 2.608048650020935e-05, "loss": 2.6294, "step": 2410 }, { "epoch": 1.613454864740561, "grad_norm": 5.903413772583008, "learning_rate": 2.606103007990371e-05, "loss": 2.8888, "step": 2411 }, { "epoch": 1.6141238449638333, "grad_norm": 3.653301477432251, "learning_rate": 2.604157301576841e-05, "loss": 2.6241, "step": 2412 }, { "epoch": 1.6147928251871053, "grad_norm": 4.215085506439209, "learning_rate": 2.6022115319609953e-05, "loss": 2.7616, "step": 2413 }, { "epoch": 1.6154618054103775, "grad_norm": 3.908628463745117, "learning_rate": 2.600265700323518e-05, "loss": 2.5136, "step": 2414 }, { "epoch": 1.6161307856336498, "grad_norm": 4.34727668762207, "learning_rate": 2.5983198078451355e-05, "loss": 2.5217, "step": 2415 }, { "epoch": 1.6167997658569218, "grad_norm": 3.767536163330078, "learning_rate": 2.5963738557066092e-05, "loss": 2.5968, "step": 2416 }, { "epoch": 1.617468746080194, "grad_norm": 4.341113567352295, "learning_rate": 2.594427845088735e-05, "loss": 2.534, "step": 2417 }, { "epoch": 1.6181377263034662, "grad_norm": 5.488800048828125, "learning_rate": 2.5924817771723474e-05, "loss": 2.6849, "step": 2418 }, { "epoch": 1.6188067065267382, "grad_norm": 5.409704208374023, "learning_rate": 2.5905356531383135e-05, "loss": 2.858, "step": 2419 }, { "epoch": 1.6194756867500104, "grad_norm": 4.850921154022217, "learning_rate": 2.5885894741675353e-05, "loss": 2.8339, "step": 2420 }, { "epoch": 1.6201446669732826, "grad_norm": 7.075712203979492, "learning_rate": 2.586643241440948e-05, "loss": 2.7669, "step": 2421 }, { "epoch": 1.6208136471965546, "grad_norm": 3.867203950881958, "learning_rate": 2.5846969561395196e-05, "loss": 2.6405, "step": 2422 }, { "epoch": 1.6214826274198268, "grad_norm": 5.741216659545898, "learning_rate": 2.582750619444249e-05, "loss": 2.9541, "step": 2423 }, { "epoch": 1.622151607643099, "grad_norm": 7.5297369956970215, "learning_rate": 2.580804232536168e-05, "loss": 2.8333, "step": 2424 }, { "epoch": 1.622820587866371, "grad_norm": 5.421424865722656, "learning_rate": 2.5788577965963372e-05, "loss": 2.5568, "step": 2425 }, { "epoch": 1.6234895680896435, "grad_norm": 4.421302318572998, "learning_rate": 2.5769113128058486e-05, "loss": 2.8412, "step": 2426 }, { "epoch": 1.6241585483129155, "grad_norm": 4.942965030670166, "learning_rate": 2.5749647823458218e-05, "loss": 2.7317, "step": 2427 }, { "epoch": 1.6248275285361875, "grad_norm": 5.792942047119141, "learning_rate": 2.5730182063974044e-05, "loss": 2.8114, "step": 2428 }, { "epoch": 1.62549650875946, "grad_norm": 5.889285087585449, "learning_rate": 2.571071586141774e-05, "loss": 3.1153, "step": 2429 }, { "epoch": 1.626165488982732, "grad_norm": 6.261662006378174, "learning_rate": 2.569124922760132e-05, "loss": 2.8418, "step": 2430 }, { "epoch": 1.626834469206004, "grad_norm": 5.267799377441406, "learning_rate": 2.5671782174337095e-05, "loss": 2.5839, "step": 2431 }, { "epoch": 1.6275034494292764, "grad_norm": 6.704413890838623, "learning_rate": 2.5652314713437586e-05, "loss": 2.7482, "step": 2432 }, { "epoch": 1.6281724296525484, "grad_norm": 7.405350685119629, "learning_rate": 2.5632846856715603e-05, "loss": 2.865, "step": 2433 }, { "epoch": 1.6288414098758206, "grad_norm": 4.24365234375, "learning_rate": 2.561337861598418e-05, "loss": 2.6506, "step": 2434 }, { "epoch": 1.6295103900990928, "grad_norm": 4.277185440063477, "learning_rate": 2.5593910003056576e-05, "loss": 2.8607, "step": 2435 }, { "epoch": 1.6301793703223648, "grad_norm": 6.50390625, "learning_rate": 2.557444102974628e-05, "loss": 2.8742, "step": 2436 }, { "epoch": 1.630848350545637, "grad_norm": 5.022792816162109, "learning_rate": 2.555497170786701e-05, "loss": 2.7886, "step": 2437 }, { "epoch": 1.6315173307689093, "grad_norm": 6.039017200469971, "learning_rate": 2.5535502049232684e-05, "loss": 2.9595, "step": 2438 }, { "epoch": 1.6321863109921813, "grad_norm": 4.410915851593018, "learning_rate": 2.551603206565743e-05, "loss": 2.6244, "step": 2439 }, { "epoch": 1.6328552912154535, "grad_norm": 5.923748016357422, "learning_rate": 2.5496561768955574e-05, "loss": 2.8382, "step": 2440 }, { "epoch": 1.6335242714387257, "grad_norm": 7.241328716278076, "learning_rate": 2.5477091170941626e-05, "loss": 3.0066, "step": 2441 }, { "epoch": 1.6341932516619977, "grad_norm": 5.229269504547119, "learning_rate": 2.5457620283430285e-05, "loss": 2.865, "step": 2442 }, { "epoch": 1.63486223188527, "grad_norm": 5.172996997833252, "learning_rate": 2.543814911823642e-05, "loss": 2.6209, "step": 2443 }, { "epoch": 1.6355312121085421, "grad_norm": 6.428318500518799, "learning_rate": 2.541867768717507e-05, "loss": 2.8982, "step": 2444 }, { "epoch": 1.6362001923318141, "grad_norm": 4.812302589416504, "learning_rate": 2.5399206002061443e-05, "loss": 2.5506, "step": 2445 }, { "epoch": 1.6368691725550863, "grad_norm": 6.246057510375977, "learning_rate": 2.5379734074710898e-05, "loss": 2.6993, "step": 2446 }, { "epoch": 1.6375381527783586, "grad_norm": 5.78316068649292, "learning_rate": 2.536026191693893e-05, "loss": 2.9312, "step": 2447 }, { "epoch": 1.6382071330016306, "grad_norm": 5.317570686340332, "learning_rate": 2.5340789540561183e-05, "loss": 2.9874, "step": 2448 }, { "epoch": 1.6388761132249028, "grad_norm": 5.024258613586426, "learning_rate": 2.5321316957393437e-05, "loss": 2.7485, "step": 2449 }, { "epoch": 1.639545093448175, "grad_norm": 4.547504425048828, "learning_rate": 2.5301844179251588e-05, "loss": 2.5821, "step": 2450 }, { "epoch": 1.640214073671447, "grad_norm": 6.373087406158447, "learning_rate": 2.528237121795166e-05, "loss": 2.9019, "step": 2451 }, { "epoch": 1.6408830538947192, "grad_norm": 4.163835525512695, "learning_rate": 2.5262898085309777e-05, "loss": 2.7396, "step": 2452 }, { "epoch": 1.6415520341179914, "grad_norm": 5.687963008880615, "learning_rate": 2.524342479314219e-05, "loss": 2.6711, "step": 2453 }, { "epoch": 1.6422210143412634, "grad_norm": 5.724370956420898, "learning_rate": 2.5223951353265208e-05, "loss": 2.7225, "step": 2454 }, { "epoch": 1.6428899945645357, "grad_norm": 4.851588249206543, "learning_rate": 2.5204477777495262e-05, "loss": 2.823, "step": 2455 }, { "epoch": 1.6435589747878079, "grad_norm": 8.624882698059082, "learning_rate": 2.518500407764886e-05, "loss": 2.996, "step": 2456 }, { "epoch": 1.6442279550110799, "grad_norm": 4.683487892150879, "learning_rate": 2.5165530265542576e-05, "loss": 2.5252, "step": 2457 }, { "epoch": 1.644896935234352, "grad_norm": 5.397092819213867, "learning_rate": 2.5146056352993052e-05, "loss": 3.0858, "step": 2458 }, { "epoch": 1.6455659154576243, "grad_norm": 7.78914737701416, "learning_rate": 2.5126582351817003e-05, "loss": 2.6707, "step": 2459 }, { "epoch": 1.6462348956808963, "grad_norm": 5.520780086517334, "learning_rate": 2.5107108273831194e-05, "loss": 2.7152, "step": 2460 }, { "epoch": 1.6469038759041685, "grad_norm": 5.705004692077637, "learning_rate": 2.508763413085242e-05, "loss": 2.8296, "step": 2461 }, { "epoch": 1.6475728561274408, "grad_norm": 5.346718788146973, "learning_rate": 2.506815993469754e-05, "loss": 3.0625, "step": 2462 }, { "epoch": 1.6482418363507128, "grad_norm": 5.675193786621094, "learning_rate": 2.5048685697183425e-05, "loss": 2.6163, "step": 2463 }, { "epoch": 1.6489108165739852, "grad_norm": 8.798904418945312, "learning_rate": 2.502921143012699e-05, "loss": 2.8633, "step": 2464 }, { "epoch": 1.6495797967972572, "grad_norm": 6.258532524108887, "learning_rate": 2.500973714534515e-05, "loss": 2.7428, "step": 2465 }, { "epoch": 1.6502487770205292, "grad_norm": 6.154701232910156, "learning_rate": 2.499026285465485e-05, "loss": 2.7623, "step": 2466 }, { "epoch": 1.6509177572438016, "grad_norm": 6.404314041137695, "learning_rate": 2.497078856987301e-05, "loss": 2.805, "step": 2467 }, { "epoch": 1.6515867374670736, "grad_norm": 5.8303914070129395, "learning_rate": 2.495131430281658e-05, "loss": 3.0137, "step": 2468 }, { "epoch": 1.6522557176903456, "grad_norm": 4.913717746734619, "learning_rate": 2.4931840065302463e-05, "loss": 2.7482, "step": 2469 }, { "epoch": 1.652924697913618, "grad_norm": 3.7035882472991943, "learning_rate": 2.4912365869147585e-05, "loss": 2.2987, "step": 2470 }, { "epoch": 1.65359367813689, "grad_norm": 5.4154839515686035, "learning_rate": 2.4892891726168812e-05, "loss": 3.0513, "step": 2471 }, { "epoch": 1.654262658360162, "grad_norm": 5.4690842628479, "learning_rate": 2.4873417648183003e-05, "loss": 2.4291, "step": 2472 }, { "epoch": 1.6549316385834345, "grad_norm": 4.671787261962891, "learning_rate": 2.485394364700695e-05, "loss": 2.6835, "step": 2473 }, { "epoch": 1.6556006188067065, "grad_norm": 5.292201995849609, "learning_rate": 2.4834469734457433e-05, "loss": 2.7257, "step": 2474 }, { "epoch": 1.6562695990299787, "grad_norm": 4.135709762573242, "learning_rate": 2.4814995922351146e-05, "loss": 2.5343, "step": 2475 }, { "epoch": 1.656938579253251, "grad_norm": 5.647322654724121, "learning_rate": 2.4795522222504744e-05, "loss": 2.8363, "step": 2476 }, { "epoch": 1.657607559476523, "grad_norm": 6.4661970138549805, "learning_rate": 2.4776048646734794e-05, "loss": 2.9677, "step": 2477 }, { "epoch": 1.6582765396997952, "grad_norm": 4.214534759521484, "learning_rate": 2.4756575206857817e-05, "loss": 2.5381, "step": 2478 }, { "epoch": 1.6589455199230674, "grad_norm": 5.281362533569336, "learning_rate": 2.4737101914690222e-05, "loss": 2.8645, "step": 2479 }, { "epoch": 1.6596145001463394, "grad_norm": 4.652920722961426, "learning_rate": 2.4717628782048348e-05, "loss": 2.5176, "step": 2480 }, { "epoch": 1.6602834803696116, "grad_norm": 5.855139255523682, "learning_rate": 2.4698155820748407e-05, "loss": 2.8235, "step": 2481 }, { "epoch": 1.6609524605928838, "grad_norm": 5.996157169342041, "learning_rate": 2.4678683042606565e-05, "loss": 2.8375, "step": 2482 }, { "epoch": 1.6616214408161558, "grad_norm": 5.081485271453857, "learning_rate": 2.4659210459438816e-05, "loss": 2.8121, "step": 2483 }, { "epoch": 1.662290421039428, "grad_norm": 5.23148775100708, "learning_rate": 2.4639738083061075e-05, "loss": 2.7627, "step": 2484 }, { "epoch": 1.6629594012627003, "grad_norm": 5.417628288269043, "learning_rate": 2.4620265925289098e-05, "loss": 2.7727, "step": 2485 }, { "epoch": 1.6636283814859723, "grad_norm": 7.604246139526367, "learning_rate": 2.4600793997938563e-05, "loss": 2.7401, "step": 2486 }, { "epoch": 1.6642973617092445, "grad_norm": 5.298999309539795, "learning_rate": 2.458132231282493e-05, "loss": 2.683, "step": 2487 }, { "epoch": 1.6649663419325167, "grad_norm": 7.451618194580078, "learning_rate": 2.4561850881763588e-05, "loss": 2.9445, "step": 2488 }, { "epoch": 1.6656353221557887, "grad_norm": 5.242423057556152, "learning_rate": 2.454237971656972e-05, "loss": 2.6389, "step": 2489 }, { "epoch": 1.666304302379061, "grad_norm": 5.074806213378906, "learning_rate": 2.4522908829058383e-05, "loss": 2.7597, "step": 2490 }, { "epoch": 1.6669732826023331, "grad_norm": 6.158017158508301, "learning_rate": 2.4503438231044425e-05, "loss": 2.8183, "step": 2491 }, { "epoch": 1.6676422628256051, "grad_norm": 6.835614204406738, "learning_rate": 2.4483967934342574e-05, "loss": 2.8358, "step": 2492 }, { "epoch": 1.6683112430488773, "grad_norm": 4.561537742614746, "learning_rate": 2.446449795076732e-05, "loss": 2.5523, "step": 2493 }, { "epoch": 1.6689802232721496, "grad_norm": 5.291999340057373, "learning_rate": 2.4445028292132997e-05, "loss": 2.6435, "step": 2494 }, { "epoch": 1.6696492034954216, "grad_norm": 4.522041320800781, "learning_rate": 2.442555897025372e-05, "loss": 2.5627, "step": 2495 }, { "epoch": 1.6703181837186938, "grad_norm": 3.6309473514556885, "learning_rate": 2.4406089996943433e-05, "loss": 2.6634, "step": 2496 }, { "epoch": 1.670987163941966, "grad_norm": 9.647355079650879, "learning_rate": 2.4386621384015823e-05, "loss": 2.4431, "step": 2497 }, { "epoch": 1.671656144165238, "grad_norm": 5.466341495513916, "learning_rate": 2.43671531432844e-05, "loss": 2.7964, "step": 2498 }, { "epoch": 1.6723251243885102, "grad_norm": 4.108764171600342, "learning_rate": 2.434768528656241e-05, "loss": 2.6217, "step": 2499 }, { "epoch": 1.6729941046117824, "grad_norm": 5.70421028137207, "learning_rate": 2.432821782566291e-05, "loss": 2.7245, "step": 2500 }, { "epoch": 1.6736630848350544, "grad_norm": 6.602444648742676, "learning_rate": 2.430875077239868e-05, "loss": 2.9086, "step": 2501 }, { "epoch": 1.6743320650583269, "grad_norm": 4.562328815460205, "learning_rate": 2.428928413858227e-05, "loss": 2.7498, "step": 2502 }, { "epoch": 1.6750010452815989, "grad_norm": 7.803415775299072, "learning_rate": 2.4269817936025955e-05, "loss": 2.5058, "step": 2503 }, { "epoch": 1.6756700255048709, "grad_norm": 5.827186107635498, "learning_rate": 2.4250352176541788e-05, "loss": 2.9237, "step": 2504 }, { "epoch": 1.6763390057281433, "grad_norm": 4.48921537399292, "learning_rate": 2.4230886871941517e-05, "loss": 2.6325, "step": 2505 }, { "epoch": 1.6770079859514153, "grad_norm": 5.492165565490723, "learning_rate": 2.421142203403663e-05, "loss": 2.8539, "step": 2506 }, { "epoch": 1.6776769661746873, "grad_norm": 4.681061744689941, "learning_rate": 2.4191957674638317e-05, "loss": 2.7593, "step": 2507 }, { "epoch": 1.6783459463979598, "grad_norm": 5.666809558868408, "learning_rate": 2.4172493805557515e-05, "loss": 2.9197, "step": 2508 }, { "epoch": 1.6790149266212318, "grad_norm": 5.163647174835205, "learning_rate": 2.4153030438604806e-05, "loss": 2.7285, "step": 2509 }, { "epoch": 1.6796839068445037, "grad_norm": 6.088483810424805, "learning_rate": 2.4133567585590524e-05, "loss": 2.6448, "step": 2510 }, { "epoch": 1.6803528870677762, "grad_norm": 6.308938980102539, "learning_rate": 2.411410525832465e-05, "loss": 2.8534, "step": 2511 }, { "epoch": 1.6810218672910482, "grad_norm": 8.010528564453125, "learning_rate": 2.4094643468616874e-05, "loss": 3.0351, "step": 2512 }, { "epoch": 1.6816908475143204, "grad_norm": 4.884311676025391, "learning_rate": 2.4075182228276528e-05, "loss": 2.7731, "step": 2513 }, { "epoch": 1.6823598277375926, "grad_norm": 5.070252895355225, "learning_rate": 2.4055721549112654e-05, "loss": 2.6083, "step": 2514 }, { "epoch": 1.6830288079608646, "grad_norm": 5.1491241455078125, "learning_rate": 2.4036261442933913e-05, "loss": 2.544, "step": 2515 }, { "epoch": 1.6836977881841368, "grad_norm": 5.936621189117432, "learning_rate": 2.4016801921548648e-05, "loss": 2.6634, "step": 2516 }, { "epoch": 1.684366768407409, "grad_norm": 5.348430156707764, "learning_rate": 2.3997342996764818e-05, "loss": 2.4732, "step": 2517 }, { "epoch": 1.685035748630681, "grad_norm": 5.629446983337402, "learning_rate": 2.3977884680390056e-05, "loss": 2.8167, "step": 2518 }, { "epoch": 1.6857047288539533, "grad_norm": 4.8856024742126465, "learning_rate": 2.395842698423159e-05, "loss": 2.6797, "step": 2519 }, { "epoch": 1.6863737090772255, "grad_norm": 6.2281494140625, "learning_rate": 2.39389699200963e-05, "loss": 2.5547, "step": 2520 }, { "epoch": 1.6870426893004975, "grad_norm": 5.2909836769104, "learning_rate": 2.3919513499790648e-05, "loss": 2.8618, "step": 2521 }, { "epoch": 1.6877116695237697, "grad_norm": 6.354259014129639, "learning_rate": 2.390005773512076e-05, "loss": 2.5831, "step": 2522 }, { "epoch": 1.688380649747042, "grad_norm": 3.8957390785217285, "learning_rate": 2.3880602637892305e-05, "loss": 2.5017, "step": 2523 }, { "epoch": 1.689049629970314, "grad_norm": 4.465644359588623, "learning_rate": 2.386114821991059e-05, "loss": 2.4734, "step": 2524 }, { "epoch": 1.6897186101935862, "grad_norm": 5.646853923797607, "learning_rate": 2.3841694492980476e-05, "loss": 2.7715, "step": 2525 }, { "epoch": 1.6903875904168584, "grad_norm": 4.684238433837891, "learning_rate": 2.3822241468906448e-05, "loss": 2.8278, "step": 2526 }, { "epoch": 1.6910565706401304, "grad_norm": 5.539403438568115, "learning_rate": 2.3802789159492514e-05, "loss": 2.6784, "step": 2527 }, { "epoch": 1.6917255508634026, "grad_norm": 4.852906703948975, "learning_rate": 2.378333757654229e-05, "loss": 2.6266, "step": 2528 }, { "epoch": 1.6923945310866748, "grad_norm": 6.845240116119385, "learning_rate": 2.3763886731858916e-05, "loss": 2.9784, "step": 2529 }, { "epoch": 1.6930635113099468, "grad_norm": 5.903868675231934, "learning_rate": 2.374443663724513e-05, "loss": 2.7955, "step": 2530 }, { "epoch": 1.693732491533219, "grad_norm": 6.892399311065674, "learning_rate": 2.3724987304503152e-05, "loss": 2.7213, "step": 2531 }, { "epoch": 1.6944014717564913, "grad_norm": 4.848674774169922, "learning_rate": 2.3705538745434804e-05, "loss": 2.8574, "step": 2532 }, { "epoch": 1.6950704519797632, "grad_norm": 4.339311599731445, "learning_rate": 2.3686090971841395e-05, "loss": 2.878, "step": 2533 }, { "epoch": 1.6957394322030355, "grad_norm": 7.817746639251709, "learning_rate": 2.366664399552378e-05, "loss": 2.6093, "step": 2534 }, { "epoch": 1.6964084124263077, "grad_norm": 5.182209491729736, "learning_rate": 2.364719782828231e-05, "loss": 2.8362, "step": 2535 }, { "epoch": 1.6970773926495797, "grad_norm": 5.601119518280029, "learning_rate": 2.362775248191687e-05, "loss": 2.7463, "step": 2536 }, { "epoch": 1.697746372872852, "grad_norm": 6.002682685852051, "learning_rate": 2.3608307968226826e-05, "loss": 2.8956, "step": 2537 }, { "epoch": 1.6984153530961241, "grad_norm": 7.065710067749023, "learning_rate": 2.3588864299011055e-05, "loss": 2.5411, "step": 2538 }, { "epoch": 1.6990843333193961, "grad_norm": 5.531703948974609, "learning_rate": 2.3569421486067894e-05, "loss": 2.5599, "step": 2539 }, { "epoch": 1.6997533135426683, "grad_norm": 7.126420497894287, "learning_rate": 2.3549979541195204e-05, "loss": 2.8409, "step": 2540 }, { "epoch": 1.7004222937659406, "grad_norm": 5.211402893066406, "learning_rate": 2.3530538476190276e-05, "loss": 2.695, "step": 2541 }, { "epoch": 1.7010912739892126, "grad_norm": 7.738270282745361, "learning_rate": 2.3511098302849902e-05, "loss": 2.6719, "step": 2542 }, { "epoch": 1.701760254212485, "grad_norm": 4.948130130767822, "learning_rate": 2.3491659032970295e-05, "loss": 2.6603, "step": 2543 }, { "epoch": 1.702429234435757, "grad_norm": 5.112238883972168, "learning_rate": 2.347222067834717e-05, "loss": 2.7037, "step": 2544 }, { "epoch": 1.703098214659029, "grad_norm": 4.915751934051514, "learning_rate": 2.345278325077563e-05, "loss": 2.7256, "step": 2545 }, { "epoch": 1.7037671948823014, "grad_norm": 6.279435634613037, "learning_rate": 2.3433346762050268e-05, "loss": 2.9779, "step": 2546 }, { "epoch": 1.7044361751055734, "grad_norm": 3.470745801925659, "learning_rate": 2.3413911223965062e-05, "loss": 2.4658, "step": 2547 }, { "epoch": 1.7051051553288454, "grad_norm": 4.6231160163879395, "learning_rate": 2.3394476648313453e-05, "loss": 2.7526, "step": 2548 }, { "epoch": 1.7057741355521179, "grad_norm": 6.271169185638428, "learning_rate": 2.3375043046888268e-05, "loss": 2.7204, "step": 2549 }, { "epoch": 1.7064431157753899, "grad_norm": 6.299276351928711, "learning_rate": 2.335561043148176e-05, "loss": 2.6145, "step": 2550 }, { "epoch": 1.707112095998662, "grad_norm": 7.203403949737549, "learning_rate": 2.333617881388557e-05, "loss": 2.9738, "step": 2551 }, { "epoch": 1.7077810762219343, "grad_norm": 6.858221054077148, "learning_rate": 2.3316748205890753e-05, "loss": 2.9029, "step": 2552 }, { "epoch": 1.7084500564452063, "grad_norm": 5.399645805358887, "learning_rate": 2.3297318619287716e-05, "loss": 2.6019, "step": 2553 }, { "epoch": 1.7091190366684785, "grad_norm": 4.9139204025268555, "learning_rate": 2.3277890065866304e-05, "loss": 2.5849, "step": 2554 }, { "epoch": 1.7097880168917508, "grad_norm": 6.575718402862549, "learning_rate": 2.3258462557415673e-05, "loss": 2.7456, "step": 2555 }, { "epoch": 1.7104569971150227, "grad_norm": 5.743617057800293, "learning_rate": 2.323903610572439e-05, "loss": 2.9058, "step": 2556 }, { "epoch": 1.711125977338295, "grad_norm": 5.065006256103516, "learning_rate": 2.321961072258035e-05, "loss": 2.811, "step": 2557 }, { "epoch": 1.7117949575615672, "grad_norm": 3.897226095199585, "learning_rate": 2.3200186419770826e-05, "loss": 2.5951, "step": 2558 }, { "epoch": 1.7124639377848392, "grad_norm": 4.879258632659912, "learning_rate": 2.3180763209082415e-05, "loss": 2.7645, "step": 2559 }, { "epoch": 1.7131329180081114, "grad_norm": 5.2255072593688965, "learning_rate": 2.3161341102301065e-05, "loss": 2.7062, "step": 2560 }, { "epoch": 1.7138018982313836, "grad_norm": 5.900786399841309, "learning_rate": 2.3141920111212035e-05, "loss": 2.7575, "step": 2561 }, { "epoch": 1.7144708784546556, "grad_norm": 5.447509288787842, "learning_rate": 2.312250024759994e-05, "loss": 2.7415, "step": 2562 }, { "epoch": 1.7151398586779278, "grad_norm": 6.256686687469482, "learning_rate": 2.310308152324867e-05, "loss": 3.0382, "step": 2563 }, { "epoch": 1.7158088389012, "grad_norm": 4.430761814117432, "learning_rate": 2.3083663949941463e-05, "loss": 2.7999, "step": 2564 }, { "epoch": 1.716477819124472, "grad_norm": 5.721620559692383, "learning_rate": 2.3064247539460827e-05, "loss": 2.8862, "step": 2565 }, { "epoch": 1.7171467993477443, "grad_norm": 5.0352935791015625, "learning_rate": 2.3044832303588586e-05, "loss": 2.8294, "step": 2566 }, { "epoch": 1.7178157795710165, "grad_norm": 5.007111072540283, "learning_rate": 2.302541825410584e-05, "loss": 2.7377, "step": 2567 }, { "epoch": 1.7184847597942885, "grad_norm": 5.62821102142334, "learning_rate": 2.3006005402792976e-05, "loss": 2.6374, "step": 2568 }, { "epoch": 1.7191537400175607, "grad_norm": 9.058024406433105, "learning_rate": 2.2986593761429638e-05, "loss": 2.7939, "step": 2569 }, { "epoch": 1.719822720240833, "grad_norm": 4.320191383361816, "learning_rate": 2.2967183341794768e-05, "loss": 2.831, "step": 2570 }, { "epoch": 1.720491700464105, "grad_norm": 4.51514196395874, "learning_rate": 2.294777415566652e-05, "loss": 2.7715, "step": 2571 }, { "epoch": 1.7211606806873772, "grad_norm": 7.908797740936279, "learning_rate": 2.292836621482235e-05, "loss": 2.7382, "step": 2572 }, { "epoch": 1.7218296609106494, "grad_norm": 5.191076755523682, "learning_rate": 2.290895953103892e-05, "loss": 2.753, "step": 2573 }, { "epoch": 1.7224986411339214, "grad_norm": 5.839506149291992, "learning_rate": 2.2889554116092153e-05, "loss": 2.7941, "step": 2574 }, { "epoch": 1.7231676213571936, "grad_norm": 4.96605920791626, "learning_rate": 2.2870149981757177e-05, "loss": 2.588, "step": 2575 }, { "epoch": 1.7238366015804658, "grad_norm": 5.3897833824157715, "learning_rate": 2.285074713980838e-05, "loss": 2.7056, "step": 2576 }, { "epoch": 1.7245055818037378, "grad_norm": 4.577094554901123, "learning_rate": 2.283134560201933e-05, "loss": 2.8574, "step": 2577 }, { "epoch": 1.72517456202701, "grad_norm": 8.089116096496582, "learning_rate": 2.2811945380162824e-05, "loss": 2.7452, "step": 2578 }, { "epoch": 1.7258435422502822, "grad_norm": 10.091802597045898, "learning_rate": 2.2792546486010847e-05, "loss": 2.9093, "step": 2579 }, { "epoch": 1.7265125224735542, "grad_norm": 8.744396209716797, "learning_rate": 2.2773148931334594e-05, "loss": 2.7463, "step": 2580 }, { "epoch": 1.7271815026968267, "grad_norm": 5.300685405731201, "learning_rate": 2.275375272790444e-05, "loss": 2.885, "step": 2581 }, { "epoch": 1.7278504829200987, "grad_norm": 4.582159042358398, "learning_rate": 2.2734357887489934e-05, "loss": 2.3113, "step": 2582 }, { "epoch": 1.7285194631433707, "grad_norm": 4.907561302185059, "learning_rate": 2.2714964421859792e-05, "loss": 2.7512, "step": 2583 }, { "epoch": 1.7291884433666431, "grad_norm": 4.984943389892578, "learning_rate": 2.2695572342781935e-05, "loss": 2.5355, "step": 2584 }, { "epoch": 1.7298574235899151, "grad_norm": 6.7600932121276855, "learning_rate": 2.2676181662023387e-05, "loss": 2.7291, "step": 2585 }, { "epoch": 1.7305264038131871, "grad_norm": 4.853344440460205, "learning_rate": 2.2656792391350375e-05, "loss": 2.3747, "step": 2586 }, { "epoch": 1.7311953840364596, "grad_norm": 3.686701774597168, "learning_rate": 2.2637404542528217e-05, "loss": 2.7271, "step": 2587 }, { "epoch": 1.7318643642597316, "grad_norm": 6.155206203460693, "learning_rate": 2.2618018127321427e-05, "loss": 2.6733, "step": 2588 }, { "epoch": 1.7325333444830036, "grad_norm": 7.384573936462402, "learning_rate": 2.2598633157493604e-05, "loss": 2.9799, "step": 2589 }, { "epoch": 1.733202324706276, "grad_norm": 4.645562648773193, "learning_rate": 2.257924964480749e-05, "loss": 2.6954, "step": 2590 }, { "epoch": 1.733871304929548, "grad_norm": 5.855587482452393, "learning_rate": 2.2559867601024927e-05, "loss": 2.7054, "step": 2591 }, { "epoch": 1.7345402851528202, "grad_norm": 5.897169589996338, "learning_rate": 2.2540487037906906e-05, "loss": 2.9156, "step": 2592 }, { "epoch": 1.7352092653760924, "grad_norm": 6.18589973449707, "learning_rate": 2.2521107967213454e-05, "loss": 2.5325, "step": 2593 }, { "epoch": 1.7358782455993644, "grad_norm": 7.244022369384766, "learning_rate": 2.250173040070376e-05, "loss": 3.0386, "step": 2594 }, { "epoch": 1.7365472258226367, "grad_norm": 5.8611955642700195, "learning_rate": 2.2482354350136045e-05, "loss": 3.0563, "step": 2595 }, { "epoch": 1.7372162060459089, "grad_norm": 7.672198295593262, "learning_rate": 2.246297982726765e-05, "loss": 2.8646, "step": 2596 }, { "epoch": 1.7378851862691809, "grad_norm": 4.697488307952881, "learning_rate": 2.2443606843854964e-05, "loss": 2.7142, "step": 2597 }, { "epoch": 1.738554166492453, "grad_norm": 4.74916410446167, "learning_rate": 2.2424235411653466e-05, "loss": 2.5616, "step": 2598 }, { "epoch": 1.7392231467157253, "grad_norm": 7.083244323730469, "learning_rate": 2.240486554241766e-05, "loss": 2.6612, "step": 2599 }, { "epoch": 1.7398921269389973, "grad_norm": 7.194074630737305, "learning_rate": 2.238549724790114e-05, "loss": 2.8222, "step": 2600 }, { "epoch": 1.7405611071622695, "grad_norm": 5.098743915557861, "learning_rate": 2.23661305398565e-05, "loss": 2.7619, "step": 2601 }, { "epoch": 1.7412300873855417, "grad_norm": 6.6131134033203125, "learning_rate": 2.234676543003542e-05, "loss": 3.2162, "step": 2602 }, { "epoch": 1.7418990676088137, "grad_norm": 6.013036727905273, "learning_rate": 2.232740193018858e-05, "loss": 2.8715, "step": 2603 }, { "epoch": 1.742568047832086, "grad_norm": 4.700963973999023, "learning_rate": 2.230804005206569e-05, "loss": 2.4515, "step": 2604 }, { "epoch": 1.7432370280553582, "grad_norm": 6.152329444885254, "learning_rate": 2.228867980741546e-05, "loss": 2.8436, "step": 2605 }, { "epoch": 1.7439060082786302, "grad_norm": 5.965137004852295, "learning_rate": 2.2269321207985645e-05, "loss": 2.9214, "step": 2606 }, { "epoch": 1.7445749885019024, "grad_norm": 3.820052146911621, "learning_rate": 2.2249964265522972e-05, "loss": 2.6333, "step": 2607 }, { "epoch": 1.7452439687251746, "grad_norm": 5.213476181030273, "learning_rate": 2.223060899177318e-05, "loss": 2.8786, "step": 2608 }, { "epoch": 1.7459129489484466, "grad_norm": 5.23811149597168, "learning_rate": 2.2211255398480967e-05, "loss": 2.7324, "step": 2609 }, { "epoch": 1.7465819291717188, "grad_norm": 4.518553256988525, "learning_rate": 2.2191903497390057e-05, "loss": 2.5349, "step": 2610 }, { "epoch": 1.747250909394991, "grad_norm": 5.865686893463135, "learning_rate": 2.2172553300243103e-05, "loss": 2.7363, "step": 2611 }, { "epoch": 1.747919889618263, "grad_norm": 4.556504249572754, "learning_rate": 2.2153204818781757e-05, "loss": 2.9129, "step": 2612 }, { "epoch": 1.7485888698415353, "grad_norm": 6.721836566925049, "learning_rate": 2.2133858064746592e-05, "loss": 2.8609, "step": 2613 }, { "epoch": 1.7492578500648075, "grad_norm": 7.764565944671631, "learning_rate": 2.2114513049877195e-05, "loss": 3.3474, "step": 2614 }, { "epoch": 1.7499268302880795, "grad_norm": 6.830298900604248, "learning_rate": 2.2095169785912015e-05, "loss": 2.5932, "step": 2615 }, { "epoch": 1.7505958105113517, "grad_norm": 4.948487281799316, "learning_rate": 2.2075828284588514e-05, "loss": 2.5113, "step": 2616 }, { "epoch": 1.751264790734624, "grad_norm": 4.6457839012146, "learning_rate": 2.205648855764304e-05, "loss": 2.807, "step": 2617 }, { "epoch": 1.751933770957896, "grad_norm": 5.526391506195068, "learning_rate": 2.2037150616810883e-05, "loss": 2.76, "step": 2618 }, { "epoch": 1.7526027511811684, "grad_norm": 10.367713928222656, "learning_rate": 2.2017814473826232e-05, "loss": 2.5667, "step": 2619 }, { "epoch": 1.7532717314044404, "grad_norm": 4.087628364562988, "learning_rate": 2.1998480140422214e-05, "loss": 2.9489, "step": 2620 }, { "epoch": 1.7539407116277124, "grad_norm": 6.0001959800720215, "learning_rate": 2.197914762833083e-05, "loss": 2.5951, "step": 2621 }, { "epoch": 1.7546096918509848, "grad_norm": 3.751884698867798, "learning_rate": 2.195981694928299e-05, "loss": 2.2341, "step": 2622 }, { "epoch": 1.7552786720742568, "grad_norm": 7.510499477386475, "learning_rate": 2.1940488115008475e-05, "loss": 2.8456, "step": 2623 }, { "epoch": 1.7559476522975288, "grad_norm": 5.829528331756592, "learning_rate": 2.1921161137235986e-05, "loss": 2.8387, "step": 2624 }, { "epoch": 1.7566166325208012, "grad_norm": 4.678679466247559, "learning_rate": 2.1901836027693047e-05, "loss": 2.6812, "step": 2625 }, { "epoch": 1.7572856127440732, "grad_norm": 5.86160135269165, "learning_rate": 2.188251279810609e-05, "loss": 2.946, "step": 2626 }, { "epoch": 1.7579545929673452, "grad_norm": 7.377922058105469, "learning_rate": 2.1863191460200366e-05, "loss": 2.8606, "step": 2627 }, { "epoch": 1.7586235731906177, "grad_norm": 5.642475605010986, "learning_rate": 2.184387202570003e-05, "loss": 2.6162, "step": 2628 }, { "epoch": 1.7592925534138897, "grad_norm": 5.498693943023682, "learning_rate": 2.182455450632803e-05, "loss": 2.7145, "step": 2629 }, { "epoch": 1.759961533637162, "grad_norm": 5.875025749206543, "learning_rate": 2.180523891380619e-05, "loss": 2.6928, "step": 2630 }, { "epoch": 1.7606305138604341, "grad_norm": 5.917715072631836, "learning_rate": 2.1785925259855135e-05, "loss": 2.7247, "step": 2631 }, { "epoch": 1.7612994940837061, "grad_norm": 6.21785831451416, "learning_rate": 2.1766613556194347e-05, "loss": 2.7966, "step": 2632 }, { "epoch": 1.7619684743069783, "grad_norm": 4.751443386077881, "learning_rate": 2.1747303814542087e-05, "loss": 2.6551, "step": 2633 }, { "epoch": 1.7626374545302506, "grad_norm": 6.275938510894775, "learning_rate": 2.172799604661546e-05, "loss": 2.9517, "step": 2634 }, { "epoch": 1.7633064347535226, "grad_norm": 5.280247688293457, "learning_rate": 2.1708690264130342e-05, "loss": 2.7783, "step": 2635 }, { "epoch": 1.7639754149767948, "grad_norm": 6.113962173461914, "learning_rate": 2.1689386478801438e-05, "loss": 2.9751, "step": 2636 }, { "epoch": 1.764644395200067, "grad_norm": 6.94852352142334, "learning_rate": 2.1670084702342204e-05, "loss": 2.8749, "step": 2637 }, { "epoch": 1.765313375423339, "grad_norm": 6.6532392501831055, "learning_rate": 2.165078494646491e-05, "loss": 2.6929, "step": 2638 }, { "epoch": 1.7659823556466112, "grad_norm": 5.977123737335205, "learning_rate": 2.163148722288058e-05, "loss": 2.8827, "step": 2639 }, { "epoch": 1.7666513358698834, "grad_norm": 5.5445427894592285, "learning_rate": 2.161219154329902e-05, "loss": 2.7309, "step": 2640 }, { "epoch": 1.7673203160931554, "grad_norm": 5.397950649261475, "learning_rate": 2.1592897919428765e-05, "loss": 2.9227, "step": 2641 }, { "epoch": 1.7679892963164276, "grad_norm": 5.681791305541992, "learning_rate": 2.157360636297715e-05, "loss": 2.6949, "step": 2642 }, { "epoch": 1.7686582765396999, "grad_norm": 5.597262382507324, "learning_rate": 2.155431688565021e-05, "loss": 2.784, "step": 2643 }, { "epoch": 1.7693272567629719, "grad_norm": 7.133121967315674, "learning_rate": 2.1535029499152757e-05, "loss": 2.8139, "step": 2644 }, { "epoch": 1.769996236986244, "grad_norm": 4.787456035614014, "learning_rate": 2.151574421518829e-05, "loss": 2.7211, "step": 2645 }, { "epoch": 1.7706652172095163, "grad_norm": 9.780879974365234, "learning_rate": 2.1496461045459083e-05, "loss": 2.9031, "step": 2646 }, { "epoch": 1.7713341974327883, "grad_norm": 6.123249530792236, "learning_rate": 2.1477180001666084e-05, "loss": 2.6404, "step": 2647 }, { "epoch": 1.7720031776560605, "grad_norm": 5.120070934295654, "learning_rate": 2.145790109550898e-05, "loss": 2.6474, "step": 2648 }, { "epoch": 1.7726721578793327, "grad_norm": 6.845491886138916, "learning_rate": 2.1438624338686135e-05, "loss": 2.9112, "step": 2649 }, { "epoch": 1.7733411381026047, "grad_norm": 7.510354042053223, "learning_rate": 2.1419349742894645e-05, "loss": 2.7838, "step": 2650 }, { "epoch": 1.774010118325877, "grad_norm": 4.251450061798096, "learning_rate": 2.1400077319830255e-05, "loss": 2.5678, "step": 2651 }, { "epoch": 1.7746790985491492, "grad_norm": 7.768141269683838, "learning_rate": 2.138080708118742e-05, "loss": 2.8773, "step": 2652 }, { "epoch": 1.7753480787724212, "grad_norm": 4.0545549392700195, "learning_rate": 2.1361539038659246e-05, "loss": 2.7939, "step": 2653 }, { "epoch": 1.7760170589956934, "grad_norm": 4.221911430358887, "learning_rate": 2.134227320393754e-05, "loss": 2.6346, "step": 2654 }, { "epoch": 1.7766860392189656, "grad_norm": 5.212001800537109, "learning_rate": 2.1323009588712723e-05, "loss": 2.7204, "step": 2655 }, { "epoch": 1.7773550194422376, "grad_norm": 5.01778507232666, "learning_rate": 2.130374820467392e-05, "loss": 2.8009, "step": 2656 }, { "epoch": 1.7780239996655098, "grad_norm": 5.883147716522217, "learning_rate": 2.1284489063508863e-05, "loss": 2.7216, "step": 2657 }, { "epoch": 1.778692979888782, "grad_norm": 7.345731735229492, "learning_rate": 2.126523217690394e-05, "loss": 2.928, "step": 2658 }, { "epoch": 1.779361960112054, "grad_norm": 6.369029521942139, "learning_rate": 2.1245977556544157e-05, "loss": 2.7213, "step": 2659 }, { "epoch": 1.7800309403353265, "grad_norm": 8.463759422302246, "learning_rate": 2.122672521411318e-05, "loss": 2.7376, "step": 2660 }, { "epoch": 1.7806999205585985, "grad_norm": 7.220532417297363, "learning_rate": 2.1207475161293255e-05, "loss": 2.9469, "step": 2661 }, { "epoch": 1.7813689007818705, "grad_norm": 7.100991249084473, "learning_rate": 2.1188227409765255e-05, "loss": 2.9285, "step": 2662 }, { "epoch": 1.782037881005143, "grad_norm": 6.796350002288818, "learning_rate": 2.116898197120864e-05, "loss": 2.7046, "step": 2663 }, { "epoch": 1.782706861228415, "grad_norm": 5.8627753257751465, "learning_rate": 2.1149738857301503e-05, "loss": 2.6686, "step": 2664 }, { "epoch": 1.783375841451687, "grad_norm": 4.756777763366699, "learning_rate": 2.1130498079720493e-05, "loss": 2.7317, "step": 2665 }, { "epoch": 1.7840448216749594, "grad_norm": 4.591255187988281, "learning_rate": 2.1111259650140854e-05, "loss": 2.4963, "step": 2666 }, { "epoch": 1.7847138018982314, "grad_norm": 5.408332824707031, "learning_rate": 2.1092023580236394e-05, "loss": 2.5654, "step": 2667 }, { "epoch": 1.7853827821215036, "grad_norm": 4.69365930557251, "learning_rate": 2.1072789881679514e-05, "loss": 2.4306, "step": 2668 }, { "epoch": 1.7860517623447758, "grad_norm": 5.50496244430542, "learning_rate": 2.105355856614115e-05, "loss": 2.7352, "step": 2669 }, { "epoch": 1.7867207425680478, "grad_norm": 5.13930082321167, "learning_rate": 2.1034329645290813e-05, "loss": 2.6872, "step": 2670 }, { "epoch": 1.78738972279132, "grad_norm": 6.122743129730225, "learning_rate": 2.101510313079653e-05, "loss": 2.9768, "step": 2671 }, { "epoch": 1.7880587030145922, "grad_norm": 5.980995178222656, "learning_rate": 2.0995879034324915e-05, "loss": 2.9266, "step": 2672 }, { "epoch": 1.7887276832378642, "grad_norm": 6.114793300628662, "learning_rate": 2.0976657367541068e-05, "loss": 2.8527, "step": 2673 }, { "epoch": 1.7893966634611365, "grad_norm": 4.264657497406006, "learning_rate": 2.095743814210865e-05, "loss": 2.8714, "step": 2674 }, { "epoch": 1.7900656436844087, "grad_norm": 4.827832221984863, "learning_rate": 2.0938221369689806e-05, "loss": 2.8182, "step": 2675 }, { "epoch": 1.7907346239076807, "grad_norm": 5.008242607116699, "learning_rate": 2.091900706194524e-05, "loss": 2.5002, "step": 2676 }, { "epoch": 1.791403604130953, "grad_norm": 5.181432247161865, "learning_rate": 2.0899795230534097e-05, "loss": 2.616, "step": 2677 }, { "epoch": 1.7920725843542251, "grad_norm": 7.064171314239502, "learning_rate": 2.0880585887114086e-05, "loss": 2.7894, "step": 2678 }, { "epoch": 1.7927415645774971, "grad_norm": 5.147838115692139, "learning_rate": 2.0861379043341357e-05, "loss": 2.6627, "step": 2679 }, { "epoch": 1.7934105448007693, "grad_norm": 7.886783599853516, "learning_rate": 2.0842174710870575e-05, "loss": 2.7651, "step": 2680 }, { "epoch": 1.7940795250240416, "grad_norm": 4.25205135345459, "learning_rate": 2.0822972901354844e-05, "loss": 2.7712, "step": 2681 }, { "epoch": 1.7947485052473136, "grad_norm": 5.357091426849365, "learning_rate": 2.080377362644579e-05, "loss": 2.6271, "step": 2682 }, { "epoch": 1.7954174854705858, "grad_norm": 3.9298439025878906, "learning_rate": 2.0784576897793452e-05, "loss": 2.7159, "step": 2683 }, { "epoch": 1.796086465693858, "grad_norm": 6.184408664703369, "learning_rate": 2.0765382727046353e-05, "loss": 2.6444, "step": 2684 }, { "epoch": 1.79675544591713, "grad_norm": 4.316830635070801, "learning_rate": 2.074619112585144e-05, "loss": 2.7326, "step": 2685 }, { "epoch": 1.7974244261404022, "grad_norm": 5.157546043395996, "learning_rate": 2.0727002105854136e-05, "loss": 2.558, "step": 2686 }, { "epoch": 1.7980934063636744, "grad_norm": 5.2561421394348145, "learning_rate": 2.070781567869826e-05, "loss": 2.6098, "step": 2687 }, { "epoch": 1.7987623865869464, "grad_norm": 6.838529586791992, "learning_rate": 2.0688631856026088e-05, "loss": 2.958, "step": 2688 }, { "epoch": 1.7994313668102186, "grad_norm": 6.147638320922852, "learning_rate": 2.0669450649478283e-05, "loss": 2.7633, "step": 2689 }, { "epoch": 1.8001003470334909, "grad_norm": 5.286609172821045, "learning_rate": 2.065027207069396e-05, "loss": 3.0781, "step": 2690 }, { "epoch": 1.8007693272567629, "grad_norm": 4.2134108543396, "learning_rate": 2.063109613131061e-05, "loss": 2.7279, "step": 2691 }, { "epoch": 1.801438307480035, "grad_norm": 5.509737014770508, "learning_rate": 2.0611922842964135e-05, "loss": 2.8086, "step": 2692 }, { "epoch": 1.8021072877033073, "grad_norm": 3.9975788593292236, "learning_rate": 2.059275221728881e-05, "loss": 2.8087, "step": 2693 }, { "epoch": 1.8027762679265793, "grad_norm": 5.958218097686768, "learning_rate": 2.0573584265917332e-05, "loss": 3.0113, "step": 2694 }, { "epoch": 1.8034452481498515, "grad_norm": 4.695807456970215, "learning_rate": 2.055441900048074e-05, "loss": 2.832, "step": 2695 }, { "epoch": 1.8041142283731237, "grad_norm": 6.1919121742248535, "learning_rate": 2.0535256432608464e-05, "loss": 2.9007, "step": 2696 }, { "epoch": 1.8047832085963957, "grad_norm": 6.809114456176758, "learning_rate": 2.051609657392827e-05, "loss": 2.8798, "step": 2697 }, { "epoch": 1.8054521888196682, "grad_norm": 4.31395959854126, "learning_rate": 2.0496939436066324e-05, "loss": 2.6148, "step": 2698 }, { "epoch": 1.8061211690429402, "grad_norm": 4.20595121383667, "learning_rate": 2.047778503064709e-05, "loss": 2.6138, "step": 2699 }, { "epoch": 1.8067901492662122, "grad_norm": 4.048036098480225, "learning_rate": 2.0458633369293424e-05, "loss": 2.6183, "step": 2700 }, { "epoch": 1.8074591294894846, "grad_norm": 6.035735607147217, "learning_rate": 2.0439484463626475e-05, "loss": 2.5516, "step": 2701 }, { "epoch": 1.8081281097127566, "grad_norm": 6.940889835357666, "learning_rate": 2.042033832526575e-05, "loss": 2.6495, "step": 2702 }, { "epoch": 1.8087970899360286, "grad_norm": 5.333974361419678, "learning_rate": 2.0401194965829048e-05, "loss": 2.5791, "step": 2703 }, { "epoch": 1.809466070159301, "grad_norm": 4.075145244598389, "learning_rate": 2.038205439693252e-05, "loss": 2.6834, "step": 2704 }, { "epoch": 1.810135050382573, "grad_norm": 5.112520217895508, "learning_rate": 2.0362916630190587e-05, "loss": 2.7141, "step": 2705 }, { "epoch": 1.810804030605845, "grad_norm": 6.721662521362305, "learning_rate": 2.0343781677215992e-05, "loss": 2.7941, "step": 2706 }, { "epoch": 1.8114730108291175, "grad_norm": 6.038949489593506, "learning_rate": 2.032464954961975e-05, "loss": 2.5827, "step": 2707 }, { "epoch": 1.8121419910523895, "grad_norm": 5.384756565093994, "learning_rate": 2.0305520259011195e-05, "loss": 2.9197, "step": 2708 }, { "epoch": 1.8128109712756617, "grad_norm": 4.2328081130981445, "learning_rate": 2.0286393816997905e-05, "loss": 2.7072, "step": 2709 }, { "epoch": 1.813479951498934, "grad_norm": 5.852884292602539, "learning_rate": 2.0267270235185748e-05, "loss": 2.8412, "step": 2710 }, { "epoch": 1.814148931722206, "grad_norm": 5.757684707641602, "learning_rate": 2.0248149525178846e-05, "loss": 2.8457, "step": 2711 }, { "epoch": 1.8148179119454781, "grad_norm": 3.801441192626953, "learning_rate": 2.02290316985796e-05, "loss": 2.6505, "step": 2712 }, { "epoch": 1.8154868921687504, "grad_norm": 6.909331321716309, "learning_rate": 2.0209916766988627e-05, "loss": 2.8613, "step": 2713 }, { "epoch": 1.8161558723920224, "grad_norm": 4.29110050201416, "learning_rate": 2.0190804742004823e-05, "loss": 2.7479, "step": 2714 }, { "epoch": 1.8168248526152946, "grad_norm": 5.398097038269043, "learning_rate": 2.0171695635225286e-05, "loss": 2.7035, "step": 2715 }, { "epoch": 1.8174938328385668, "grad_norm": 4.887238502502441, "learning_rate": 2.015258945824538e-05, "loss": 2.666, "step": 2716 }, { "epoch": 1.8181628130618388, "grad_norm": 4.847476959228516, "learning_rate": 2.013348622265866e-05, "loss": 2.6205, "step": 2717 }, { "epoch": 1.818831793285111, "grad_norm": 6.064548492431641, "learning_rate": 2.011438594005691e-05, "loss": 2.5477, "step": 2718 }, { "epoch": 1.8195007735083832, "grad_norm": 6.697597503662109, "learning_rate": 2.009528862203012e-05, "loss": 2.7419, "step": 2719 }, { "epoch": 1.8201697537316552, "grad_norm": 5.916878700256348, "learning_rate": 2.007619428016649e-05, "loss": 2.6181, "step": 2720 }, { "epoch": 1.8208387339549275, "grad_norm": 6.983180046081543, "learning_rate": 2.0057102926052384e-05, "loss": 2.7966, "step": 2721 }, { "epoch": 1.8215077141781997, "grad_norm": 5.324311256408691, "learning_rate": 2.00380145712724e-05, "loss": 2.5784, "step": 2722 }, { "epoch": 1.8221766944014717, "grad_norm": 4.834647178649902, "learning_rate": 2.0018929227409276e-05, "loss": 2.6259, "step": 2723 }, { "epoch": 1.822845674624744, "grad_norm": 8.242607116699219, "learning_rate": 1.9999846906043944e-05, "loss": 2.9843, "step": 2724 }, { "epoch": 1.8235146548480161, "grad_norm": 7.778139591217041, "learning_rate": 1.998076761875548e-05, "loss": 2.7287, "step": 2725 }, { "epoch": 1.8241836350712881, "grad_norm": 5.7373151779174805, "learning_rate": 1.996169137712116e-05, "loss": 2.6754, "step": 2726 }, { "epoch": 1.8248526152945603, "grad_norm": 6.458043098449707, "learning_rate": 1.9942618192716367e-05, "loss": 3.1508, "step": 2727 }, { "epoch": 1.8255215955178326, "grad_norm": 5.5279154777526855, "learning_rate": 1.9923548077114657e-05, "loss": 2.6276, "step": 2728 }, { "epoch": 1.8261905757411045, "grad_norm": 5.591763496398926, "learning_rate": 1.99044810418877e-05, "loss": 2.6509, "step": 2729 }, { "epoch": 1.8268595559643768, "grad_norm": 7.234933376312256, "learning_rate": 1.9885417098605342e-05, "loss": 2.9541, "step": 2730 }, { "epoch": 1.827528536187649, "grad_norm": 6.901897430419922, "learning_rate": 1.986635625883549e-05, "loss": 2.717, "step": 2731 }, { "epoch": 1.828197516410921, "grad_norm": 6.3411030769348145, "learning_rate": 1.9847298534144225e-05, "loss": 2.6243, "step": 2732 }, { "epoch": 1.8288664966341932, "grad_norm": 3.945781707763672, "learning_rate": 1.9828243936095696e-05, "loss": 2.7439, "step": 2733 }, { "epoch": 1.8295354768574654, "grad_norm": 4.212995529174805, "learning_rate": 1.9809192476252187e-05, "loss": 2.5473, "step": 2734 }, { "epoch": 1.8302044570807374, "grad_norm": 6.0992584228515625, "learning_rate": 1.979014416617405e-05, "loss": 2.7305, "step": 2735 }, { "epoch": 1.8308734373040099, "grad_norm": 7.614537715911865, "learning_rate": 1.9771099017419746e-05, "loss": 3.1265, "step": 2736 }, { "epoch": 1.8315424175272819, "grad_norm": 5.30592155456543, "learning_rate": 1.9752057041545803e-05, "loss": 2.6206, "step": 2737 }, { "epoch": 1.8322113977505539, "grad_norm": 8.460625648498535, "learning_rate": 1.973301825010685e-05, "loss": 2.8481, "step": 2738 }, { "epoch": 1.8328803779738263, "grad_norm": 5.965261459350586, "learning_rate": 1.9713982654655534e-05, "loss": 2.6954, "step": 2739 }, { "epoch": 1.8335493581970983, "grad_norm": 7.425351142883301, "learning_rate": 1.9694950266742622e-05, "loss": 2.7809, "step": 2740 }, { "epoch": 1.8342183384203703, "grad_norm": 6.250692844390869, "learning_rate": 1.9675921097916887e-05, "loss": 2.7152, "step": 2741 }, { "epoch": 1.8348873186436427, "grad_norm": 6.669025421142578, "learning_rate": 1.965689515972518e-05, "loss": 2.9273, "step": 2742 }, { "epoch": 1.8355562988669147, "grad_norm": 4.485771179199219, "learning_rate": 1.9637872463712365e-05, "loss": 2.562, "step": 2743 }, { "epoch": 1.8362252790901867, "grad_norm": 7.555203437805176, "learning_rate": 1.9618853021421373e-05, "loss": 2.7249, "step": 2744 }, { "epoch": 1.8368942593134592, "grad_norm": 5.925047397613525, "learning_rate": 1.9599836844393122e-05, "loss": 2.7757, "step": 2745 }, { "epoch": 1.8375632395367312, "grad_norm": 7.533424377441406, "learning_rate": 1.9580823944166583e-05, "loss": 2.8247, "step": 2746 }, { "epoch": 1.8382322197600034, "grad_norm": 7.545873641967773, "learning_rate": 1.9561814332278704e-05, "loss": 2.8188, "step": 2747 }, { "epoch": 1.8389011999832756, "grad_norm": 5.071416854858398, "learning_rate": 1.9542808020264474e-05, "loss": 2.7274, "step": 2748 }, { "epoch": 1.8395701802065476, "grad_norm": 5.147225379943848, "learning_rate": 1.9523805019656854e-05, "loss": 2.8018, "step": 2749 }, { "epoch": 1.8402391604298198, "grad_norm": 6.20609188079834, "learning_rate": 1.9504805341986812e-05, "loss": 2.7468, "step": 2750 }, { "epoch": 1.840908140653092, "grad_norm": 5.602138996124268, "learning_rate": 1.9485808998783275e-05, "loss": 2.7135, "step": 2751 }, { "epoch": 1.841577120876364, "grad_norm": 5.072507381439209, "learning_rate": 1.9466816001573183e-05, "loss": 2.6772, "step": 2752 }, { "epoch": 1.8422461010996363, "grad_norm": 5.482741832733154, "learning_rate": 1.944782636188141e-05, "loss": 2.7309, "step": 2753 }, { "epoch": 1.8429150813229085, "grad_norm": 5.543910980224609, "learning_rate": 1.942884009123082e-05, "loss": 2.9191, "step": 2754 }, { "epoch": 1.8435840615461805, "grad_norm": 6.336831569671631, "learning_rate": 1.9409857201142208e-05, "loss": 2.7211, "step": 2755 }, { "epoch": 1.8442530417694527, "grad_norm": 4.116778373718262, "learning_rate": 1.939087770313435e-05, "loss": 2.8932, "step": 2756 }, { "epoch": 1.844922021992725, "grad_norm": 5.453319072723389, "learning_rate": 1.9371901608723923e-05, "loss": 2.7212, "step": 2757 }, { "epoch": 1.845591002215997, "grad_norm": 5.0413289070129395, "learning_rate": 1.9352928929425586e-05, "loss": 2.515, "step": 2758 }, { "epoch": 1.8462599824392691, "grad_norm": 3.929534912109375, "learning_rate": 1.9333959676751863e-05, "loss": 2.5231, "step": 2759 }, { "epoch": 1.8469289626625414, "grad_norm": 5.582021713256836, "learning_rate": 1.9314993862213283e-05, "loss": 2.6033, "step": 2760 }, { "epoch": 1.8475979428858134, "grad_norm": 5.542810916900635, "learning_rate": 1.9296031497318194e-05, "loss": 2.8874, "step": 2761 }, { "epoch": 1.8482669231090856, "grad_norm": 5.410712242126465, "learning_rate": 1.9277072593572933e-05, "loss": 2.8732, "step": 2762 }, { "epoch": 1.8489359033323578, "grad_norm": 5.130572319030762, "learning_rate": 1.9258117162481686e-05, "loss": 2.5383, "step": 2763 }, { "epoch": 1.8496048835556298, "grad_norm": 4.82888650894165, "learning_rate": 1.9239165215546556e-05, "loss": 2.6045, "step": 2764 }, { "epoch": 1.850273863778902, "grad_norm": 6.0040106773376465, "learning_rate": 1.9220216764267508e-05, "loss": 2.8346, "step": 2765 }, { "epoch": 1.8509428440021742, "grad_norm": 3.895348072052002, "learning_rate": 1.9201271820142422e-05, "loss": 2.5414, "step": 2766 }, { "epoch": 1.8516118242254462, "grad_norm": 4.430002689361572, "learning_rate": 1.9182330394667016e-05, "loss": 2.7993, "step": 2767 }, { "epoch": 1.8522808044487185, "grad_norm": 6.686192512512207, "learning_rate": 1.9163392499334896e-05, "loss": 2.8709, "step": 2768 }, { "epoch": 1.8529497846719907, "grad_norm": 5.229307174682617, "learning_rate": 1.9144458145637498e-05, "loss": 2.8088, "step": 2769 }, { "epoch": 1.8536187648952627, "grad_norm": 5.271556854248047, "learning_rate": 1.9125527345064152e-05, "loss": 2.9849, "step": 2770 }, { "epoch": 1.854287745118535, "grad_norm": 3.884953737258911, "learning_rate": 1.9106600109101988e-05, "loss": 2.742, "step": 2771 }, { "epoch": 1.8549567253418071, "grad_norm": 6.681604385375977, "learning_rate": 1.9087676449236007e-05, "loss": 2.8005, "step": 2772 }, { "epoch": 1.855625705565079, "grad_norm": 6.033226013183594, "learning_rate": 1.9068756376949003e-05, "loss": 2.7764, "step": 2773 }, { "epoch": 1.8562946857883513, "grad_norm": 4.049818992614746, "learning_rate": 1.9049839903721646e-05, "loss": 2.5804, "step": 2774 }, { "epoch": 1.8569636660116235, "grad_norm": 5.595391273498535, "learning_rate": 1.903092704103236e-05, "loss": 2.7069, "step": 2775 }, { "epoch": 1.8576326462348955, "grad_norm": 6.733685493469238, "learning_rate": 1.9012017800357437e-05, "loss": 2.8359, "step": 2776 }, { "epoch": 1.858301626458168, "grad_norm": 5.3344526290893555, "learning_rate": 1.899311219317092e-05, "loss": 2.6181, "step": 2777 }, { "epoch": 1.85897060668144, "grad_norm": 6.081495761871338, "learning_rate": 1.897421023094469e-05, "loss": 2.9845, "step": 2778 }, { "epoch": 1.859639586904712, "grad_norm": 5.906316757202148, "learning_rate": 1.8955311925148387e-05, "loss": 2.5286, "step": 2779 }, { "epoch": 1.8603085671279844, "grad_norm": 5.377201557159424, "learning_rate": 1.893641728724945e-05, "loss": 2.6125, "step": 2780 }, { "epoch": 1.8609775473512564, "grad_norm": 5.031513690948486, "learning_rate": 1.891752632871306e-05, "loss": 2.6942, "step": 2781 }, { "epoch": 1.8616465275745284, "grad_norm": 5.494041919708252, "learning_rate": 1.8898639061002234e-05, "loss": 2.7891, "step": 2782 }, { "epoch": 1.8623155077978009, "grad_norm": 7.990489482879639, "learning_rate": 1.887975549557766e-05, "loss": 2.9002, "step": 2783 }, { "epoch": 1.8629844880210729, "grad_norm": 6.093759059906006, "learning_rate": 1.8860875643897864e-05, "loss": 2.9595, "step": 2784 }, { "epoch": 1.863653468244345, "grad_norm": 5.460567474365234, "learning_rate": 1.884199951741905e-05, "loss": 2.7114, "step": 2785 }, { "epoch": 1.8643224484676173, "grad_norm": 6.7065749168396, "learning_rate": 1.8823127127595207e-05, "loss": 2.9208, "step": 2786 }, { "epoch": 1.8649914286908893, "grad_norm": 6.396170139312744, "learning_rate": 1.8804258485878025e-05, "loss": 2.9138, "step": 2787 }, { "epoch": 1.8656604089141615, "grad_norm": 6.916308879852295, "learning_rate": 1.8785393603716962e-05, "loss": 3.1748, "step": 2788 }, { "epoch": 1.8663293891374337, "grad_norm": 4.9034223556518555, "learning_rate": 1.8766532492559144e-05, "loss": 2.8398, "step": 2789 }, { "epoch": 1.8669983693607057, "grad_norm": 6.835448741912842, "learning_rate": 1.8747675163849445e-05, "loss": 2.8856, "step": 2790 }, { "epoch": 1.867667349583978, "grad_norm": 5.442148685455322, "learning_rate": 1.872882162903042e-05, "loss": 2.7276, "step": 2791 }, { "epoch": 1.8683363298072502, "grad_norm": 6.819629192352295, "learning_rate": 1.8709971899542352e-05, "loss": 2.6695, "step": 2792 }, { "epoch": 1.8690053100305222, "grad_norm": 7.096249103546143, "learning_rate": 1.8691125986823182e-05, "loss": 3.0836, "step": 2793 }, { "epoch": 1.8696742902537944, "grad_norm": 6.612250328063965, "learning_rate": 1.8672283902308557e-05, "loss": 2.8811, "step": 2794 }, { "epoch": 1.8703432704770666, "grad_norm": 5.536661148071289, "learning_rate": 1.8653445657431777e-05, "loss": 2.7357, "step": 2795 }, { "epoch": 1.8710122507003386, "grad_norm": 5.26278829574585, "learning_rate": 1.863461126362386e-05, "loss": 2.3215, "step": 2796 }, { "epoch": 1.8716812309236108, "grad_norm": 4.44549036026001, "learning_rate": 1.8615780732313425e-05, "loss": 2.5687, "step": 2797 }, { "epoch": 1.872350211146883, "grad_norm": 4.844627380371094, "learning_rate": 1.85969540749268e-05, "loss": 2.6822, "step": 2798 }, { "epoch": 1.873019191370155, "grad_norm": 8.582950592041016, "learning_rate": 1.8578131302887915e-05, "loss": 2.5262, "step": 2799 }, { "epoch": 1.8736881715934273, "grad_norm": 5.140320777893066, "learning_rate": 1.8559312427618397e-05, "loss": 2.6723, "step": 2800 }, { "epoch": 1.8743571518166995, "grad_norm": 5.917047023773193, "learning_rate": 1.8540497460537466e-05, "loss": 2.6551, "step": 2801 }, { "epoch": 1.8750261320399715, "grad_norm": 5.449909210205078, "learning_rate": 1.852168641306198e-05, "loss": 2.6342, "step": 2802 }, { "epoch": 1.8756951122632437, "grad_norm": 6.341294288635254, "learning_rate": 1.8502879296606426e-05, "loss": 2.7602, "step": 2803 }, { "epoch": 1.876364092486516, "grad_norm": 6.9685750007629395, "learning_rate": 1.848407612258291e-05, "loss": 2.7687, "step": 2804 }, { "epoch": 1.877033072709788, "grad_norm": 4.044394493103027, "learning_rate": 1.8465276902401114e-05, "loss": 2.4774, "step": 2805 }, { "epoch": 1.8777020529330601, "grad_norm": 7.276406288146973, "learning_rate": 1.844648164746837e-05, "loss": 2.711, "step": 2806 }, { "epoch": 1.8783710331563324, "grad_norm": 5.422701358795166, "learning_rate": 1.8427690369189572e-05, "loss": 2.9431, "step": 2807 }, { "epoch": 1.8790400133796044, "grad_norm": 4.879177570343018, "learning_rate": 1.8408903078967202e-05, "loss": 2.445, "step": 2808 }, { "epoch": 1.8797089936028766, "grad_norm": 4.497944355010986, "learning_rate": 1.8390119788201322e-05, "loss": 2.5336, "step": 2809 }, { "epoch": 1.8803779738261488, "grad_norm": 4.598652362823486, "learning_rate": 1.8371340508289592e-05, "loss": 2.5773, "step": 2810 }, { "epoch": 1.8810469540494208, "grad_norm": 4.848037242889404, "learning_rate": 1.83525652506272e-05, "loss": 2.7507, "step": 2811 }, { "epoch": 1.881715934272693, "grad_norm": 4.2268805503845215, "learning_rate": 1.8333794026606925e-05, "loss": 2.7592, "step": 2812 }, { "epoch": 1.8823849144959652, "grad_norm": 3.9678916931152344, "learning_rate": 1.831502684761907e-05, "loss": 2.592, "step": 2813 }, { "epoch": 1.8830538947192372, "grad_norm": 6.395400524139404, "learning_rate": 1.829626372505152e-05, "loss": 2.8128, "step": 2814 }, { "epoch": 1.8837228749425097, "grad_norm": 5.863958358764648, "learning_rate": 1.8277504670289663e-05, "loss": 2.619, "step": 2815 }, { "epoch": 1.8843918551657817, "grad_norm": 5.843434810638428, "learning_rate": 1.8258749694716443e-05, "loss": 2.7406, "step": 2816 }, { "epoch": 1.8850608353890537, "grad_norm": 6.783684253692627, "learning_rate": 1.8239998809712302e-05, "loss": 2.4084, "step": 2817 }, { "epoch": 1.885729815612326, "grad_norm": 5.074789047241211, "learning_rate": 1.822125202665524e-05, "loss": 2.6946, "step": 2818 }, { "epoch": 1.886398795835598, "grad_norm": 5.649902820587158, "learning_rate": 1.8202509356920726e-05, "loss": 2.5995, "step": 2819 }, { "epoch": 1.88706777605887, "grad_norm": 6.75178861618042, "learning_rate": 1.8183770811881766e-05, "loss": 2.523, "step": 2820 }, { "epoch": 1.8877367562821425, "grad_norm": 5.733412742614746, "learning_rate": 1.816503640290883e-05, "loss": 2.6101, "step": 2821 }, { "epoch": 1.8884057365054145, "grad_norm": 5.465363025665283, "learning_rate": 1.814630614136993e-05, "loss": 2.7557, "step": 2822 }, { "epoch": 1.8890747167286865, "grad_norm": 5.500278472900391, "learning_rate": 1.8127580038630487e-05, "loss": 2.8996, "step": 2823 }, { "epoch": 1.889743696951959, "grad_norm": 4.765371799468994, "learning_rate": 1.810885810605348e-05, "loss": 2.6213, "step": 2824 }, { "epoch": 1.890412677175231, "grad_norm": 4.7066264152526855, "learning_rate": 1.8090140354999285e-05, "loss": 2.8025, "step": 2825 }, { "epoch": 1.8910816573985032, "grad_norm": 4.957799911499023, "learning_rate": 1.8071426796825797e-05, "loss": 2.6934, "step": 2826 }, { "epoch": 1.8917506376217754, "grad_norm": 4.18173360824585, "learning_rate": 1.8052717442888324e-05, "loss": 2.453, "step": 2827 }, { "epoch": 1.8924196178450474, "grad_norm": 4.871091842651367, "learning_rate": 1.8034012304539664e-05, "loss": 2.7906, "step": 2828 }, { "epoch": 1.8930885980683196, "grad_norm": 5.089080810546875, "learning_rate": 1.8015311393130014e-05, "loss": 2.5184, "step": 2829 }, { "epoch": 1.8937575782915919, "grad_norm": 5.239760875701904, "learning_rate": 1.7996614720007043e-05, "loss": 2.72, "step": 2830 }, { "epoch": 1.8944265585148639, "grad_norm": 5.64327335357666, "learning_rate": 1.7977922296515816e-05, "loss": 2.7463, "step": 2831 }, { "epoch": 1.895095538738136, "grad_norm": 5.543586254119873, "learning_rate": 1.7959234133998853e-05, "loss": 2.8013, "step": 2832 }, { "epoch": 1.8957645189614083, "grad_norm": 4.5046820640563965, "learning_rate": 1.794055024379606e-05, "loss": 2.7221, "step": 2833 }, { "epoch": 1.8964334991846803, "grad_norm": 4.879184722900391, "learning_rate": 1.792187063724477e-05, "loss": 2.7548, "step": 2834 }, { "epoch": 1.8971024794079525, "grad_norm": 5.551717281341553, "learning_rate": 1.790319532567969e-05, "loss": 2.9741, "step": 2835 }, { "epoch": 1.8977714596312247, "grad_norm": 6.884735584259033, "learning_rate": 1.7884524320432967e-05, "loss": 2.783, "step": 2836 }, { "epoch": 1.8984404398544967, "grad_norm": 5.557075023651123, "learning_rate": 1.7865857632834087e-05, "loss": 2.7484, "step": 2837 }, { "epoch": 1.899109420077769, "grad_norm": 4.404065132141113, "learning_rate": 1.7847195274209946e-05, "loss": 2.8067, "step": 2838 }, { "epoch": 1.8997784003010412, "grad_norm": 5.520792484283447, "learning_rate": 1.7828537255884793e-05, "loss": 2.8406, "step": 2839 }, { "epoch": 1.9004473805243132, "grad_norm": 7.45339822769165, "learning_rate": 1.7809883589180266e-05, "loss": 2.6373, "step": 2840 }, { "epoch": 1.9011163607475854, "grad_norm": 5.119688987731934, "learning_rate": 1.779123428541534e-05, "loss": 2.7223, "step": 2841 }, { "epoch": 1.9017853409708576, "grad_norm": 4.976303577423096, "learning_rate": 1.777258935590636e-05, "loss": 2.7959, "step": 2842 }, { "epoch": 1.9024543211941296, "grad_norm": 6.238039493560791, "learning_rate": 1.7753948811967004e-05, "loss": 2.799, "step": 2843 }, { "epoch": 1.9031233014174018, "grad_norm": 6.58221960067749, "learning_rate": 1.7735312664908306e-05, "loss": 2.6526, "step": 2844 }, { "epoch": 1.903792281640674, "grad_norm": 5.450275897979736, "learning_rate": 1.7716680926038598e-05, "loss": 2.8581, "step": 2845 }, { "epoch": 1.904461261863946, "grad_norm": 5.318499565124512, "learning_rate": 1.7698053606663585e-05, "loss": 2.5682, "step": 2846 }, { "epoch": 1.9051302420872183, "grad_norm": 5.3904194831848145, "learning_rate": 1.7679430718086243e-05, "loss": 2.5502, "step": 2847 }, { "epoch": 1.9057992223104905, "grad_norm": 6.697218418121338, "learning_rate": 1.7660812271606896e-05, "loss": 2.8885, "step": 2848 }, { "epoch": 1.9064682025337625, "grad_norm": 3.4873507022857666, "learning_rate": 1.764219827852315e-05, "loss": 2.3108, "step": 2849 }, { "epoch": 1.9071371827570347, "grad_norm": 6.173823833465576, "learning_rate": 1.762358875012992e-05, "loss": 2.8236, "step": 2850 }, { "epoch": 1.907806162980307, "grad_norm": 6.896760940551758, "learning_rate": 1.7604983697719408e-05, "loss": 2.75, "step": 2851 }, { "epoch": 1.908475143203579, "grad_norm": 4.93982458114624, "learning_rate": 1.75863831325811e-05, "loss": 2.6241, "step": 2852 }, { "epoch": 1.9091441234268514, "grad_norm": 2.976346492767334, "learning_rate": 1.7567787066001752e-05, "loss": 3.0217, "step": 2853 }, { "epoch": 1.9098131036501234, "grad_norm": 6.784980297088623, "learning_rate": 1.7549195509265408e-05, "loss": 2.7611, "step": 2854 }, { "epoch": 1.9104820838733954, "grad_norm": 4.974246025085449, "learning_rate": 1.7530608473653367e-05, "loss": 2.6508, "step": 2855 }, { "epoch": 1.9111510640966678, "grad_norm": 5.458065986633301, "learning_rate": 1.7512025970444173e-05, "loss": 2.7313, "step": 2856 }, { "epoch": 1.9118200443199398, "grad_norm": 5.316771507263184, "learning_rate": 1.7493448010913625e-05, "loss": 2.5354, "step": 2857 }, { "epoch": 1.9124890245432118, "grad_norm": 4.226136684417725, "learning_rate": 1.747487460633479e-05, "loss": 2.5317, "step": 2858 }, { "epoch": 1.9131580047664842, "grad_norm": 3.920102834701538, "learning_rate": 1.745630576797793e-05, "loss": 2.621, "step": 2859 }, { "epoch": 1.9138269849897562, "grad_norm": 4.785923480987549, "learning_rate": 1.743774150711057e-05, "loss": 2.5633, "step": 2860 }, { "epoch": 1.9144959652130282, "grad_norm": 4.950382709503174, "learning_rate": 1.7419181834997435e-05, "loss": 2.4897, "step": 2861 }, { "epoch": 1.9151649454363007, "grad_norm": 5.032035827636719, "learning_rate": 1.740062676290048e-05, "loss": 2.8504, "step": 2862 }, { "epoch": 1.9158339256595727, "grad_norm": 5.08629035949707, "learning_rate": 1.738207630207886e-05, "loss": 2.7167, "step": 2863 }, { "epoch": 1.9165029058828449, "grad_norm": 6.729982852935791, "learning_rate": 1.736353046378894e-05, "loss": 3.1568, "step": 2864 }, { "epoch": 1.917171886106117, "grad_norm": 4.999177932739258, "learning_rate": 1.7344989259284267e-05, "loss": 2.5559, "step": 2865 }, { "epoch": 1.917840866329389, "grad_norm": 5.187335014343262, "learning_rate": 1.7326452699815602e-05, "loss": 2.8355, "step": 2866 }, { "epoch": 1.9185098465526613, "grad_norm": 6.359440803527832, "learning_rate": 1.730792079663084e-05, "loss": 2.5979, "step": 2867 }, { "epoch": 1.9191788267759335, "grad_norm": 6.776169300079346, "learning_rate": 1.7289393560975113e-05, "loss": 2.8677, "step": 2868 }, { "epoch": 1.9198478069992055, "grad_norm": 8.18869400024414, "learning_rate": 1.7270871004090663e-05, "loss": 2.8048, "step": 2869 }, { "epoch": 1.9205167872224778, "grad_norm": 5.368067741394043, "learning_rate": 1.7252353137216938e-05, "loss": 2.7922, "step": 2870 }, { "epoch": 1.92118576744575, "grad_norm": 4.105527877807617, "learning_rate": 1.72338399715905e-05, "loss": 2.6717, "step": 2871 }, { "epoch": 1.921854747669022, "grad_norm": 5.149494647979736, "learning_rate": 1.7215331518445095e-05, "loss": 2.5812, "step": 2872 }, { "epoch": 1.9225237278922942, "grad_norm": 5.233162879943848, "learning_rate": 1.7196827789011585e-05, "loss": 2.6188, "step": 2873 }, { "epoch": 1.9231927081155664, "grad_norm": 5.364161014556885, "learning_rate": 1.7178328794517983e-05, "loss": 2.7196, "step": 2874 }, { "epoch": 1.9238616883388384, "grad_norm": 6.9434003829956055, "learning_rate": 1.71598345461894e-05, "loss": 2.8359, "step": 2875 }, { "epoch": 1.9245306685621106, "grad_norm": 5.348224639892578, "learning_rate": 1.7141345055248108e-05, "loss": 2.5852, "step": 2876 }, { "epoch": 1.9251996487853829, "grad_norm": 5.631481647491455, "learning_rate": 1.712286033291346e-05, "loss": 3.0714, "step": 2877 }, { "epoch": 1.9258686290086549, "grad_norm": 5.012246131896973, "learning_rate": 1.7104380390401938e-05, "loss": 2.6516, "step": 2878 }, { "epoch": 1.926537609231927, "grad_norm": 4.643505573272705, "learning_rate": 1.7085905238927085e-05, "loss": 2.8997, "step": 2879 }, { "epoch": 1.9272065894551993, "grad_norm": 5.0478057861328125, "learning_rate": 1.7067434889699598e-05, "loss": 2.5784, "step": 2880 }, { "epoch": 1.9278755696784713, "grad_norm": 4.843696117401123, "learning_rate": 1.7048969353927195e-05, "loss": 2.5817, "step": 2881 }, { "epoch": 1.9285445499017435, "grad_norm": 4.8132710456848145, "learning_rate": 1.703050864281473e-05, "loss": 2.6692, "step": 2882 }, { "epoch": 1.9292135301250157, "grad_norm": 5.918645858764648, "learning_rate": 1.701205276756408e-05, "loss": 2.5419, "step": 2883 }, { "epoch": 1.9298825103482877, "grad_norm": 5.2134833335876465, "learning_rate": 1.699360173937423e-05, "loss": 2.7145, "step": 2884 }, { "epoch": 1.93055149057156, "grad_norm": 5.388915538787842, "learning_rate": 1.6975155569441192e-05, "loss": 2.7019, "step": 2885 }, { "epoch": 1.9312204707948322, "grad_norm": 5.895727634429932, "learning_rate": 1.6956714268958058e-05, "loss": 2.503, "step": 2886 }, { "epoch": 1.9318894510181042, "grad_norm": 4.27496862411499, "learning_rate": 1.6938277849114928e-05, "loss": 2.6144, "step": 2887 }, { "epoch": 1.9325584312413764, "grad_norm": 6.133200645446777, "learning_rate": 1.6919846321098982e-05, "loss": 2.8895, "step": 2888 }, { "epoch": 1.9332274114646486, "grad_norm": 5.87167501449585, "learning_rate": 1.6901419696094396e-05, "loss": 2.9797, "step": 2889 }, { "epoch": 1.9338963916879206, "grad_norm": 7.774203777313232, "learning_rate": 1.6882997985282405e-05, "loss": 2.548, "step": 2890 }, { "epoch": 1.9345653719111928, "grad_norm": 4.060307025909424, "learning_rate": 1.6864581199841228e-05, "loss": 2.7134, "step": 2891 }, { "epoch": 1.935234352134465, "grad_norm": 5.368431568145752, "learning_rate": 1.684616935094613e-05, "loss": 2.4984, "step": 2892 }, { "epoch": 1.935903332357737, "grad_norm": 4.65740442276001, "learning_rate": 1.682776244976933e-05, "loss": 2.7124, "step": 2893 }, { "epoch": 1.9365723125810095, "grad_norm": 4.4884843826293945, "learning_rate": 1.680936050748011e-05, "loss": 2.649, "step": 2894 }, { "epoch": 1.9372412928042815, "grad_norm": 7.604023456573486, "learning_rate": 1.6790963535244698e-05, "loss": 2.4289, "step": 2895 }, { "epoch": 1.9379102730275535, "grad_norm": 5.809898376464844, "learning_rate": 1.6772571544226312e-05, "loss": 2.8822, "step": 2896 }, { "epoch": 1.938579253250826, "grad_norm": 5.104942321777344, "learning_rate": 1.6754184545585152e-05, "loss": 2.7296, "step": 2897 }, { "epoch": 1.939248233474098, "grad_norm": 3.730175495147705, "learning_rate": 1.6735802550478407e-05, "loss": 2.4977, "step": 2898 }, { "epoch": 1.93991721369737, "grad_norm": 5.256652355194092, "learning_rate": 1.6717425570060192e-05, "loss": 2.6609, "step": 2899 }, { "epoch": 1.9405861939206424, "grad_norm": 7.092987060546875, "learning_rate": 1.6699053615481616e-05, "loss": 2.7562, "step": 2900 }, { "epoch": 1.9412551741439144, "grad_norm": 6.14080810546875, "learning_rate": 1.6680686697890706e-05, "loss": 2.9019, "step": 2901 }, { "epoch": 1.9419241543671866, "grad_norm": 3.7073490619659424, "learning_rate": 1.6662324828432467e-05, "loss": 2.3301, "step": 2902 }, { "epoch": 1.9425931345904588, "grad_norm": 4.069120407104492, "learning_rate": 1.6643968018248808e-05, "loss": 2.5039, "step": 2903 }, { "epoch": 1.9432621148137308, "grad_norm": 5.932475566864014, "learning_rate": 1.662561627847859e-05, "loss": 2.5813, "step": 2904 }, { "epoch": 1.943931095037003, "grad_norm": 4.354176044464111, "learning_rate": 1.6607269620257583e-05, "loss": 2.5891, "step": 2905 }, { "epoch": 1.9446000752602752, "grad_norm": 5.194872856140137, "learning_rate": 1.6588928054718494e-05, "loss": 3.0809, "step": 2906 }, { "epoch": 1.9452690554835472, "grad_norm": 6.054746627807617, "learning_rate": 1.6570591592990913e-05, "loss": 2.9886, "step": 2907 }, { "epoch": 1.9459380357068194, "grad_norm": 3.42716383934021, "learning_rate": 1.6552260246201352e-05, "loss": 2.6991, "step": 2908 }, { "epoch": 1.9466070159300917, "grad_norm": 4.277167320251465, "learning_rate": 1.6533934025473212e-05, "loss": 2.5809, "step": 2909 }, { "epoch": 1.9472759961533637, "grad_norm": 6.264553546905518, "learning_rate": 1.6515612941926788e-05, "loss": 2.94, "step": 2910 }, { "epoch": 1.9479449763766359, "grad_norm": 6.1438517570495605, "learning_rate": 1.649729700667924e-05, "loss": 2.3862, "step": 2911 }, { "epoch": 1.948613956599908, "grad_norm": 5.2817301750183105, "learning_rate": 1.6478986230844645e-05, "loss": 2.5003, "step": 2912 }, { "epoch": 1.94928293682318, "grad_norm": 4.638598442077637, "learning_rate": 1.6460680625533904e-05, "loss": 2.4736, "step": 2913 }, { "epoch": 1.9499519170464523, "grad_norm": 3.7549145221710205, "learning_rate": 1.644238020185481e-05, "loss": 2.7409, "step": 2914 }, { "epoch": 1.9506208972697245, "grad_norm": 4.2682952880859375, "learning_rate": 1.6424084970911984e-05, "loss": 2.5337, "step": 2915 }, { "epoch": 1.9512898774929965, "grad_norm": 4.557698726654053, "learning_rate": 1.6405794943806934e-05, "loss": 2.5818, "step": 2916 }, { "epoch": 1.9519588577162688, "grad_norm": 6.678733825683594, "learning_rate": 1.6387510131637982e-05, "loss": 2.7797, "step": 2917 }, { "epoch": 1.952627837939541, "grad_norm": 5.692575931549072, "learning_rate": 1.63692305455003e-05, "loss": 2.7032, "step": 2918 }, { "epoch": 1.953296818162813, "grad_norm": 5.498258590698242, "learning_rate": 1.6350956196485856e-05, "loss": 2.7007, "step": 2919 }, { "epoch": 1.9539657983860852, "grad_norm": 4.76276159286499, "learning_rate": 1.6332687095683503e-05, "loss": 2.7379, "step": 2920 }, { "epoch": 1.9546347786093574, "grad_norm": 6.434892654418945, "learning_rate": 1.6314423254178847e-05, "loss": 2.6362, "step": 2921 }, { "epoch": 1.9553037588326294, "grad_norm": 9.089195251464844, "learning_rate": 1.6296164683054345e-05, "loss": 2.9048, "step": 2922 }, { "epoch": 1.9559727390559016, "grad_norm": 6.396719932556152, "learning_rate": 1.6277911393389218e-05, "loss": 2.7807, "step": 2923 }, { "epoch": 1.9566417192791739, "grad_norm": 4.648181915283203, "learning_rate": 1.6259663396259528e-05, "loss": 3.1269, "step": 2924 }, { "epoch": 1.9573106995024458, "grad_norm": 5.44894552230835, "learning_rate": 1.6241420702738088e-05, "loss": 2.6917, "step": 2925 }, { "epoch": 1.957979679725718, "grad_norm": 8.427546501159668, "learning_rate": 1.622318332389451e-05, "loss": 3.0648, "step": 2926 }, { "epoch": 1.9586486599489903, "grad_norm": 5.089845657348633, "learning_rate": 1.620495127079517e-05, "loss": 2.6871, "step": 2927 }, { "epoch": 1.9593176401722623, "grad_norm": 5.062190532684326, "learning_rate": 1.618672455450324e-05, "loss": 2.7424, "step": 2928 }, { "epoch": 1.9599866203955345, "grad_norm": 4.900814533233643, "learning_rate": 1.6168503186078598e-05, "loss": 2.5565, "step": 2929 }, { "epoch": 1.9606556006188067, "grad_norm": 5.936448097229004, "learning_rate": 1.6150287176577948e-05, "loss": 2.6673, "step": 2930 }, { "epoch": 1.9613245808420787, "grad_norm": 7.32464599609375, "learning_rate": 1.613207653705468e-05, "loss": 3.0643, "step": 2931 }, { "epoch": 1.9619935610653512, "grad_norm": 5.269471645355225, "learning_rate": 1.6113871278558974e-05, "loss": 2.6852, "step": 2932 }, { "epoch": 1.9626625412886232, "grad_norm": 4.94673490524292, "learning_rate": 1.6095671412137694e-05, "loss": 2.6746, "step": 2933 }, { "epoch": 1.9633315215118952, "grad_norm": 4.557250022888184, "learning_rate": 1.607747694883449e-05, "loss": 2.6577, "step": 2934 }, { "epoch": 1.9640005017351676, "grad_norm": 3.836230993270874, "learning_rate": 1.6059287899689684e-05, "loss": 2.6077, "step": 2935 }, { "epoch": 1.9646694819584396, "grad_norm": 5.7456278800964355, "learning_rate": 1.604110427574035e-05, "loss": 2.7519, "step": 2936 }, { "epoch": 1.9653384621817116, "grad_norm": 4.406337261199951, "learning_rate": 1.6022926088020228e-05, "loss": 2.7272, "step": 2937 }, { "epoch": 1.966007442404984, "grad_norm": 4.271851539611816, "learning_rate": 1.6004753347559808e-05, "loss": 2.588, "step": 2938 }, { "epoch": 1.966676422628256, "grad_norm": 4.202340602874756, "learning_rate": 1.5986586065386243e-05, "loss": 2.5318, "step": 2939 }, { "epoch": 1.967345402851528, "grad_norm": 4.201272010803223, "learning_rate": 1.5968424252523378e-05, "loss": 2.6647, "step": 2940 }, { "epoch": 1.9680143830748005, "grad_norm": 4.757193088531494, "learning_rate": 1.595026791999174e-05, "loss": 2.6625, "step": 2941 }, { "epoch": 1.9686833632980725, "grad_norm": 4.6923112869262695, "learning_rate": 1.5932117078808544e-05, "loss": 2.7788, "step": 2942 }, { "epoch": 1.9693523435213447, "grad_norm": 5.724450588226318, "learning_rate": 1.5913971739987655e-05, "loss": 2.7372, "step": 2943 }, { "epoch": 1.970021323744617, "grad_norm": 6.370497703552246, "learning_rate": 1.5895831914539616e-05, "loss": 2.6868, "step": 2944 }, { "epoch": 1.970690303967889, "grad_norm": 5.791628360748291, "learning_rate": 1.587769761347159e-05, "loss": 2.8062, "step": 2945 }, { "epoch": 1.9713592841911611, "grad_norm": 5.352672576904297, "learning_rate": 1.585956884778745e-05, "loss": 2.7399, "step": 2946 }, { "epoch": 1.9720282644144334, "grad_norm": 8.139093399047852, "learning_rate": 1.584144562848764e-05, "loss": 3.0253, "step": 2947 }, { "epoch": 1.9726972446377053, "grad_norm": 6.4604411125183105, "learning_rate": 1.582332796656929e-05, "loss": 2.879, "step": 2948 }, { "epoch": 1.9733662248609776, "grad_norm": 4.963821887969971, "learning_rate": 1.5805215873026125e-05, "loss": 2.6953, "step": 2949 }, { "epoch": 1.9740352050842498, "grad_norm": 4.356338024139404, "learning_rate": 1.5787109358848528e-05, "loss": 2.6261, "step": 2950 }, { "epoch": 1.9747041853075218, "grad_norm": 4.943756103515625, "learning_rate": 1.5769008435023446e-05, "loss": 2.6896, "step": 2951 }, { "epoch": 1.975373165530794, "grad_norm": 7.068074703216553, "learning_rate": 1.575091311253448e-05, "loss": 2.886, "step": 2952 }, { "epoch": 1.9760421457540662, "grad_norm": 4.81316614151001, "learning_rate": 1.573282340236181e-05, "loss": 2.5914, "step": 2953 }, { "epoch": 1.9767111259773382, "grad_norm": 5.508342742919922, "learning_rate": 1.571473931548222e-05, "loss": 2.5439, "step": 2954 }, { "epoch": 1.9773801062006104, "grad_norm": 7.063830375671387, "learning_rate": 1.5696660862869057e-05, "loss": 2.738, "step": 2955 }, { "epoch": 1.9780490864238827, "grad_norm": 7.030398845672607, "learning_rate": 1.567858805549229e-05, "loss": 2.8256, "step": 2956 }, { "epoch": 1.9787180666471547, "grad_norm": 5.586159706115723, "learning_rate": 1.5660520904318424e-05, "loss": 2.6369, "step": 2957 }, { "epoch": 1.9793870468704269, "grad_norm": 5.897188663482666, "learning_rate": 1.564245942031056e-05, "loss": 2.6623, "step": 2958 }, { "epoch": 1.980056027093699, "grad_norm": 5.793785095214844, "learning_rate": 1.5624403614428336e-05, "loss": 2.8337, "step": 2959 }, { "epoch": 1.980725007316971, "grad_norm": 6.084945201873779, "learning_rate": 1.5606353497627972e-05, "loss": 2.88, "step": 2960 }, { "epoch": 1.9813939875402433, "grad_norm": 7.513318061828613, "learning_rate": 1.5588309080862216e-05, "loss": 2.883, "step": 2961 }, { "epoch": 1.9820629677635155, "grad_norm": 5.256368637084961, "learning_rate": 1.5570270375080362e-05, "loss": 2.8528, "step": 2962 }, { "epoch": 1.9827319479867875, "grad_norm": 5.060674667358398, "learning_rate": 1.5552237391228226e-05, "loss": 2.6451, "step": 2963 }, { "epoch": 1.9834009282100598, "grad_norm": 4.594638347625732, "learning_rate": 1.553421014024819e-05, "loss": 2.7623, "step": 2964 }, { "epoch": 1.984069908433332, "grad_norm": 4.919723033905029, "learning_rate": 1.551618863307911e-05, "loss": 2.6493, "step": 2965 }, { "epoch": 1.984738888656604, "grad_norm": 5.47368049621582, "learning_rate": 1.549817288065639e-05, "loss": 2.8822, "step": 2966 }, { "epoch": 1.9854078688798762, "grad_norm": 5.195687770843506, "learning_rate": 1.5480162893911922e-05, "loss": 2.6127, "step": 2967 }, { "epoch": 1.9860768491031484, "grad_norm": 4.345044136047363, "learning_rate": 1.5462158683774124e-05, "loss": 2.6003, "step": 2968 }, { "epoch": 1.9867458293264204, "grad_norm": 6.155688285827637, "learning_rate": 1.544416026116788e-05, "loss": 2.831, "step": 2969 }, { "epoch": 1.9874148095496929, "grad_norm": 5.565713405609131, "learning_rate": 1.5426167637014582e-05, "loss": 2.5097, "step": 2970 }, { "epoch": 1.9880837897729648, "grad_norm": 6.924679279327393, "learning_rate": 1.5408180822232088e-05, "loss": 3.001, "step": 2971 }, { "epoch": 1.9887527699962368, "grad_norm": 5.707123279571533, "learning_rate": 1.5390199827734746e-05, "loss": 2.6601, "step": 2972 }, { "epoch": 1.9894217502195093, "grad_norm": 3.676952838897705, "learning_rate": 1.537222466443336e-05, "loss": 2.5597, "step": 2973 }, { "epoch": 1.9900907304427813, "grad_norm": 7.1246867179870605, "learning_rate": 1.5354255343235216e-05, "loss": 2.8476, "step": 2974 }, { "epoch": 1.9907597106660533, "grad_norm": 5.087657451629639, "learning_rate": 1.5336291875044025e-05, "loss": 2.7412, "step": 2975 }, { "epoch": 1.9914286908893257, "grad_norm": 6.350460529327393, "learning_rate": 1.5318334270759972e-05, "loss": 2.6847, "step": 2976 }, { "epoch": 1.9920976711125977, "grad_norm": 5.982015132904053, "learning_rate": 1.5300382541279658e-05, "loss": 2.8279, "step": 2977 }, { "epoch": 1.9927666513358697, "grad_norm": 4.596728801727295, "learning_rate": 1.528243669749616e-05, "loss": 2.6684, "step": 2978 }, { "epoch": 1.9934356315591422, "grad_norm": 4.809084415435791, "learning_rate": 1.526449675029894e-05, "loss": 2.8206, "step": 2979 }, { "epoch": 1.9941046117824142, "grad_norm": 5.641161918640137, "learning_rate": 1.5246562710573908e-05, "loss": 2.9301, "step": 2980 }, { "epoch": 1.9947735920056864, "grad_norm": 9.615150451660156, "learning_rate": 1.5228634589203367e-05, "loss": 3.0272, "step": 2981 }, { "epoch": 1.9954425722289586, "grad_norm": 5.482935905456543, "learning_rate": 1.521071239706607e-05, "loss": 2.9605, "step": 2982 }, { "epoch": 1.9961115524522306, "grad_norm": 6.808692932128906, "learning_rate": 1.5192796145037125e-05, "loss": 2.6895, "step": 2983 }, { "epoch": 1.9967805326755028, "grad_norm": 5.924152851104736, "learning_rate": 1.5174885843988069e-05, "loss": 2.6848, "step": 2984 }, { "epoch": 1.997449512898775, "grad_norm": 5.037985324859619, "learning_rate": 1.5156981504786798e-05, "loss": 2.6335, "step": 2985 }, { "epoch": 1.998118493122047, "grad_norm": 7.344898700714111, "learning_rate": 1.5139083138297633e-05, "loss": 3.1455, "step": 2986 }, { "epoch": 1.9987874733453193, "grad_norm": 5.326211929321289, "learning_rate": 1.512119075538122e-05, "loss": 2.7672, "step": 2987 }, { "epoch": 1.9994564535685915, "grad_norm": 4.72623348236084, "learning_rate": 1.5103304366894622e-05, "loss": 2.7863, "step": 2988 }, { "epoch": 2.000668980223272, "grad_norm": 12.768083572387695, "learning_rate": 1.508542398369122e-05, "loss": 5.1041, "step": 2989 }, { "epoch": 2.0013379604465444, "grad_norm": 6.635680675506592, "learning_rate": 1.506754961662079e-05, "loss": 2.663, "step": 2990 }, { "epoch": 2.0020069406698164, "grad_norm": 6.694709300994873, "learning_rate": 1.5049681276529437e-05, "loss": 2.4626, "step": 2991 }, { "epoch": 2.0026759208930884, "grad_norm": 4.822940349578857, "learning_rate": 1.503181897425961e-05, "loss": 2.4768, "step": 2992 }, { "epoch": 2.003344901116361, "grad_norm": 5.400036334991455, "learning_rate": 1.5013962720650095e-05, "loss": 2.3476, "step": 2993 }, { "epoch": 2.004013881339633, "grad_norm": 6.009514331817627, "learning_rate": 1.4996112526536019e-05, "loss": 2.4499, "step": 2994 }, { "epoch": 2.004682861562905, "grad_norm": 5.883105754852295, "learning_rate": 1.4978268402748802e-05, "loss": 2.5064, "step": 2995 }, { "epoch": 2.0053518417861773, "grad_norm": 5.894524574279785, "learning_rate": 1.4960430360116229e-05, "loss": 2.4421, "step": 2996 }, { "epoch": 2.0060208220094493, "grad_norm": 5.011678218841553, "learning_rate": 1.4942598409462343e-05, "loss": 2.2602, "step": 2997 }, { "epoch": 2.0066898022327213, "grad_norm": 3.9589545726776123, "learning_rate": 1.4924772561607537e-05, "loss": 2.3687, "step": 2998 }, { "epoch": 2.0073587824559938, "grad_norm": 5.826911449432373, "learning_rate": 1.4906952827368447e-05, "loss": 2.766, "step": 2999 }, { "epoch": 2.0080277626792657, "grad_norm": 6.351194858551025, "learning_rate": 1.4889139217558066e-05, "loss": 2.0096, "step": 3000 } ], "logging_steps": 1, "max_steps": 4482, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 8.817208589346351e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }