diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,31408 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.9994564535685915, + "eval_steps": 500, + "global_step": 4482, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0006689802232721495, + "grad_norm": 1.7879213094711304, + "learning_rate": 1.11358574610245e-07, + "loss": 3.5222, + "step": 1 + }, + { + "epoch": 0.001337960446544299, + "grad_norm": 1.4433249235153198, + "learning_rate": 2.2271714922049e-07, + "loss": 3.6449, + "step": 2 + }, + { + "epoch": 0.0020069406698164484, + "grad_norm": 1.370274543762207, + "learning_rate": 3.34075723830735e-07, + "loss": 3.4489, + "step": 3 + }, + { + "epoch": 0.002675920893088598, + "grad_norm": 1.2992745637893677, + "learning_rate": 4.4543429844098e-07, + "loss": 3.623, + "step": 4 + }, + { + "epoch": 0.0033449011163607474, + "grad_norm": 1.18953275680542, + "learning_rate": 5.567928730512249e-07, + "loss": 3.2437, + "step": 5 + }, + { + "epoch": 0.004013881339632897, + "grad_norm": 1.7558338642120361, + "learning_rate": 6.6815144766147e-07, + "loss": 3.8406, + "step": 6 + }, + { + "epoch": 0.004682861562905046, + "grad_norm": 1.383160948753357, + "learning_rate": 7.79510022271715e-07, + "loss": 3.4969, + "step": 7 + }, + { + "epoch": 0.005351841786177196, + "grad_norm": 1.0541542768478394, + "learning_rate": 8.9086859688196e-07, + "loss": 3.3137, + "step": 8 + }, + { + "epoch": 0.006020822009449345, + "grad_norm": 1.486899733543396, + "learning_rate": 1.002227171492205e-06, + "loss": 3.6094, + "step": 9 + }, + { + "epoch": 0.006689802232721495, + "grad_norm": 1.3455109596252441, + "learning_rate": 1.1135857461024499e-06, + "loss": 3.3945, + "step": 10 + }, + { + "epoch": 0.007358782455993645, + "grad_norm": 1.5229488611221313, + "learning_rate": 1.224944320712695e-06, + "loss": 3.379, + "step": 11 + }, + { + "epoch": 0.008027762679265794, + "grad_norm": 1.6204516887664795, + "learning_rate": 1.33630289532294e-06, + "loss": 3.4507, + "step": 12 + }, + { + "epoch": 0.008696742902537944, + "grad_norm": 1.4001444578170776, + "learning_rate": 1.447661469933185e-06, + "loss": 3.3992, + "step": 13 + }, + { + "epoch": 0.009365723125810093, + "grad_norm": 1.6883145570755005, + "learning_rate": 1.55902004454343e-06, + "loss": 3.6524, + "step": 14 + }, + { + "epoch": 0.010034703349082243, + "grad_norm": 1.1602007150650024, + "learning_rate": 1.670378619153675e-06, + "loss": 3.4021, + "step": 15 + }, + { + "epoch": 0.010703683572354392, + "grad_norm": 1.2480249404907227, + "learning_rate": 1.78173719376392e-06, + "loss": 3.4262, + "step": 16 + }, + { + "epoch": 0.011372663795626542, + "grad_norm": 1.0981214046478271, + "learning_rate": 1.893095768374165e-06, + "loss": 3.3911, + "step": 17 + }, + { + "epoch": 0.01204164401889869, + "grad_norm": 1.5630245208740234, + "learning_rate": 2.00445434298441e-06, + "loss": 3.7733, + "step": 18 + }, + { + "epoch": 0.012710624242170841, + "grad_norm": 1.4163099527359009, + "learning_rate": 2.1158129175946547e-06, + "loss": 3.544, + "step": 19 + }, + { + "epoch": 0.01337960446544299, + "grad_norm": 1.4298584461212158, + "learning_rate": 2.2271714922048998e-06, + "loss": 3.4907, + "step": 20 + }, + { + "epoch": 0.01404858468871514, + "grad_norm": 1.3225711584091187, + "learning_rate": 2.338530066815145e-06, + "loss": 3.3728, + "step": 21 + }, + { + "epoch": 0.01471756491198729, + "grad_norm": 1.684406042098999, + "learning_rate": 2.44988864142539e-06, + "loss": 3.835, + "step": 22 + }, + { + "epoch": 0.015386545135259439, + "grad_norm": 1.498872995376587, + "learning_rate": 2.561247216035635e-06, + "loss": 3.4506, + "step": 23 + }, + { + "epoch": 0.016055525358531587, + "grad_norm": 1.7191704511642456, + "learning_rate": 2.67260579064588e-06, + "loss": 3.7095, + "step": 24 + }, + { + "epoch": 0.016724505581803738, + "grad_norm": 1.579593539237976, + "learning_rate": 2.783964365256125e-06, + "loss": 3.5933, + "step": 25 + }, + { + "epoch": 0.017393485805075888, + "grad_norm": 1.3628127574920654, + "learning_rate": 2.89532293986637e-06, + "loss": 3.199, + "step": 26 + }, + { + "epoch": 0.01806246602834804, + "grad_norm": 1.6564180850982666, + "learning_rate": 3.006681514476615e-06, + "loss": 3.725, + "step": 27 + }, + { + "epoch": 0.018731446251620185, + "grad_norm": 1.3412981033325195, + "learning_rate": 3.11804008908686e-06, + "loss": 3.5123, + "step": 28 + }, + { + "epoch": 0.019400426474892336, + "grad_norm": 0.9989301562309265, + "learning_rate": 3.229398663697105e-06, + "loss": 2.9673, + "step": 29 + }, + { + "epoch": 0.020069406698164486, + "grad_norm": 1.3338531255722046, + "learning_rate": 3.34075723830735e-06, + "loss": 3.299, + "step": 30 + }, + { + "epoch": 0.020738386921436636, + "grad_norm": 1.562309741973877, + "learning_rate": 3.4521158129175947e-06, + "loss": 3.5201, + "step": 31 + }, + { + "epoch": 0.021407367144708783, + "grad_norm": 1.2774256467819214, + "learning_rate": 3.56347438752784e-06, + "loss": 3.2878, + "step": 32 + }, + { + "epoch": 0.022076347367980934, + "grad_norm": 1.455329418182373, + "learning_rate": 3.674832962138085e-06, + "loss": 3.4257, + "step": 33 + }, + { + "epoch": 0.022745327591253084, + "grad_norm": 1.2751973867416382, + "learning_rate": 3.78619153674833e-06, + "loss": 3.3183, + "step": 34 + }, + { + "epoch": 0.023414307814525234, + "grad_norm": 1.400235652923584, + "learning_rate": 3.897550111358575e-06, + "loss": 3.5279, + "step": 35 + }, + { + "epoch": 0.02408328803779738, + "grad_norm": 1.029577612876892, + "learning_rate": 4.00890868596882e-06, + "loss": 2.774, + "step": 36 + }, + { + "epoch": 0.02475226826106953, + "grad_norm": 1.280173897743225, + "learning_rate": 4.120267260579064e-06, + "loss": 3.4011, + "step": 37 + }, + { + "epoch": 0.025421248484341682, + "grad_norm": 1.3390440940856934, + "learning_rate": 4.231625835189309e-06, + "loss": 3.4696, + "step": 38 + }, + { + "epoch": 0.026090228707613832, + "grad_norm": 1.429360270500183, + "learning_rate": 4.3429844097995545e-06, + "loss": 3.5177, + "step": 39 + }, + { + "epoch": 0.02675920893088598, + "grad_norm": 1.375282645225525, + "learning_rate": 4.4543429844097995e-06, + "loss": 3.5709, + "step": 40 + }, + { + "epoch": 0.02742818915415813, + "grad_norm": 1.241412878036499, + "learning_rate": 4.565701559020045e-06, + "loss": 3.2464, + "step": 41 + }, + { + "epoch": 0.02809716937743028, + "grad_norm": 1.5399926900863647, + "learning_rate": 4.67706013363029e-06, + "loss": 3.2398, + "step": 42 + }, + { + "epoch": 0.02876614960070243, + "grad_norm": 1.683517575263977, + "learning_rate": 4.788418708240535e-06, + "loss": 3.6636, + "step": 43 + }, + { + "epoch": 0.02943512982397458, + "grad_norm": 1.3960689306259155, + "learning_rate": 4.89977728285078e-06, + "loss": 3.3246, + "step": 44 + }, + { + "epoch": 0.030104110047246727, + "grad_norm": 1.0281037092208862, + "learning_rate": 5.011135857461025e-06, + "loss": 3.2595, + "step": 45 + }, + { + "epoch": 0.030773090270518878, + "grad_norm": 1.204046607017517, + "learning_rate": 5.12249443207127e-06, + "loss": 3.1779, + "step": 46 + }, + { + "epoch": 0.031442070493791024, + "grad_norm": 1.2449531555175781, + "learning_rate": 5.233853006681515e-06, + "loss": 3.1104, + "step": 47 + }, + { + "epoch": 0.032111050717063175, + "grad_norm": 1.1790910959243774, + "learning_rate": 5.34521158129176e-06, + "loss": 3.2203, + "step": 48 + }, + { + "epoch": 0.032780030940335325, + "grad_norm": 1.266721248626709, + "learning_rate": 5.456570155902005e-06, + "loss": 3.4279, + "step": 49 + }, + { + "epoch": 0.033449011163607476, + "grad_norm": 1.5345031023025513, + "learning_rate": 5.56792873051225e-06, + "loss": 3.4272, + "step": 50 + }, + { + "epoch": 0.034117991386879626, + "grad_norm": 1.250928521156311, + "learning_rate": 5.6792873051224945e-06, + "loss": 3.124, + "step": 51 + }, + { + "epoch": 0.034786971610151776, + "grad_norm": 1.2841166257858276, + "learning_rate": 5.79064587973274e-06, + "loss": 3.1168, + "step": 52 + }, + { + "epoch": 0.03545595183342393, + "grad_norm": 1.311871886253357, + "learning_rate": 5.902004454342985e-06, + "loss": 3.1856, + "step": 53 + }, + { + "epoch": 0.03612493205669608, + "grad_norm": 1.2422196865081787, + "learning_rate": 6.01336302895323e-06, + "loss": 3.2101, + "step": 54 + }, + { + "epoch": 0.03679391227996822, + "grad_norm": 1.347784399986267, + "learning_rate": 6.124721603563475e-06, + "loss": 3.1658, + "step": 55 + }, + { + "epoch": 0.03746289250324037, + "grad_norm": 1.3649243116378784, + "learning_rate": 6.23608017817372e-06, + "loss": 3.0376, + "step": 56 + }, + { + "epoch": 0.03813187272651252, + "grad_norm": 1.2381495237350464, + "learning_rate": 6.347438752783964e-06, + "loss": 2.7926, + "step": 57 + }, + { + "epoch": 0.03880085294978467, + "grad_norm": 1.3895397186279297, + "learning_rate": 6.45879732739421e-06, + "loss": 3.194, + "step": 58 + }, + { + "epoch": 0.03946983317305682, + "grad_norm": 1.2116481065750122, + "learning_rate": 6.570155902004454e-06, + "loss": 3.1445, + "step": 59 + }, + { + "epoch": 0.04013881339632897, + "grad_norm": 1.3978331089019775, + "learning_rate": 6.6815144766147e-06, + "loss": 3.218, + "step": 60 + }, + { + "epoch": 0.04080779361960112, + "grad_norm": 1.3602263927459717, + "learning_rate": 6.792873051224944e-06, + "loss": 3.2545, + "step": 61 + }, + { + "epoch": 0.04147677384287327, + "grad_norm": 1.4267244338989258, + "learning_rate": 6.9042316258351895e-06, + "loss": 3.353, + "step": 62 + }, + { + "epoch": 0.04214575406614542, + "grad_norm": 1.3509756326675415, + "learning_rate": 7.0155902004454345e-06, + "loss": 3.1955, + "step": 63 + }, + { + "epoch": 0.042814734289417566, + "grad_norm": 1.8327754735946655, + "learning_rate": 7.12694877505568e-06, + "loss": 3.408, + "step": 64 + }, + { + "epoch": 0.04348371451268972, + "grad_norm": 1.5196959972381592, + "learning_rate": 7.238307349665925e-06, + "loss": 3.2611, + "step": 65 + }, + { + "epoch": 0.04415269473596187, + "grad_norm": 1.2716466188430786, + "learning_rate": 7.34966592427617e-06, + "loss": 3.0723, + "step": 66 + }, + { + "epoch": 0.04482167495923402, + "grad_norm": 1.5642995834350586, + "learning_rate": 7.461024498886416e-06, + "loss": 3.2954, + "step": 67 + }, + { + "epoch": 0.04549065518250617, + "grad_norm": 1.3303098678588867, + "learning_rate": 7.57238307349666e-06, + "loss": 3.2496, + "step": 68 + }, + { + "epoch": 0.04615963540577832, + "grad_norm": 1.5286786556243896, + "learning_rate": 7.683741648106903e-06, + "loss": 3.1971, + "step": 69 + }, + { + "epoch": 0.04682861562905047, + "grad_norm": 1.5279144048690796, + "learning_rate": 7.79510022271715e-06, + "loss": 3.0903, + "step": 70 + }, + { + "epoch": 0.04749759585232262, + "grad_norm": 1.0978755950927734, + "learning_rate": 7.906458797327395e-06, + "loss": 2.8862, + "step": 71 + }, + { + "epoch": 0.04816657607559476, + "grad_norm": 1.305240511894226, + "learning_rate": 8.01781737193764e-06, + "loss": 2.9631, + "step": 72 + }, + { + "epoch": 0.04883555629886691, + "grad_norm": 1.7665414810180664, + "learning_rate": 8.129175946547885e-06, + "loss": 3.0723, + "step": 73 + }, + { + "epoch": 0.04950453652213906, + "grad_norm": 1.4403198957443237, + "learning_rate": 8.240534521158129e-06, + "loss": 2.921, + "step": 74 + }, + { + "epoch": 0.05017351674541121, + "grad_norm": 1.5842571258544922, + "learning_rate": 8.351893095768375e-06, + "loss": 3.2412, + "step": 75 + }, + { + "epoch": 0.050842496968683364, + "grad_norm": 1.4527440071105957, + "learning_rate": 8.463251670378619e-06, + "loss": 3.0784, + "step": 76 + }, + { + "epoch": 0.051511477191955514, + "grad_norm": 1.566912055015564, + "learning_rate": 8.574610244988866e-06, + "loss": 3.3487, + "step": 77 + }, + { + "epoch": 0.052180457415227664, + "grad_norm": 1.4142961502075195, + "learning_rate": 8.685968819599109e-06, + "loss": 3.165, + "step": 78 + }, + { + "epoch": 0.052849437638499815, + "grad_norm": 1.6616283655166626, + "learning_rate": 8.797327394209356e-06, + "loss": 2.9588, + "step": 79 + }, + { + "epoch": 0.05351841786177196, + "grad_norm": 1.506828784942627, + "learning_rate": 8.908685968819599e-06, + "loss": 3.0466, + "step": 80 + }, + { + "epoch": 0.05418739808504411, + "grad_norm": 1.2490448951721191, + "learning_rate": 9.020044543429844e-06, + "loss": 2.8133, + "step": 81 + }, + { + "epoch": 0.05485637830831626, + "grad_norm": 1.604492425918579, + "learning_rate": 9.13140311804009e-06, + "loss": 3.0556, + "step": 82 + }, + { + "epoch": 0.05552535853158841, + "grad_norm": 1.5094462633132935, + "learning_rate": 9.242761692650334e-06, + "loss": 3.0852, + "step": 83 + }, + { + "epoch": 0.05619433875486056, + "grad_norm": 1.6594133377075195, + "learning_rate": 9.35412026726058e-06, + "loss": 3.1974, + "step": 84 + }, + { + "epoch": 0.05686331897813271, + "grad_norm": 1.156653881072998, + "learning_rate": 9.465478841870824e-06, + "loss": 2.8347, + "step": 85 + }, + { + "epoch": 0.05753229920140486, + "grad_norm": 1.2661216259002686, + "learning_rate": 9.57683741648107e-06, + "loss": 2.8568, + "step": 86 + }, + { + "epoch": 0.05820127942467701, + "grad_norm": 1.5680220127105713, + "learning_rate": 9.688195991091315e-06, + "loss": 3.3677, + "step": 87 + }, + { + "epoch": 0.05887025964794916, + "grad_norm": 1.2300074100494385, + "learning_rate": 9.79955456570156e-06, + "loss": 2.9659, + "step": 88 + }, + { + "epoch": 0.059539239871221304, + "grad_norm": 1.4609466791152954, + "learning_rate": 9.910913140311805e-06, + "loss": 3.2218, + "step": 89 + }, + { + "epoch": 0.060208220094493455, + "grad_norm": 1.5475412607192993, + "learning_rate": 1.002227171492205e-05, + "loss": 3.0252, + "step": 90 + }, + { + "epoch": 0.060877200317765605, + "grad_norm": 1.712816834449768, + "learning_rate": 1.0133630289532295e-05, + "loss": 3.1354, + "step": 91 + }, + { + "epoch": 0.061546180541037755, + "grad_norm": 1.9765368700027466, + "learning_rate": 1.024498886414254e-05, + "loss": 3.2223, + "step": 92 + }, + { + "epoch": 0.062215160764309906, + "grad_norm": 1.6176987886428833, + "learning_rate": 1.0356347438752785e-05, + "loss": 3.1179, + "step": 93 + }, + { + "epoch": 0.06288414098758205, + "grad_norm": 7.536865711212158, + "learning_rate": 1.046770601336303e-05, + "loss": 3.4484, + "step": 94 + }, + { + "epoch": 0.0635531212108542, + "grad_norm": 1.5932583808898926, + "learning_rate": 1.0579064587973274e-05, + "loss": 2.9496, + "step": 95 + }, + { + "epoch": 0.06422210143412635, + "grad_norm": 1.488038420677185, + "learning_rate": 1.069042316258352e-05, + "loss": 3.119, + "step": 96 + }, + { + "epoch": 0.0648910816573985, + "grad_norm": 1.9568957090377808, + "learning_rate": 1.0801781737193764e-05, + "loss": 3.3372, + "step": 97 + }, + { + "epoch": 0.06556006188067065, + "grad_norm": 1.7145527601242065, + "learning_rate": 1.091314031180401e-05, + "loss": 3.0294, + "step": 98 + }, + { + "epoch": 0.0662290421039428, + "grad_norm": 1.7908884286880493, + "learning_rate": 1.1024498886414254e-05, + "loss": 3.241, + "step": 99 + }, + { + "epoch": 0.06689802232721495, + "grad_norm": 1.2711106538772583, + "learning_rate": 1.11358574610245e-05, + "loss": 2.9724, + "step": 100 + }, + { + "epoch": 0.0675670025504871, + "grad_norm": 1.7003251314163208, + "learning_rate": 1.1247216035634744e-05, + "loss": 3.0808, + "step": 101 + }, + { + "epoch": 0.06823598277375925, + "grad_norm": 1.5021196603775024, + "learning_rate": 1.1358574610244989e-05, + "loss": 3.0445, + "step": 102 + }, + { + "epoch": 0.0689049629970314, + "grad_norm": 1.330795168876648, + "learning_rate": 1.1469933184855234e-05, + "loss": 3.0629, + "step": 103 + }, + { + "epoch": 0.06957394322030355, + "grad_norm": 1.3143346309661865, + "learning_rate": 1.158129175946548e-05, + "loss": 3.0155, + "step": 104 + }, + { + "epoch": 0.0702429234435757, + "grad_norm": 1.519888162612915, + "learning_rate": 1.1692650334075724e-05, + "loss": 3.2906, + "step": 105 + }, + { + "epoch": 0.07091190366684785, + "grad_norm": 1.4895938634872437, + "learning_rate": 1.180400890868597e-05, + "loss": 3.0162, + "step": 106 + }, + { + "epoch": 0.07158088389012, + "grad_norm": 1.589227318763733, + "learning_rate": 1.1915367483296214e-05, + "loss": 2.9903, + "step": 107 + }, + { + "epoch": 0.07224986411339215, + "grad_norm": 1.701695203781128, + "learning_rate": 1.202672605790646e-05, + "loss": 3.1095, + "step": 108 + }, + { + "epoch": 0.0729188443366643, + "grad_norm": 1.981400489807129, + "learning_rate": 1.2138084632516705e-05, + "loss": 3.2036, + "step": 109 + }, + { + "epoch": 0.07358782455993644, + "grad_norm": 1.8499116897583008, + "learning_rate": 1.224944320712695e-05, + "loss": 3.0118, + "step": 110 + }, + { + "epoch": 0.07425680478320859, + "grad_norm": 1.604082465171814, + "learning_rate": 1.2360801781737195e-05, + "loss": 3.2236, + "step": 111 + }, + { + "epoch": 0.07492578500648074, + "grad_norm": 2.0322389602661133, + "learning_rate": 1.247216035634744e-05, + "loss": 3.3046, + "step": 112 + }, + { + "epoch": 0.07559476522975289, + "grad_norm": 2.7553274631500244, + "learning_rate": 1.2583518930957685e-05, + "loss": 3.1316, + "step": 113 + }, + { + "epoch": 0.07626374545302504, + "grad_norm": 1.6986571550369263, + "learning_rate": 1.2694877505567928e-05, + "loss": 3.182, + "step": 114 + }, + { + "epoch": 0.07693272567629719, + "grad_norm": 1.5125391483306885, + "learning_rate": 1.2806236080178175e-05, + "loss": 3.1322, + "step": 115 + }, + { + "epoch": 0.07760170589956934, + "grad_norm": 1.5707584619522095, + "learning_rate": 1.291759465478842e-05, + "loss": 3.045, + "step": 116 + }, + { + "epoch": 0.07827068612284149, + "grad_norm": 1.6684489250183105, + "learning_rate": 1.3028953229398663e-05, + "loss": 3.0713, + "step": 117 + }, + { + "epoch": 0.07893966634611364, + "grad_norm": 1.5122387409210205, + "learning_rate": 1.3140311804008909e-05, + "loss": 3.022, + "step": 118 + }, + { + "epoch": 0.0796086465693858, + "grad_norm": 2.1538209915161133, + "learning_rate": 1.3251670378619155e-05, + "loss": 3.2607, + "step": 119 + }, + { + "epoch": 0.08027762679265794, + "grad_norm": 1.4033334255218506, + "learning_rate": 1.33630289532294e-05, + "loss": 3.0916, + "step": 120 + }, + { + "epoch": 0.0809466070159301, + "grad_norm": 1.8540390729904175, + "learning_rate": 1.3474387527839644e-05, + "loss": 3.1077, + "step": 121 + }, + { + "epoch": 0.08161558723920224, + "grad_norm": 1.456629753112793, + "learning_rate": 1.3585746102449889e-05, + "loss": 2.9815, + "step": 122 + }, + { + "epoch": 0.0822845674624744, + "grad_norm": 1.5381335020065308, + "learning_rate": 1.3697104677060136e-05, + "loss": 2.9814, + "step": 123 + }, + { + "epoch": 0.08295354768574655, + "grad_norm": 1.2347851991653442, + "learning_rate": 1.3808463251670379e-05, + "loss": 2.7295, + "step": 124 + }, + { + "epoch": 0.0836225279090187, + "grad_norm": 1.2295702695846558, + "learning_rate": 1.3919821826280624e-05, + "loss": 2.8827, + "step": 125 + }, + { + "epoch": 0.08429150813229085, + "grad_norm": 1.7871609926223755, + "learning_rate": 1.4031180400890869e-05, + "loss": 3.0303, + "step": 126 + }, + { + "epoch": 0.08496048835556298, + "grad_norm": 1.6743766069412231, + "learning_rate": 1.4142538975501116e-05, + "loss": 2.9467, + "step": 127 + }, + { + "epoch": 0.08562946857883513, + "grad_norm": 1.4504770040512085, + "learning_rate": 1.425389755011136e-05, + "loss": 3.3573, + "step": 128 + }, + { + "epoch": 0.08629844880210728, + "grad_norm": 1.6345608234405518, + "learning_rate": 1.4365256124721604e-05, + "loss": 2.9953, + "step": 129 + }, + { + "epoch": 0.08696742902537943, + "grad_norm": 1.3987749814987183, + "learning_rate": 1.447661469933185e-05, + "loss": 2.7236, + "step": 130 + }, + { + "epoch": 0.08763640924865158, + "grad_norm": 1.359086275100708, + "learning_rate": 1.4587973273942093e-05, + "loss": 2.8055, + "step": 131 + }, + { + "epoch": 0.08830538947192373, + "grad_norm": 1.4174124002456665, + "learning_rate": 1.469933184855234e-05, + "loss": 2.9624, + "step": 132 + }, + { + "epoch": 0.08897436969519588, + "grad_norm": 1.8663897514343262, + "learning_rate": 1.4810690423162585e-05, + "loss": 2.9814, + "step": 133 + }, + { + "epoch": 0.08964334991846803, + "grad_norm": 2.121668577194214, + "learning_rate": 1.4922048997772831e-05, + "loss": 3.1841, + "step": 134 + }, + { + "epoch": 0.09031233014174019, + "grad_norm": 1.5990877151489258, + "learning_rate": 1.5033407572383073e-05, + "loss": 2.9322, + "step": 135 + }, + { + "epoch": 0.09098131036501234, + "grad_norm": 2.1767890453338623, + "learning_rate": 1.514476614699332e-05, + "loss": 3.1197, + "step": 136 + }, + { + "epoch": 0.09165029058828449, + "grad_norm": 1.5318809747695923, + "learning_rate": 1.5256124721603565e-05, + "loss": 2.9157, + "step": 137 + }, + { + "epoch": 0.09231927081155664, + "grad_norm": 2.0057833194732666, + "learning_rate": 1.5367483296213807e-05, + "loss": 3.0975, + "step": 138 + }, + { + "epoch": 0.09298825103482879, + "grad_norm": 1.485335350036621, + "learning_rate": 1.5478841870824053e-05, + "loss": 2.9477, + "step": 139 + }, + { + "epoch": 0.09365723125810094, + "grad_norm": 1.8702194690704346, + "learning_rate": 1.55902004454343e-05, + "loss": 3.0644, + "step": 140 + }, + { + "epoch": 0.09432621148137309, + "grad_norm": 1.8341530561447144, + "learning_rate": 1.5701559020044543e-05, + "loss": 3.0453, + "step": 141 + }, + { + "epoch": 0.09499519170464524, + "grad_norm": 1.6797006130218506, + "learning_rate": 1.581291759465479e-05, + "loss": 3.0143, + "step": 142 + }, + { + "epoch": 0.09566417192791737, + "grad_norm": 1.846571445465088, + "learning_rate": 1.5924276169265034e-05, + "loss": 3.18, + "step": 143 + }, + { + "epoch": 0.09633315215118952, + "grad_norm": 1.3143115043640137, + "learning_rate": 1.603563474387528e-05, + "loss": 2.6912, + "step": 144 + }, + { + "epoch": 0.09700213237446167, + "grad_norm": 1.8494148254394531, + "learning_rate": 1.6146993318485524e-05, + "loss": 3.0117, + "step": 145 + }, + { + "epoch": 0.09767111259773383, + "grad_norm": 1.6968753337860107, + "learning_rate": 1.625835189309577e-05, + "loss": 3.0856, + "step": 146 + }, + { + "epoch": 0.09834009282100598, + "grad_norm": 1.684114933013916, + "learning_rate": 1.6369710467706014e-05, + "loss": 3.1856, + "step": 147 + }, + { + "epoch": 0.09900907304427813, + "grad_norm": 1.4782801866531372, + "learning_rate": 1.6481069042316257e-05, + "loss": 2.9094, + "step": 148 + }, + { + "epoch": 0.09967805326755028, + "grad_norm": 1.9199340343475342, + "learning_rate": 1.6592427616926504e-05, + "loss": 2.9601, + "step": 149 + }, + { + "epoch": 0.10034703349082243, + "grad_norm": 1.7519869804382324, + "learning_rate": 1.670378619153675e-05, + "loss": 3.0994, + "step": 150 + }, + { + "epoch": 0.10101601371409458, + "grad_norm": 2.247997760772705, + "learning_rate": 1.6815144766146994e-05, + "loss": 3.277, + "step": 151 + }, + { + "epoch": 0.10168499393736673, + "grad_norm": 1.676098108291626, + "learning_rate": 1.6926503340757238e-05, + "loss": 3.0809, + "step": 152 + }, + { + "epoch": 0.10235397416063888, + "grad_norm": 1.3067691326141357, + "learning_rate": 1.7037861915367484e-05, + "loss": 2.8895, + "step": 153 + }, + { + "epoch": 0.10302295438391103, + "grad_norm": 1.6931229829788208, + "learning_rate": 1.714922048997773e-05, + "loss": 3.0187, + "step": 154 + }, + { + "epoch": 0.10369193460718318, + "grad_norm": 2.1117000579833984, + "learning_rate": 1.726057906458797e-05, + "loss": 3.2156, + "step": 155 + }, + { + "epoch": 0.10436091483045533, + "grad_norm": 1.5815112590789795, + "learning_rate": 1.7371937639198218e-05, + "loss": 2.8911, + "step": 156 + }, + { + "epoch": 0.10502989505372748, + "grad_norm": 1.5297530889511108, + "learning_rate": 1.7483296213808465e-05, + "loss": 3.0298, + "step": 157 + }, + { + "epoch": 0.10569887527699963, + "grad_norm": 1.5161750316619873, + "learning_rate": 1.759465478841871e-05, + "loss": 3.0969, + "step": 158 + }, + { + "epoch": 0.10636785550027178, + "grad_norm": 1.600009560585022, + "learning_rate": 1.7706013363028955e-05, + "loss": 2.9769, + "step": 159 + }, + { + "epoch": 0.10703683572354392, + "grad_norm": 1.8170793056488037, + "learning_rate": 1.7817371937639198e-05, + "loss": 2.9447, + "step": 160 + }, + { + "epoch": 0.10770581594681607, + "grad_norm": 1.4646176099777222, + "learning_rate": 1.7928730512249445e-05, + "loss": 2.9214, + "step": 161 + }, + { + "epoch": 0.10837479617008822, + "grad_norm": 1.8447264432907104, + "learning_rate": 1.804008908685969e-05, + "loss": 3.1705, + "step": 162 + }, + { + "epoch": 0.10904377639336037, + "grad_norm": 2.4299709796905518, + "learning_rate": 1.8151447661469935e-05, + "loss": 3.016, + "step": 163 + }, + { + "epoch": 0.10971275661663252, + "grad_norm": 1.999182105064392, + "learning_rate": 1.826280623608018e-05, + "loss": 2.98, + "step": 164 + }, + { + "epoch": 0.11038173683990467, + "grad_norm": 2.1972455978393555, + "learning_rate": 1.8374164810690425e-05, + "loss": 3.1834, + "step": 165 + }, + { + "epoch": 0.11105071706317682, + "grad_norm": 2.052306652069092, + "learning_rate": 1.848552338530067e-05, + "loss": 2.9967, + "step": 166 + }, + { + "epoch": 0.11171969728644897, + "grad_norm": 1.6119320392608643, + "learning_rate": 1.8596881959910915e-05, + "loss": 2.9699, + "step": 167 + }, + { + "epoch": 0.11238867750972112, + "grad_norm": 2.123548746109009, + "learning_rate": 1.870824053452116e-05, + "loss": 3.2142, + "step": 168 + }, + { + "epoch": 0.11305765773299327, + "grad_norm": 2.0712902545928955, + "learning_rate": 1.8819599109131402e-05, + "loss": 2.9206, + "step": 169 + }, + { + "epoch": 0.11372663795626542, + "grad_norm": 1.5530469417572021, + "learning_rate": 1.893095768374165e-05, + "loss": 2.9556, + "step": 170 + }, + { + "epoch": 0.11439561817953757, + "grad_norm": 1.4171209335327148, + "learning_rate": 1.9042316258351896e-05, + "loss": 2.9874, + "step": 171 + }, + { + "epoch": 0.11506459840280972, + "grad_norm": 1.9125628471374512, + "learning_rate": 1.915367483296214e-05, + "loss": 3.0581, + "step": 172 + }, + { + "epoch": 0.11573357862608187, + "grad_norm": 1.7964285612106323, + "learning_rate": 1.9265033407572382e-05, + "loss": 3.1749, + "step": 173 + }, + { + "epoch": 0.11640255884935402, + "grad_norm": 1.7148891687393188, + "learning_rate": 1.937639198218263e-05, + "loss": 2.7974, + "step": 174 + }, + { + "epoch": 0.11707153907262617, + "grad_norm": 1.8488072156906128, + "learning_rate": 1.9487750556792876e-05, + "loss": 3.0808, + "step": 175 + }, + { + "epoch": 0.11774051929589832, + "grad_norm": 1.799553632736206, + "learning_rate": 1.959910913140312e-05, + "loss": 3.0121, + "step": 176 + }, + { + "epoch": 0.11840949951917046, + "grad_norm": 1.8497250080108643, + "learning_rate": 1.9710467706013363e-05, + "loss": 3.0084, + "step": 177 + }, + { + "epoch": 0.11907847974244261, + "grad_norm": 1.743905782699585, + "learning_rate": 1.982182628062361e-05, + "loss": 2.9881, + "step": 178 + }, + { + "epoch": 0.11974745996571476, + "grad_norm": 2.140427827835083, + "learning_rate": 1.9933184855233856e-05, + "loss": 2.9783, + "step": 179 + }, + { + "epoch": 0.12041644018898691, + "grad_norm": 1.3036153316497803, + "learning_rate": 2.00445434298441e-05, + "loss": 2.8561, + "step": 180 + }, + { + "epoch": 0.12108542041225906, + "grad_norm": 1.707446813583374, + "learning_rate": 2.0155902004454343e-05, + "loss": 3.0634, + "step": 181 + }, + { + "epoch": 0.12175440063553121, + "grad_norm": 1.7313611507415771, + "learning_rate": 2.026726057906459e-05, + "loss": 3.0231, + "step": 182 + }, + { + "epoch": 0.12242338085880336, + "grad_norm": 1.5043818950653076, + "learning_rate": 2.0378619153674833e-05, + "loss": 2.7388, + "step": 183 + }, + { + "epoch": 0.12309236108207551, + "grad_norm": 1.794662356376648, + "learning_rate": 2.048997772828508e-05, + "loss": 3.0349, + "step": 184 + }, + { + "epoch": 0.12376134130534766, + "grad_norm": 1.933425784111023, + "learning_rate": 2.0601336302895323e-05, + "loss": 3.107, + "step": 185 + }, + { + "epoch": 0.12443032152861981, + "grad_norm": 2.4808244705200195, + "learning_rate": 2.071269487750557e-05, + "loss": 3.0081, + "step": 186 + }, + { + "epoch": 0.12509930175189196, + "grad_norm": 1.7459521293640137, + "learning_rate": 2.0824053452115813e-05, + "loss": 2.9181, + "step": 187 + }, + { + "epoch": 0.1257682819751641, + "grad_norm": 2.0204484462738037, + "learning_rate": 2.093541202672606e-05, + "loss": 3.2941, + "step": 188 + }, + { + "epoch": 0.12643726219843626, + "grad_norm": 1.8456350564956665, + "learning_rate": 2.1046770601336304e-05, + "loss": 3.0197, + "step": 189 + }, + { + "epoch": 0.1271062424217084, + "grad_norm": 1.5815969705581665, + "learning_rate": 2.1158129175946547e-05, + "loss": 2.949, + "step": 190 + }, + { + "epoch": 0.12777522264498056, + "grad_norm": 2.190483570098877, + "learning_rate": 2.1269487750556794e-05, + "loss": 3.0315, + "step": 191 + }, + { + "epoch": 0.1284442028682527, + "grad_norm": 1.8807663917541504, + "learning_rate": 2.138084632516704e-05, + "loss": 2.8794, + "step": 192 + }, + { + "epoch": 0.12911318309152486, + "grad_norm": 1.9881584644317627, + "learning_rate": 2.1492204899777284e-05, + "loss": 2.9104, + "step": 193 + }, + { + "epoch": 0.129782163314797, + "grad_norm": 2.114427328109741, + "learning_rate": 2.1603563474387527e-05, + "loss": 2.8785, + "step": 194 + }, + { + "epoch": 0.13045114353806916, + "grad_norm": 2.0160298347473145, + "learning_rate": 2.1714922048997774e-05, + "loss": 3.0206, + "step": 195 + }, + { + "epoch": 0.1311201237613413, + "grad_norm": 1.7591657638549805, + "learning_rate": 2.182628062360802e-05, + "loss": 3.0766, + "step": 196 + }, + { + "epoch": 0.13178910398461346, + "grad_norm": 2.251678705215454, + "learning_rate": 2.1937639198218264e-05, + "loss": 3.0232, + "step": 197 + }, + { + "epoch": 0.1324580842078856, + "grad_norm": 2.139174699783325, + "learning_rate": 2.2048997772828508e-05, + "loss": 2.9806, + "step": 198 + }, + { + "epoch": 0.13312706443115777, + "grad_norm": 2.0383951663970947, + "learning_rate": 2.2160356347438754e-05, + "loss": 3.3346, + "step": 199 + }, + { + "epoch": 0.1337960446544299, + "grad_norm": 2.2171547412872314, + "learning_rate": 2.2271714922049e-05, + "loss": 2.8831, + "step": 200 + }, + { + "epoch": 0.13446502487770207, + "grad_norm": 2.0654296875, + "learning_rate": 2.2383073496659245e-05, + "loss": 2.7744, + "step": 201 + }, + { + "epoch": 0.1351340051009742, + "grad_norm": 1.9895446300506592, + "learning_rate": 2.2494432071269488e-05, + "loss": 3.0192, + "step": 202 + }, + { + "epoch": 0.13580298532424634, + "grad_norm": 2.3785881996154785, + "learning_rate": 2.2605790645879735e-05, + "loss": 3.1008, + "step": 203 + }, + { + "epoch": 0.1364719655475185, + "grad_norm": 2.2556653022766113, + "learning_rate": 2.2717149220489978e-05, + "loss": 3.2238, + "step": 204 + }, + { + "epoch": 0.13714094577079064, + "grad_norm": 2.219691038131714, + "learning_rate": 2.2828507795100225e-05, + "loss": 2.8803, + "step": 205 + }, + { + "epoch": 0.1378099259940628, + "grad_norm": 1.679511308670044, + "learning_rate": 2.2939866369710468e-05, + "loss": 2.7962, + "step": 206 + }, + { + "epoch": 0.13847890621733494, + "grad_norm": 1.844044804573059, + "learning_rate": 2.3051224944320715e-05, + "loss": 3.0686, + "step": 207 + }, + { + "epoch": 0.1391478864406071, + "grad_norm": 1.6532880067825317, + "learning_rate": 2.316258351893096e-05, + "loss": 3.0698, + "step": 208 + }, + { + "epoch": 0.13981686666387924, + "grad_norm": 2.2440009117126465, + "learning_rate": 2.3273942093541205e-05, + "loss": 3.0008, + "step": 209 + }, + { + "epoch": 0.1404858468871514, + "grad_norm": 1.1852771043777466, + "learning_rate": 2.338530066815145e-05, + "loss": 2.6482, + "step": 210 + }, + { + "epoch": 0.14115482711042354, + "grad_norm": 1.9627621173858643, + "learning_rate": 2.3496659242761692e-05, + "loss": 2.9175, + "step": 211 + }, + { + "epoch": 0.1418238073336957, + "grad_norm": 1.5972543954849243, + "learning_rate": 2.360801781737194e-05, + "loss": 2.874, + "step": 212 + }, + { + "epoch": 0.14249278755696784, + "grad_norm": 2.07195782661438, + "learning_rate": 2.3719376391982185e-05, + "loss": 2.8547, + "step": 213 + }, + { + "epoch": 0.14316176778024, + "grad_norm": 2.280430555343628, + "learning_rate": 2.383073496659243e-05, + "loss": 2.9573, + "step": 214 + }, + { + "epoch": 0.14383074800351214, + "grad_norm": 1.7816762924194336, + "learning_rate": 2.3942093541202672e-05, + "loss": 3.0404, + "step": 215 + }, + { + "epoch": 0.1444997282267843, + "grad_norm": 1.383423089981079, + "learning_rate": 2.405345211581292e-05, + "loss": 2.8613, + "step": 216 + }, + { + "epoch": 0.14516870845005644, + "grad_norm": 1.8071893453598022, + "learning_rate": 2.4164810690423166e-05, + "loss": 3.0848, + "step": 217 + }, + { + "epoch": 0.1458376886733286, + "grad_norm": 1.5270397663116455, + "learning_rate": 2.427616926503341e-05, + "loss": 2.8631, + "step": 218 + }, + { + "epoch": 0.14650666889660074, + "grad_norm": 1.8604637384414673, + "learning_rate": 2.4387527839643652e-05, + "loss": 2.9685, + "step": 219 + }, + { + "epoch": 0.14717564911987288, + "grad_norm": 1.9972350597381592, + "learning_rate": 2.44988864142539e-05, + "loss": 2.7886, + "step": 220 + }, + { + "epoch": 0.14784462934314505, + "grad_norm": 1.5495383739471436, + "learning_rate": 2.4610244988864146e-05, + "loss": 2.8556, + "step": 221 + }, + { + "epoch": 0.14851360956641718, + "grad_norm": 1.7119460105895996, + "learning_rate": 2.472160356347439e-05, + "loss": 2.971, + "step": 222 + }, + { + "epoch": 0.14918258978968935, + "grad_norm": 1.7634843587875366, + "learning_rate": 2.4832962138084633e-05, + "loss": 2.791, + "step": 223 + }, + { + "epoch": 0.14985157001296148, + "grad_norm": 1.7723678350448608, + "learning_rate": 2.494432071269488e-05, + "loss": 2.8888, + "step": 224 + }, + { + "epoch": 0.15052055023623365, + "grad_norm": 2.0935842990875244, + "learning_rate": 2.5055679287305123e-05, + "loss": 3.0744, + "step": 225 + }, + { + "epoch": 0.15118953045950578, + "grad_norm": 2.2163760662078857, + "learning_rate": 2.516703786191537e-05, + "loss": 3.0321, + "step": 226 + }, + { + "epoch": 0.15185851068277795, + "grad_norm": 2.2750024795532227, + "learning_rate": 2.5278396436525613e-05, + "loss": 3.055, + "step": 227 + }, + { + "epoch": 0.15252749090605008, + "grad_norm": 2.189126968383789, + "learning_rate": 2.5389755011135856e-05, + "loss": 3.0768, + "step": 228 + }, + { + "epoch": 0.15319647112932225, + "grad_norm": 2.0731709003448486, + "learning_rate": 2.5501113585746107e-05, + "loss": 2.8946, + "step": 229 + }, + { + "epoch": 0.15386545135259438, + "grad_norm": 2.6770660877227783, + "learning_rate": 2.561247216035635e-05, + "loss": 3.2448, + "step": 230 + }, + { + "epoch": 0.15453443157586655, + "grad_norm": 2.252547264099121, + "learning_rate": 2.5723830734966593e-05, + "loss": 3.1992, + "step": 231 + }, + { + "epoch": 0.15520341179913869, + "grad_norm": 2.1851205825805664, + "learning_rate": 2.583518930957684e-05, + "loss": 2.9443, + "step": 232 + }, + { + "epoch": 0.15587239202241085, + "grad_norm": 1.8990890979766846, + "learning_rate": 2.5946547884187083e-05, + "loss": 2.7778, + "step": 233 + }, + { + "epoch": 0.15654137224568299, + "grad_norm": 1.4797788858413696, + "learning_rate": 2.6057906458797327e-05, + "loss": 2.9676, + "step": 234 + }, + { + "epoch": 0.15721035246895515, + "grad_norm": 2.029318332672119, + "learning_rate": 2.6169265033407574e-05, + "loss": 3.1086, + "step": 235 + }, + { + "epoch": 0.1578793326922273, + "grad_norm": 2.179896354675293, + "learning_rate": 2.6280623608017817e-05, + "loss": 2.9045, + "step": 236 + }, + { + "epoch": 0.15854831291549942, + "grad_norm": 1.391133189201355, + "learning_rate": 2.639198218262806e-05, + "loss": 2.694, + "step": 237 + }, + { + "epoch": 0.1592172931387716, + "grad_norm": 2.4943625926971436, + "learning_rate": 2.650334075723831e-05, + "loss": 3.1102, + "step": 238 + }, + { + "epoch": 0.15988627336204372, + "grad_norm": 2.750905752182007, + "learning_rate": 2.6614699331848554e-05, + "loss": 3.0298, + "step": 239 + }, + { + "epoch": 0.1605552535853159, + "grad_norm": 1.7482820749282837, + "learning_rate": 2.67260579064588e-05, + "loss": 2.83, + "step": 240 + }, + { + "epoch": 0.16122423380858802, + "grad_norm": 2.7642734050750732, + "learning_rate": 2.6837416481069044e-05, + "loss": 3.1332, + "step": 241 + }, + { + "epoch": 0.1618932140318602, + "grad_norm": 2.404057025909424, + "learning_rate": 2.6948775055679287e-05, + "loss": 3.1092, + "step": 242 + }, + { + "epoch": 0.16256219425513233, + "grad_norm": 1.8724385499954224, + "learning_rate": 2.7060133630289534e-05, + "loss": 3.0315, + "step": 243 + }, + { + "epoch": 0.1632311744784045, + "grad_norm": 1.7511506080627441, + "learning_rate": 2.7171492204899778e-05, + "loss": 2.7904, + "step": 244 + }, + { + "epoch": 0.16390015470167663, + "grad_norm": 1.8246021270751953, + "learning_rate": 2.728285077951002e-05, + "loss": 3.0182, + "step": 245 + }, + { + "epoch": 0.1645691349249488, + "grad_norm": 1.8768839836120605, + "learning_rate": 2.739420935412027e-05, + "loss": 2.9269, + "step": 246 + }, + { + "epoch": 0.16523811514822093, + "grad_norm": 2.0913779735565186, + "learning_rate": 2.7505567928730515e-05, + "loss": 2.8438, + "step": 247 + }, + { + "epoch": 0.1659070953714931, + "grad_norm": 2.4028401374816895, + "learning_rate": 2.7616926503340758e-05, + "loss": 2.8489, + "step": 248 + }, + { + "epoch": 0.16657607559476523, + "grad_norm": 1.8068379163742065, + "learning_rate": 2.7728285077951005e-05, + "loss": 2.9883, + "step": 249 + }, + { + "epoch": 0.1672450558180374, + "grad_norm": 2.4506468772888184, + "learning_rate": 2.7839643652561248e-05, + "loss": 2.8825, + "step": 250 + }, + { + "epoch": 0.16791403604130953, + "grad_norm": 1.8266322612762451, + "learning_rate": 2.795100222717149e-05, + "loss": 3.1515, + "step": 251 + }, + { + "epoch": 0.1685830162645817, + "grad_norm": 3.492474317550659, + "learning_rate": 2.8062360801781738e-05, + "loss": 3.055, + "step": 252 + }, + { + "epoch": 0.16925199648785383, + "grad_norm": 2.0248913764953613, + "learning_rate": 2.817371937639198e-05, + "loss": 2.8643, + "step": 253 + }, + { + "epoch": 0.16992097671112597, + "grad_norm": 2.751221179962158, + "learning_rate": 2.8285077951002232e-05, + "loss": 2.9571, + "step": 254 + }, + { + "epoch": 0.17058995693439813, + "grad_norm": 1.8173011541366577, + "learning_rate": 2.8396436525612475e-05, + "loss": 2.98, + "step": 255 + }, + { + "epoch": 0.17125893715767027, + "grad_norm": 2.3821542263031006, + "learning_rate": 2.850779510022272e-05, + "loss": 3.0517, + "step": 256 + }, + { + "epoch": 0.17192791738094243, + "grad_norm": 2.088780403137207, + "learning_rate": 2.8619153674832965e-05, + "loss": 2.9371, + "step": 257 + }, + { + "epoch": 0.17259689760421457, + "grad_norm": 1.652597188949585, + "learning_rate": 2.873051224944321e-05, + "loss": 3.0313, + "step": 258 + }, + { + "epoch": 0.17326587782748673, + "grad_norm": 1.7598010301589966, + "learning_rate": 2.8841870824053452e-05, + "loss": 2.8893, + "step": 259 + }, + { + "epoch": 0.17393485805075887, + "grad_norm": 1.3597828149795532, + "learning_rate": 2.89532293986637e-05, + "loss": 2.6477, + "step": 260 + }, + { + "epoch": 0.17460383827403103, + "grad_norm": 1.9741261005401611, + "learning_rate": 2.9064587973273942e-05, + "loss": 3.0815, + "step": 261 + }, + { + "epoch": 0.17527281849730317, + "grad_norm": 2.220665693283081, + "learning_rate": 2.9175946547884186e-05, + "loss": 3.0102, + "step": 262 + }, + { + "epoch": 0.17594179872057533, + "grad_norm": 1.7396165132522583, + "learning_rate": 2.9287305122494436e-05, + "loss": 3.0626, + "step": 263 + }, + { + "epoch": 0.17661077894384747, + "grad_norm": 1.7730218172073364, + "learning_rate": 2.939866369710468e-05, + "loss": 2.9754, + "step": 264 + }, + { + "epoch": 0.17727975916711963, + "grad_norm": 2.6833832263946533, + "learning_rate": 2.9510022271714922e-05, + "loss": 3.2264, + "step": 265 + }, + { + "epoch": 0.17794873939039177, + "grad_norm": 1.9124982357025146, + "learning_rate": 2.962138084632517e-05, + "loss": 2.9205, + "step": 266 + }, + { + "epoch": 0.17861771961366393, + "grad_norm": 2.111903190612793, + "learning_rate": 2.9732739420935413e-05, + "loss": 2.981, + "step": 267 + }, + { + "epoch": 0.17928669983693607, + "grad_norm": 2.5708305835723877, + "learning_rate": 2.9844097995545663e-05, + "loss": 3.0503, + "step": 268 + }, + { + "epoch": 0.1799556800602082, + "grad_norm": 2.538877010345459, + "learning_rate": 2.9955456570155903e-05, + "loss": 3.1285, + "step": 269 + }, + { + "epoch": 0.18062466028348037, + "grad_norm": 2.1339826583862305, + "learning_rate": 3.0066815144766146e-05, + "loss": 2.9144, + "step": 270 + }, + { + "epoch": 0.1812936405067525, + "grad_norm": 1.9419770240783691, + "learning_rate": 3.0178173719376396e-05, + "loss": 2.8248, + "step": 271 + }, + { + "epoch": 0.18196262073002467, + "grad_norm": 1.6631495952606201, + "learning_rate": 3.028953229398664e-05, + "loss": 2.8192, + "step": 272 + }, + { + "epoch": 0.1826316009532968, + "grad_norm": 2.0016918182373047, + "learning_rate": 3.0400890868596883e-05, + "loss": 2.7806, + "step": 273 + }, + { + "epoch": 0.18330058117656897, + "grad_norm": 2.2896358966827393, + "learning_rate": 3.051224944320713e-05, + "loss": 3.047, + "step": 274 + }, + { + "epoch": 0.1839695613998411, + "grad_norm": 2.4204511642456055, + "learning_rate": 3.0623608017817377e-05, + "loss": 3.0408, + "step": 275 + }, + { + "epoch": 0.18463854162311327, + "grad_norm": 2.3816628456115723, + "learning_rate": 3.073496659242761e-05, + "loss": 2.9486, + "step": 276 + }, + { + "epoch": 0.1853075218463854, + "grad_norm": 1.8626177310943604, + "learning_rate": 3.084632516703786e-05, + "loss": 3.0183, + "step": 277 + }, + { + "epoch": 0.18597650206965757, + "grad_norm": 2.2854461669921875, + "learning_rate": 3.095768374164811e-05, + "loss": 3.0818, + "step": 278 + }, + { + "epoch": 0.1866454822929297, + "grad_norm": 2.1244637966156006, + "learning_rate": 3.106904231625835e-05, + "loss": 3.0196, + "step": 279 + }, + { + "epoch": 0.18731446251620187, + "grad_norm": 1.7232093811035156, + "learning_rate": 3.11804008908686e-05, + "loss": 2.8402, + "step": 280 + }, + { + "epoch": 0.187983442739474, + "grad_norm": 1.849948763847351, + "learning_rate": 3.1291759465478844e-05, + "loss": 2.9892, + "step": 281 + }, + { + "epoch": 0.18865242296274617, + "grad_norm": 1.760595440864563, + "learning_rate": 3.140311804008909e-05, + "loss": 2.6975, + "step": 282 + }, + { + "epoch": 0.1893214031860183, + "grad_norm": 2.229926824569702, + "learning_rate": 3.151447661469934e-05, + "loss": 3.0247, + "step": 283 + }, + { + "epoch": 0.18999038340929048, + "grad_norm": 2.4357099533081055, + "learning_rate": 3.162583518930958e-05, + "loss": 2.8133, + "step": 284 + }, + { + "epoch": 0.1906593636325626, + "grad_norm": 1.6331199407577515, + "learning_rate": 3.1737193763919824e-05, + "loss": 2.8615, + "step": 285 + }, + { + "epoch": 0.19132834385583475, + "grad_norm": 2.13657546043396, + "learning_rate": 3.184855233853007e-05, + "loss": 3.0742, + "step": 286 + }, + { + "epoch": 0.1919973240791069, + "grad_norm": 2.2347702980041504, + "learning_rate": 3.195991091314031e-05, + "loss": 2.9647, + "step": 287 + }, + { + "epoch": 0.19266630430237905, + "grad_norm": 2.1436219215393066, + "learning_rate": 3.207126948775056e-05, + "loss": 2.9529, + "step": 288 + }, + { + "epoch": 0.1933352845256512, + "grad_norm": 1.7989563941955566, + "learning_rate": 3.2182628062360804e-05, + "loss": 2.9052, + "step": 289 + }, + { + "epoch": 0.19400426474892335, + "grad_norm": 2.0161685943603516, + "learning_rate": 3.229398663697105e-05, + "loss": 2.9663, + "step": 290 + }, + { + "epoch": 0.1946732449721955, + "grad_norm": 1.6002366542816162, + "learning_rate": 3.24053452115813e-05, + "loss": 2.929, + "step": 291 + }, + { + "epoch": 0.19534222519546765, + "grad_norm": 1.668652057647705, + "learning_rate": 3.251670378619154e-05, + "loss": 2.8479, + "step": 292 + }, + { + "epoch": 0.19601120541873981, + "grad_norm": 1.8546702861785889, + "learning_rate": 3.262806236080178e-05, + "loss": 2.9774, + "step": 293 + }, + { + "epoch": 0.19668018564201195, + "grad_norm": 2.1030890941619873, + "learning_rate": 3.273942093541203e-05, + "loss": 2.8901, + "step": 294 + }, + { + "epoch": 0.19734916586528412, + "grad_norm": 1.4736385345458984, + "learning_rate": 3.285077951002227e-05, + "loss": 2.963, + "step": 295 + }, + { + "epoch": 0.19801814608855625, + "grad_norm": 1.8257207870483398, + "learning_rate": 3.2962138084632515e-05, + "loss": 2.98, + "step": 296 + }, + { + "epoch": 0.19868712631182842, + "grad_norm": 2.160999298095703, + "learning_rate": 3.3073496659242765e-05, + "loss": 2.7556, + "step": 297 + }, + { + "epoch": 0.19935610653510055, + "grad_norm": 2.267620801925659, + "learning_rate": 3.318485523385301e-05, + "loss": 2.798, + "step": 298 + }, + { + "epoch": 0.20002508675837272, + "grad_norm": 2.425196409225464, + "learning_rate": 3.329621380846326e-05, + "loss": 3.0942, + "step": 299 + }, + { + "epoch": 0.20069406698164485, + "grad_norm": 2.4903581142425537, + "learning_rate": 3.34075723830735e-05, + "loss": 2.9861, + "step": 300 + }, + { + "epoch": 0.20136304720491702, + "grad_norm": 1.8119566440582275, + "learning_rate": 3.3518930957683745e-05, + "loss": 2.8985, + "step": 301 + }, + { + "epoch": 0.20203202742818915, + "grad_norm": 3.619180202484131, + "learning_rate": 3.363028953229399e-05, + "loss": 3.0096, + "step": 302 + }, + { + "epoch": 0.2027010076514613, + "grad_norm": 2.1625945568084717, + "learning_rate": 3.374164810690423e-05, + "loss": 2.871, + "step": 303 + }, + { + "epoch": 0.20336998787473345, + "grad_norm": 2.3259880542755127, + "learning_rate": 3.3853006681514475e-05, + "loss": 3.1425, + "step": 304 + }, + { + "epoch": 0.2040389680980056, + "grad_norm": 2.2823097705841064, + "learning_rate": 3.3964365256124725e-05, + "loss": 2.9197, + "step": 305 + }, + { + "epoch": 0.20470794832127776, + "grad_norm": 2.460858106613159, + "learning_rate": 3.407572383073497e-05, + "loss": 2.9594, + "step": 306 + }, + { + "epoch": 0.2053769285445499, + "grad_norm": 2.1677207946777344, + "learning_rate": 3.418708240534521e-05, + "loss": 3.0082, + "step": 307 + }, + { + "epoch": 0.20604590876782206, + "grad_norm": 2.5266077518463135, + "learning_rate": 3.429844097995546e-05, + "loss": 3.2755, + "step": 308 + }, + { + "epoch": 0.2067148889910942, + "grad_norm": 2.598078727722168, + "learning_rate": 3.4409799554565706e-05, + "loss": 3.0439, + "step": 309 + }, + { + "epoch": 0.20738386921436636, + "grad_norm": 2.6901001930236816, + "learning_rate": 3.452115812917594e-05, + "loss": 3.2016, + "step": 310 + }, + { + "epoch": 0.2080528494376385, + "grad_norm": 2.208012342453003, + "learning_rate": 3.463251670378619e-05, + "loss": 3.0823, + "step": 311 + }, + { + "epoch": 0.20872182966091066, + "grad_norm": 2.5381200313568115, + "learning_rate": 3.4743875278396436e-05, + "loss": 2.8945, + "step": 312 + }, + { + "epoch": 0.2093908098841828, + "grad_norm": 2.7676663398742676, + "learning_rate": 3.4855233853006686e-05, + "loss": 3.2512, + "step": 313 + }, + { + "epoch": 0.21005979010745496, + "grad_norm": 1.880428433418274, + "learning_rate": 3.496659242761693e-05, + "loss": 2.8545, + "step": 314 + }, + { + "epoch": 0.2107287703307271, + "grad_norm": 1.7739355564117432, + "learning_rate": 3.507795100222717e-05, + "loss": 2.8965, + "step": 315 + }, + { + "epoch": 0.21139775055399926, + "grad_norm": 2.162645101547241, + "learning_rate": 3.518930957683742e-05, + "loss": 3.005, + "step": 316 + }, + { + "epoch": 0.2120667307772714, + "grad_norm": 1.821035385131836, + "learning_rate": 3.5300668151447666e-05, + "loss": 3.0064, + "step": 317 + }, + { + "epoch": 0.21273571100054356, + "grad_norm": 3.1734619140625, + "learning_rate": 3.541202672605791e-05, + "loss": 3.1068, + "step": 318 + }, + { + "epoch": 0.2134046912238157, + "grad_norm": 3.1917405128479004, + "learning_rate": 3.552338530066815e-05, + "loss": 3.0866, + "step": 319 + }, + { + "epoch": 0.21407367144708783, + "grad_norm": 2.3374900817871094, + "learning_rate": 3.5634743875278396e-05, + "loss": 2.9519, + "step": 320 + }, + { + "epoch": 0.21474265167036, + "grad_norm": 2.6081695556640625, + "learning_rate": 3.574610244988864e-05, + "loss": 3.0563, + "step": 321 + }, + { + "epoch": 0.21541163189363213, + "grad_norm": 2.195274591445923, + "learning_rate": 3.585746102449889e-05, + "loss": 2.9423, + "step": 322 + }, + { + "epoch": 0.2160806121169043, + "grad_norm": 3.0954089164733887, + "learning_rate": 3.596881959910913e-05, + "loss": 3.0645, + "step": 323 + }, + { + "epoch": 0.21674959234017643, + "grad_norm": 2.7243683338165283, + "learning_rate": 3.608017817371938e-05, + "loss": 3.0141, + "step": 324 + }, + { + "epoch": 0.2174185725634486, + "grad_norm": 2.720536947250366, + "learning_rate": 3.619153674832963e-05, + "loss": 3.0356, + "step": 325 + }, + { + "epoch": 0.21808755278672073, + "grad_norm": 2.3073344230651855, + "learning_rate": 3.630289532293987e-05, + "loss": 2.8578, + "step": 326 + }, + { + "epoch": 0.2187565330099929, + "grad_norm": 3.312396287918091, + "learning_rate": 3.6414253897550114e-05, + "loss": 3.1004, + "step": 327 + }, + { + "epoch": 0.21942551323326503, + "grad_norm": 2.946176528930664, + "learning_rate": 3.652561247216036e-05, + "loss": 2.9902, + "step": 328 + }, + { + "epoch": 0.2200944934565372, + "grad_norm": 2.6662216186523438, + "learning_rate": 3.66369710467706e-05, + "loss": 2.9431, + "step": 329 + }, + { + "epoch": 0.22076347367980934, + "grad_norm": 2.286525249481201, + "learning_rate": 3.674832962138085e-05, + "loss": 2.9168, + "step": 330 + }, + { + "epoch": 0.2214324539030815, + "grad_norm": 2.14633846282959, + "learning_rate": 3.6859688195991094e-05, + "loss": 2.9081, + "step": 331 + }, + { + "epoch": 0.22210143412635364, + "grad_norm": 2.405035972595215, + "learning_rate": 3.697104677060134e-05, + "loss": 3.1149, + "step": 332 + }, + { + "epoch": 0.2227704143496258, + "grad_norm": 2.870598554611206, + "learning_rate": 3.708240534521159e-05, + "loss": 2.9908, + "step": 333 + }, + { + "epoch": 0.22343939457289794, + "grad_norm": 3.6809451580047607, + "learning_rate": 3.719376391982183e-05, + "loss": 2.8389, + "step": 334 + }, + { + "epoch": 0.2241083747961701, + "grad_norm": 4.133260726928711, + "learning_rate": 3.7305122494432074e-05, + "loss": 2.9818, + "step": 335 + }, + { + "epoch": 0.22477735501944224, + "grad_norm": 2.9487850666046143, + "learning_rate": 3.741648106904232e-05, + "loss": 3.086, + "step": 336 + }, + { + "epoch": 0.22544633524271437, + "grad_norm": 2.474928617477417, + "learning_rate": 3.752783964365256e-05, + "loss": 2.9965, + "step": 337 + }, + { + "epoch": 0.22611531546598654, + "grad_norm": 2.069495439529419, + "learning_rate": 3.7639198218262804e-05, + "loss": 2.8583, + "step": 338 + }, + { + "epoch": 0.22678429568925867, + "grad_norm": 2.0604686737060547, + "learning_rate": 3.7750556792873054e-05, + "loss": 2.7425, + "step": 339 + }, + { + "epoch": 0.22745327591253084, + "grad_norm": 2.515392541885376, + "learning_rate": 3.78619153674833e-05, + "loss": 3.0675, + "step": 340 + }, + { + "epoch": 0.22812225613580298, + "grad_norm": 2.4841203689575195, + "learning_rate": 3.797327394209355e-05, + "loss": 3.0104, + "step": 341 + }, + { + "epoch": 0.22879123635907514, + "grad_norm": 3.003702163696289, + "learning_rate": 3.808463251670379e-05, + "loss": 3.0008, + "step": 342 + }, + { + "epoch": 0.22946021658234728, + "grad_norm": 2.7392759323120117, + "learning_rate": 3.8195991091314035e-05, + "loss": 2.8839, + "step": 343 + }, + { + "epoch": 0.23012919680561944, + "grad_norm": 2.9119346141815186, + "learning_rate": 3.830734966592428e-05, + "loss": 2.6593, + "step": 344 + }, + { + "epoch": 0.23079817702889158, + "grad_norm": 3.402265787124634, + "learning_rate": 3.841870824053452e-05, + "loss": 2.978, + "step": 345 + }, + { + "epoch": 0.23146715725216374, + "grad_norm": 2.0154426097869873, + "learning_rate": 3.8530066815144765e-05, + "loss": 2.8954, + "step": 346 + }, + { + "epoch": 0.23213613747543588, + "grad_norm": 2.149036169052124, + "learning_rate": 3.8641425389755015e-05, + "loss": 2.8539, + "step": 347 + }, + { + "epoch": 0.23280511769870804, + "grad_norm": 2.9926230907440186, + "learning_rate": 3.875278396436526e-05, + "loss": 3.0115, + "step": 348 + }, + { + "epoch": 0.23347409792198018, + "grad_norm": 2.1109957695007324, + "learning_rate": 3.88641425389755e-05, + "loss": 2.7915, + "step": 349 + }, + { + "epoch": 0.23414307814525234, + "grad_norm": 1.8408353328704834, + "learning_rate": 3.897550111358575e-05, + "loss": 2.6129, + "step": 350 + }, + { + "epoch": 0.23481205836852448, + "grad_norm": 4.683795928955078, + "learning_rate": 3.9086859688195995e-05, + "loss": 2.9513, + "step": 351 + }, + { + "epoch": 0.23548103859179664, + "grad_norm": 2.911135673522949, + "learning_rate": 3.919821826280624e-05, + "loss": 2.9199, + "step": 352 + }, + { + "epoch": 0.23615001881506878, + "grad_norm": 3.0055723190307617, + "learning_rate": 3.930957683741648e-05, + "loss": 2.7776, + "step": 353 + }, + { + "epoch": 0.23681899903834092, + "grad_norm": 2.214090347290039, + "learning_rate": 3.9420935412026726e-05, + "loss": 2.8616, + "step": 354 + }, + { + "epoch": 0.23748797926161308, + "grad_norm": 4.9794135093688965, + "learning_rate": 3.9532293986636976e-05, + "loss": 2.9461, + "step": 355 + }, + { + "epoch": 0.23815695948488522, + "grad_norm": 2.91196870803833, + "learning_rate": 3.964365256124722e-05, + "loss": 3.1177, + "step": 356 + }, + { + "epoch": 0.23882593970815738, + "grad_norm": 3.165623664855957, + "learning_rate": 3.975501113585746e-05, + "loss": 2.8003, + "step": 357 + }, + { + "epoch": 0.23949491993142952, + "grad_norm": 2.178732395172119, + "learning_rate": 3.986636971046771e-05, + "loss": 3.1017, + "step": 358 + }, + { + "epoch": 0.24016390015470168, + "grad_norm": 2.2613189220428467, + "learning_rate": 3.9977728285077956e-05, + "loss": 2.9624, + "step": 359 + }, + { + "epoch": 0.24083288037797382, + "grad_norm": 3.2467665672302246, + "learning_rate": 4.00890868596882e-05, + "loss": 2.7477, + "step": 360 + }, + { + "epoch": 0.24150186060124598, + "grad_norm": 4.1236467361450195, + "learning_rate": 4.020044543429844e-05, + "loss": 2.8327, + "step": 361 + }, + { + "epoch": 0.24217084082451812, + "grad_norm": 6.381893157958984, + "learning_rate": 4.0311804008908686e-05, + "loss": 2.9811, + "step": 362 + }, + { + "epoch": 0.24283982104779028, + "grad_norm": 4.767080783843994, + "learning_rate": 4.042316258351893e-05, + "loss": 2.7984, + "step": 363 + }, + { + "epoch": 0.24350880127106242, + "grad_norm": 3.955322504043579, + "learning_rate": 4.053452115812918e-05, + "loss": 3.0451, + "step": 364 + }, + { + "epoch": 0.24417778149433458, + "grad_norm": 2.52062726020813, + "learning_rate": 4.064587973273942e-05, + "loss": 3.0299, + "step": 365 + }, + { + "epoch": 0.24484676171760672, + "grad_norm": 2.5843591690063477, + "learning_rate": 4.0757238307349666e-05, + "loss": 2.9065, + "step": 366 + }, + { + "epoch": 0.24551574194087888, + "grad_norm": 2.6871824264526367, + "learning_rate": 4.0868596881959917e-05, + "loss": 2.9752, + "step": 367 + }, + { + "epoch": 0.24618472216415102, + "grad_norm": 2.8572964668273926, + "learning_rate": 4.097995545657016e-05, + "loss": 3.0047, + "step": 368 + }, + { + "epoch": 0.24685370238742319, + "grad_norm": 3.1573598384857178, + "learning_rate": 4.10913140311804e-05, + "loss": 2.8892, + "step": 369 + }, + { + "epoch": 0.24752268261069532, + "grad_norm": 4.083068370819092, + "learning_rate": 4.120267260579065e-05, + "loss": 2.5709, + "step": 370 + }, + { + "epoch": 0.24819166283396746, + "grad_norm": 4.6753387451171875, + "learning_rate": 4.131403118040089e-05, + "loss": 3.1801, + "step": 371 + }, + { + "epoch": 0.24886064305723962, + "grad_norm": 3.2260594367980957, + "learning_rate": 4.142538975501114e-05, + "loss": 3.1373, + "step": 372 + }, + { + "epoch": 0.24952962328051176, + "grad_norm": 3.0378241539001465, + "learning_rate": 4.1536748329621384e-05, + "loss": 3.0976, + "step": 373 + }, + { + "epoch": 0.2501986035037839, + "grad_norm": 3.077693462371826, + "learning_rate": 4.164810690423163e-05, + "loss": 3.0883, + "step": 374 + }, + { + "epoch": 0.2508675837270561, + "grad_norm": 4.166459083557129, + "learning_rate": 4.175946547884188e-05, + "loss": 3.178, + "step": 375 + }, + { + "epoch": 0.2515365639503282, + "grad_norm": 3.4417877197265625, + "learning_rate": 4.187082405345212e-05, + "loss": 3.028, + "step": 376 + }, + { + "epoch": 0.25220554417360036, + "grad_norm": 3.2813799381256104, + "learning_rate": 4.1982182628062364e-05, + "loss": 3.0955, + "step": 377 + }, + { + "epoch": 0.2528745243968725, + "grad_norm": 3.2578415870666504, + "learning_rate": 4.209354120267261e-05, + "loss": 3.0973, + "step": 378 + }, + { + "epoch": 0.2535435046201447, + "grad_norm": 7.314671516418457, + "learning_rate": 4.220489977728285e-05, + "loss": 3.1364, + "step": 379 + }, + { + "epoch": 0.2542124848434168, + "grad_norm": 4.096634387969971, + "learning_rate": 4.2316258351893094e-05, + "loss": 2.9937, + "step": 380 + }, + { + "epoch": 0.25488146506668896, + "grad_norm": 8.772887229919434, + "learning_rate": 4.2427616926503344e-05, + "loss": 2.9784, + "step": 381 + }, + { + "epoch": 0.2555504452899611, + "grad_norm": 3.3955204486846924, + "learning_rate": 4.253897550111359e-05, + "loss": 2.9533, + "step": 382 + }, + { + "epoch": 0.2562194255132333, + "grad_norm": 2.985086679458618, + "learning_rate": 4.265033407572383e-05, + "loss": 3.0271, + "step": 383 + }, + { + "epoch": 0.2568884057365054, + "grad_norm": 3.7264626026153564, + "learning_rate": 4.276169265033408e-05, + "loss": 3.0873, + "step": 384 + }, + { + "epoch": 0.25755738595977756, + "grad_norm": 3.565138101577759, + "learning_rate": 4.2873051224944324e-05, + "loss": 2.9025, + "step": 385 + }, + { + "epoch": 0.2582263661830497, + "grad_norm": 2.661137342453003, + "learning_rate": 4.298440979955457e-05, + "loss": 3.1241, + "step": 386 + }, + { + "epoch": 0.25889534640632184, + "grad_norm": 2.084932804107666, + "learning_rate": 4.309576837416481e-05, + "loss": 2.911, + "step": 387 + }, + { + "epoch": 0.259564326629594, + "grad_norm": 2.9260430335998535, + "learning_rate": 4.3207126948775055e-05, + "loss": 3.0158, + "step": 388 + }, + { + "epoch": 0.26023330685286616, + "grad_norm": 2.6525022983551025, + "learning_rate": 4.3318485523385305e-05, + "loss": 2.8838, + "step": 389 + }, + { + "epoch": 0.26090228707613833, + "grad_norm": 2.613525629043579, + "learning_rate": 4.342984409799555e-05, + "loss": 2.9201, + "step": 390 + }, + { + "epoch": 0.26157126729941044, + "grad_norm": 3.924321413040161, + "learning_rate": 4.354120267260579e-05, + "loss": 2.9793, + "step": 391 + }, + { + "epoch": 0.2622402475226826, + "grad_norm": 5.627641201019287, + "learning_rate": 4.365256124721604e-05, + "loss": 3.081, + "step": 392 + }, + { + "epoch": 0.26290922774595477, + "grad_norm": 3.294105052947998, + "learning_rate": 4.3763919821826285e-05, + "loss": 2.8867, + "step": 393 + }, + { + "epoch": 0.26357820796922693, + "grad_norm": 2.795708179473877, + "learning_rate": 4.387527839643653e-05, + "loss": 2.8974, + "step": 394 + }, + { + "epoch": 0.26424718819249904, + "grad_norm": 5.553576946258545, + "learning_rate": 4.398663697104677e-05, + "loss": 3.1988, + "step": 395 + }, + { + "epoch": 0.2649161684157712, + "grad_norm": 3.2928688526153564, + "learning_rate": 4.4097995545657015e-05, + "loss": 2.8335, + "step": 396 + }, + { + "epoch": 0.26558514863904337, + "grad_norm": 5.719513893127441, + "learning_rate": 4.420935412026726e-05, + "loss": 3.0254, + "step": 397 + }, + { + "epoch": 0.26625412886231553, + "grad_norm": 3.231088161468506, + "learning_rate": 4.432071269487751e-05, + "loss": 3.1652, + "step": 398 + }, + { + "epoch": 0.26692310908558764, + "grad_norm": 4.428014278411865, + "learning_rate": 4.443207126948775e-05, + "loss": 3.1403, + "step": 399 + }, + { + "epoch": 0.2675920893088598, + "grad_norm": 2.7273247241973877, + "learning_rate": 4.4543429844098e-05, + "loss": 2.9933, + "step": 400 + }, + { + "epoch": 0.26826106953213197, + "grad_norm": 3.1555709838867188, + "learning_rate": 4.4654788418708246e-05, + "loss": 2.7692, + "step": 401 + }, + { + "epoch": 0.26893004975540413, + "grad_norm": 4.41472053527832, + "learning_rate": 4.476614699331849e-05, + "loss": 3.1536, + "step": 402 + }, + { + "epoch": 0.26959902997867624, + "grad_norm": 2.652088165283203, + "learning_rate": 4.487750556792873e-05, + "loss": 2.7263, + "step": 403 + }, + { + "epoch": 0.2702680102019484, + "grad_norm": 2.2178761959075928, + "learning_rate": 4.4988864142538976e-05, + "loss": 2.7703, + "step": 404 + }, + { + "epoch": 0.27093699042522057, + "grad_norm": 3.2548203468322754, + "learning_rate": 4.510022271714922e-05, + "loss": 3.0788, + "step": 405 + }, + { + "epoch": 0.2716059706484927, + "grad_norm": 3.69232177734375, + "learning_rate": 4.521158129175947e-05, + "loss": 3.1247, + "step": 406 + }, + { + "epoch": 0.27227495087176484, + "grad_norm": 2.2741658687591553, + "learning_rate": 4.532293986636971e-05, + "loss": 2.9171, + "step": 407 + }, + { + "epoch": 0.272943931095037, + "grad_norm": 3.9409332275390625, + "learning_rate": 4.5434298440979956e-05, + "loss": 3.1141, + "step": 408 + }, + { + "epoch": 0.27361291131830917, + "grad_norm": 4.8423004150390625, + "learning_rate": 4.5545657015590206e-05, + "loss": 3.0009, + "step": 409 + }, + { + "epoch": 0.2742818915415813, + "grad_norm": 4.420136451721191, + "learning_rate": 4.565701559020045e-05, + "loss": 3.2071, + "step": 410 + }, + { + "epoch": 0.27495087176485344, + "grad_norm": 2.833836317062378, + "learning_rate": 4.576837416481069e-05, + "loss": 2.8665, + "step": 411 + }, + { + "epoch": 0.2756198519881256, + "grad_norm": 4.68223237991333, + "learning_rate": 4.5879732739420936e-05, + "loss": 3.0481, + "step": 412 + }, + { + "epoch": 0.2762888322113978, + "grad_norm": 1.9087761640548706, + "learning_rate": 4.599109131403118e-05, + "loss": 2.8015, + "step": 413 + }, + { + "epoch": 0.2769578124346699, + "grad_norm": 5.1107177734375, + "learning_rate": 4.610244988864143e-05, + "loss": 2.9523, + "step": 414 + }, + { + "epoch": 0.27762679265794205, + "grad_norm": 5.10582160949707, + "learning_rate": 4.621380846325167e-05, + "loss": 2.6587, + "step": 415 + }, + { + "epoch": 0.2782957728812142, + "grad_norm": 5.031997203826904, + "learning_rate": 4.632516703786192e-05, + "loss": 2.7179, + "step": 416 + }, + { + "epoch": 0.2789647531044864, + "grad_norm": 3.6552696228027344, + "learning_rate": 4.643652561247217e-05, + "loss": 2.8385, + "step": 417 + }, + { + "epoch": 0.2796337333277585, + "grad_norm": 4.179525852203369, + "learning_rate": 4.654788418708241e-05, + "loss": 3.0226, + "step": 418 + }, + { + "epoch": 0.28030271355103065, + "grad_norm": 5.424846649169922, + "learning_rate": 4.6659242761692654e-05, + "loss": 3.0232, + "step": 419 + }, + { + "epoch": 0.2809716937743028, + "grad_norm": 3.7056894302368164, + "learning_rate": 4.67706013363029e-05, + "loss": 2.9397, + "step": 420 + }, + { + "epoch": 0.2816406739975749, + "grad_norm": 3.52085542678833, + "learning_rate": 4.688195991091314e-05, + "loss": 3.1203, + "step": 421 + }, + { + "epoch": 0.2823096542208471, + "grad_norm": 4.416035175323486, + "learning_rate": 4.6993318485523384e-05, + "loss": 3.1685, + "step": 422 + }, + { + "epoch": 0.28297863444411925, + "grad_norm": 4.974792957305908, + "learning_rate": 4.7104677060133634e-05, + "loss": 2.9907, + "step": 423 + }, + { + "epoch": 0.2836476146673914, + "grad_norm": 2.7833263874053955, + "learning_rate": 4.721603563474388e-05, + "loss": 2.9569, + "step": 424 + }, + { + "epoch": 0.2843165948906635, + "grad_norm": 4.347465991973877, + "learning_rate": 4.732739420935412e-05, + "loss": 2.9085, + "step": 425 + }, + { + "epoch": 0.2849855751139357, + "grad_norm": 4.685590744018555, + "learning_rate": 4.743875278396437e-05, + "loss": 2.945, + "step": 426 + }, + { + "epoch": 0.28565455533720785, + "grad_norm": 2.592012643814087, + "learning_rate": 4.7550111358574614e-05, + "loss": 3.0512, + "step": 427 + }, + { + "epoch": 0.28632353556048, + "grad_norm": 2.9887845516204834, + "learning_rate": 4.766146993318486e-05, + "loss": 3.0683, + "step": 428 + }, + { + "epoch": 0.2869925157837521, + "grad_norm": 2.813981294631958, + "learning_rate": 4.77728285077951e-05, + "loss": 2.9379, + "step": 429 + }, + { + "epoch": 0.2876614960070243, + "grad_norm": 4.186840057373047, + "learning_rate": 4.7884187082405344e-05, + "loss": 2.8466, + "step": 430 + }, + { + "epoch": 0.28833047623029645, + "grad_norm": 2.8538403511047363, + "learning_rate": 4.7995545657015594e-05, + "loss": 2.9927, + "step": 431 + }, + { + "epoch": 0.2889994564535686, + "grad_norm": 2.1021909713745117, + "learning_rate": 4.810690423162584e-05, + "loss": 2.7849, + "step": 432 + }, + { + "epoch": 0.2896684366768407, + "grad_norm": 1.9388480186462402, + "learning_rate": 4.821826280623608e-05, + "loss": 3.2563, + "step": 433 + }, + { + "epoch": 0.2903374169001129, + "grad_norm": 4.667293548583984, + "learning_rate": 4.832962138084633e-05, + "loss": 2.9185, + "step": 434 + }, + { + "epoch": 0.29100639712338505, + "grad_norm": 3.824524164199829, + "learning_rate": 4.8440979955456575e-05, + "loss": 2.9848, + "step": 435 + }, + { + "epoch": 0.2916753773466572, + "grad_norm": 2.4440345764160156, + "learning_rate": 4.855233853006682e-05, + "loss": 2.8454, + "step": 436 + }, + { + "epoch": 0.2923443575699293, + "grad_norm": 7.678300380706787, + "learning_rate": 4.866369710467706e-05, + "loss": 2.9164, + "step": 437 + }, + { + "epoch": 0.2930133377932015, + "grad_norm": 3.7858011722564697, + "learning_rate": 4.8775055679287305e-05, + "loss": 2.9595, + "step": 438 + }, + { + "epoch": 0.29368231801647365, + "grad_norm": 4.288517951965332, + "learning_rate": 4.888641425389755e-05, + "loss": 2.8055, + "step": 439 + }, + { + "epoch": 0.29435129823974576, + "grad_norm": 5.512247085571289, + "learning_rate": 4.89977728285078e-05, + "loss": 3.0377, + "step": 440 + }, + { + "epoch": 0.2950202784630179, + "grad_norm": 4.056970596313477, + "learning_rate": 4.910913140311804e-05, + "loss": 3.2016, + "step": 441 + }, + { + "epoch": 0.2956892586862901, + "grad_norm": 3.7022035121917725, + "learning_rate": 4.922048997772829e-05, + "loss": 3.1512, + "step": 442 + }, + { + "epoch": 0.29635823890956225, + "grad_norm": 2.9726853370666504, + "learning_rate": 4.9331848552338535e-05, + "loss": 3.1247, + "step": 443 + }, + { + "epoch": 0.29702721913283436, + "grad_norm": 2.665365219116211, + "learning_rate": 4.944320712694878e-05, + "loss": 2.871, + "step": 444 + }, + { + "epoch": 0.29769619935610653, + "grad_norm": 2.9526169300079346, + "learning_rate": 4.955456570155902e-05, + "loss": 2.8726, + "step": 445 + }, + { + "epoch": 0.2983651795793787, + "grad_norm": 3.251380681991577, + "learning_rate": 4.9665924276169265e-05, + "loss": 3.0167, + "step": 446 + }, + { + "epoch": 0.29903415980265086, + "grad_norm": 3.6996798515319824, + "learning_rate": 4.977728285077951e-05, + "loss": 3.0624, + "step": 447 + }, + { + "epoch": 0.29970314002592296, + "grad_norm": 3.7724545001983643, + "learning_rate": 4.988864142538976e-05, + "loss": 2.8873, + "step": 448 + }, + { + "epoch": 0.30037212024919513, + "grad_norm": 3.4690496921539307, + "learning_rate": 5e-05, + "loss": 3.0608, + "step": 449 + }, + { + "epoch": 0.3010411004724673, + "grad_norm": 2.773916482925415, + "learning_rate": 4.999999241504004e-05, + "loss": 2.9176, + "step": 450 + }, + { + "epoch": 0.30171008069573946, + "grad_norm": 5.8442816734313965, + "learning_rate": 4.999996966016478e-05, + "loss": 3.1301, + "step": 451 + }, + { + "epoch": 0.30237906091901157, + "grad_norm": 5.579975605010986, + "learning_rate": 4.9999931735387995e-05, + "loss": 3.0301, + "step": 452 + }, + { + "epoch": 0.30304804114228373, + "grad_norm": 5.85215425491333, + "learning_rate": 4.999987864073273e-05, + "loss": 3.0296, + "step": 453 + }, + { + "epoch": 0.3037170213655559, + "grad_norm": 3.585082530975342, + "learning_rate": 4.999981037623118e-05, + "loss": 2.9487, + "step": 454 + }, + { + "epoch": 0.304386001588828, + "grad_norm": 3.605626344680786, + "learning_rate": 4.999972694192479e-05, + "loss": 3.0024, + "step": 455 + }, + { + "epoch": 0.30505498181210017, + "grad_norm": 6.199455738067627, + "learning_rate": 4.999962833786417e-05, + "loss": 3.354, + "step": 456 + }, + { + "epoch": 0.30572396203537233, + "grad_norm": 4.334702014923096, + "learning_rate": 4.999951456410915e-05, + "loss": 3.1339, + "step": 457 + }, + { + "epoch": 0.3063929422586445, + "grad_norm": 4.66254186630249, + "learning_rate": 4.9999385620728776e-05, + "loss": 3.1656, + "step": 458 + }, + { + "epoch": 0.3070619224819166, + "grad_norm": 3.108375072479248, + "learning_rate": 4.99992415078013e-05, + "loss": 2.8593, + "step": 459 + }, + { + "epoch": 0.30773090270518877, + "grad_norm": 4.193397521972656, + "learning_rate": 4.9999082225414154e-05, + "loss": 3.0536, + "step": 460 + }, + { + "epoch": 0.30839988292846093, + "grad_norm": 3.317269802093506, + "learning_rate": 4.9998907773663996e-05, + "loss": 3.0542, + "step": 461 + }, + { + "epoch": 0.3090688631517331, + "grad_norm": 3.166783332824707, + "learning_rate": 4.9998718152656684e-05, + "loss": 2.902, + "step": 462 + }, + { + "epoch": 0.3097378433750052, + "grad_norm": 2.3117363452911377, + "learning_rate": 4.9998513362507274e-05, + "loss": 2.7614, + "step": 463 + }, + { + "epoch": 0.31040682359827737, + "grad_norm": 4.291511058807373, + "learning_rate": 4.999829340334003e-05, + "loss": 3.2065, + "step": 464 + }, + { + "epoch": 0.31107580382154953, + "grad_norm": 4.51165246963501, + "learning_rate": 4.9998058275288435e-05, + "loss": 2.9523, + "step": 465 + }, + { + "epoch": 0.3117447840448217, + "grad_norm": 3.7631454467773438, + "learning_rate": 4.9997807978495154e-05, + "loss": 2.9064, + "step": 466 + }, + { + "epoch": 0.3124137642680938, + "grad_norm": 4.295648574829102, + "learning_rate": 4.999754251311207e-05, + "loss": 2.8078, + "step": 467 + }, + { + "epoch": 0.31308274449136597, + "grad_norm": 4.3560004234313965, + "learning_rate": 4.9997261879300264e-05, + "loss": 3.2831, + "step": 468 + }, + { + "epoch": 0.31375172471463814, + "grad_norm": 4.275026798248291, + "learning_rate": 4.999696607723003e-05, + "loss": 2.8916, + "step": 469 + }, + { + "epoch": 0.3144207049379103, + "grad_norm": 2.104069232940674, + "learning_rate": 4.999665510708085e-05, + "loss": 2.8333, + "step": 470 + }, + { + "epoch": 0.3150896851611824, + "grad_norm": 3.4207301139831543, + "learning_rate": 4.999632896904143e-05, + "loss": 2.9654, + "step": 471 + }, + { + "epoch": 0.3157586653844546, + "grad_norm": 4.362611293792725, + "learning_rate": 4.9995987663309656e-05, + "loss": 2.9717, + "step": 472 + }, + { + "epoch": 0.31642764560772674, + "grad_norm": 3.473769426345825, + "learning_rate": 4.999563119009264e-05, + "loss": 3.0075, + "step": 473 + }, + { + "epoch": 0.31709662583099885, + "grad_norm": 2.608717441558838, + "learning_rate": 4.99952595496067e-05, + "loss": 2.7839, + "step": 474 + }, + { + "epoch": 0.317765606054271, + "grad_norm": 3.1809604167938232, + "learning_rate": 4.9994872742077327e-05, + "loss": 2.9022, + "step": 475 + }, + { + "epoch": 0.3184345862775432, + "grad_norm": 3.7251923084259033, + "learning_rate": 4.999447076773924e-05, + "loss": 3.1133, + "step": 476 + }, + { + "epoch": 0.31910356650081534, + "grad_norm": 2.886922597885132, + "learning_rate": 4.999405362683636e-05, + "loss": 3.0508, + "step": 477 + }, + { + "epoch": 0.31977254672408745, + "grad_norm": 6.421143054962158, + "learning_rate": 4.9993621319621804e-05, + "loss": 3.0336, + "step": 478 + }, + { + "epoch": 0.3204415269473596, + "grad_norm": 5.60941219329834, + "learning_rate": 4.9993173846357896e-05, + "loss": 3.0946, + "step": 479 + }, + { + "epoch": 0.3211105071706318, + "grad_norm": 1.9423307180404663, + "learning_rate": 4.9992711207316156e-05, + "loss": 2.6631, + "step": 480 + }, + { + "epoch": 0.32177948739390394, + "grad_norm": 3.3870861530303955, + "learning_rate": 4.999223340277732e-05, + "loss": 2.754, + "step": 481 + }, + { + "epoch": 0.32244846761717605, + "grad_norm": 2.7448318004608154, + "learning_rate": 4.999174043303132e-05, + "loss": 2.9059, + "step": 482 + }, + { + "epoch": 0.3231174478404482, + "grad_norm": 3.669349431991577, + "learning_rate": 4.999123229837728e-05, + "loss": 3.1572, + "step": 483 + }, + { + "epoch": 0.3237864280637204, + "grad_norm": 5.015190601348877, + "learning_rate": 4.999070899912353e-05, + "loss": 3.1667, + "step": 484 + }, + { + "epoch": 0.32445540828699254, + "grad_norm": 4.793431282043457, + "learning_rate": 4.999017053558762e-05, + "loss": 3.0626, + "step": 485 + }, + { + "epoch": 0.32512438851026465, + "grad_norm": 2.3480074405670166, + "learning_rate": 4.998961690809628e-05, + "loss": 3.1293, + "step": 486 + }, + { + "epoch": 0.3257933687335368, + "grad_norm": 3.8708577156066895, + "learning_rate": 4.998904811698545e-05, + "loss": 3.2092, + "step": 487 + }, + { + "epoch": 0.326462348956809, + "grad_norm": 3.8278703689575195, + "learning_rate": 4.998846416260028e-05, + "loss": 2.8935, + "step": 488 + }, + { + "epoch": 0.3271313291800811, + "grad_norm": 4.190570831298828, + "learning_rate": 4.99878650452951e-05, + "loss": 3.0671, + "step": 489 + }, + { + "epoch": 0.32780030940335325, + "grad_norm": 3.4385035037994385, + "learning_rate": 4.998725076543345e-05, + "loss": 2.8071, + "step": 490 + }, + { + "epoch": 0.3284692896266254, + "grad_norm": 3.1834115982055664, + "learning_rate": 4.998662132338808e-05, + "loss": 2.9962, + "step": 491 + }, + { + "epoch": 0.3291382698498976, + "grad_norm": 4.123552322387695, + "learning_rate": 4.9985976719540936e-05, + "loss": 2.932, + "step": 492 + }, + { + "epoch": 0.3298072500731697, + "grad_norm": 3.3126766681671143, + "learning_rate": 4.998531695428316e-05, + "loss": 2.968, + "step": 493 + }, + { + "epoch": 0.33047623029644185, + "grad_norm": 4.15322732925415, + "learning_rate": 4.99846420280151e-05, + "loss": 3.1475, + "step": 494 + }, + { + "epoch": 0.331145210519714, + "grad_norm": 3.0752909183502197, + "learning_rate": 4.998395194114628e-05, + "loss": 2.9903, + "step": 495 + }, + { + "epoch": 0.3318141907429862, + "grad_norm": 4.031482696533203, + "learning_rate": 4.9983246694095455e-05, + "loss": 3.0917, + "step": 496 + }, + { + "epoch": 0.3324831709662583, + "grad_norm": 2.93766450881958, + "learning_rate": 4.998252628729058e-05, + "loss": 2.8263, + "step": 497 + }, + { + "epoch": 0.33315215118953045, + "grad_norm": 2.928539752960205, + "learning_rate": 4.9981790721168767e-05, + "loss": 3.0096, + "step": 498 + }, + { + "epoch": 0.3338211314128026, + "grad_norm": 4.941032409667969, + "learning_rate": 4.9981039996176375e-05, + "loss": 2.9287, + "step": 499 + }, + { + "epoch": 0.3344901116360748, + "grad_norm": 4.886608600616455, + "learning_rate": 4.998027411276894e-05, + "loss": 3.0736, + "step": 500 + }, + { + "epoch": 0.3351590918593469, + "grad_norm": 4.183567047119141, + "learning_rate": 4.997949307141119e-05, + "loss": 2.8601, + "step": 501 + }, + { + "epoch": 0.33582807208261906, + "grad_norm": 4.0794243812561035, + "learning_rate": 4.997869687257707e-05, + "loss": 3.002, + "step": 502 + }, + { + "epoch": 0.3364970523058912, + "grad_norm": 6.189320087432861, + "learning_rate": 4.99778855167497e-05, + "loss": 2.928, + "step": 503 + }, + { + "epoch": 0.3371660325291634, + "grad_norm": 3.896519184112549, + "learning_rate": 4.997705900442141e-05, + "loss": 3.0234, + "step": 504 + }, + { + "epoch": 0.3378350127524355, + "grad_norm": 8.249415397644043, + "learning_rate": 4.9976217336093726e-05, + "loss": 2.8567, + "step": 505 + }, + { + "epoch": 0.33850399297570766, + "grad_norm": 5.5367937088012695, + "learning_rate": 4.997536051227738e-05, + "loss": 2.9441, + "step": 506 + }, + { + "epoch": 0.3391729731989798, + "grad_norm": 5.139379978179932, + "learning_rate": 4.997448853349227e-05, + "loss": 3.1405, + "step": 507 + }, + { + "epoch": 0.33984195342225193, + "grad_norm": 4.751509189605713, + "learning_rate": 4.997360140026752e-05, + "loss": 2.8483, + "step": 508 + }, + { + "epoch": 0.3405109336455241, + "grad_norm": 5.006605625152588, + "learning_rate": 4.997269911314145e-05, + "loss": 2.9661, + "step": 509 + }, + { + "epoch": 0.34117991386879626, + "grad_norm": 3.2533388137817383, + "learning_rate": 4.997178167266155e-05, + "loss": 2.891, + "step": 510 + }, + { + "epoch": 0.3418488940920684, + "grad_norm": 5.7418670654296875, + "learning_rate": 4.9970849079384524e-05, + "loss": 2.8222, + "step": 511 + }, + { + "epoch": 0.34251787431534053, + "grad_norm": 5.2950005531311035, + "learning_rate": 4.9969901333876264e-05, + "loss": 2.9353, + "step": 512 + }, + { + "epoch": 0.3431868545386127, + "grad_norm": 2.9090206623077393, + "learning_rate": 4.996893843671187e-05, + "loss": 2.7708, + "step": 513 + }, + { + "epoch": 0.34385583476188486, + "grad_norm": 5.055557727813721, + "learning_rate": 4.996796038847561e-05, + "loss": 2.897, + "step": 514 + }, + { + "epoch": 0.344524814985157, + "grad_norm": 5.402209281921387, + "learning_rate": 4.996696718976098e-05, + "loss": 3.1266, + "step": 515 + }, + { + "epoch": 0.34519379520842913, + "grad_norm": 3.475196599960327, + "learning_rate": 4.9965958841170626e-05, + "loss": 2.911, + "step": 516 + }, + { + "epoch": 0.3458627754317013, + "grad_norm": 5.288271427154541, + "learning_rate": 4.9964935343316435e-05, + "loss": 2.9801, + "step": 517 + }, + { + "epoch": 0.34653175565497346, + "grad_norm": 5.355285167694092, + "learning_rate": 4.9963896696819433e-05, + "loss": 3.0884, + "step": 518 + }, + { + "epoch": 0.3472007358782456, + "grad_norm": 6.395686149597168, + "learning_rate": 4.99628429023099e-05, + "loss": 3.1425, + "step": 519 + }, + { + "epoch": 0.34786971610151773, + "grad_norm": 5.087702751159668, + "learning_rate": 4.9961773960427246e-05, + "loss": 2.9261, + "step": 520 + }, + { + "epoch": 0.3485386963247899, + "grad_norm": 2.206958532333374, + "learning_rate": 4.996068987182012e-05, + "loss": 2.6969, + "step": 521 + }, + { + "epoch": 0.34920767654806206, + "grad_norm": 3.5213186740875244, + "learning_rate": 4.995959063714634e-05, + "loss": 2.6573, + "step": 522 + }, + { + "epoch": 0.34987665677133417, + "grad_norm": 5.5672807693481445, + "learning_rate": 4.9958476257072914e-05, + "loss": 3.1532, + "step": 523 + }, + { + "epoch": 0.35054563699460634, + "grad_norm": 3.522601842880249, + "learning_rate": 4.995734673227605e-05, + "loss": 2.826, + "step": 524 + }, + { + "epoch": 0.3512146172178785, + "grad_norm": 9.747411727905273, + "learning_rate": 4.9956202063441135e-05, + "loss": 3.0387, + "step": 525 + }, + { + "epoch": 0.35188359744115066, + "grad_norm": 4.434753894805908, + "learning_rate": 4.995504225126275e-05, + "loss": 3.0938, + "step": 526 + }, + { + "epoch": 0.3525525776644228, + "grad_norm": 4.533684253692627, + "learning_rate": 4.9953867296444665e-05, + "loss": 2.9057, + "step": 527 + }, + { + "epoch": 0.35322155788769494, + "grad_norm": 3.0737051963806152, + "learning_rate": 4.9952677199699846e-05, + "loss": 2.9397, + "step": 528 + }, + { + "epoch": 0.3538905381109671, + "grad_norm": 4.464580535888672, + "learning_rate": 4.995147196175044e-05, + "loss": 2.8025, + "step": 529 + }, + { + "epoch": 0.35455951833423927, + "grad_norm": 7.038727760314941, + "learning_rate": 4.9950251583327767e-05, + "loss": 3.2705, + "step": 530 + }, + { + "epoch": 0.3552284985575114, + "grad_norm": 3.4099175930023193, + "learning_rate": 4.994901606517236e-05, + "loss": 2.9136, + "step": 531 + }, + { + "epoch": 0.35589747878078354, + "grad_norm": 5.16508674621582, + "learning_rate": 4.994776540803393e-05, + "loss": 2.9786, + "step": 532 + }, + { + "epoch": 0.3565664590040557, + "grad_norm": 3.2725794315338135, + "learning_rate": 4.994649961267136e-05, + "loss": 2.8186, + "step": 533 + }, + { + "epoch": 0.35723543922732787, + "grad_norm": 2.896885633468628, + "learning_rate": 4.994521867985275e-05, + "loss": 3.0125, + "step": 534 + }, + { + "epoch": 0.3579044194506, + "grad_norm": 6.460264682769775, + "learning_rate": 4.994392261035534e-05, + "loss": 3.0882, + "step": 535 + }, + { + "epoch": 0.35857339967387214, + "grad_norm": 3.8949625492095947, + "learning_rate": 4.994261140496561e-05, + "loss": 2.8898, + "step": 536 + }, + { + "epoch": 0.3592423798971443, + "grad_norm": 4.002209186553955, + "learning_rate": 4.9941285064479165e-05, + "loss": 3.2028, + "step": 537 + }, + { + "epoch": 0.3599113601204164, + "grad_norm": 3.564258575439453, + "learning_rate": 4.9939943589700845e-05, + "loss": 3.1001, + "step": 538 + }, + { + "epoch": 0.3605803403436886, + "grad_norm": 2.9036061763763428, + "learning_rate": 4.9938586981444647e-05, + "loss": 2.9339, + "step": 539 + }, + { + "epoch": 0.36124932056696074, + "grad_norm": 5.254669666290283, + "learning_rate": 4.9937215240533757e-05, + "loss": 2.8677, + "step": 540 + }, + { + "epoch": 0.3619183007902329, + "grad_norm": 3.252579927444458, + "learning_rate": 4.9935828367800544e-05, + "loss": 3.0247, + "step": 541 + }, + { + "epoch": 0.362587281013505, + "grad_norm": 4.90103006362915, + "learning_rate": 4.9934426364086554e-05, + "loss": 3.0836, + "step": 542 + }, + { + "epoch": 0.3632562612367772, + "grad_norm": 4.381466865539551, + "learning_rate": 4.9933009230242524e-05, + "loss": 3.1326, + "step": 543 + }, + { + "epoch": 0.36392524146004934, + "grad_norm": 2.9965758323669434, + "learning_rate": 4.993157696712836e-05, + "loss": 2.8662, + "step": 544 + }, + { + "epoch": 0.3645942216833215, + "grad_norm": 3.2848966121673584, + "learning_rate": 4.9930129575613156e-05, + "loss": 2.8297, + "step": 545 + }, + { + "epoch": 0.3652632019065936, + "grad_norm": 5.529489994049072, + "learning_rate": 4.9928667056575185e-05, + "loss": 2.9997, + "step": 546 + }, + { + "epoch": 0.3659321821298658, + "grad_norm": 4.317118167877197, + "learning_rate": 4.9927189410901905e-05, + "loss": 3.0713, + "step": 547 + }, + { + "epoch": 0.36660116235313794, + "grad_norm": 3.1297154426574707, + "learning_rate": 4.992569663948994e-05, + "loss": 2.6562, + "step": 548 + }, + { + "epoch": 0.3672701425764101, + "grad_norm": 3.4097821712493896, + "learning_rate": 4.992418874324509e-05, + "loss": 2.7812, + "step": 549 + }, + { + "epoch": 0.3679391227996822, + "grad_norm": 3.653111696243286, + "learning_rate": 4.992266572308237e-05, + "loss": 2.9268, + "step": 550 + }, + { + "epoch": 0.3686081030229544, + "grad_norm": 4.445350170135498, + "learning_rate": 4.992112757992591e-05, + "loss": 3.0955, + "step": 551 + }, + { + "epoch": 0.36927708324622655, + "grad_norm": 5.325762748718262, + "learning_rate": 4.991957431470908e-05, + "loss": 2.9877, + "step": 552 + }, + { + "epoch": 0.3699460634694987, + "grad_norm": 3.6300137042999268, + "learning_rate": 4.991800592837438e-05, + "loss": 2.9609, + "step": 553 + }, + { + "epoch": 0.3706150436927708, + "grad_norm": 3.8826160430908203, + "learning_rate": 4.9916422421873496e-05, + "loss": 2.9752, + "step": 554 + }, + { + "epoch": 0.371284023916043, + "grad_norm": 3.5641696453094482, + "learning_rate": 4.991482379616731e-05, + "loss": 2.7834, + "step": 555 + }, + { + "epoch": 0.37195300413931515, + "grad_norm": 3.369718074798584, + "learning_rate": 4.991321005222585e-05, + "loss": 2.9023, + "step": 556 + }, + { + "epoch": 0.37262198436258726, + "grad_norm": 3.8720474243164062, + "learning_rate": 4.991158119102834e-05, + "loss": 3.0678, + "step": 557 + }, + { + "epoch": 0.3732909645858594, + "grad_norm": 3.0319697856903076, + "learning_rate": 4.9909937213563165e-05, + "loss": 2.8119, + "step": 558 + }, + { + "epoch": 0.3739599448091316, + "grad_norm": 5.15587043762207, + "learning_rate": 4.9908278120827886e-05, + "loss": 2.9111, + "step": 559 + }, + { + "epoch": 0.37462892503240375, + "grad_norm": 3.232497215270996, + "learning_rate": 4.990660391382923e-05, + "loss": 2.9465, + "step": 560 + }, + { + "epoch": 0.37529790525567586, + "grad_norm": 4.08522891998291, + "learning_rate": 4.990491459358311e-05, + "loss": 3.0208, + "step": 561 + }, + { + "epoch": 0.375966885478948, + "grad_norm": 3.3792853355407715, + "learning_rate": 4.990321016111459e-05, + "loss": 2.92, + "step": 562 + }, + { + "epoch": 0.3766358657022202, + "grad_norm": 3.6902658939361572, + "learning_rate": 4.990149061745791e-05, + "loss": 3.0366, + "step": 563 + }, + { + "epoch": 0.37730484592549235, + "grad_norm": 4.229647636413574, + "learning_rate": 4.9899755963656506e-05, + "loss": 3.0959, + "step": 564 + }, + { + "epoch": 0.37797382614876446, + "grad_norm": 5.087015151977539, + "learning_rate": 4.989800620076295e-05, + "loss": 2.8492, + "step": 565 + }, + { + "epoch": 0.3786428063720366, + "grad_norm": 6.532529354095459, + "learning_rate": 4.989624132983898e-05, + "loss": 2.9678, + "step": 566 + }, + { + "epoch": 0.3793117865953088, + "grad_norm": 2.5950136184692383, + "learning_rate": 4.989446135195553e-05, + "loss": 2.9949, + "step": 567 + }, + { + "epoch": 0.37998076681858095, + "grad_norm": 4.946874618530273, + "learning_rate": 4.989266626819266e-05, + "loss": 2.8929, + "step": 568 + }, + { + "epoch": 0.38064974704185306, + "grad_norm": 3.0101711750030518, + "learning_rate": 4.989085607963965e-05, + "loss": 2.8305, + "step": 569 + }, + { + "epoch": 0.3813187272651252, + "grad_norm": 3.235175848007202, + "learning_rate": 4.988903078739491e-05, + "loss": 2.9382, + "step": 570 + }, + { + "epoch": 0.3819877074883974, + "grad_norm": 5.849483966827393, + "learning_rate": 4.988719039256601e-05, + "loss": 3.1833, + "step": 571 + }, + { + "epoch": 0.3826566877116695, + "grad_norm": 3.72119140625, + "learning_rate": 4.9885334896269707e-05, + "loss": 2.8637, + "step": 572 + }, + { + "epoch": 0.38332566793494166, + "grad_norm": 5.516154766082764, + "learning_rate": 4.988346429963191e-05, + "loss": 3.1849, + "step": 573 + }, + { + "epoch": 0.3839946481582138, + "grad_norm": 4.963644981384277, + "learning_rate": 4.9881578603787684e-05, + "loss": 3.0707, + "step": 574 + }, + { + "epoch": 0.384663628381486, + "grad_norm": 4.417355060577393, + "learning_rate": 4.987967780988126e-05, + "loss": 3.0208, + "step": 575 + }, + { + "epoch": 0.3853326086047581, + "grad_norm": 5.166300296783447, + "learning_rate": 4.9877761919066044e-05, + "loss": 3.1407, + "step": 576 + }, + { + "epoch": 0.38600158882803026, + "grad_norm": 5.213954925537109, + "learning_rate": 4.98758309325046e-05, + "loss": 3.0561, + "step": 577 + }, + { + "epoch": 0.3866705690513024, + "grad_norm": 4.354947566986084, + "learning_rate": 4.987388485136861e-05, + "loss": 2.8773, + "step": 578 + }, + { + "epoch": 0.3873395492745746, + "grad_norm": 3.1227710247039795, + "learning_rate": 4.9871923676838985e-05, + "loss": 2.8522, + "step": 579 + }, + { + "epoch": 0.3880085294978467, + "grad_norm": 4.111440658569336, + "learning_rate": 4.986994741010576e-05, + "loss": 3.0179, + "step": 580 + }, + { + "epoch": 0.38867750972111886, + "grad_norm": 6.039926052093506, + "learning_rate": 4.9867956052368094e-05, + "loss": 3.0055, + "step": 581 + }, + { + "epoch": 0.389346489944391, + "grad_norm": 3.7227399349212646, + "learning_rate": 4.986594960483436e-05, + "loss": 3.043, + "step": 582 + }, + { + "epoch": 0.3900154701676632, + "grad_norm": 4.943892002105713, + "learning_rate": 4.9863928068722065e-05, + "loss": 2.8667, + "step": 583 + }, + { + "epoch": 0.3906844503909353, + "grad_norm": 4.038351535797119, + "learning_rate": 4.986189144525787e-05, + "loss": 2.9695, + "step": 584 + }, + { + "epoch": 0.39135343061420746, + "grad_norm": 4.159579277038574, + "learning_rate": 4.9859839735677585e-05, + "loss": 3.0379, + "step": 585 + }, + { + "epoch": 0.39202241083747963, + "grad_norm": 3.6894772052764893, + "learning_rate": 4.9857772941226174e-05, + "loss": 2.8926, + "step": 586 + }, + { + "epoch": 0.3926913910607518, + "grad_norm": 3.6522088050842285, + "learning_rate": 4.9855691063157785e-05, + "loss": 3.0332, + "step": 587 + }, + { + "epoch": 0.3933603712840239, + "grad_norm": 5.894712448120117, + "learning_rate": 4.9853594102735674e-05, + "loss": 3.0254, + "step": 588 + }, + { + "epoch": 0.39402935150729607, + "grad_norm": 4.821498870849609, + "learning_rate": 4.985148206123228e-05, + "loss": 3.232, + "step": 589 + }, + { + "epoch": 0.39469833173056823, + "grad_norm": 4.994937419891357, + "learning_rate": 4.9849354939929177e-05, + "loss": 3.1466, + "step": 590 + }, + { + "epoch": 0.39536731195384034, + "grad_norm": 4.068504333496094, + "learning_rate": 4.98472127401171e-05, + "loss": 2.9745, + "step": 591 + }, + { + "epoch": 0.3960362921771125, + "grad_norm": 2.8930795192718506, + "learning_rate": 4.984505546309592e-05, + "loss": 3.0703, + "step": 592 + }, + { + "epoch": 0.39670527240038467, + "grad_norm": 4.69880485534668, + "learning_rate": 4.984288311017469e-05, + "loss": 2.9232, + "step": 593 + }, + { + "epoch": 0.39737425262365683, + "grad_norm": 2.9983294010162354, + "learning_rate": 4.9840695682671555e-05, + "loss": 2.9689, + "step": 594 + }, + { + "epoch": 0.39804323284692894, + "grad_norm": 2.860313653945923, + "learning_rate": 4.983849318191386e-05, + "loss": 2.9432, + "step": 595 + }, + { + "epoch": 0.3987122130702011, + "grad_norm": 2.924636125564575, + "learning_rate": 4.983627560923807e-05, + "loss": 2.8894, + "step": 596 + }, + { + "epoch": 0.39938119329347327, + "grad_norm": 4.200955867767334, + "learning_rate": 4.983404296598979e-05, + "loss": 2.9354, + "step": 597 + }, + { + "epoch": 0.40005017351674543, + "grad_norm": 3.3520474433898926, + "learning_rate": 4.9831795253523804e-05, + "loss": 2.8986, + "step": 598 + }, + { + "epoch": 0.40071915374001754, + "grad_norm": 3.76237416267395, + "learning_rate": 4.9829532473204e-05, + "loss": 2.9264, + "step": 599 + }, + { + "epoch": 0.4013881339632897, + "grad_norm": 4.730940818786621, + "learning_rate": 4.9827254626403433e-05, + "loss": 3.2129, + "step": 600 + }, + { + "epoch": 0.40205711418656187, + "grad_norm": 3.7802793979644775, + "learning_rate": 4.982496171450428e-05, + "loss": 2.8529, + "step": 601 + }, + { + "epoch": 0.40272609440983403, + "grad_norm": 3.456836462020874, + "learning_rate": 4.982265373889789e-05, + "loss": 2.9196, + "step": 602 + }, + { + "epoch": 0.40339507463310614, + "grad_norm": 3.481623411178589, + "learning_rate": 4.982033070098472e-05, + "loss": 2.7904, + "step": 603 + }, + { + "epoch": 0.4040640548563783, + "grad_norm": 3.897667169570923, + "learning_rate": 4.981799260217439e-05, + "loss": 3.1093, + "step": 604 + }, + { + "epoch": 0.40473303507965047, + "grad_norm": 3.733328104019165, + "learning_rate": 4.9815639443885656e-05, + "loss": 3.0469, + "step": 605 + }, + { + "epoch": 0.4054020153029226, + "grad_norm": 2.8006529808044434, + "learning_rate": 4.9813271227546396e-05, + "loss": 2.7381, + "step": 606 + }, + { + "epoch": 0.40607099552619474, + "grad_norm": 3.6003527641296387, + "learning_rate": 4.9810887954593655e-05, + "loss": 2.8827, + "step": 607 + }, + { + "epoch": 0.4067399757494669, + "grad_norm": 4.127907752990723, + "learning_rate": 4.980848962647356e-05, + "loss": 2.5829, + "step": 608 + }, + { + "epoch": 0.4074089559727391, + "grad_norm": 5.597202301025391, + "learning_rate": 4.980607624464145e-05, + "loss": 2.97, + "step": 609 + }, + { + "epoch": 0.4080779361960112, + "grad_norm": 3.340439558029175, + "learning_rate": 4.9803647810561735e-05, + "loss": 3.037, + "step": 610 + }, + { + "epoch": 0.40874691641928335, + "grad_norm": 3.1540212631225586, + "learning_rate": 4.980120432570797e-05, + "loss": 2.9054, + "step": 611 + }, + { + "epoch": 0.4094158966425555, + "grad_norm": 4.93973445892334, + "learning_rate": 4.9798745791562885e-05, + "loss": 2.9211, + "step": 612 + }, + { + "epoch": 0.4100848768658277, + "grad_norm": 2.2099862098693848, + "learning_rate": 4.9796272209618286e-05, + "loss": 2.6368, + "step": 613 + }, + { + "epoch": 0.4107538570890998, + "grad_norm": 4.601409912109375, + "learning_rate": 4.979378358137514e-05, + "loss": 2.8993, + "step": 614 + }, + { + "epoch": 0.41142283731237195, + "grad_norm": 5.1786675453186035, + "learning_rate": 4.979127990834354e-05, + "loss": 3.0772, + "step": 615 + }, + { + "epoch": 0.4120918175356441, + "grad_norm": 3.350377321243286, + "learning_rate": 4.978876119204271e-05, + "loss": 2.8451, + "step": 616 + }, + { + "epoch": 0.4127607977589163, + "grad_norm": 5.280319690704346, + "learning_rate": 4.9786227434000995e-05, + "loss": 3.1549, + "step": 617 + }, + { + "epoch": 0.4134297779821884, + "grad_norm": 4.424160003662109, + "learning_rate": 4.978367863575587e-05, + "loss": 2.9093, + "step": 618 + }, + { + "epoch": 0.41409875820546055, + "grad_norm": 7.253220558166504, + "learning_rate": 4.9781114798853945e-05, + "loss": 3.2784, + "step": 619 + }, + { + "epoch": 0.4147677384287327, + "grad_norm": 2.733386278152466, + "learning_rate": 4.977853592485094e-05, + "loss": 2.7615, + "step": 620 + }, + { + "epoch": 0.4154367186520049, + "grad_norm": 5.424954414367676, + "learning_rate": 4.9775942015311713e-05, + "loss": 3.0018, + "step": 621 + }, + { + "epoch": 0.416105698875277, + "grad_norm": 2.8420839309692383, + "learning_rate": 4.9773333071810244e-05, + "loss": 3.0407, + "step": 622 + }, + { + "epoch": 0.41677467909854915, + "grad_norm": 3.767392635345459, + "learning_rate": 4.977070909592961e-05, + "loss": 2.8276, + "step": 623 + }, + { + "epoch": 0.4174436593218213, + "grad_norm": 5.107291221618652, + "learning_rate": 4.976807008926206e-05, + "loss": 3.0319, + "step": 624 + }, + { + "epoch": 0.4181126395450934, + "grad_norm": 3.7375893592834473, + "learning_rate": 4.976541605340892e-05, + "loss": 2.961, + "step": 625 + }, + { + "epoch": 0.4187816197683656, + "grad_norm": 8.229865074157715, + "learning_rate": 4.976274698998065e-05, + "loss": 2.9438, + "step": 626 + }, + { + "epoch": 0.41945059999163775, + "grad_norm": 4.140076160430908, + "learning_rate": 4.9760062900596825e-05, + "loss": 3.0025, + "step": 627 + }, + { + "epoch": 0.4201195802149099, + "grad_norm": 4.103034496307373, + "learning_rate": 4.9757363786886145e-05, + "loss": 2.864, + "step": 628 + }, + { + "epoch": 0.420788560438182, + "grad_norm": 3.0549144744873047, + "learning_rate": 4.975464965048644e-05, + "loss": 2.9363, + "step": 629 + }, + { + "epoch": 0.4214575406614542, + "grad_norm": 5.202071189880371, + "learning_rate": 4.975192049304461e-05, + "loss": 2.9154, + "step": 630 + }, + { + "epoch": 0.42212652088472635, + "grad_norm": 3.9111521244049072, + "learning_rate": 4.974917631621673e-05, + "loss": 2.7795, + "step": 631 + }, + { + "epoch": 0.4227955011079985, + "grad_norm": 5.800964832305908, + "learning_rate": 4.974641712166793e-05, + "loss": 2.85, + "step": 632 + }, + { + "epoch": 0.4234644813312706, + "grad_norm": 5.897298336029053, + "learning_rate": 4.97436429110725e-05, + "loss": 2.5939, + "step": 633 + }, + { + "epoch": 0.4241334615545428, + "grad_norm": 3.8999950885772705, + "learning_rate": 4.974085368611381e-05, + "loss": 2.8274, + "step": 634 + }, + { + "epoch": 0.42480244177781495, + "grad_norm": 3.5932559967041016, + "learning_rate": 4.973804944848437e-05, + "loss": 2.7909, + "step": 635 + }, + { + "epoch": 0.4254714220010871, + "grad_norm": 5.648451328277588, + "learning_rate": 4.973523019988575e-05, + "loss": 3.0426, + "step": 636 + }, + { + "epoch": 0.4261404022243592, + "grad_norm": 4.077558517456055, + "learning_rate": 4.973239594202869e-05, + "loss": 2.8152, + "step": 637 + }, + { + "epoch": 0.4268093824476314, + "grad_norm": 3.34771728515625, + "learning_rate": 4.9729546676633e-05, + "loss": 2.7797, + "step": 638 + }, + { + "epoch": 0.42747836267090356, + "grad_norm": 4.4564690589904785, + "learning_rate": 4.972668240542761e-05, + "loss": 2.7314, + "step": 639 + }, + { + "epoch": 0.42814734289417566, + "grad_norm": 4.631012916564941, + "learning_rate": 4.972380313015054e-05, + "loss": 3.1524, + "step": 640 + }, + { + "epoch": 0.42881632311744783, + "grad_norm": 10.560107231140137, + "learning_rate": 4.972090885254893e-05, + "loss": 3.1603, + "step": 641 + }, + { + "epoch": 0.42948530334072, + "grad_norm": 3.9643378257751465, + "learning_rate": 4.9717999574379016e-05, + "loss": 2.7712, + "step": 642 + }, + { + "epoch": 0.43015428356399216, + "grad_norm": 3.4705636501312256, + "learning_rate": 4.971507529740614e-05, + "loss": 2.8451, + "step": 643 + }, + { + "epoch": 0.43082326378726427, + "grad_norm": 4.629807949066162, + "learning_rate": 4.971213602340475e-05, + "loss": 2.7529, + "step": 644 + }, + { + "epoch": 0.43149224401053643, + "grad_norm": 4.530109405517578, + "learning_rate": 4.970918175415838e-05, + "loss": 2.9651, + "step": 645 + }, + { + "epoch": 0.4321612242338086, + "grad_norm": 4.763259410858154, + "learning_rate": 4.9706212491459685e-05, + "loss": 2.8411, + "step": 646 + }, + { + "epoch": 0.43283020445708076, + "grad_norm": 3.4377224445343018, + "learning_rate": 4.9703228237110386e-05, + "loss": 2.9072, + "step": 647 + }, + { + "epoch": 0.43349918468035287, + "grad_norm": 9.420544624328613, + "learning_rate": 4.970022899292133e-05, + "loss": 3.0431, + "step": 648 + }, + { + "epoch": 0.43416816490362503, + "grad_norm": 5.205924034118652, + "learning_rate": 4.969721476071244e-05, + "loss": 2.9704, + "step": 649 + }, + { + "epoch": 0.4348371451268972, + "grad_norm": 3.097839832305908, + "learning_rate": 4.9694185542312745e-05, + "loss": 2.8079, + "step": 650 + }, + { + "epoch": 0.43550612535016936, + "grad_norm": 4.139601707458496, + "learning_rate": 4.969114133956037e-05, + "loss": 2.8865, + "step": 651 + }, + { + "epoch": 0.43617510557344147, + "grad_norm": 6.326453685760498, + "learning_rate": 4.968808215430253e-05, + "loss": 2.8658, + "step": 652 + }, + { + "epoch": 0.43684408579671363, + "grad_norm": 3.709042549133301, + "learning_rate": 4.968500798839552e-05, + "loss": 2.9125, + "step": 653 + }, + { + "epoch": 0.4375130660199858, + "grad_norm": 4.608620643615723, + "learning_rate": 4.968191884370474e-05, + "loss": 2.859, + "step": 654 + }, + { + "epoch": 0.43818204624325796, + "grad_norm": 5.535547733306885, + "learning_rate": 4.967881472210467e-05, + "loss": 2.9336, + "step": 655 + }, + { + "epoch": 0.43885102646653007, + "grad_norm": 4.740244388580322, + "learning_rate": 4.967569562547888e-05, + "loss": 3.0713, + "step": 656 + }, + { + "epoch": 0.43952000668980223, + "grad_norm": 4.269866466522217, + "learning_rate": 4.967256155572003e-05, + "loss": 3.065, + "step": 657 + }, + { + "epoch": 0.4401889869130744, + "grad_norm": 4.712810516357422, + "learning_rate": 4.966941251472986e-05, + "loss": 3.1094, + "step": 658 + }, + { + "epoch": 0.4408579671363465, + "grad_norm": 4.342670917510986, + "learning_rate": 4.966624850441921e-05, + "loss": 3.1203, + "step": 659 + }, + { + "epoch": 0.44152694735961867, + "grad_norm": 3.0770020484924316, + "learning_rate": 4.966306952670798e-05, + "loss": 2.9101, + "step": 660 + }, + { + "epoch": 0.44219592758289084, + "grad_norm": 4.738711357116699, + "learning_rate": 4.9659875583525165e-05, + "loss": 2.8037, + "step": 661 + }, + { + "epoch": 0.442864907806163, + "grad_norm": 4.434926986694336, + "learning_rate": 4.9656666676808844e-05, + "loss": 2.6684, + "step": 662 + }, + { + "epoch": 0.4435338880294351, + "grad_norm": 2.662122964859009, + "learning_rate": 4.9653442808506165e-05, + "loss": 2.8781, + "step": 663 + }, + { + "epoch": 0.4442028682527073, + "grad_norm": 4.463600158691406, + "learning_rate": 4.965020398057336e-05, + "loss": 3.0432, + "step": 664 + }, + { + "epoch": 0.44487184847597944, + "grad_norm": 5.810666084289551, + "learning_rate": 4.964695019497575e-05, + "loss": 2.9434, + "step": 665 + }, + { + "epoch": 0.4455408286992516, + "grad_norm": 4.265758037567139, + "learning_rate": 4.964368145368772e-05, + "loss": 2.8055, + "step": 666 + }, + { + "epoch": 0.4462098089225237, + "grad_norm": 3.7653000354766846, + "learning_rate": 4.9640397758692715e-05, + "loss": 2.7077, + "step": 667 + }, + { + "epoch": 0.4468787891457959, + "grad_norm": 5.089723110198975, + "learning_rate": 4.963709911198329e-05, + "loss": 3.1039, + "step": 668 + }, + { + "epoch": 0.44754776936906804, + "grad_norm": 3.1420540809631348, + "learning_rate": 4.9633785515561035e-05, + "loss": 2.822, + "step": 669 + }, + { + "epoch": 0.4482167495923402, + "grad_norm": 7.582685470581055, + "learning_rate": 4.9630456971436654e-05, + "loss": 2.915, + "step": 670 + }, + { + "epoch": 0.4488857298156123, + "grad_norm": 4.884276866912842, + "learning_rate": 4.962711348162987e-05, + "loss": 2.8379, + "step": 671 + }, + { + "epoch": 0.4495547100388845, + "grad_norm": 4.963716983795166, + "learning_rate": 4.962375504816953e-05, + "loss": 2.8788, + "step": 672 + }, + { + "epoch": 0.45022369026215664, + "grad_norm": 6.727871417999268, + "learning_rate": 4.9620381673093496e-05, + "loss": 2.9308, + "step": 673 + }, + { + "epoch": 0.45089267048542875, + "grad_norm": 3.4183032512664795, + "learning_rate": 4.9616993358448734e-05, + "loss": 2.7673, + "step": 674 + }, + { + "epoch": 0.4515616507087009, + "grad_norm": 5.130782604217529, + "learning_rate": 4.9613590106291266e-05, + "loss": 2.746, + "step": 675 + }, + { + "epoch": 0.4522306309319731, + "grad_norm": 5.305598735809326, + "learning_rate": 4.9610171918686157e-05, + "loss": 3.1816, + "step": 676 + }, + { + "epoch": 0.45289961115524524, + "grad_norm": 5.105145454406738, + "learning_rate": 4.960673879770757e-05, + "loss": 2.9757, + "step": 677 + }, + { + "epoch": 0.45356859137851735, + "grad_norm": 4.758883953094482, + "learning_rate": 4.9603290745438703e-05, + "loss": 2.7821, + "step": 678 + }, + { + "epoch": 0.4542375716017895, + "grad_norm": 4.063161373138428, + "learning_rate": 4.9599827763971825e-05, + "loss": 3.003, + "step": 679 + }, + { + "epoch": 0.4549065518250617, + "grad_norm": 4.086067199707031, + "learning_rate": 4.9596349855408266e-05, + "loss": 3.0175, + "step": 680 + }, + { + "epoch": 0.45557553204833384, + "grad_norm": 3.486377477645874, + "learning_rate": 4.9592857021858397e-05, + "loss": 2.9731, + "step": 681 + }, + { + "epoch": 0.45624451227160595, + "grad_norm": 4.757167816162109, + "learning_rate": 4.958934926544168e-05, + "loss": 3.0288, + "step": 682 + }, + { + "epoch": 0.4569134924948781, + "grad_norm": 4.748201370239258, + "learning_rate": 4.958582658828659e-05, + "loss": 2.8637, + "step": 683 + }, + { + "epoch": 0.4575824727181503, + "grad_norm": 5.692895889282227, + "learning_rate": 4.958228899253068e-05, + "loss": 3.1108, + "step": 684 + }, + { + "epoch": 0.45825145294142244, + "grad_norm": 4.517188549041748, + "learning_rate": 4.957873648032056e-05, + "loss": 2.9381, + "step": 685 + }, + { + "epoch": 0.45892043316469455, + "grad_norm": 7.702240467071533, + "learning_rate": 4.9575169053811876e-05, + "loss": 3.1256, + "step": 686 + }, + { + "epoch": 0.4595894133879667, + "grad_norm": 6.084390640258789, + "learning_rate": 4.957158671516934e-05, + "loss": 2.9819, + "step": 687 + }, + { + "epoch": 0.4602583936112389, + "grad_norm": 5.609198570251465, + "learning_rate": 4.9567989466566686e-05, + "loss": 3.296, + "step": 688 + }, + { + "epoch": 0.46092737383451104, + "grad_norm": 4.34131383895874, + "learning_rate": 4.956437731018674e-05, + "loss": 2.8, + "step": 689 + }, + { + "epoch": 0.46159635405778315, + "grad_norm": 4.470595359802246, + "learning_rate": 4.956075024822132e-05, + "loss": 2.9731, + "step": 690 + }, + { + "epoch": 0.4622653342810553, + "grad_norm": 4.090426445007324, + "learning_rate": 4.955710828287133e-05, + "loss": 2.7561, + "step": 691 + }, + { + "epoch": 0.4629343145043275, + "grad_norm": 4.911933422088623, + "learning_rate": 4.95534514163467e-05, + "loss": 2.9921, + "step": 692 + }, + { + "epoch": 0.4636032947275996, + "grad_norm": 3.661410331726074, + "learning_rate": 4.95497796508664e-05, + "loss": 2.9448, + "step": 693 + }, + { + "epoch": 0.46427227495087176, + "grad_norm": 3.1984846591949463, + "learning_rate": 4.9546092988658457e-05, + "loss": 2.7455, + "step": 694 + }, + { + "epoch": 0.4649412551741439, + "grad_norm": 3.988065242767334, + "learning_rate": 4.954239143195991e-05, + "loss": 2.7055, + "step": 695 + }, + { + "epoch": 0.4656102353974161, + "grad_norm": 4.223339080810547, + "learning_rate": 4.9538674983016866e-05, + "loss": 2.7932, + "step": 696 + }, + { + "epoch": 0.4662792156206882, + "grad_norm": 2.925285816192627, + "learning_rate": 4.953494364408445e-05, + "loss": 3.1801, + "step": 697 + }, + { + "epoch": 0.46694819584396036, + "grad_norm": 3.728097915649414, + "learning_rate": 4.9531197417426825e-05, + "loss": 3.066, + "step": 698 + }, + { + "epoch": 0.4676171760672325, + "grad_norm": 3.7567453384399414, + "learning_rate": 4.9527436305317195e-05, + "loss": 2.7462, + "step": 699 + }, + { + "epoch": 0.4682861562905047, + "grad_norm": 4.21150541305542, + "learning_rate": 4.952366031003778e-05, + "loss": 2.9276, + "step": 700 + }, + { + "epoch": 0.4689551365137768, + "grad_norm": 4.783014297485352, + "learning_rate": 4.951986943387984e-05, + "loss": 3.1362, + "step": 701 + }, + { + "epoch": 0.46962411673704896, + "grad_norm": 3.026461601257324, + "learning_rate": 4.951606367914369e-05, + "loss": 3.0478, + "step": 702 + }, + { + "epoch": 0.4702930969603211, + "grad_norm": 5.113108158111572, + "learning_rate": 4.951224304813862e-05, + "loss": 3.2145, + "step": 703 + }, + { + "epoch": 0.4709620771835933, + "grad_norm": 3.4587903022766113, + "learning_rate": 4.950840754318299e-05, + "loss": 2.9438, + "step": 704 + }, + { + "epoch": 0.4716310574068654, + "grad_norm": 3.0687296390533447, + "learning_rate": 4.950455716660418e-05, + "loss": 2.8577, + "step": 705 + }, + { + "epoch": 0.47230003763013756, + "grad_norm": 3.2031595706939697, + "learning_rate": 4.950069192073857e-05, + "loss": 3.0442, + "step": 706 + }, + { + "epoch": 0.4729690178534097, + "grad_norm": 3.09106183052063, + "learning_rate": 4.9496811807931596e-05, + "loss": 3.0287, + "step": 707 + }, + { + "epoch": 0.47363799807668183, + "grad_norm": 3.3463358879089355, + "learning_rate": 4.949291683053769e-05, + "loss": 2.9155, + "step": 708 + }, + { + "epoch": 0.474306978299954, + "grad_norm": 3.321120023727417, + "learning_rate": 4.948900699092031e-05, + "loss": 2.8729, + "step": 709 + }, + { + "epoch": 0.47497595852322616, + "grad_norm": 4.3413286209106445, + "learning_rate": 4.948508229145194e-05, + "loss": 2.813, + "step": 710 + }, + { + "epoch": 0.4756449387464983, + "grad_norm": 2.997468948364258, + "learning_rate": 4.948114273451405e-05, + "loss": 2.9919, + "step": 711 + }, + { + "epoch": 0.47631391896977043, + "grad_norm": 4.761056900024414, + "learning_rate": 4.947718832249719e-05, + "loss": 3.0345, + "step": 712 + }, + { + "epoch": 0.4769828991930426, + "grad_norm": 6.336904048919678, + "learning_rate": 4.9473219057800855e-05, + "loss": 3.2108, + "step": 713 + }, + { + "epoch": 0.47765187941631476, + "grad_norm": 2.752504348754883, + "learning_rate": 4.94692349428336e-05, + "loss": 2.7977, + "step": 714 + }, + { + "epoch": 0.4783208596395869, + "grad_norm": 4.846428394317627, + "learning_rate": 4.9465235980012964e-05, + "loss": 2.8033, + "step": 715 + }, + { + "epoch": 0.47898983986285903, + "grad_norm": 5.727593421936035, + "learning_rate": 4.946122217176551e-05, + "loss": 2.9128, + "step": 716 + }, + { + "epoch": 0.4796588200861312, + "grad_norm": 4.22546911239624, + "learning_rate": 4.945719352052679e-05, + "loss": 2.9922, + "step": 717 + }, + { + "epoch": 0.48032780030940336, + "grad_norm": 3.6872398853302, + "learning_rate": 4.94531500287414e-05, + "loss": 2.7136, + "step": 718 + }, + { + "epoch": 0.4809967805326755, + "grad_norm": 5.122034072875977, + "learning_rate": 4.94490916988629e-05, + "loss": 3.047, + "step": 719 + }, + { + "epoch": 0.48166576075594764, + "grad_norm": 3.345089912414551, + "learning_rate": 4.944501853335387e-05, + "loss": 2.8755, + "step": 720 + }, + { + "epoch": 0.4823347409792198, + "grad_norm": 3.690261125564575, + "learning_rate": 4.9440930534685914e-05, + "loss": 2.9244, + "step": 721 + }, + { + "epoch": 0.48300372120249196, + "grad_norm": 3.8499228954315186, + "learning_rate": 4.9436827705339597e-05, + "loss": 2.9331, + "step": 722 + }, + { + "epoch": 0.4836727014257641, + "grad_norm": 5.345366477966309, + "learning_rate": 4.94327100478045e-05, + "loss": 2.984, + "step": 723 + }, + { + "epoch": 0.48434168164903624, + "grad_norm": 3.7239580154418945, + "learning_rate": 4.9428577564579227e-05, + "loss": 3.0778, + "step": 724 + }, + { + "epoch": 0.4850106618723084, + "grad_norm": 4.34011173248291, + "learning_rate": 4.942443025817133e-05, + "loss": 3.0008, + "step": 725 + }, + { + "epoch": 0.48567964209558057, + "grad_norm": 5.08994197845459, + "learning_rate": 4.94202681310974e-05, + "loss": 3.0557, + "step": 726 + }, + { + "epoch": 0.4863486223188527, + "grad_norm": 5.373446464538574, + "learning_rate": 4.9416091185883e-05, + "loss": 3.2981, + "step": 727 + }, + { + "epoch": 0.48701760254212484, + "grad_norm": 3.0065951347351074, + "learning_rate": 4.9411899425062665e-05, + "loss": 2.9907, + "step": 728 + }, + { + "epoch": 0.487686582765397, + "grad_norm": 3.025444269180298, + "learning_rate": 4.9407692851179976e-05, + "loss": 3.0356, + "step": 729 + }, + { + "epoch": 0.48835556298866917, + "grad_norm": 2.8892788887023926, + "learning_rate": 4.9403471466787446e-05, + "loss": 2.8412, + "step": 730 + }, + { + "epoch": 0.4890245432119413, + "grad_norm": 3.6930127143859863, + "learning_rate": 4.93992352744466e-05, + "loss": 2.8324, + "step": 731 + }, + { + "epoch": 0.48969352343521344, + "grad_norm": 5.3692193031311035, + "learning_rate": 4.9394984276727954e-05, + "loss": 3.2483, + "step": 732 + }, + { + "epoch": 0.4903625036584856, + "grad_norm": 3.029644727706909, + "learning_rate": 4.9390718476210994e-05, + "loss": 2.7399, + "step": 733 + }, + { + "epoch": 0.49103148388175777, + "grad_norm": 4.040469646453857, + "learning_rate": 4.9386437875484194e-05, + "loss": 2.9284, + "step": 734 + }, + { + "epoch": 0.4917004641050299, + "grad_norm": 5.720209121704102, + "learning_rate": 4.938214247714501e-05, + "loss": 2.7779, + "step": 735 + }, + { + "epoch": 0.49236944432830204, + "grad_norm": 3.9115052223205566, + "learning_rate": 4.937783228379988e-05, + "loss": 2.7891, + "step": 736 + }, + { + "epoch": 0.4930384245515742, + "grad_norm": 5.581562042236328, + "learning_rate": 4.937350729806421e-05, + "loss": 3.0304, + "step": 737 + }, + { + "epoch": 0.49370740477484637, + "grad_norm": 3.564924955368042, + "learning_rate": 4.9369167522562385e-05, + "loss": 2.9908, + "step": 738 + }, + { + "epoch": 0.4943763849981185, + "grad_norm": 6.215388774871826, + "learning_rate": 4.9364812959927773e-05, + "loss": 3.0139, + "step": 739 + }, + { + "epoch": 0.49504536522139064, + "grad_norm": 3.3125667572021484, + "learning_rate": 4.936044361280271e-05, + "loss": 2.7297, + "step": 740 + }, + { + "epoch": 0.4957143454446628, + "grad_norm": 3.6967480182647705, + "learning_rate": 4.9356059483838495e-05, + "loss": 3.0106, + "step": 741 + }, + { + "epoch": 0.4963833256679349, + "grad_norm": 3.230792760848999, + "learning_rate": 4.935166057569541e-05, + "loss": 2.9077, + "step": 742 + }, + { + "epoch": 0.4970523058912071, + "grad_norm": 4.57341194152832, + "learning_rate": 4.9347246891042685e-05, + "loss": 3.0735, + "step": 743 + }, + { + "epoch": 0.49772128611447924, + "grad_norm": 2.9976770877838135, + "learning_rate": 4.934281843255855e-05, + "loss": 2.9161, + "step": 744 + }, + { + "epoch": 0.4983902663377514, + "grad_norm": 6.950145244598389, + "learning_rate": 4.933837520293017e-05, + "loss": 2.9473, + "step": 745 + }, + { + "epoch": 0.4990592465610235, + "grad_norm": 3.205528497695923, + "learning_rate": 4.933391720485368e-05, + "loss": 3.0744, + "step": 746 + }, + { + "epoch": 0.4997282267842957, + "grad_norm": 5.989559173583984, + "learning_rate": 4.932944444103418e-05, + "loss": 3.0021, + "step": 747 + }, + { + "epoch": 0.5003972070075678, + "grad_norm": 9.719673156738281, + "learning_rate": 4.9324956914185725e-05, + "loss": 3.17, + "step": 748 + }, + { + "epoch": 0.50106618723084, + "grad_norm": 3.216552972793579, + "learning_rate": 4.932045462703134e-05, + "loss": 2.7118, + "step": 749 + }, + { + "epoch": 0.5017351674541122, + "grad_norm": 3.494694471359253, + "learning_rate": 4.9315937582303e-05, + "loss": 2.9572, + "step": 750 + }, + { + "epoch": 0.5024041476773843, + "grad_norm": 3.0318126678466797, + "learning_rate": 4.931140578274162e-05, + "loss": 3.0735, + "step": 751 + }, + { + "epoch": 0.5030731279006564, + "grad_norm": 5.392656326293945, + "learning_rate": 4.930685923109709e-05, + "loss": 2.7952, + "step": 752 + }, + { + "epoch": 0.5037421081239286, + "grad_norm": 2.4600045680999756, + "learning_rate": 4.930229793012825e-05, + "loss": 2.7525, + "step": 753 + }, + { + "epoch": 0.5044110883472007, + "grad_norm": 3.9268031120300293, + "learning_rate": 4.929772188260287e-05, + "loss": 2.872, + "step": 754 + }, + { + "epoch": 0.5050800685704729, + "grad_norm": 4.539796829223633, + "learning_rate": 4.9293131091297686e-05, + "loss": 2.8895, + "step": 755 + }, + { + "epoch": 0.505749048793745, + "grad_norm": 2.849412202835083, + "learning_rate": 4.928852555899838e-05, + "loss": 2.8359, + "step": 756 + }, + { + "epoch": 0.5064180290170172, + "grad_norm": 5.098194122314453, + "learning_rate": 4.928390528849957e-05, + "loss": 2.9828, + "step": 757 + }, + { + "epoch": 0.5070870092402894, + "grad_norm": 6.503052711486816, + "learning_rate": 4.927927028260482e-05, + "loss": 2.9446, + "step": 758 + }, + { + "epoch": 0.5077559894635615, + "grad_norm": 4.589889049530029, + "learning_rate": 4.9274620544126625e-05, + "loss": 2.7252, + "step": 759 + }, + { + "epoch": 0.5084249696868336, + "grad_norm": 6.710662841796875, + "learning_rate": 4.926995607588646e-05, + "loss": 3.0542, + "step": 760 + }, + { + "epoch": 0.5090939499101058, + "grad_norm": 3.628431558609009, + "learning_rate": 4.9265276880714696e-05, + "loss": 2.9673, + "step": 761 + }, + { + "epoch": 0.5097629301333779, + "grad_norm": 4.540585517883301, + "learning_rate": 4.9260582961450644e-05, + "loss": 2.9251, + "step": 762 + }, + { + "epoch": 0.51043191035665, + "grad_norm": 5.215747833251953, + "learning_rate": 4.9255874320942565e-05, + "loss": 2.8115, + "step": 763 + }, + { + "epoch": 0.5111008905799223, + "grad_norm": 4.2205424308776855, + "learning_rate": 4.925115096204765e-05, + "loss": 3.0809, + "step": 764 + }, + { + "epoch": 0.5117698708031944, + "grad_norm": 4.905090808868408, + "learning_rate": 4.924641288763202e-05, + "loss": 2.8316, + "step": 765 + }, + { + "epoch": 0.5124388510264666, + "grad_norm": 4.075945854187012, + "learning_rate": 4.924166010057072e-05, + "loss": 3.1006, + "step": 766 + }, + { + "epoch": 0.5131078312497387, + "grad_norm": 6.7424116134643555, + "learning_rate": 4.9236892603747725e-05, + "loss": 3.0465, + "step": 767 + }, + { + "epoch": 0.5137768114730108, + "grad_norm": 3.8477931022644043, + "learning_rate": 4.9232110400055944e-05, + "loss": 2.8483, + "step": 768 + }, + { + "epoch": 0.514445791696283, + "grad_norm": 4.381292819976807, + "learning_rate": 4.9227313492397184e-05, + "loss": 3.0262, + "step": 769 + }, + { + "epoch": 0.5151147719195551, + "grad_norm": 3.9413158893585205, + "learning_rate": 4.9222501883682214e-05, + "loss": 2.927, + "step": 770 + }, + { + "epoch": 0.5157837521428272, + "grad_norm": 3.5212087631225586, + "learning_rate": 4.921767557683069e-05, + "loss": 2.7958, + "step": 771 + }, + { + "epoch": 0.5164527323660995, + "grad_norm": 4.910702228546143, + "learning_rate": 4.921283457477121e-05, + "loss": 3.0121, + "step": 772 + }, + { + "epoch": 0.5171217125893716, + "grad_norm": 3.43432354927063, + "learning_rate": 4.9207978880441275e-05, + "loss": 2.9094, + "step": 773 + }, + { + "epoch": 0.5177906928126437, + "grad_norm": 4.514383792877197, + "learning_rate": 4.9203108496787295e-05, + "loss": 2.9786, + "step": 774 + }, + { + "epoch": 0.5184596730359159, + "grad_norm": 3.666508913040161, + "learning_rate": 4.919822342676461e-05, + "loss": 2.9302, + "step": 775 + }, + { + "epoch": 0.519128653259188, + "grad_norm": 3.301198720932007, + "learning_rate": 4.9193323673337476e-05, + "loss": 3.1581, + "step": 776 + }, + { + "epoch": 0.5197976334824602, + "grad_norm": 5.317511558532715, + "learning_rate": 4.9188409239479026e-05, + "loss": 3.1275, + "step": 777 + }, + { + "epoch": 0.5204666137057323, + "grad_norm": 4.472630023956299, + "learning_rate": 4.9183480128171345e-05, + "loss": 3.1681, + "step": 778 + }, + { + "epoch": 0.5211355939290044, + "grad_norm": 3.865774393081665, + "learning_rate": 4.917853634240538e-05, + "loss": 2.9445, + "step": 779 + }, + { + "epoch": 0.5218045741522767, + "grad_norm": 2.5969860553741455, + "learning_rate": 4.9173577885181024e-05, + "loss": 2.7801, + "step": 780 + }, + { + "epoch": 0.5224735543755488, + "grad_norm": 3.0315206050872803, + "learning_rate": 4.916860475950704e-05, + "loss": 2.7403, + "step": 781 + }, + { + "epoch": 0.5231425345988209, + "grad_norm": 4.092448711395264, + "learning_rate": 4.91636169684011e-05, + "loss": 2.8117, + "step": 782 + }, + { + "epoch": 0.5238115148220931, + "grad_norm": 4.825198173522949, + "learning_rate": 4.9158614514889806e-05, + "loss": 2.9907, + "step": 783 + }, + { + "epoch": 0.5244804950453652, + "grad_norm": 4.413206577301025, + "learning_rate": 4.915359740200861e-05, + "loss": 3.1183, + "step": 784 + }, + { + "epoch": 0.5251494752686374, + "grad_norm": 5.487285614013672, + "learning_rate": 4.914856563280187e-05, + "loss": 3.0364, + "step": 785 + }, + { + "epoch": 0.5258184554919095, + "grad_norm": 3.3987667560577393, + "learning_rate": 4.9143519210322875e-05, + "loss": 2.7744, + "step": 786 + }, + { + "epoch": 0.5264874357151816, + "grad_norm": 3.7231040000915527, + "learning_rate": 4.9138458137633756e-05, + "loss": 3.0891, + "step": 787 + }, + { + "epoch": 0.5271564159384539, + "grad_norm": 10.888092041015625, + "learning_rate": 4.913338241780557e-05, + "loss": 2.7896, + "step": 788 + }, + { + "epoch": 0.527825396161726, + "grad_norm": 4.869259357452393, + "learning_rate": 4.9128292053918235e-05, + "loss": 2.8969, + "step": 789 + }, + { + "epoch": 0.5284943763849981, + "grad_norm": 5.367812156677246, + "learning_rate": 4.9123187049060584e-05, + "loss": 3.0896, + "step": 790 + }, + { + "epoch": 0.5291633566082703, + "grad_norm": 2.8876943588256836, + "learning_rate": 4.911806740633029e-05, + "loss": 2.9287, + "step": 791 + }, + { + "epoch": 0.5298323368315424, + "grad_norm": 4.382396697998047, + "learning_rate": 4.9112933128833974e-05, + "loss": 2.9172, + "step": 792 + }, + { + "epoch": 0.5305013170548145, + "grad_norm": 5.258941173553467, + "learning_rate": 4.9107784219687055e-05, + "loss": 2.9639, + "step": 793 + }, + { + "epoch": 0.5311702972780867, + "grad_norm": 4.112799167633057, + "learning_rate": 4.9102620682013915e-05, + "loss": 3.0852, + "step": 794 + }, + { + "epoch": 0.5318392775013588, + "grad_norm": 3.10772442817688, + "learning_rate": 4.909744251894775e-05, + "loss": 2.8169, + "step": 795 + }, + { + "epoch": 0.5325082577246311, + "grad_norm": 5.637611389160156, + "learning_rate": 4.9092249733630656e-05, + "loss": 3.0063, + "step": 796 + }, + { + "epoch": 0.5331772379479032, + "grad_norm": 4.501651287078857, + "learning_rate": 4.9087042329213606e-05, + "loss": 2.6273, + "step": 797 + }, + { + "epoch": 0.5338462181711753, + "grad_norm": 5.757697582244873, + "learning_rate": 4.9081820308856425e-05, + "loss": 3.2429, + "step": 798 + }, + { + "epoch": 0.5345151983944475, + "grad_norm": 3.935551404953003, + "learning_rate": 4.907658367572783e-05, + "loss": 2.7728, + "step": 799 + }, + { + "epoch": 0.5351841786177196, + "grad_norm": 3.3114700317382812, + "learning_rate": 4.907133243300538e-05, + "loss": 2.8407, + "step": 800 + }, + { + "epoch": 0.5358531588409917, + "grad_norm": 9.505463600158691, + "learning_rate": 4.906606658387551e-05, + "loss": 3.1491, + "step": 801 + }, + { + "epoch": 0.5365221390642639, + "grad_norm": 8.680765151977539, + "learning_rate": 4.906078613153354e-05, + "loss": 3.1871, + "step": 802 + }, + { + "epoch": 0.537191119287536, + "grad_norm": 5.7888031005859375, + "learning_rate": 4.905549107918362e-05, + "loss": 3.0384, + "step": 803 + }, + { + "epoch": 0.5378600995108083, + "grad_norm": 6.471538066864014, + "learning_rate": 4.905018143003878e-05, + "loss": 2.8996, + "step": 804 + }, + { + "epoch": 0.5385290797340804, + "grad_norm": 3.8376762866973877, + "learning_rate": 4.904485718732088e-05, + "loss": 3.2006, + "step": 805 + }, + { + "epoch": 0.5391980599573525, + "grad_norm": 7.126554012298584, + "learning_rate": 4.9039518354260674e-05, + "loss": 3.0624, + "step": 806 + }, + { + "epoch": 0.5398670401806247, + "grad_norm": 3.2664778232574463, + "learning_rate": 4.903416493409772e-05, + "loss": 2.6461, + "step": 807 + }, + { + "epoch": 0.5405360204038968, + "grad_norm": 4.890481948852539, + "learning_rate": 4.902879693008049e-05, + "loss": 2.9073, + "step": 808 + }, + { + "epoch": 0.5412050006271689, + "grad_norm": 4.069634437561035, + "learning_rate": 4.902341434546626e-05, + "loss": 2.806, + "step": 809 + }, + { + "epoch": 0.5418739808504411, + "grad_norm": 3.5017058849334717, + "learning_rate": 4.901801718352115e-05, + "loss": 2.8788, + "step": 810 + }, + { + "epoch": 0.5425429610737132, + "grad_norm": 7.153815269470215, + "learning_rate": 4.901260544752015e-05, + "loss": 3.0846, + "step": 811 + }, + { + "epoch": 0.5432119412969854, + "grad_norm": 5.80309534072876, + "learning_rate": 4.90071791407471e-05, + "loss": 3.0031, + "step": 812 + }, + { + "epoch": 0.5438809215202576, + "grad_norm": 5.59669828414917, + "learning_rate": 4.900173826649464e-05, + "loss": 3.1324, + "step": 813 + }, + { + "epoch": 0.5445499017435297, + "grad_norm": 5.2630486488342285, + "learning_rate": 4.899628282806428e-05, + "loss": 3.0937, + "step": 814 + }, + { + "epoch": 0.5452188819668019, + "grad_norm": 7.03258752822876, + "learning_rate": 4.8990812828766375e-05, + "loss": 2.9336, + "step": 815 + }, + { + "epoch": 0.545887862190074, + "grad_norm": 5.068579196929932, + "learning_rate": 4.8985328271920104e-05, + "loss": 3.3006, + "step": 816 + }, + { + "epoch": 0.5465568424133461, + "grad_norm": 4.967462062835693, + "learning_rate": 4.897982916085346e-05, + "loss": 2.7347, + "step": 817 + }, + { + "epoch": 0.5472258226366183, + "grad_norm": 4.353154182434082, + "learning_rate": 4.897431549890331e-05, + "loss": 2.9929, + "step": 818 + }, + { + "epoch": 0.5478948028598905, + "grad_norm": 4.147519588470459, + "learning_rate": 4.896878728941531e-05, + "loss": 2.8716, + "step": 819 + }, + { + "epoch": 0.5485637830831626, + "grad_norm": 3.507474422454834, + "learning_rate": 4.8963244535743954e-05, + "loss": 2.7667, + "step": 820 + }, + { + "epoch": 0.5492327633064348, + "grad_norm": 3.999964952468872, + "learning_rate": 4.895768724125259e-05, + "loss": 2.8255, + "step": 821 + }, + { + "epoch": 0.5499017435297069, + "grad_norm": 4.473382949829102, + "learning_rate": 4.895211540931335e-05, + "loss": 3.0781, + "step": 822 + }, + { + "epoch": 0.5505707237529791, + "grad_norm": 4.881600379943848, + "learning_rate": 4.894652904330721e-05, + "loss": 2.8138, + "step": 823 + }, + { + "epoch": 0.5512397039762512, + "grad_norm": 4.237582206726074, + "learning_rate": 4.894092814662395e-05, + "loss": 2.9103, + "step": 824 + }, + { + "epoch": 0.5519086841995233, + "grad_norm": 18.73539924621582, + "learning_rate": 4.893531272266218e-05, + "loss": 2.9088, + "step": 825 + }, + { + "epoch": 0.5525776644227955, + "grad_norm": 5.194465637207031, + "learning_rate": 4.8929682774829336e-05, + "loss": 3.065, + "step": 826 + }, + { + "epoch": 0.5532466446460677, + "grad_norm": 4.28487491607666, + "learning_rate": 4.892403830654163e-05, + "loss": 3.0793, + "step": 827 + }, + { + "epoch": 0.5539156248693398, + "grad_norm": 3.6678833961486816, + "learning_rate": 4.891837932122412e-05, + "loss": 2.8483, + "step": 828 + }, + { + "epoch": 0.554584605092612, + "grad_norm": 4.224690914154053, + "learning_rate": 4.8912705822310655e-05, + "loss": 2.8658, + "step": 829 + }, + { + "epoch": 0.5552535853158841, + "grad_norm": 4.132622241973877, + "learning_rate": 4.89070178132439e-05, + "loss": 2.7489, + "step": 830 + }, + { + "epoch": 0.5559225655391562, + "grad_norm": 4.749293327331543, + "learning_rate": 4.8901315297475315e-05, + "loss": 2.8354, + "step": 831 + }, + { + "epoch": 0.5565915457624284, + "grad_norm": 3.7333221435546875, + "learning_rate": 4.889559827846518e-05, + "loss": 2.8272, + "step": 832 + }, + { + "epoch": 0.5572605259857005, + "grad_norm": 3.7024402618408203, + "learning_rate": 4.8889866759682554e-05, + "loss": 2.7571, + "step": 833 + }, + { + "epoch": 0.5579295062089727, + "grad_norm": 3.952960968017578, + "learning_rate": 4.88841207446053e-05, + "loss": 2.8217, + "step": 834 + }, + { + "epoch": 0.5585984864322449, + "grad_norm": 3.772315502166748, + "learning_rate": 4.88783602367201e-05, + "loss": 3.0521, + "step": 835 + }, + { + "epoch": 0.559267466655517, + "grad_norm": 6.524354457855225, + "learning_rate": 4.887258523952239e-05, + "loss": 3.124, + "step": 836 + }, + { + "epoch": 0.5599364468787892, + "grad_norm": 4.3125457763671875, + "learning_rate": 4.886679575651643e-05, + "loss": 3.1228, + "step": 837 + }, + { + "epoch": 0.5606054271020613, + "grad_norm": 4.666600227355957, + "learning_rate": 4.886099179121526e-05, + "loss": 2.9512, + "step": 838 + }, + { + "epoch": 0.5612744073253334, + "grad_norm": 4.160144329071045, + "learning_rate": 4.885517334714072e-05, + "loss": 3.0934, + "step": 839 + }, + { + "epoch": 0.5619433875486056, + "grad_norm": 3.479520320892334, + "learning_rate": 4.884934042782339e-05, + "loss": 2.7539, + "step": 840 + }, + { + "epoch": 0.5626123677718777, + "grad_norm": 3.340153694152832, + "learning_rate": 4.8843493036802696e-05, + "loss": 2.7626, + "step": 841 + }, + { + "epoch": 0.5632813479951498, + "grad_norm": 3.464170217514038, + "learning_rate": 4.8837631177626807e-05, + "loss": 2.9802, + "step": 842 + }, + { + "epoch": 0.5639503282184221, + "grad_norm": 4.128434181213379, + "learning_rate": 4.883175485385268e-05, + "loss": 3.3324, + "step": 843 + }, + { + "epoch": 0.5646193084416942, + "grad_norm": 3.9891414642333984, + "learning_rate": 4.8825864069046044e-05, + "loss": 2.9105, + "step": 844 + }, + { + "epoch": 0.5652882886649664, + "grad_norm": 5.570315837860107, + "learning_rate": 4.881995882678142e-05, + "loss": 2.8328, + "step": 845 + }, + { + "epoch": 0.5659572688882385, + "grad_norm": 5.166423797607422, + "learning_rate": 4.881403913064208e-05, + "loss": 2.973, + "step": 846 + }, + { + "epoch": 0.5666262491115106, + "grad_norm": 3.9031732082366943, + "learning_rate": 4.880810498422009e-05, + "loss": 3.0236, + "step": 847 + }, + { + "epoch": 0.5672952293347828, + "grad_norm": 4.1895012855529785, + "learning_rate": 4.880215639111626e-05, + "loss": 2.9534, + "step": 848 + }, + { + "epoch": 0.5679642095580549, + "grad_norm": 8.483187675476074, + "learning_rate": 4.879619335494017e-05, + "loss": 2.8898, + "step": 849 + }, + { + "epoch": 0.568633189781327, + "grad_norm": 4.321814060211182, + "learning_rate": 4.879021587931019e-05, + "loss": 3.1692, + "step": 850 + }, + { + "epoch": 0.5693021700045993, + "grad_norm": 3.7368454933166504, + "learning_rate": 4.878422396785342e-05, + "loss": 2.8991, + "step": 851 + }, + { + "epoch": 0.5699711502278714, + "grad_norm": 7.553808212280273, + "learning_rate": 4.877821762420574e-05, + "loss": 3.1544, + "step": 852 + }, + { + "epoch": 0.5706401304511436, + "grad_norm": 4.322549819946289, + "learning_rate": 4.877219685201176e-05, + "loss": 2.9816, + "step": 853 + }, + { + "epoch": 0.5713091106744157, + "grad_norm": 3.573345184326172, + "learning_rate": 4.87661616549249e-05, + "loss": 2.8022, + "step": 854 + }, + { + "epoch": 0.5719780908976878, + "grad_norm": 6.072822093963623, + "learning_rate": 4.876011203660727e-05, + "loss": 3.0581, + "step": 855 + }, + { + "epoch": 0.57264707112096, + "grad_norm": 3.929838180541992, + "learning_rate": 4.875404800072977e-05, + "loss": 2.9111, + "step": 856 + }, + { + "epoch": 0.5733160513442321, + "grad_norm": 4.304371356964111, + "learning_rate": 4.874796955097204e-05, + "loss": 3.0608, + "step": 857 + }, + { + "epoch": 0.5739850315675042, + "grad_norm": 4.513864994049072, + "learning_rate": 4.874187669102246e-05, + "loss": 3.066, + "step": 858 + }, + { + "epoch": 0.5746540117907765, + "grad_norm": 3.6599647998809814, + "learning_rate": 4.873576942457815e-05, + "loss": 3.075, + "step": 859 + }, + { + "epoch": 0.5753229920140486, + "grad_norm": 3.387683391571045, + "learning_rate": 4.8729647755344995e-05, + "loss": 2.7858, + "step": 860 + }, + { + "epoch": 0.5759919722373207, + "grad_norm": 4.5267181396484375, + "learning_rate": 4.872351168703759e-05, + "loss": 2.924, + "step": 861 + }, + { + "epoch": 0.5766609524605929, + "grad_norm": 4.898892879486084, + "learning_rate": 4.87173612233793e-05, + "loss": 3.2621, + "step": 862 + }, + { + "epoch": 0.577329932683865, + "grad_norm": 5.0480732917785645, + "learning_rate": 4.871119636810219e-05, + "loss": 3.143, + "step": 863 + }, + { + "epoch": 0.5779989129071372, + "grad_norm": 4.6042304039001465, + "learning_rate": 4.870501712494708e-05, + "loss": 2.876, + "step": 864 + }, + { + "epoch": 0.5786678931304093, + "grad_norm": 3.421410083770752, + "learning_rate": 4.8698823497663513e-05, + "loss": 2.947, + "step": 865 + }, + { + "epoch": 0.5793368733536814, + "grad_norm": 4.7167534828186035, + "learning_rate": 4.869261549000976e-05, + "loss": 2.9044, + "step": 866 + }, + { + "epoch": 0.5800058535769537, + "grad_norm": 2.3363020420074463, + "learning_rate": 4.868639310575283e-05, + "loss": 2.8712, + "step": 867 + }, + { + "epoch": 0.5806748338002258, + "grad_norm": 5.71278715133667, + "learning_rate": 4.8680156348668436e-05, + "loss": 2.9753, + "step": 868 + }, + { + "epoch": 0.5813438140234979, + "grad_norm": 4.479608058929443, + "learning_rate": 4.867390522254103e-05, + "loss": 2.9986, + "step": 869 + }, + { + "epoch": 0.5820127942467701, + "grad_norm": 4.775486469268799, + "learning_rate": 4.8667639731163775e-05, + "loss": 2.8507, + "step": 870 + }, + { + "epoch": 0.5826817744700422, + "grad_norm": 5.254409313201904, + "learning_rate": 4.866135987833854e-05, + "loss": 2.9328, + "step": 871 + }, + { + "epoch": 0.5833507546933144, + "grad_norm": 5.818423748016357, + "learning_rate": 4.865506566787593e-05, + "loss": 2.7693, + "step": 872 + }, + { + "epoch": 0.5840197349165865, + "grad_norm": 3.793675422668457, + "learning_rate": 4.864875710359524e-05, + "loss": 2.8004, + "step": 873 + }, + { + "epoch": 0.5846887151398587, + "grad_norm": 3.916386365890503, + "learning_rate": 4.864243418932451e-05, + "loss": 2.9722, + "step": 874 + }, + { + "epoch": 0.5853576953631309, + "grad_norm": 8.37555980682373, + "learning_rate": 4.8636096928900446e-05, + "loss": 2.7889, + "step": 875 + }, + { + "epoch": 0.586026675586403, + "grad_norm": 4.431842803955078, + "learning_rate": 4.862974532616848e-05, + "loss": 2.683, + "step": 876 + }, + { + "epoch": 0.5866956558096751, + "grad_norm": 4.644500732421875, + "learning_rate": 4.862337938498274e-05, + "loss": 2.8752, + "step": 877 + }, + { + "epoch": 0.5873646360329473, + "grad_norm": 4.099252700805664, + "learning_rate": 4.8616999109206063e-05, + "loss": 2.9416, + "step": 878 + }, + { + "epoch": 0.5880336162562194, + "grad_norm": 5.259937286376953, + "learning_rate": 4.8610604502709984e-05, + "loss": 3.0663, + "step": 879 + }, + { + "epoch": 0.5887025964794915, + "grad_norm": 3.8882627487182617, + "learning_rate": 4.8604195569374725e-05, + "loss": 2.8836, + "step": 880 + }, + { + "epoch": 0.5893715767027637, + "grad_norm": 5.224497318267822, + "learning_rate": 4.859777231308921e-05, + "loss": 3.2656, + "step": 881 + }, + { + "epoch": 0.5900405569260359, + "grad_norm": 4.912106513977051, + "learning_rate": 4.859133473775105e-05, + "loss": 3.0812, + "step": 882 + }, + { + "epoch": 0.5907095371493081, + "grad_norm": 3.9396631717681885, + "learning_rate": 4.858488284726654e-05, + "loss": 2.7061, + "step": 883 + }, + { + "epoch": 0.5913785173725802, + "grad_norm": 3.4575467109680176, + "learning_rate": 4.857841664555067e-05, + "loss": 3.0659, + "step": 884 + }, + { + "epoch": 0.5920474975958523, + "grad_norm": 4.488811016082764, + "learning_rate": 4.857193613652711e-05, + "loss": 2.8396, + "step": 885 + }, + { + "epoch": 0.5927164778191245, + "grad_norm": 6.178808212280273, + "learning_rate": 4.856544132412821e-05, + "loss": 2.9394, + "step": 886 + }, + { + "epoch": 0.5933854580423966, + "grad_norm": 4.329935073852539, + "learning_rate": 4.8558932212295006e-05, + "loss": 3.1392, + "step": 887 + }, + { + "epoch": 0.5940544382656687, + "grad_norm": 4.575646877288818, + "learning_rate": 4.85524088049772e-05, + "loss": 2.9275, + "step": 888 + }, + { + "epoch": 0.594723418488941, + "grad_norm": 4.456279277801514, + "learning_rate": 4.854587110613318e-05, + "loss": 3.0891, + "step": 889 + }, + { + "epoch": 0.5953923987122131, + "grad_norm": 4.95554780960083, + "learning_rate": 4.853931911973e-05, + "loss": 2.9593, + "step": 890 + }, + { + "epoch": 0.5960613789354852, + "grad_norm": 4.824105262756348, + "learning_rate": 4.8532752849743384e-05, + "loss": 3.1116, + "step": 891 + }, + { + "epoch": 0.5967303591587574, + "grad_norm": 3.0384416580200195, + "learning_rate": 4.8526172300157726e-05, + "loss": 2.9218, + "step": 892 + }, + { + "epoch": 0.5973993393820295, + "grad_norm": 5.79701566696167, + "learning_rate": 4.8519577474966074e-05, + "loss": 3.0881, + "step": 893 + }, + { + "epoch": 0.5980683196053017, + "grad_norm": 3.513871431350708, + "learning_rate": 4.851296837817015e-05, + "loss": 2.7299, + "step": 894 + }, + { + "epoch": 0.5987372998285738, + "grad_norm": 3.8924996852874756, + "learning_rate": 4.850634501378034e-05, + "loss": 2.9338, + "step": 895 + }, + { + "epoch": 0.5994062800518459, + "grad_norm": 5.226293087005615, + "learning_rate": 4.849970738581568e-05, + "loss": 3.0033, + "step": 896 + }, + { + "epoch": 0.6000752602751181, + "grad_norm": 4.557132244110107, + "learning_rate": 4.8493055498303854e-05, + "loss": 2.8244, + "step": 897 + }, + { + "epoch": 0.6007442404983903, + "grad_norm": 5.492852687835693, + "learning_rate": 4.84863893552812e-05, + "loss": 2.8643, + "step": 898 + }, + { + "epoch": 0.6014132207216624, + "grad_norm": 6.306492805480957, + "learning_rate": 4.847970896079272e-05, + "loss": 2.9746, + "step": 899 + }, + { + "epoch": 0.6020822009449346, + "grad_norm": 3.952476739883423, + "learning_rate": 4.8473014318892075e-05, + "loss": 3.0116, + "step": 900 + }, + { + "epoch": 0.6027511811682067, + "grad_norm": 4.202814102172852, + "learning_rate": 4.846630543364152e-05, + "loss": 3.0347, + "step": 901 + }, + { + "epoch": 0.6034201613914789, + "grad_norm": 5.378247261047363, + "learning_rate": 4.8459582309112e-05, + "loss": 3.2602, + "step": 902 + }, + { + "epoch": 0.604089141614751, + "grad_norm": 4.968826770782471, + "learning_rate": 4.8452844949383094e-05, + "loss": 2.8037, + "step": 903 + }, + { + "epoch": 0.6047581218380231, + "grad_norm": 5.133361339569092, + "learning_rate": 4.8446093358542986e-05, + "loss": 2.8434, + "step": 904 + }, + { + "epoch": 0.6054271020612954, + "grad_norm": 4.359605312347412, + "learning_rate": 4.843932754068854e-05, + "loss": 2.7, + "step": 905 + }, + { + "epoch": 0.6060960822845675, + "grad_norm": 4.303665637969971, + "learning_rate": 4.843254749992523e-05, + "loss": 2.6906, + "step": 906 + }, + { + "epoch": 0.6067650625078396, + "grad_norm": 4.343417644500732, + "learning_rate": 4.8425753240367165e-05, + "loss": 2.8728, + "step": 907 + }, + { + "epoch": 0.6074340427311118, + "grad_norm": 4.789746284484863, + "learning_rate": 4.841894476613707e-05, + "loss": 2.9626, + "step": 908 + }, + { + "epoch": 0.6081030229543839, + "grad_norm": 5.736357688903809, + "learning_rate": 4.841212208136631e-05, + "loss": 2.8692, + "step": 909 + }, + { + "epoch": 0.608772003177656, + "grad_norm": 3.2540862560272217, + "learning_rate": 4.840528519019487e-05, + "loss": 2.7127, + "step": 910 + }, + { + "epoch": 0.6094409834009282, + "grad_norm": 5.809683322906494, + "learning_rate": 4.839843409677135e-05, + "loss": 3.1236, + "step": 911 + }, + { + "epoch": 0.6101099636242003, + "grad_norm": 6.905038356781006, + "learning_rate": 4.839156880525297e-05, + "loss": 2.9768, + "step": 912 + }, + { + "epoch": 0.6107789438474726, + "grad_norm": 5.895445346832275, + "learning_rate": 4.8384689319805584e-05, + "loss": 2.9639, + "step": 913 + }, + { + "epoch": 0.6114479240707447, + "grad_norm": 5.081824779510498, + "learning_rate": 4.8377795644603615e-05, + "loss": 3.1095, + "step": 914 + }, + { + "epoch": 0.6121169042940168, + "grad_norm": 2.9040255546569824, + "learning_rate": 4.837088778383015e-05, + "loss": 2.6119, + "step": 915 + }, + { + "epoch": 0.612785884517289, + "grad_norm": 4.30228328704834, + "learning_rate": 4.836396574167684e-05, + "loss": 2.7013, + "step": 916 + }, + { + "epoch": 0.6134548647405611, + "grad_norm": 3.297487735748291, + "learning_rate": 4.835702952234395e-05, + "loss": 2.6564, + "step": 917 + }, + { + "epoch": 0.6141238449638332, + "grad_norm": 5.532101154327393, + "learning_rate": 4.835007913004038e-05, + "loss": 2.9093, + "step": 918 + }, + { + "epoch": 0.6147928251871054, + "grad_norm": 8.50019359588623, + "learning_rate": 4.8343114568983594e-05, + "loss": 2.9708, + "step": 919 + }, + { + "epoch": 0.6154618054103775, + "grad_norm": 6.267725944519043, + "learning_rate": 4.833613584339965e-05, + "loss": 2.9291, + "step": 920 + }, + { + "epoch": 0.6161307856336498, + "grad_norm": 6.294321537017822, + "learning_rate": 4.8329142957523245e-05, + "loss": 2.545, + "step": 921 + }, + { + "epoch": 0.6167997658569219, + "grad_norm": 3.558936834335327, + "learning_rate": 4.832213591559762e-05, + "loss": 2.8664, + "step": 922 + }, + { + "epoch": 0.617468746080194, + "grad_norm": 5.082416534423828, + "learning_rate": 4.831511472187463e-05, + "loss": 2.9146, + "step": 923 + }, + { + "epoch": 0.6181377263034662, + "grad_norm": 5.50706672668457, + "learning_rate": 4.830807938061471e-05, + "loss": 3.0, + "step": 924 + }, + { + "epoch": 0.6188067065267383, + "grad_norm": 3.523247003555298, + "learning_rate": 4.83010298960869e-05, + "loss": 2.9033, + "step": 925 + }, + { + "epoch": 0.6194756867500104, + "grad_norm": 5.927065849304199, + "learning_rate": 4.829396627256878e-05, + "loss": 2.9559, + "step": 926 + }, + { + "epoch": 0.6201446669732826, + "grad_norm": 6.5479865074157715, + "learning_rate": 4.828688851434655e-05, + "loss": 2.9306, + "step": 927 + }, + { + "epoch": 0.6208136471965547, + "grad_norm": 5.179655075073242, + "learning_rate": 4.8279796625714955e-05, + "loss": 2.7334, + "step": 928 + }, + { + "epoch": 0.6214826274198268, + "grad_norm": 5.061421871185303, + "learning_rate": 4.8272690610977356e-05, + "loss": 2.8804, + "step": 929 + }, + { + "epoch": 0.6221516076430991, + "grad_norm": 3.9310529232025146, + "learning_rate": 4.8265570474445636e-05, + "loss": 3.0256, + "step": 930 + }, + { + "epoch": 0.6228205878663712, + "grad_norm": 5.7879228591918945, + "learning_rate": 4.825843622044028e-05, + "loss": 3.0422, + "step": 931 + }, + { + "epoch": 0.6234895680896434, + "grad_norm": 4.052291393280029, + "learning_rate": 4.825128785329034e-05, + "loss": 2.8619, + "step": 932 + }, + { + "epoch": 0.6241585483129155, + "grad_norm": 3.4358327388763428, + "learning_rate": 4.824412537733341e-05, + "loss": 2.8506, + "step": 933 + }, + { + "epoch": 0.6248275285361876, + "grad_norm": 3.948198080062866, + "learning_rate": 4.823694879691565e-05, + "loss": 2.8697, + "step": 934 + }, + { + "epoch": 0.6254965087594598, + "grad_norm": 4.091851234436035, + "learning_rate": 4.822975811639181e-05, + "loss": 3.051, + "step": 935 + }, + { + "epoch": 0.6261654889827319, + "grad_norm": 4.4150896072387695, + "learning_rate": 4.822255334012515e-05, + "loss": 3.0754, + "step": 936 + }, + { + "epoch": 0.626834469206004, + "grad_norm": 4.241657257080078, + "learning_rate": 4.821533447248752e-05, + "loss": 2.9059, + "step": 937 + }, + { + "epoch": 0.6275034494292763, + "grad_norm": 7.57624626159668, + "learning_rate": 4.8208101517859294e-05, + "loss": 2.7324, + "step": 938 + }, + { + "epoch": 0.6281724296525484, + "grad_norm": 5.501160621643066, + "learning_rate": 4.820085448062942e-05, + "loss": 2.9821, + "step": 939 + }, + { + "epoch": 0.6288414098758206, + "grad_norm": 6.796340465545654, + "learning_rate": 4.819359336519536e-05, + "loss": 2.9937, + "step": 940 + }, + { + "epoch": 0.6295103900990927, + "grad_norm": 5.409112453460693, + "learning_rate": 4.8186318175963145e-05, + "loss": 2.7989, + "step": 941 + }, + { + "epoch": 0.6301793703223648, + "grad_norm": 6.310184955596924, + "learning_rate": 4.817902891734734e-05, + "loss": 3.2403, + "step": 942 + }, + { + "epoch": 0.630848350545637, + "grad_norm": 4.645697593688965, + "learning_rate": 4.817172559377103e-05, + "loss": 2.8919, + "step": 943 + }, + { + "epoch": 0.6315173307689091, + "grad_norm": 8.987037658691406, + "learning_rate": 4.816440820966587e-05, + "loss": 2.862, + "step": 944 + }, + { + "epoch": 0.6321863109921813, + "grad_norm": 4.836058616638184, + "learning_rate": 4.8157076769472e-05, + "loss": 2.7711, + "step": 945 + }, + { + "epoch": 0.6328552912154535, + "grad_norm": 6.288712024688721, + "learning_rate": 4.814973127763813e-05, + "loss": 2.9659, + "step": 946 + }, + { + "epoch": 0.6335242714387256, + "grad_norm": 3.072624683380127, + "learning_rate": 4.814237173862148e-05, + "loss": 2.8166, + "step": 947 + }, + { + "epoch": 0.6341932516619977, + "grad_norm": 5.474343299865723, + "learning_rate": 4.81349981568878e-05, + "loss": 2.9033, + "step": 948 + }, + { + "epoch": 0.6348622318852699, + "grad_norm": 3.749812602996826, + "learning_rate": 4.812761053691134e-05, + "loss": 2.9709, + "step": 949 + }, + { + "epoch": 0.635531212108542, + "grad_norm": 3.7517666816711426, + "learning_rate": 4.81202088831749e-05, + "loss": 2.9877, + "step": 950 + }, + { + "epoch": 0.6362001923318142, + "grad_norm": 4.512857913970947, + "learning_rate": 4.811279320016976e-05, + "loss": 3.0786, + "step": 951 + }, + { + "epoch": 0.6368691725550863, + "grad_norm": 4.326724529266357, + "learning_rate": 4.810536349239576e-05, + "loss": 2.8043, + "step": 952 + }, + { + "epoch": 0.6375381527783585, + "grad_norm": 3.670686960220337, + "learning_rate": 4.8097919764361194e-05, + "loss": 3.0172, + "step": 953 + }, + { + "epoch": 0.6382071330016307, + "grad_norm": 3.1971354484558105, + "learning_rate": 4.809046202058291e-05, + "loss": 2.9217, + "step": 954 + }, + { + "epoch": 0.6388761132249028, + "grad_norm": 4.29288387298584, + "learning_rate": 4.8082990265586245e-05, + "loss": 2.9419, + "step": 955 + }, + { + "epoch": 0.6395450934481749, + "grad_norm": 6.09660005569458, + "learning_rate": 4.8075504503905025e-05, + "loss": 2.7751, + "step": 956 + }, + { + "epoch": 0.6402140736714471, + "grad_norm": 3.5380003452301025, + "learning_rate": 4.80680047400816e-05, + "loss": 2.9738, + "step": 957 + }, + { + "epoch": 0.6408830538947192, + "grad_norm": 5.08881139755249, + "learning_rate": 4.8060490978666784e-05, + "loss": 2.9587, + "step": 958 + }, + { + "epoch": 0.6415520341179913, + "grad_norm": 4.936773300170898, + "learning_rate": 4.8052963224219915e-05, + "loss": 3.017, + "step": 959 + }, + { + "epoch": 0.6422210143412636, + "grad_norm": 5.949796199798584, + "learning_rate": 4.804542148130881e-05, + "loss": 3.1539, + "step": 960 + }, + { + "epoch": 0.6428899945645357, + "grad_norm": 5.088866233825684, + "learning_rate": 4.803786575450978e-05, + "loss": 2.9623, + "step": 961 + }, + { + "epoch": 0.6435589747878079, + "grad_norm": 4.024708271026611, + "learning_rate": 4.8030296048407596e-05, + "loss": 3.0045, + "step": 962 + }, + { + "epoch": 0.64422795501108, + "grad_norm": 6.3203630447387695, + "learning_rate": 4.802271236759556e-05, + "loss": 3.103, + "step": 963 + }, + { + "epoch": 0.6448969352343521, + "grad_norm": 4.5583367347717285, + "learning_rate": 4.8015114716675395e-05, + "loss": 2.9665, + "step": 964 + }, + { + "epoch": 0.6455659154576243, + "grad_norm": 4.08353853225708, + "learning_rate": 4.800750310025735e-05, + "loss": 2.9864, + "step": 965 + }, + { + "epoch": 0.6462348956808964, + "grad_norm": 2.692695140838623, + "learning_rate": 4.799987752296013e-05, + "loss": 2.8868, + "step": 966 + }, + { + "epoch": 0.6469038759041685, + "grad_norm": 6.414584159851074, + "learning_rate": 4.7992237989410904e-05, + "loss": 3.0365, + "step": 967 + }, + { + "epoch": 0.6475728561274408, + "grad_norm": 4.764004707336426, + "learning_rate": 4.7984584504245325e-05, + "loss": 3.1698, + "step": 968 + }, + { + "epoch": 0.6482418363507129, + "grad_norm": 3.422572612762451, + "learning_rate": 4.7976917072107486e-05, + "loss": 2.7567, + "step": 969 + }, + { + "epoch": 0.6489108165739851, + "grad_norm": 4.826651096343994, + "learning_rate": 4.796923569764998e-05, + "loss": 2.7321, + "step": 970 + }, + { + "epoch": 0.6495797967972572, + "grad_norm": 5.226298809051514, + "learning_rate": 4.796154038553382e-05, + "loss": 3.1136, + "step": 971 + }, + { + "epoch": 0.6502487770205293, + "grad_norm": 3.5223143100738525, + "learning_rate": 4.795383114042852e-05, + "loss": 2.74, + "step": 972 + }, + { + "epoch": 0.6509177572438015, + "grad_norm": 5.651745319366455, + "learning_rate": 4.794610796701201e-05, + "loss": 3.0947, + "step": 973 + }, + { + "epoch": 0.6515867374670736, + "grad_norm": 4.480429172515869, + "learning_rate": 4.7938370869970694e-05, + "loss": 2.9169, + "step": 974 + }, + { + "epoch": 0.6522557176903457, + "grad_norm": 5.427626132965088, + "learning_rate": 4.793061985399942e-05, + "loss": 2.9253, + "step": 975 + }, + { + "epoch": 0.652924697913618, + "grad_norm": 3.702354669570923, + "learning_rate": 4.7922854923801457e-05, + "loss": 2.8818, + "step": 976 + }, + { + "epoch": 0.6535936781368901, + "grad_norm": 9.913829803466797, + "learning_rate": 4.7915076084088565e-05, + "loss": 3.3618, + "step": 977 + }, + { + "epoch": 0.6542626583601622, + "grad_norm": 3.803285837173462, + "learning_rate": 4.790728333958091e-05, + "loss": 2.9537, + "step": 978 + }, + { + "epoch": 0.6549316385834344, + "grad_norm": 5.172455787658691, + "learning_rate": 4.789947669500711e-05, + "loss": 2.9666, + "step": 979 + }, + { + "epoch": 0.6556006188067065, + "grad_norm": 4.230083465576172, + "learning_rate": 4.78916561551042e-05, + "loss": 2.9084, + "step": 980 + }, + { + "epoch": 0.6562695990299787, + "grad_norm": 5.195817470550537, + "learning_rate": 4.7883821724617674e-05, + "loss": 3.0398, + "step": 981 + }, + { + "epoch": 0.6569385792532508, + "grad_norm": 4.758103370666504, + "learning_rate": 4.7875973408301424e-05, + "loss": 2.8131, + "step": 982 + }, + { + "epoch": 0.6576075594765229, + "grad_norm": 7.711447715759277, + "learning_rate": 4.786811121091779e-05, + "loss": 2.9391, + "step": 983 + }, + { + "epoch": 0.6582765396997952, + "grad_norm": 4.11110258102417, + "learning_rate": 4.786023513723753e-05, + "loss": 2.6406, + "step": 984 + }, + { + "epoch": 0.6589455199230673, + "grad_norm": 8.119991302490234, + "learning_rate": 4.785234519203982e-05, + "loss": 3.0358, + "step": 985 + }, + { + "epoch": 0.6596145001463394, + "grad_norm": 4.686762809753418, + "learning_rate": 4.7844441380112247e-05, + "loss": 3.0016, + "step": 986 + }, + { + "epoch": 0.6602834803696116, + "grad_norm": 5.753101348876953, + "learning_rate": 4.7836523706250825e-05, + "loss": 2.9268, + "step": 987 + }, + { + "epoch": 0.6609524605928837, + "grad_norm": 7.028621196746826, + "learning_rate": 4.7828592175259976e-05, + "loss": 3.234, + "step": 988 + }, + { + "epoch": 0.6616214408161559, + "grad_norm": 5.211911678314209, + "learning_rate": 4.782064679195253e-05, + "loss": 3.0338, + "step": 989 + }, + { + "epoch": 0.662290421039428, + "grad_norm": 5.439329147338867, + "learning_rate": 4.78126875611497e-05, + "loss": 2.9931, + "step": 990 + }, + { + "epoch": 0.6629594012627001, + "grad_norm": 5.949470043182373, + "learning_rate": 4.780471448768115e-05, + "loss": 2.9316, + "step": 991 + }, + { + "epoch": 0.6636283814859724, + "grad_norm": 4.6272783279418945, + "learning_rate": 4.7796727576384884e-05, + "loss": 3.0215, + "step": 992 + }, + { + "epoch": 0.6642973617092445, + "grad_norm": 4.166497230529785, + "learning_rate": 4.778872683210736e-05, + "loss": 2.7412, + "step": 993 + }, + { + "epoch": 0.6649663419325166, + "grad_norm": 5.167282581329346, + "learning_rate": 4.77807122597034e-05, + "loss": 2.8083, + "step": 994 + }, + { + "epoch": 0.6656353221557888, + "grad_norm": 4.663416385650635, + "learning_rate": 4.77726838640362e-05, + "loss": 2.8814, + "step": 995 + }, + { + "epoch": 0.6663043023790609, + "grad_norm": 4.006105422973633, + "learning_rate": 4.776464164997739e-05, + "loss": 3.0979, + "step": 996 + }, + { + "epoch": 0.666973282602333, + "grad_norm": 7.690087795257568, + "learning_rate": 4.775658562240696e-05, + "loss": 2.8149, + "step": 997 + }, + { + "epoch": 0.6676422628256052, + "grad_norm": 4.8195295333862305, + "learning_rate": 4.7748515786213264e-05, + "loss": 2.9307, + "step": 998 + }, + { + "epoch": 0.6683112430488773, + "grad_norm": 3.997514009475708, + "learning_rate": 4.7740432146293055e-05, + "loss": 2.9505, + "step": 999 + }, + { + "epoch": 0.6689802232721496, + "grad_norm": 6.172044277191162, + "learning_rate": 4.773233470755147e-05, + "loss": 2.9291, + "step": 1000 + }, + { + "epoch": 0.6696492034954217, + "grad_norm": 6.186550140380859, + "learning_rate": 4.7724223474902014e-05, + "loss": 3.0619, + "step": 1001 + }, + { + "epoch": 0.6703181837186938, + "grad_norm": 5.554472923278809, + "learning_rate": 4.771609845326654e-05, + "loss": 2.8572, + "step": 1002 + }, + { + "epoch": 0.670987163941966, + "grad_norm": 3.5486297607421875, + "learning_rate": 4.7707959647575295e-05, + "loss": 2.793, + "step": 1003 + }, + { + "epoch": 0.6716561441652381, + "grad_norm": 5.987931728363037, + "learning_rate": 4.7699807062766876e-05, + "loss": 2.8616, + "step": 1004 + }, + { + "epoch": 0.6723251243885102, + "grad_norm": 5.39992618560791, + "learning_rate": 4.769164070378824e-05, + "loss": 2.9884, + "step": 1005 + }, + { + "epoch": 0.6729941046117824, + "grad_norm": 4.178929805755615, + "learning_rate": 4.768346057559473e-05, + "loss": 2.9624, + "step": 1006 + }, + { + "epoch": 0.6736630848350545, + "grad_norm": 6.299113750457764, + "learning_rate": 4.7675266683149996e-05, + "loss": 3.1258, + "step": 1007 + }, + { + "epoch": 0.6743320650583268, + "grad_norm": 2.810047149658203, + "learning_rate": 4.766705903142608e-05, + "loss": 2.8287, + "step": 1008 + }, + { + "epoch": 0.6750010452815989, + "grad_norm": 4.146429061889648, + "learning_rate": 4.7658837625403354e-05, + "loss": 3.1088, + "step": 1009 + }, + { + "epoch": 0.675670025504871, + "grad_norm": 4.9735798835754395, + "learning_rate": 4.7650602470070536e-05, + "loss": 2.8834, + "step": 1010 + }, + { + "epoch": 0.6763390057281432, + "grad_norm": 5.806169033050537, + "learning_rate": 4.7642353570424704e-05, + "loss": 3.0712, + "step": 1011 + }, + { + "epoch": 0.6770079859514153, + "grad_norm": 5.46205472946167, + "learning_rate": 4.7634090931471254e-05, + "loss": 2.8523, + "step": 1012 + }, + { + "epoch": 0.6776769661746874, + "grad_norm": 4.201307773590088, + "learning_rate": 4.762581455822394e-05, + "loss": 2.6202, + "step": 1013 + }, + { + "epoch": 0.6783459463979596, + "grad_norm": 4.1945271492004395, + "learning_rate": 4.761752445570482e-05, + "loss": 2.7655, + "step": 1014 + }, + { + "epoch": 0.6790149266212318, + "grad_norm": 4.354991436004639, + "learning_rate": 4.760922062894432e-05, + "loss": 3.0846, + "step": 1015 + }, + { + "epoch": 0.6796839068445039, + "grad_norm": 3.5169050693511963, + "learning_rate": 4.760090308298116e-05, + "loss": 2.9702, + "step": 1016 + }, + { + "epoch": 0.6803528870677761, + "grad_norm": 3.669956684112549, + "learning_rate": 4.759257182286242e-05, + "loss": 2.7531, + "step": 1017 + }, + { + "epoch": 0.6810218672910482, + "grad_norm": 4.439055919647217, + "learning_rate": 4.7584226853643465e-05, + "loss": 3.0305, + "step": 1018 + }, + { + "epoch": 0.6816908475143204, + "grad_norm": 4.583697319030762, + "learning_rate": 4.7575868180388e-05, + "loss": 2.988, + "step": 1019 + }, + { + "epoch": 0.6823598277375925, + "grad_norm": 4.089028835296631, + "learning_rate": 4.756749580816804e-05, + "loss": 2.8122, + "step": 1020 + }, + { + "epoch": 0.6830288079608646, + "grad_norm": 4.180925369262695, + "learning_rate": 4.755910974206392e-05, + "loss": 2.9264, + "step": 1021 + }, + { + "epoch": 0.6836977881841368, + "grad_norm": 4.284313201904297, + "learning_rate": 4.755070998716428e-05, + "loss": 2.8798, + "step": 1022 + }, + { + "epoch": 0.684366768407409, + "grad_norm": 6.518707752227783, + "learning_rate": 4.7542296548566044e-05, + "loss": 2.909, + "step": 1023 + }, + { + "epoch": 0.6850357486306811, + "grad_norm": 5.645822048187256, + "learning_rate": 4.753386943137448e-05, + "loss": 3.0253, + "step": 1024 + }, + { + "epoch": 0.6857047288539533, + "grad_norm": 3.4720051288604736, + "learning_rate": 4.752542864070313e-05, + "loss": 3.0259, + "step": 1025 + }, + { + "epoch": 0.6863737090772254, + "grad_norm": 3.090329170227051, + "learning_rate": 4.751697418167384e-05, + "loss": 2.8428, + "step": 1026 + }, + { + "epoch": 0.6870426893004975, + "grad_norm": 3.7356035709381104, + "learning_rate": 4.750850605941675e-05, + "loss": 2.9791, + "step": 1027 + }, + { + "epoch": 0.6877116695237697, + "grad_norm": 3.315736770629883, + "learning_rate": 4.750002427907028e-05, + "loss": 2.7206, + "step": 1028 + }, + { + "epoch": 0.6883806497470418, + "grad_norm": 5.103281497955322, + "learning_rate": 4.7491528845781155e-05, + "loss": 3.16, + "step": 1029 + }, + { + "epoch": 0.689049629970314, + "grad_norm": 4.23598575592041, + "learning_rate": 4.7483019764704365e-05, + "loss": 2.9537, + "step": 1030 + }, + { + "epoch": 0.6897186101935862, + "grad_norm": 3.8574106693267822, + "learning_rate": 4.747449704100322e-05, + "loss": 3.0713, + "step": 1031 + }, + { + "epoch": 0.6903875904168583, + "grad_norm": 4.238114356994629, + "learning_rate": 4.746596067984925e-05, + "loss": 2.9019, + "step": 1032 + }, + { + "epoch": 0.6910565706401305, + "grad_norm": 4.508023262023926, + "learning_rate": 4.745741068642232e-05, + "loss": 3.2339, + "step": 1033 + }, + { + "epoch": 0.6917255508634026, + "grad_norm": 5.460549831390381, + "learning_rate": 4.744884706591052e-05, + "loss": 3.0797, + "step": 1034 + }, + { + "epoch": 0.6923945310866747, + "grad_norm": 6.020919322967529, + "learning_rate": 4.744026982351023e-05, + "loss": 3.1077, + "step": 1035 + }, + { + "epoch": 0.6930635113099469, + "grad_norm": 4.980844020843506, + "learning_rate": 4.74316789644261e-05, + "loss": 2.893, + "step": 1036 + }, + { + "epoch": 0.693732491533219, + "grad_norm": 5.355983734130859, + "learning_rate": 4.742307449387103e-05, + "loss": 2.8476, + "step": 1037 + }, + { + "epoch": 0.6944014717564913, + "grad_norm": 6.6928839683532715, + "learning_rate": 4.741445641706618e-05, + "loss": 3.0333, + "step": 1038 + }, + { + "epoch": 0.6950704519797634, + "grad_norm": 5.792276859283447, + "learning_rate": 4.740582473924099e-05, + "loss": 3.2354, + "step": 1039 + }, + { + "epoch": 0.6957394322030355, + "grad_norm": 4.4441609382629395, + "learning_rate": 4.739717946563311e-05, + "loss": 2.7452, + "step": 1040 + }, + { + "epoch": 0.6964084124263077, + "grad_norm": 5.933568477630615, + "learning_rate": 4.738852060148849e-05, + "loss": 3.0236, + "step": 1041 + }, + { + "epoch": 0.6970773926495798, + "grad_norm": 4.327657222747803, + "learning_rate": 4.737984815206128e-05, + "loss": 2.8741, + "step": 1042 + }, + { + "epoch": 0.6977463728728519, + "grad_norm": 5.816018104553223, + "learning_rate": 4.73711621226139e-05, + "loss": 2.8933, + "step": 1043 + }, + { + "epoch": 0.6984153530961241, + "grad_norm": 4.199925422668457, + "learning_rate": 4.736246251841701e-05, + "loss": 2.8275, + "step": 1044 + }, + { + "epoch": 0.6990843333193962, + "grad_norm": 5.037257194519043, + "learning_rate": 4.73537493447495e-05, + "loss": 2.9567, + "step": 1045 + }, + { + "epoch": 0.6997533135426683, + "grad_norm": 4.057535171508789, + "learning_rate": 4.734502260689849e-05, + "loss": 2.8808, + "step": 1046 + }, + { + "epoch": 0.7004222937659406, + "grad_norm": 5.710482120513916, + "learning_rate": 4.7336282310159356e-05, + "loss": 2.9466, + "step": 1047 + }, + { + "epoch": 0.7010912739892127, + "grad_norm": 3.3100647926330566, + "learning_rate": 4.7327528459835654e-05, + "loss": 2.7895, + "step": 1048 + }, + { + "epoch": 0.7017602542124849, + "grad_norm": 3.2015061378479004, + "learning_rate": 4.7318761061239206e-05, + "loss": 2.603, + "step": 1049 + }, + { + "epoch": 0.702429234435757, + "grad_norm": 4.796367168426514, + "learning_rate": 4.730998011969004e-05, + "loss": 3.0706, + "step": 1050 + }, + { + "epoch": 0.7030982146590291, + "grad_norm": 3.3222239017486572, + "learning_rate": 4.730118564051642e-05, + "loss": 2.9381, + "step": 1051 + }, + { + "epoch": 0.7037671948823013, + "grad_norm": 5.005463123321533, + "learning_rate": 4.7292377629054777e-05, + "loss": 2.8974, + "step": 1052 + }, + { + "epoch": 0.7044361751055734, + "grad_norm": 5.9778218269348145, + "learning_rate": 4.728355609064981e-05, + "loss": 3.2306, + "step": 1053 + }, + { + "epoch": 0.7051051553288455, + "grad_norm": 5.388123035430908, + "learning_rate": 4.727472103065439e-05, + "loss": 3.1614, + "step": 1054 + }, + { + "epoch": 0.7057741355521178, + "grad_norm": 5.160175800323486, + "learning_rate": 4.726587245442959e-05, + "loss": 2.8656, + "step": 1055 + }, + { + "epoch": 0.7064431157753899, + "grad_norm": 3.826104164123535, + "learning_rate": 4.725701036734472e-05, + "loss": 2.8349, + "step": 1056 + }, + { + "epoch": 0.7071120959986621, + "grad_norm": 6.139577388763428, + "learning_rate": 4.7248134774777255e-05, + "loss": 3.0752, + "step": 1057 + }, + { + "epoch": 0.7077810762219342, + "grad_norm": 5.152160167694092, + "learning_rate": 4.723924568211288e-05, + "loss": 3.0287, + "step": 1058 + }, + { + "epoch": 0.7084500564452063, + "grad_norm": 3.847810745239258, + "learning_rate": 4.723034309474546e-05, + "loss": 3.0277, + "step": 1059 + }, + { + "epoch": 0.7091190366684785, + "grad_norm": 4.554903984069824, + "learning_rate": 4.722142701807706e-05, + "loss": 2.7659, + "step": 1060 + }, + { + "epoch": 0.7097880168917506, + "grad_norm": 3.8796348571777344, + "learning_rate": 4.721249745751794e-05, + "loss": 2.8684, + "step": 1061 + }, + { + "epoch": 0.7104569971150227, + "grad_norm": 3.869396686553955, + "learning_rate": 4.720355441848651e-05, + "loss": 2.8877, + "step": 1062 + }, + { + "epoch": 0.711125977338295, + "grad_norm": 3.0460147857666016, + "learning_rate": 4.719459790640939e-05, + "loss": 2.8196, + "step": 1063 + }, + { + "epoch": 0.7117949575615671, + "grad_norm": 4.743002414703369, + "learning_rate": 4.718562792672135e-05, + "loss": 3.0261, + "step": 1064 + }, + { + "epoch": 0.7124639377848392, + "grad_norm": 3.112426280975342, + "learning_rate": 4.717664448486536e-05, + "loss": 3.0122, + "step": 1065 + }, + { + "epoch": 0.7131329180081114, + "grad_norm": 5.043789386749268, + "learning_rate": 4.716764758629254e-05, + "loss": 2.8663, + "step": 1066 + }, + { + "epoch": 0.7138018982313835, + "grad_norm": 4.557572364807129, + "learning_rate": 4.7158637236462163e-05, + "loss": 2.7515, + "step": 1067 + }, + { + "epoch": 0.7144708784546557, + "grad_norm": 5.140640735626221, + "learning_rate": 4.714961344084171e-05, + "loss": 3.0498, + "step": 1068 + }, + { + "epoch": 0.7151398586779278, + "grad_norm": 7.291658401489258, + "learning_rate": 4.714057620490676e-05, + "loss": 3.0888, + "step": 1069 + }, + { + "epoch": 0.7158088389012, + "grad_norm": 5.374858379364014, + "learning_rate": 4.71315255341411e-05, + "loss": 2.9642, + "step": 1070 + }, + { + "epoch": 0.7164778191244722, + "grad_norm": 3.6649510860443115, + "learning_rate": 4.7122461434036645e-05, + "loss": 2.8619, + "step": 1071 + }, + { + "epoch": 0.7171467993477443, + "grad_norm": 4.465708255767822, + "learning_rate": 4.7113383910093455e-05, + "loss": 2.972, + "step": 1072 + }, + { + "epoch": 0.7178157795710164, + "grad_norm": 5.3930253982543945, + "learning_rate": 4.710429296781974e-05, + "loss": 2.9884, + "step": 1073 + }, + { + "epoch": 0.7184847597942886, + "grad_norm": 4.323741912841797, + "learning_rate": 4.709518861273187e-05, + "loss": 2.7952, + "step": 1074 + }, + { + "epoch": 0.7191537400175607, + "grad_norm": 3.911958694458008, + "learning_rate": 4.708607085035433e-05, + "loss": 2.8223, + "step": 1075 + }, + { + "epoch": 0.7198227202408328, + "grad_norm": 4.966227054595947, + "learning_rate": 4.7076939686219734e-05, + "loss": 3.0945, + "step": 1076 + }, + { + "epoch": 0.720491700464105, + "grad_norm": 6.530167579650879, + "learning_rate": 4.706779512586887e-05, + "loss": 3.1844, + "step": 1077 + }, + { + "epoch": 0.7211606806873772, + "grad_norm": 5.185251235961914, + "learning_rate": 4.7058637174850604e-05, + "loss": 2.9711, + "step": 1078 + }, + { + "epoch": 0.7218296609106494, + "grad_norm": 4.045567512512207, + "learning_rate": 4.704946583872197e-05, + "loss": 3.0596, + "step": 1079 + }, + { + "epoch": 0.7224986411339215, + "grad_norm": 6.590458393096924, + "learning_rate": 4.70402811230481e-05, + "loss": 2.9715, + "step": 1080 + }, + { + "epoch": 0.7231676213571936, + "grad_norm": 5.262936592102051, + "learning_rate": 4.703108303340225e-05, + "loss": 2.9436, + "step": 1081 + }, + { + "epoch": 0.7238366015804658, + "grad_norm": 6.591318130493164, + "learning_rate": 4.702187157536578e-05, + "loss": 3.0539, + "step": 1082 + }, + { + "epoch": 0.7245055818037379, + "grad_norm": 4.073249816894531, + "learning_rate": 4.701264675452819e-05, + "loss": 3.0097, + "step": 1083 + }, + { + "epoch": 0.72517456202701, + "grad_norm": 4.671321392059326, + "learning_rate": 4.700340857648706e-05, + "loss": 2.8997, + "step": 1084 + }, + { + "epoch": 0.7258435422502822, + "grad_norm": 5.47347354888916, + "learning_rate": 4.6994157046848085e-05, + "loss": 2.7696, + "step": 1085 + }, + { + "epoch": 0.7265125224735544, + "grad_norm": 5.776899814605713, + "learning_rate": 4.6984892171225084e-05, + "loss": 2.8977, + "step": 1086 + }, + { + "epoch": 0.7271815026968266, + "grad_norm": 2.396639823913574, + "learning_rate": 4.697561395523993e-05, + "loss": 2.7917, + "step": 1087 + }, + { + "epoch": 0.7278504829200987, + "grad_norm": 3.468226671218872, + "learning_rate": 4.6966322404522625e-05, + "loss": 3.0844, + "step": 1088 + }, + { + "epoch": 0.7285194631433708, + "grad_norm": 4.072845458984375, + "learning_rate": 4.695701752471125e-05, + "loss": 2.9978, + "step": 1089 + }, + { + "epoch": 0.729188443366643, + "grad_norm": 3.296337127685547, + "learning_rate": 4.694769932145198e-05, + "loss": 3.1235, + "step": 1090 + }, + { + "epoch": 0.7298574235899151, + "grad_norm": 6.070874214172363, + "learning_rate": 4.693836780039906e-05, + "loss": 3.0078, + "step": 1091 + }, + { + "epoch": 0.7305264038131872, + "grad_norm": 3.937580108642578, + "learning_rate": 4.6929022967214845e-05, + "loss": 2.6871, + "step": 1092 + }, + { + "epoch": 0.7311953840364595, + "grad_norm": 4.932339191436768, + "learning_rate": 4.691966482756974e-05, + "loss": 3.0435, + "step": 1093 + }, + { + "epoch": 0.7318643642597316, + "grad_norm": 5.55933141708374, + "learning_rate": 4.6910293387142234e-05, + "loss": 2.995, + "step": 1094 + }, + { + "epoch": 0.7325333444830037, + "grad_norm": 3.3646323680877686, + "learning_rate": 4.690090865161889e-05, + "loss": 2.6285, + "step": 1095 + }, + { + "epoch": 0.7332023247062759, + "grad_norm": 4.399787902832031, + "learning_rate": 4.6891510626694325e-05, + "loss": 2.7016, + "step": 1096 + }, + { + "epoch": 0.733871304929548, + "grad_norm": 3.4604458808898926, + "learning_rate": 4.6882099318071246e-05, + "loss": 2.7843, + "step": 1097 + }, + { + "epoch": 0.7345402851528202, + "grad_norm": 2.961883544921875, + "learning_rate": 4.687267473146039e-05, + "loss": 2.8187, + "step": 1098 + }, + { + "epoch": 0.7352092653760923, + "grad_norm": 5.411362171173096, + "learning_rate": 4.686323687258058e-05, + "loss": 3.1326, + "step": 1099 + }, + { + "epoch": 0.7358782455993644, + "grad_norm": 5.700189113616943, + "learning_rate": 4.685378574715867e-05, + "loss": 3.1582, + "step": 1100 + }, + { + "epoch": 0.7365472258226367, + "grad_norm": 7.942058086395264, + "learning_rate": 4.6844321360929574e-05, + "loss": 3.0623, + "step": 1101 + }, + { + "epoch": 0.7372162060459088, + "grad_norm": 5.992877006530762, + "learning_rate": 4.6834843719636256e-05, + "loss": 2.9932, + "step": 1102 + }, + { + "epoch": 0.7378851862691809, + "grad_norm": 4.572200775146484, + "learning_rate": 4.6825352829029705e-05, + "loss": 2.6074, + "step": 1103 + }, + { + "epoch": 0.7385541664924531, + "grad_norm": 3.488412857055664, + "learning_rate": 4.681584869486898e-05, + "loss": 2.8706, + "step": 1104 + }, + { + "epoch": 0.7392231467157252, + "grad_norm": 4.054234504699707, + "learning_rate": 4.680633132292115e-05, + "loss": 2.9623, + "step": 1105 + }, + { + "epoch": 0.7398921269389974, + "grad_norm": 5.8013176918029785, + "learning_rate": 4.679680071896132e-05, + "loss": 2.7356, + "step": 1106 + }, + { + "epoch": 0.7405611071622695, + "grad_norm": 3.128580331802368, + "learning_rate": 4.678725688877265e-05, + "loss": 2.7304, + "step": 1107 + }, + { + "epoch": 0.7412300873855416, + "grad_norm": 6.505280494689941, + "learning_rate": 4.6777699838146286e-05, + "loss": 3.1473, + "step": 1108 + }, + { + "epoch": 0.7418990676088139, + "grad_norm": 4.020694255828857, + "learning_rate": 4.676812957288141e-05, + "loss": 2.8131, + "step": 1109 + }, + { + "epoch": 0.742568047832086, + "grad_norm": 5.444528579711914, + "learning_rate": 4.675854609878526e-05, + "loss": 3.0127, + "step": 1110 + }, + { + "epoch": 0.7432370280553581, + "grad_norm": 3.6009886264801025, + "learning_rate": 4.674894942167303e-05, + "loss": 2.6999, + "step": 1111 + }, + { + "epoch": 0.7439060082786303, + "grad_norm": 3.48321533203125, + "learning_rate": 4.673933954736796e-05, + "loss": 2.9221, + "step": 1112 + }, + { + "epoch": 0.7445749885019024, + "grad_norm": 5.604138374328613, + "learning_rate": 4.672971648170129e-05, + "loss": 3.0887, + "step": 1113 + }, + { + "epoch": 0.7452439687251745, + "grad_norm": 3.9699227809906006, + "learning_rate": 4.672008023051228e-05, + "loss": 2.876, + "step": 1114 + }, + { + "epoch": 0.7459129489484467, + "grad_norm": 5.587133884429932, + "learning_rate": 4.671043079964815e-05, + "loss": 3.1502, + "step": 1115 + }, + { + "epoch": 0.7465819291717188, + "grad_norm": 5.9802680015563965, + "learning_rate": 4.670076819496416e-05, + "loss": 3.1501, + "step": 1116 + }, + { + "epoch": 0.7472509093949911, + "grad_norm": 5.209526538848877, + "learning_rate": 4.669109242232355e-05, + "loss": 3.0757, + "step": 1117 + }, + { + "epoch": 0.7479198896182632, + "grad_norm": 6.363523006439209, + "learning_rate": 4.6681403487597536e-05, + "loss": 3.0408, + "step": 1118 + }, + { + "epoch": 0.7485888698415353, + "grad_norm": 4.495762825012207, + "learning_rate": 4.6671701396665345e-05, + "loss": 2.8424, + "step": 1119 + }, + { + "epoch": 0.7492578500648075, + "grad_norm": 4.736873626708984, + "learning_rate": 4.6661986155414164e-05, + "loss": 3.1298, + "step": 1120 + }, + { + "epoch": 0.7499268302880796, + "grad_norm": 4.165266513824463, + "learning_rate": 4.665225776973918e-05, + "loss": 2.8714, + "step": 1121 + }, + { + "epoch": 0.7505958105113517, + "grad_norm": 8.435874938964844, + "learning_rate": 4.664251624554354e-05, + "loss": 3.3392, + "step": 1122 + }, + { + "epoch": 0.7512647907346239, + "grad_norm": 5.860448360443115, + "learning_rate": 4.663276158873837e-05, + "loss": 2.8908, + "step": 1123 + }, + { + "epoch": 0.751933770957896, + "grad_norm": 3.957852363586426, + "learning_rate": 4.6622993805242766e-05, + "loss": 3.0232, + "step": 1124 + }, + { + "epoch": 0.7526027511811683, + "grad_norm": 5.143473148345947, + "learning_rate": 4.661321290098379e-05, + "loss": 3.0588, + "step": 1125 + }, + { + "epoch": 0.7532717314044404, + "grad_norm": 3.922308921813965, + "learning_rate": 4.660341888189646e-05, + "loss": 2.9771, + "step": 1126 + }, + { + "epoch": 0.7539407116277125, + "grad_norm": 3.8919553756713867, + "learning_rate": 4.6593611753923756e-05, + "loss": 2.8287, + "step": 1127 + }, + { + "epoch": 0.7546096918509847, + "grad_norm": 3.5253610610961914, + "learning_rate": 4.6583791523016616e-05, + "loss": 2.7894, + "step": 1128 + }, + { + "epoch": 0.7552786720742568, + "grad_norm": 5.46091890335083, + "learning_rate": 4.657395819513392e-05, + "loss": 3.1581, + "step": 1129 + }, + { + "epoch": 0.7559476522975289, + "grad_norm": 5.317687034606934, + "learning_rate": 4.6564111776242494e-05, + "loss": 3.0963, + "step": 1130 + }, + { + "epoch": 0.7566166325208011, + "grad_norm": 5.06818151473999, + "learning_rate": 4.655425227231712e-05, + "loss": 2.9743, + "step": 1131 + }, + { + "epoch": 0.7572856127440732, + "grad_norm": 3.99940824508667, + "learning_rate": 4.6544379689340515e-05, + "loss": 2.8127, + "step": 1132 + }, + { + "epoch": 0.7579545929673454, + "grad_norm": 4.829176425933838, + "learning_rate": 4.653449403330333e-05, + "loss": 2.8884, + "step": 1133 + }, + { + "epoch": 0.7586235731906176, + "grad_norm": 4.37836217880249, + "learning_rate": 4.652459531020416e-05, + "loss": 2.8942, + "step": 1134 + }, + { + "epoch": 0.7592925534138897, + "grad_norm": 5.445929050445557, + "learning_rate": 4.651468352604949e-05, + "loss": 2.8611, + "step": 1135 + }, + { + "epoch": 0.7599615336371619, + "grad_norm": 3.0896148681640625, + "learning_rate": 4.6504758686853786e-05, + "loss": 2.7729, + "step": 1136 + }, + { + "epoch": 0.760630513860434, + "grad_norm": 7.576822757720947, + "learning_rate": 4.6494820798639396e-05, + "loss": 2.9835, + "step": 1137 + }, + { + "epoch": 0.7612994940837061, + "grad_norm": 11.344874382019043, + "learning_rate": 4.64848698674366e-05, + "loss": 2.8235, + "step": 1138 + }, + { + "epoch": 0.7619684743069783, + "grad_norm": 6.878731727600098, + "learning_rate": 4.6474905899283596e-05, + "loss": 2.8795, + "step": 1139 + }, + { + "epoch": 0.7626374545302504, + "grad_norm": 6.209124565124512, + "learning_rate": 4.646492890022648e-05, + "loss": 2.6676, + "step": 1140 + }, + { + "epoch": 0.7633064347535226, + "grad_norm": 5.197012424468994, + "learning_rate": 4.6454938876319266e-05, + "loss": 2.7382, + "step": 1141 + }, + { + "epoch": 0.7639754149767948, + "grad_norm": 8.035080909729004, + "learning_rate": 4.644493583362387e-05, + "loss": 3.1107, + "step": 1142 + }, + { + "epoch": 0.7646443952000669, + "grad_norm": 6.702689170837402, + "learning_rate": 4.6434919778210114e-05, + "loss": 2.7169, + "step": 1143 + }, + { + "epoch": 0.765313375423339, + "grad_norm": 6.232007026672363, + "learning_rate": 4.64248907161557e-05, + "loss": 2.9951, + "step": 1144 + }, + { + "epoch": 0.7659823556466112, + "grad_norm": 7.285654067993164, + "learning_rate": 4.641484865354623e-05, + "loss": 2.8934, + "step": 1145 + }, + { + "epoch": 0.7666513358698833, + "grad_norm": 4.895506858825684, + "learning_rate": 4.6404793596475195e-05, + "loss": 2.7874, + "step": 1146 + }, + { + "epoch": 0.7673203160931555, + "grad_norm": 7.719137191772461, + "learning_rate": 4.639472555104397e-05, + "loss": 3.0158, + "step": 1147 + }, + { + "epoch": 0.7679892963164276, + "grad_norm": 6.7118730545043945, + "learning_rate": 4.638464452336182e-05, + "loss": 3.0185, + "step": 1148 + }, + { + "epoch": 0.7686582765396998, + "grad_norm": 6.159465312957764, + "learning_rate": 4.637455051954587e-05, + "loss": 2.9336, + "step": 1149 + }, + { + "epoch": 0.769327256762972, + "grad_norm": 4.991342544555664, + "learning_rate": 4.6364443545721146e-05, + "loss": 2.9299, + "step": 1150 + }, + { + "epoch": 0.7699962369862441, + "grad_norm": 5.403529167175293, + "learning_rate": 4.635432360802051e-05, + "loss": 3.1296, + "step": 1151 + }, + { + "epoch": 0.7706652172095162, + "grad_norm": 5.111245155334473, + "learning_rate": 4.634419071258472e-05, + "loss": 2.6837, + "step": 1152 + }, + { + "epoch": 0.7713341974327884, + "grad_norm": 5.243202209472656, + "learning_rate": 4.633404486556238e-05, + "loss": 2.8771, + "step": 1153 + }, + { + "epoch": 0.7720031776560605, + "grad_norm": 6.665611743927002, + "learning_rate": 4.632388607310995e-05, + "loss": 2.7898, + "step": 1154 + }, + { + "epoch": 0.7726721578793327, + "grad_norm": 5.265156269073486, + "learning_rate": 4.631371434139176e-05, + "loss": 2.896, + "step": 1155 + }, + { + "epoch": 0.7733411381026049, + "grad_norm": 5.411212921142578, + "learning_rate": 4.630352967657998e-05, + "loss": 3.0554, + "step": 1156 + }, + { + "epoch": 0.774010118325877, + "grad_norm": 5.297793388366699, + "learning_rate": 4.629333208485464e-05, + "loss": 2.7445, + "step": 1157 + }, + { + "epoch": 0.7746790985491492, + "grad_norm": 6.694152355194092, + "learning_rate": 4.62831215724036e-05, + "loss": 3.0652, + "step": 1158 + }, + { + "epoch": 0.7753480787724213, + "grad_norm": 4.164678573608398, + "learning_rate": 4.627289814542257e-05, + "loss": 2.8941, + "step": 1159 + }, + { + "epoch": 0.7760170589956934, + "grad_norm": 5.421492099761963, + "learning_rate": 4.626266181011509e-05, + "loss": 3.0211, + "step": 1160 + }, + { + "epoch": 0.7766860392189656, + "grad_norm": 5.934850215911865, + "learning_rate": 4.625241257269254e-05, + "loss": 2.9949, + "step": 1161 + }, + { + "epoch": 0.7773550194422377, + "grad_norm": 3.3746259212493896, + "learning_rate": 4.624215043937411e-05, + "loss": 2.9759, + "step": 1162 + }, + { + "epoch": 0.7780239996655098, + "grad_norm": 4.722992420196533, + "learning_rate": 4.623187541638685e-05, + "loss": 3.0717, + "step": 1163 + }, + { + "epoch": 0.778692979888782, + "grad_norm": 4.32106876373291, + "learning_rate": 4.6221587509965594e-05, + "loss": 2.6513, + "step": 1164 + }, + { + "epoch": 0.7793619601120542, + "grad_norm": 4.257534980773926, + "learning_rate": 4.621128672635302e-05, + "loss": 2.798, + "step": 1165 + }, + { + "epoch": 0.7800309403353264, + "grad_norm": 4.742833137512207, + "learning_rate": 4.620097307179961e-05, + "loss": 3.0547, + "step": 1166 + }, + { + "epoch": 0.7806999205585985, + "grad_norm": 5.508314609527588, + "learning_rate": 4.6190646552563655e-05, + "loss": 3.1237, + "step": 1167 + }, + { + "epoch": 0.7813689007818706, + "grad_norm": 4.580541133880615, + "learning_rate": 4.6180307174911255e-05, + "loss": 2.8756, + "step": 1168 + }, + { + "epoch": 0.7820378810051428, + "grad_norm": 3.2291085720062256, + "learning_rate": 4.61699549451163e-05, + "loss": 2.7087, + "step": 1169 + }, + { + "epoch": 0.7827068612284149, + "grad_norm": 5.868531227111816, + "learning_rate": 4.6159589869460504e-05, + "loss": 2.9949, + "step": 1170 + }, + { + "epoch": 0.783375841451687, + "grad_norm": 4.447974681854248, + "learning_rate": 4.614921195423336e-05, + "loss": 2.8073, + "step": 1171 + }, + { + "epoch": 0.7840448216749593, + "grad_norm": 5.745816230773926, + "learning_rate": 4.613882120573215e-05, + "loss": 2.9943, + "step": 1172 + }, + { + "epoch": 0.7847138018982314, + "grad_norm": 3.089176654815674, + "learning_rate": 4.612841763026195e-05, + "loss": 2.8825, + "step": 1173 + }, + { + "epoch": 0.7853827821215036, + "grad_norm": 4.548274040222168, + "learning_rate": 4.611800123413561e-05, + "loss": 3.0515, + "step": 1174 + }, + { + "epoch": 0.7860517623447757, + "grad_norm": 5.2816853523254395, + "learning_rate": 4.6107572023673774e-05, + "loss": 3.1152, + "step": 1175 + }, + { + "epoch": 0.7867207425680478, + "grad_norm": 4.492981433868408, + "learning_rate": 4.6097130005204846e-05, + "loss": 3.0368, + "step": 1176 + }, + { + "epoch": 0.78738972279132, + "grad_norm": 4.274287223815918, + "learning_rate": 4.608667518506502e-05, + "loss": 2.9959, + "step": 1177 + }, + { + "epoch": 0.7880587030145921, + "grad_norm": 5.986141204833984, + "learning_rate": 4.607620756959823e-05, + "loss": 3.1899, + "step": 1178 + }, + { + "epoch": 0.7887276832378642, + "grad_norm": 6.11605167388916, + "learning_rate": 4.6065727165156214e-05, + "loss": 3.2268, + "step": 1179 + }, + { + "epoch": 0.7893966634611365, + "grad_norm": 3.461554527282715, + "learning_rate": 4.6055233978098424e-05, + "loss": 2.9106, + "step": 1180 + }, + { + "epoch": 0.7900656436844086, + "grad_norm": 3.9393229484558105, + "learning_rate": 4.604472801479211e-05, + "loss": 2.7877, + "step": 1181 + }, + { + "epoch": 0.7907346239076807, + "grad_norm": 3.958235263824463, + "learning_rate": 4.603420928161225e-05, + "loss": 2.7716, + "step": 1182 + }, + { + "epoch": 0.7914036041309529, + "grad_norm": 5.014087200164795, + "learning_rate": 4.602367778494158e-05, + "loss": 3.1674, + "step": 1183 + }, + { + "epoch": 0.792072584354225, + "grad_norm": 5.102466106414795, + "learning_rate": 4.601313353117057e-05, + "loss": 2.8841, + "step": 1184 + }, + { + "epoch": 0.7927415645774972, + "grad_norm": 3.9684786796569824, + "learning_rate": 4.6002576526697446e-05, + "loss": 2.9394, + "step": 1185 + }, + { + "epoch": 0.7934105448007693, + "grad_norm": 5.413763523101807, + "learning_rate": 4.599200677792818e-05, + "loss": 3.0567, + "step": 1186 + }, + { + "epoch": 0.7940795250240414, + "grad_norm": 3.9001362323760986, + "learning_rate": 4.598142429127643e-05, + "loss": 2.7906, + "step": 1187 + }, + { + "epoch": 0.7947485052473137, + "grad_norm": 5.886847019195557, + "learning_rate": 4.597082907316363e-05, + "loss": 3.0456, + "step": 1188 + }, + { + "epoch": 0.7954174854705858, + "grad_norm": 5.5517096519470215, + "learning_rate": 4.5960221130018946e-05, + "loss": 3.0884, + "step": 1189 + }, + { + "epoch": 0.7960864656938579, + "grad_norm": 4.110422134399414, + "learning_rate": 4.594960046827921e-05, + "loss": 2.9232, + "step": 1190 + }, + { + "epoch": 0.7967554459171301, + "grad_norm": 6.881380558013916, + "learning_rate": 4.593896709438902e-05, + "loss": 3.0062, + "step": 1191 + }, + { + "epoch": 0.7974244261404022, + "grad_norm": 4.250757217407227, + "learning_rate": 4.592832101480067e-05, + "loss": 2.9539, + "step": 1192 + }, + { + "epoch": 0.7980934063636744, + "grad_norm": 4.646194934844971, + "learning_rate": 4.591766223597417e-05, + "loss": 2.858, + "step": 1193 + }, + { + "epoch": 0.7987623865869465, + "grad_norm": 3.1693055629730225, + "learning_rate": 4.5906990764377235e-05, + "loss": 2.7729, + "step": 1194 + }, + { + "epoch": 0.7994313668102186, + "grad_norm": 4.924377918243408, + "learning_rate": 4.589630660648527e-05, + "loss": 3.1445, + "step": 1195 + }, + { + "epoch": 0.8001003470334909, + "grad_norm": 5.191469669342041, + "learning_rate": 4.5885609768781405e-05, + "loss": 2.6655, + "step": 1196 + }, + { + "epoch": 0.800769327256763, + "grad_norm": 5.284387111663818, + "learning_rate": 4.587490025775644e-05, + "loss": 3.0168, + "step": 1197 + }, + { + "epoch": 0.8014383074800351, + "grad_norm": 4.831686019897461, + "learning_rate": 4.586417807990886e-05, + "loss": 2.8958, + "step": 1198 + }, + { + "epoch": 0.8021072877033073, + "grad_norm": 4.951563835144043, + "learning_rate": 4.585344324174485e-05, + "loss": 2.9277, + "step": 1199 + }, + { + "epoch": 0.8027762679265794, + "grad_norm": 5.770039081573486, + "learning_rate": 4.58426957497783e-05, + "loss": 3.1331, + "step": 1200 + }, + { + "epoch": 0.8034452481498515, + "grad_norm": 4.684397220611572, + "learning_rate": 4.583193561053072e-05, + "loss": 2.8653, + "step": 1201 + }, + { + "epoch": 0.8041142283731237, + "grad_norm": 4.134792804718018, + "learning_rate": 4.582116283053135e-05, + "loss": 2.9133, + "step": 1202 + }, + { + "epoch": 0.8047832085963958, + "grad_norm": 6.689801216125488, + "learning_rate": 4.581037741631708e-05, + "loss": 3.1337, + "step": 1203 + }, + { + "epoch": 0.8054521888196681, + "grad_norm": 6.469520568847656, + "learning_rate": 4.579957937443245e-05, + "loss": 2.9906, + "step": 1204 + }, + { + "epoch": 0.8061211690429402, + "grad_norm": 6.04118537902832, + "learning_rate": 4.5788768711429685e-05, + "loss": 2.8179, + "step": 1205 + }, + { + "epoch": 0.8067901492662123, + "grad_norm": 4.4167799949646, + "learning_rate": 4.5777945433868664e-05, + "loss": 2.9181, + "step": 1206 + }, + { + "epoch": 0.8074591294894845, + "grad_norm": 4.885730743408203, + "learning_rate": 4.576710954831691e-05, + "loss": 3.3447, + "step": 1207 + }, + { + "epoch": 0.8081281097127566, + "grad_norm": 3.864809989929199, + "learning_rate": 4.57562610613496e-05, + "loss": 2.8715, + "step": 1208 + }, + { + "epoch": 0.8087970899360287, + "grad_norm": 3.0726685523986816, + "learning_rate": 4.574539997954957e-05, + "loss": 2.953, + "step": 1209 + }, + { + "epoch": 0.8094660701593009, + "grad_norm": 4.57878303527832, + "learning_rate": 4.5734526309507294e-05, + "loss": 2.9744, + "step": 1210 + }, + { + "epoch": 0.810135050382573, + "grad_norm": 5.683286190032959, + "learning_rate": 4.5723640057820874e-05, + "loss": 3.0864, + "step": 1211 + }, + { + "epoch": 0.8108040306058452, + "grad_norm": 4.932238578796387, + "learning_rate": 4.571274123109606e-05, + "loss": 3.0714, + "step": 1212 + }, + { + "epoch": 0.8114730108291174, + "grad_norm": 4.840191841125488, + "learning_rate": 4.5701829835946204e-05, + "loss": 3.0249, + "step": 1213 + }, + { + "epoch": 0.8121419910523895, + "grad_norm": 5.308492183685303, + "learning_rate": 4.569090587899232e-05, + "loss": 3.0613, + "step": 1214 + }, + { + "epoch": 0.8128109712756617, + "grad_norm": 5.62808895111084, + "learning_rate": 4.567996936686303e-05, + "loss": 2.5797, + "step": 1215 + }, + { + "epoch": 0.8134799514989338, + "grad_norm": 5.087706089019775, + "learning_rate": 4.5669020306194585e-05, + "loss": 2.8123, + "step": 1216 + }, + { + "epoch": 0.8141489317222059, + "grad_norm": 4.26098108291626, + "learning_rate": 4.565805870363082e-05, + "loss": 2.9352, + "step": 1217 + }, + { + "epoch": 0.8148179119454781, + "grad_norm": 3.571173906326294, + "learning_rate": 4.564708456582321e-05, + "loss": 3.0319, + "step": 1218 + }, + { + "epoch": 0.8154868921687503, + "grad_norm": 6.964523792266846, + "learning_rate": 4.5636097899430826e-05, + "loss": 2.9455, + "step": 1219 + }, + { + "epoch": 0.8161558723920224, + "grad_norm": 5.344782829284668, + "learning_rate": 4.562509871112034e-05, + "loss": 3.2849, + "step": 1220 + }, + { + "epoch": 0.8168248526152946, + "grad_norm": 4.287156105041504, + "learning_rate": 4.561408700756603e-05, + "loss": 3.0911, + "step": 1221 + }, + { + "epoch": 0.8174938328385667, + "grad_norm": 6.898590564727783, + "learning_rate": 4.560306279544975e-05, + "loss": 3.1278, + "step": 1222 + }, + { + "epoch": 0.8181628130618389, + "grad_norm": 4.712205410003662, + "learning_rate": 4.559202608146098e-05, + "loss": 2.7449, + "step": 1223 + }, + { + "epoch": 0.818831793285111, + "grad_norm": 2.392604351043701, + "learning_rate": 4.558097687229673e-05, + "loss": 2.5767, + "step": 1224 + }, + { + "epoch": 0.8195007735083831, + "grad_norm": 6.0168561935424805, + "learning_rate": 4.5569915174661656e-05, + "loss": 3.0924, + "step": 1225 + }, + { + "epoch": 0.8201697537316553, + "grad_norm": 5.974442958831787, + "learning_rate": 4.555884099526794e-05, + "loss": 3.2305, + "step": 1226 + }, + { + "epoch": 0.8208387339549275, + "grad_norm": 4.008500099182129, + "learning_rate": 4.554775434083537e-05, + "loss": 3.1273, + "step": 1227 + }, + { + "epoch": 0.8215077141781996, + "grad_norm": 6.343125343322754, + "learning_rate": 4.553665521809128e-05, + "loss": 3.2656, + "step": 1228 + }, + { + "epoch": 0.8221766944014718, + "grad_norm": 4.358270645141602, + "learning_rate": 4.5525543633770604e-05, + "loss": 2.8296, + "step": 1229 + }, + { + "epoch": 0.8228456746247439, + "grad_norm": 3.9398019313812256, + "learning_rate": 4.551441959461579e-05, + "loss": 2.9505, + "step": 1230 + }, + { + "epoch": 0.823514654848016, + "grad_norm": 7.38106107711792, + "learning_rate": 4.5503283107376885e-05, + "loss": 3.3745, + "step": 1231 + }, + { + "epoch": 0.8241836350712882, + "grad_norm": 4.664844989776611, + "learning_rate": 4.549213417881147e-05, + "loss": 2.8598, + "step": 1232 + }, + { + "epoch": 0.8248526152945603, + "grad_norm": 8.018467903137207, + "learning_rate": 4.548097281568469e-05, + "loss": 2.961, + "step": 1233 + }, + { + "epoch": 0.8255215955178326, + "grad_norm": 4.438595294952393, + "learning_rate": 4.54697990247692e-05, + "loss": 2.9577, + "step": 1234 + }, + { + "epoch": 0.8261905757411047, + "grad_norm": 4.215197563171387, + "learning_rate": 4.545861281284524e-05, + "loss": 2.9438, + "step": 1235 + }, + { + "epoch": 0.8268595559643768, + "grad_norm": 4.889639377593994, + "learning_rate": 4.5447414186700556e-05, + "loss": 3.1007, + "step": 1236 + }, + { + "epoch": 0.827528536187649, + "grad_norm": 5.220578670501709, + "learning_rate": 4.543620315313045e-05, + "loss": 2.8278, + "step": 1237 + }, + { + "epoch": 0.8281975164109211, + "grad_norm": 5.146899700164795, + "learning_rate": 4.5424979718937736e-05, + "loss": 2.9282, + "step": 1238 + }, + { + "epoch": 0.8288664966341932, + "grad_norm": 3.74849796295166, + "learning_rate": 4.5413743890932754e-05, + "loss": 2.9679, + "step": 1239 + }, + { + "epoch": 0.8295354768574654, + "grad_norm": 2.725539207458496, + "learning_rate": 4.5402495675933373e-05, + "loss": 2.6577, + "step": 1240 + }, + { + "epoch": 0.8302044570807375, + "grad_norm": 4.239365100860596, + "learning_rate": 4.5391235080764973e-05, + "loss": 2.6505, + "step": 1241 + }, + { + "epoch": 0.8308734373040098, + "grad_norm": 3.76081919670105, + "learning_rate": 4.5379962112260446e-05, + "loss": 2.775, + "step": 1242 + }, + { + "epoch": 0.8315424175272819, + "grad_norm": 4.860802173614502, + "learning_rate": 4.536867677726019e-05, + "loss": 2.9374, + "step": 1243 + }, + { + "epoch": 0.832211397750554, + "grad_norm": 5.248920917510986, + "learning_rate": 4.535737908261212e-05, + "loss": 2.9561, + "step": 1244 + }, + { + "epoch": 0.8328803779738262, + "grad_norm": 3.981989860534668, + "learning_rate": 4.534606903517163e-05, + "loss": 2.7477, + "step": 1245 + }, + { + "epoch": 0.8335493581970983, + "grad_norm": 5.067933559417725, + "learning_rate": 4.533474664180163e-05, + "loss": 2.7223, + "step": 1246 + }, + { + "epoch": 0.8342183384203704, + "grad_norm": 4.483163833618164, + "learning_rate": 4.5323411909372516e-05, + "loss": 2.9859, + "step": 1247 + }, + { + "epoch": 0.8348873186436426, + "grad_norm": 3.9220480918884277, + "learning_rate": 4.5312064844762144e-05, + "loss": 2.9838, + "step": 1248 + }, + { + "epoch": 0.8355562988669147, + "grad_norm": 6.302621841430664, + "learning_rate": 4.53007054548559e-05, + "loss": 2.9708, + "step": 1249 + }, + { + "epoch": 0.8362252790901868, + "grad_norm": 6.725874423980713, + "learning_rate": 4.5289333746546614e-05, + "loss": 2.7491, + "step": 1250 + }, + { + "epoch": 0.8368942593134591, + "grad_norm": 6.032719612121582, + "learning_rate": 4.52779497267346e-05, + "loss": 2.9548, + "step": 1251 + }, + { + "epoch": 0.8375632395367312, + "grad_norm": 5.064156532287598, + "learning_rate": 4.526655340232766e-05, + "loss": 3.1138, + "step": 1252 + }, + { + "epoch": 0.8382322197600034, + "grad_norm": 5.924524307250977, + "learning_rate": 4.5255144780241025e-05, + "loss": 3.0439, + "step": 1253 + }, + { + "epoch": 0.8389011999832755, + "grad_norm": 3.7269787788391113, + "learning_rate": 4.5243723867397433e-05, + "loss": 2.9438, + "step": 1254 + }, + { + "epoch": 0.8395701802065476, + "grad_norm": 4.633069038391113, + "learning_rate": 4.523229067072704e-05, + "loss": 3.035, + "step": 1255 + }, + { + "epoch": 0.8402391604298198, + "grad_norm": 6.016887664794922, + "learning_rate": 4.522084519716748e-05, + "loss": 3.1314, + "step": 1256 + }, + { + "epoch": 0.8409081406530919, + "grad_norm": 5.080687999725342, + "learning_rate": 4.520938745366383e-05, + "loss": 3.1458, + "step": 1257 + }, + { + "epoch": 0.841577120876364, + "grad_norm": 4.570522785186768, + "learning_rate": 4.519791744716861e-05, + "loss": 2.6878, + "step": 1258 + }, + { + "epoch": 0.8422461010996363, + "grad_norm": 5.6180009841918945, + "learning_rate": 4.518643518464179e-05, + "loss": 2.8925, + "step": 1259 + }, + { + "epoch": 0.8429150813229084, + "grad_norm": 5.0585408210754395, + "learning_rate": 4.517494067305076e-05, + "loss": 2.8532, + "step": 1260 + }, + { + "epoch": 0.8435840615461805, + "grad_norm": 4.849548816680908, + "learning_rate": 4.516343391937036e-05, + "loss": 2.9977, + "step": 1261 + }, + { + "epoch": 0.8442530417694527, + "grad_norm": 6.075852394104004, + "learning_rate": 4.515191493058285e-05, + "loss": 2.9442, + "step": 1262 + }, + { + "epoch": 0.8449220219927248, + "grad_norm": 3.9582536220550537, + "learning_rate": 4.5140383713677916e-05, + "loss": 2.9436, + "step": 1263 + }, + { + "epoch": 0.845591002215997, + "grad_norm": 4.025606632232666, + "learning_rate": 4.512884027565265e-05, + "loss": 2.5885, + "step": 1264 + }, + { + "epoch": 0.8462599824392691, + "grad_norm": 9.147647857666016, + "learning_rate": 4.51172846235116e-05, + "loss": 2.9629, + "step": 1265 + }, + { + "epoch": 0.8469289626625413, + "grad_norm": 5.3007330894470215, + "learning_rate": 4.510571676426667e-05, + "loss": 2.9883, + "step": 1266 + }, + { + "epoch": 0.8475979428858135, + "grad_norm": 5.245005130767822, + "learning_rate": 4.5094136704937225e-05, + "loss": 3.0921, + "step": 1267 + }, + { + "epoch": 0.8482669231090856, + "grad_norm": 7.631860256195068, + "learning_rate": 4.508254445254999e-05, + "loss": 2.9209, + "step": 1268 + }, + { + "epoch": 0.8489359033323577, + "grad_norm": 3.889382839202881, + "learning_rate": 4.507094001413911e-05, + "loss": 2.8368, + "step": 1269 + }, + { + "epoch": 0.8496048835556299, + "grad_norm": 5.040628910064697, + "learning_rate": 4.505932339674613e-05, + "loss": 2.6859, + "step": 1270 + }, + { + "epoch": 0.850273863778902, + "grad_norm": 4.311012268066406, + "learning_rate": 4.5047694607419974e-05, + "loss": 3.067, + "step": 1271 + }, + { + "epoch": 0.8509428440021742, + "grad_norm": 4.256524562835693, + "learning_rate": 4.503605365321695e-05, + "loss": 2.7441, + "step": 1272 + }, + { + "epoch": 0.8516118242254463, + "grad_norm": 4.711860179901123, + "learning_rate": 4.502440054120074e-05, + "loss": 3.0067, + "step": 1273 + }, + { + "epoch": 0.8522808044487185, + "grad_norm": 4.862099647521973, + "learning_rate": 4.5012735278442436e-05, + "loss": 2.816, + "step": 1274 + }, + { + "epoch": 0.8529497846719907, + "grad_norm": 4.249661922454834, + "learning_rate": 4.500105787202047e-05, + "loss": 2.9722, + "step": 1275 + }, + { + "epoch": 0.8536187648952628, + "grad_norm": 5.019298076629639, + "learning_rate": 4.4989368329020664e-05, + "loss": 2.9151, + "step": 1276 + }, + { + "epoch": 0.8542877451185349, + "grad_norm": 4.351741313934326, + "learning_rate": 4.497766665653619e-05, + "loss": 2.8578, + "step": 1277 + }, + { + "epoch": 0.8549567253418071, + "grad_norm": 4.4666972160339355, + "learning_rate": 4.4965952861667574e-05, + "loss": 2.9205, + "step": 1278 + }, + { + "epoch": 0.8556257055650792, + "grad_norm": 6.395944118499756, + "learning_rate": 4.495422695152272e-05, + "loss": 2.8507, + "step": 1279 + }, + { + "epoch": 0.8562946857883513, + "grad_norm": 3.730733871459961, + "learning_rate": 4.494248893321689e-05, + "loss": 2.6328, + "step": 1280 + }, + { + "epoch": 0.8569636660116235, + "grad_norm": 5.058286190032959, + "learning_rate": 4.493073881387265e-05, + "loss": 3.0965, + "step": 1281 + }, + { + "epoch": 0.8576326462348957, + "grad_norm": 3.647876501083374, + "learning_rate": 4.491897660061994e-05, + "loss": 2.8117, + "step": 1282 + }, + { + "epoch": 0.8583016264581679, + "grad_norm": 5.1443867683410645, + "learning_rate": 4.4907202300596036e-05, + "loss": 2.8762, + "step": 1283 + }, + { + "epoch": 0.85897060668144, + "grad_norm": 4.4886250495910645, + "learning_rate": 4.489541592094555e-05, + "loss": 3.094, + "step": 1284 + }, + { + "epoch": 0.8596395869047121, + "grad_norm": 5.375618934631348, + "learning_rate": 4.4883617468820415e-05, + "loss": 3.1313, + "step": 1285 + }, + { + "epoch": 0.8603085671279843, + "grad_norm": 5.438999176025391, + "learning_rate": 4.4871806951379894e-05, + "loss": 3.0824, + "step": 1286 + }, + { + "epoch": 0.8609775473512564, + "grad_norm": 5.070971488952637, + "learning_rate": 4.485998437579056e-05, + "loss": 2.7387, + "step": 1287 + }, + { + "epoch": 0.8616465275745285, + "grad_norm": 5.442570209503174, + "learning_rate": 4.484814974922634e-05, + "loss": 3.1292, + "step": 1288 + }, + { + "epoch": 0.8623155077978008, + "grad_norm": 6.013929843902588, + "learning_rate": 4.4836303078868435e-05, + "loss": 2.7225, + "step": 1289 + }, + { + "epoch": 0.8629844880210729, + "grad_norm": 5.940018653869629, + "learning_rate": 4.482444437190536e-05, + "loss": 3.0959, + "step": 1290 + }, + { + "epoch": 0.8636534682443451, + "grad_norm": 4.380285739898682, + "learning_rate": 4.4812573635532945e-05, + "loss": 2.9311, + "step": 1291 + }, + { + "epoch": 0.8643224484676172, + "grad_norm": 3.536611795425415, + "learning_rate": 4.4800690876954324e-05, + "loss": 2.6594, + "step": 1292 + }, + { + "epoch": 0.8649914286908893, + "grad_norm": 4.585886001586914, + "learning_rate": 4.478879610337989e-05, + "loss": 2.6898, + "step": 1293 + }, + { + "epoch": 0.8656604089141615, + "grad_norm": 4.433380126953125, + "learning_rate": 4.477688932202738e-05, + "loss": 2.9575, + "step": 1294 + }, + { + "epoch": 0.8663293891374336, + "grad_norm": 4.734560012817383, + "learning_rate": 4.476497054012179e-05, + "loss": 2.8395, + "step": 1295 + }, + { + "epoch": 0.8669983693607057, + "grad_norm": 7.193070888519287, + "learning_rate": 4.475303976489538e-05, + "loss": 3.2112, + "step": 1296 + }, + { + "epoch": 0.867667349583978, + "grad_norm": 5.735711097717285, + "learning_rate": 4.4741097003587716e-05, + "loss": 3.2148, + "step": 1297 + }, + { + "epoch": 0.8683363298072501, + "grad_norm": 4.532172679901123, + "learning_rate": 4.472914226344564e-05, + "loss": 3.09, + "step": 1298 + }, + { + "epoch": 0.8690053100305222, + "grad_norm": 6.511048793792725, + "learning_rate": 4.471717555172323e-05, + "loss": 2.9723, + "step": 1299 + }, + { + "epoch": 0.8696742902537944, + "grad_norm": 6.3584113121032715, + "learning_rate": 4.4705196875681854e-05, + "loss": 3.1419, + "step": 1300 + }, + { + "epoch": 0.8703432704770665, + "grad_norm": 9.031208992004395, + "learning_rate": 4.4693206242590145e-05, + "loss": 3.0329, + "step": 1301 + }, + { + "epoch": 0.8710122507003387, + "grad_norm": 4.642002105712891, + "learning_rate": 4.468120365972397e-05, + "loss": 2.9389, + "step": 1302 + }, + { + "epoch": 0.8716812309236108, + "grad_norm": 5.527377605438232, + "learning_rate": 4.466918913436646e-05, + "loss": 2.9945, + "step": 1303 + }, + { + "epoch": 0.8723502111468829, + "grad_norm": 4.439020156860352, + "learning_rate": 4.465716267380799e-05, + "loss": 2.6869, + "step": 1304 + }, + { + "epoch": 0.8730191913701552, + "grad_norm": 6.921942234039307, + "learning_rate": 4.464512428534618e-05, + "loss": 2.8721, + "step": 1305 + }, + { + "epoch": 0.8736881715934273, + "grad_norm": 7.672665596008301, + "learning_rate": 4.463307397628588e-05, + "loss": 3.0736, + "step": 1306 + }, + { + "epoch": 0.8743571518166994, + "grad_norm": 5.868035316467285, + "learning_rate": 4.462101175393919e-05, + "loss": 2.9702, + "step": 1307 + }, + { + "epoch": 0.8750261320399716, + "grad_norm": 4.5191168785095215, + "learning_rate": 4.460893762562542e-05, + "loss": 2.9085, + "step": 1308 + }, + { + "epoch": 0.8756951122632437, + "grad_norm": 5.143170356750488, + "learning_rate": 4.459685159867111e-05, + "loss": 2.8955, + "step": 1309 + }, + { + "epoch": 0.8763640924865159, + "grad_norm": 4.201148986816406, + "learning_rate": 4.458475368041003e-05, + "loss": 2.5116, + "step": 1310 + }, + { + "epoch": 0.877033072709788, + "grad_norm": 5.7478179931640625, + "learning_rate": 4.457264387818315e-05, + "loss": 3.0584, + "step": 1311 + }, + { + "epoch": 0.8777020529330601, + "grad_norm": 5.526137828826904, + "learning_rate": 4.456052219933867e-05, + "loss": 2.8589, + "step": 1312 + }, + { + "epoch": 0.8783710331563324, + "grad_norm": 4.569222927093506, + "learning_rate": 4.454838865123197e-05, + "loss": 3.0637, + "step": 1313 + }, + { + "epoch": 0.8790400133796045, + "grad_norm": 5.451717853546143, + "learning_rate": 4.453624324122566e-05, + "loss": 2.8953, + "step": 1314 + }, + { + "epoch": 0.8797089936028766, + "grad_norm": 7.568721294403076, + "learning_rate": 4.4524085976689536e-05, + "loss": 2.9357, + "step": 1315 + }, + { + "epoch": 0.8803779738261488, + "grad_norm": 7.537128925323486, + "learning_rate": 4.451191686500058e-05, + "loss": 2.9977, + "step": 1316 + }, + { + "epoch": 0.8810469540494209, + "grad_norm": 4.4014739990234375, + "learning_rate": 4.449973591354298e-05, + "loss": 3.0327, + "step": 1317 + }, + { + "epoch": 0.881715934272693, + "grad_norm": 4.172126293182373, + "learning_rate": 4.448754312970809e-05, + "loss": 2.8322, + "step": 1318 + }, + { + "epoch": 0.8823849144959652, + "grad_norm": 6.07208251953125, + "learning_rate": 4.447533852089445e-05, + "loss": 2.837, + "step": 1319 + }, + { + "epoch": 0.8830538947192373, + "grad_norm": 3.788719892501831, + "learning_rate": 4.4463122094507794e-05, + "loss": 2.8765, + "step": 1320 + }, + { + "epoch": 0.8837228749425096, + "grad_norm": 3.7321970462799072, + "learning_rate": 4.445089385796099e-05, + "loss": 2.7925, + "step": 1321 + }, + { + "epoch": 0.8843918551657817, + "grad_norm": 5.387599945068359, + "learning_rate": 4.4438653818674105e-05, + "loss": 2.9877, + "step": 1322 + }, + { + "epoch": 0.8850608353890538, + "grad_norm": 5.4844255447387695, + "learning_rate": 4.442640198407435e-05, + "loss": 2.8944, + "step": 1323 + }, + { + "epoch": 0.885729815612326, + "grad_norm": 4.812926292419434, + "learning_rate": 4.4414138361596105e-05, + "loss": 2.8036, + "step": 1324 + }, + { + "epoch": 0.8863987958355981, + "grad_norm": 4.339323997497559, + "learning_rate": 4.4401862958680884e-05, + "loss": 2.8654, + "step": 1325 + }, + { + "epoch": 0.8870677760588702, + "grad_norm": 4.4866509437561035, + "learning_rate": 4.438957578277738e-05, + "loss": 3.0448, + "step": 1326 + }, + { + "epoch": 0.8877367562821424, + "grad_norm": 4.485202789306641, + "learning_rate": 4.4377276841341395e-05, + "loss": 2.9447, + "step": 1327 + }, + { + "epoch": 0.8884057365054145, + "grad_norm": 4.087451457977295, + "learning_rate": 4.4364966141835904e-05, + "loss": 3.0245, + "step": 1328 + }, + { + "epoch": 0.8890747167286867, + "grad_norm": 4.466521739959717, + "learning_rate": 4.435264369173099e-05, + "loss": 3.1356, + "step": 1329 + }, + { + "epoch": 0.8897436969519589, + "grad_norm": 4.541243076324463, + "learning_rate": 4.434030949850387e-05, + "loss": 3.0463, + "step": 1330 + }, + { + "epoch": 0.890412677175231, + "grad_norm": 4.771731376647949, + "learning_rate": 4.4327963569638905e-05, + "loss": 2.7375, + "step": 1331 + }, + { + "epoch": 0.8910816573985032, + "grad_norm": 5.35650110244751, + "learning_rate": 4.4315605912627565e-05, + "loss": 3.0898, + "step": 1332 + }, + { + "epoch": 0.8917506376217753, + "grad_norm": 4.51669454574585, + "learning_rate": 4.430323653496843e-05, + "loss": 3.1061, + "step": 1333 + }, + { + "epoch": 0.8924196178450474, + "grad_norm": 6.2187652587890625, + "learning_rate": 4.4290855444167194e-05, + "loss": 2.9603, + "step": 1334 + }, + { + "epoch": 0.8930885980683196, + "grad_norm": 5.403723239898682, + "learning_rate": 4.4278462647736675e-05, + "loss": 3.0003, + "step": 1335 + }, + { + "epoch": 0.8937575782915917, + "grad_norm": 3.582946538925171, + "learning_rate": 4.426605815319678e-05, + "loss": 2.9088, + "step": 1336 + }, + { + "epoch": 0.8944265585148639, + "grad_norm": 5.798472881317139, + "learning_rate": 4.425364196807451e-05, + "loss": 2.7459, + "step": 1337 + }, + { + "epoch": 0.8950955387381361, + "grad_norm": 5.068627834320068, + "learning_rate": 4.4241214099903976e-05, + "loss": 2.9603, + "step": 1338 + }, + { + "epoch": 0.8957645189614082, + "grad_norm": 6.003021240234375, + "learning_rate": 4.422877455622636e-05, + "loss": 3.0229, + "step": 1339 + }, + { + "epoch": 0.8964334991846804, + "grad_norm": 5.207427024841309, + "learning_rate": 4.421632334458994e-05, + "loss": 3.0078, + "step": 1340 + }, + { + "epoch": 0.8971024794079525, + "grad_norm": 9.058572769165039, + "learning_rate": 4.4203860472550075e-05, + "loss": 3.1514, + "step": 1341 + }, + { + "epoch": 0.8977714596312246, + "grad_norm": 5.014550685882568, + "learning_rate": 4.4191385947669187e-05, + "loss": 3.0033, + "step": 1342 + }, + { + "epoch": 0.8984404398544968, + "grad_norm": 9.838627815246582, + "learning_rate": 4.4178899777516786e-05, + "loss": 2.8175, + "step": 1343 + }, + { + "epoch": 0.899109420077769, + "grad_norm": 3.6878244876861572, + "learning_rate": 4.4166401969669434e-05, + "loss": 2.8379, + "step": 1344 + }, + { + "epoch": 0.8997784003010411, + "grad_norm": 5.310073375701904, + "learning_rate": 4.415389253171077e-05, + "loss": 3.0186, + "step": 1345 + }, + { + "epoch": 0.9004473805243133, + "grad_norm": 6.830930709838867, + "learning_rate": 4.414137147123148e-05, + "loss": 2.6658, + "step": 1346 + }, + { + "epoch": 0.9011163607475854, + "grad_norm": 7.637371063232422, + "learning_rate": 4.412883879582928e-05, + "loss": 3.1366, + "step": 1347 + }, + { + "epoch": 0.9017853409708575, + "grad_norm": 8.96474552154541, + "learning_rate": 4.4116294513108985e-05, + "loss": 2.9803, + "step": 1348 + }, + { + "epoch": 0.9024543211941297, + "grad_norm": 4.182546138763428, + "learning_rate": 4.4103738630682416e-05, + "loss": 2.9514, + "step": 1349 + }, + { + "epoch": 0.9031233014174018, + "grad_norm": 5.354439735412598, + "learning_rate": 4.409117115616844e-05, + "loss": 2.8105, + "step": 1350 + }, + { + "epoch": 0.903792281640674, + "grad_norm": 5.362891674041748, + "learning_rate": 4.407859209719297e-05, + "loss": 2.8012, + "step": 1351 + }, + { + "epoch": 0.9044612618639462, + "grad_norm": 3.78045654296875, + "learning_rate": 4.406600146138893e-05, + "loss": 2.5848, + "step": 1352 + }, + { + "epoch": 0.9051302420872183, + "grad_norm": 2.21057391166687, + "learning_rate": 4.4053399256396275e-05, + "loss": 2.5734, + "step": 1353 + }, + { + "epoch": 0.9057992223104905, + "grad_norm": 5.139932632446289, + "learning_rate": 4.404078548986199e-05, + "loss": 3.0914, + "step": 1354 + }, + { + "epoch": 0.9064682025337626, + "grad_norm": 3.5655221939086914, + "learning_rate": 4.402816016944006e-05, + "loss": 2.6834, + "step": 1355 + }, + { + "epoch": 0.9071371827570347, + "grad_norm": 5.4275994300842285, + "learning_rate": 4.401552330279149e-05, + "loss": 3.3536, + "step": 1356 + }, + { + "epoch": 0.9078061629803069, + "grad_norm": 8.335515022277832, + "learning_rate": 4.40028748975843e-05, + "loss": 3.2658, + "step": 1357 + }, + { + "epoch": 0.908475143203579, + "grad_norm": 5.343057632446289, + "learning_rate": 4.3990214961493495e-05, + "loss": 2.8544, + "step": 1358 + }, + { + "epoch": 0.9091441234268512, + "grad_norm": 5.262948036193848, + "learning_rate": 4.397754350220108e-05, + "loss": 3.0868, + "step": 1359 + }, + { + "epoch": 0.9098131036501234, + "grad_norm": 5.593395233154297, + "learning_rate": 4.3964860527396066e-05, + "loss": 2.9476, + "step": 1360 + }, + { + "epoch": 0.9104820838733955, + "grad_norm": 9.119357109069824, + "learning_rate": 4.3952166044774435e-05, + "loss": 2.8594, + "step": 1361 + }, + { + "epoch": 0.9111510640966677, + "grad_norm": 4.6348676681518555, + "learning_rate": 4.393946006203915e-05, + "loss": 2.9661, + "step": 1362 + }, + { + "epoch": 0.9118200443199398, + "grad_norm": 4.553847789764404, + "learning_rate": 4.392674258690018e-05, + "loss": 2.7935, + "step": 1363 + }, + { + "epoch": 0.9124890245432119, + "grad_norm": 3.9665307998657227, + "learning_rate": 4.391401362707444e-05, + "loss": 2.8593, + "step": 1364 + }, + { + "epoch": 0.9131580047664841, + "grad_norm": 4.720427513122559, + "learning_rate": 4.390127319028581e-05, + "loss": 2.8045, + "step": 1365 + }, + { + "epoch": 0.9138269849897562, + "grad_norm": 3.928102970123291, + "learning_rate": 4.388852128426516e-05, + "loss": 2.8448, + "step": 1366 + }, + { + "epoch": 0.9144959652130283, + "grad_norm": 4.242812156677246, + "learning_rate": 4.38757579167503e-05, + "loss": 2.9404, + "step": 1367 + }, + { + "epoch": 0.9151649454363006, + "grad_norm": 3.773695945739746, + "learning_rate": 4.3862983095486e-05, + "loss": 2.7635, + "step": 1368 + }, + { + "epoch": 0.9158339256595727, + "grad_norm": 3.5291969776153564, + "learning_rate": 4.385019682822399e-05, + "loss": 2.5655, + "step": 1369 + }, + { + "epoch": 0.9165029058828449, + "grad_norm": 4.14195442199707, + "learning_rate": 4.383739912272292e-05, + "loss": 2.739, + "step": 1370 + }, + { + "epoch": 0.917171886106117, + "grad_norm": 4.202181816101074, + "learning_rate": 4.382458998674841e-05, + "loss": 2.8227, + "step": 1371 + }, + { + "epoch": 0.9178408663293891, + "grad_norm": 5.822393417358398, + "learning_rate": 4.3811769428073004e-05, + "loss": 2.7845, + "step": 1372 + }, + { + "epoch": 0.9185098465526613, + "grad_norm": 4.378946304321289, + "learning_rate": 4.3798937454476164e-05, + "loss": 2.7979, + "step": 1373 + }, + { + "epoch": 0.9191788267759334, + "grad_norm": 5.244045734405518, + "learning_rate": 4.37860940737443e-05, + "loss": 2.944, + "step": 1374 + }, + { + "epoch": 0.9198478069992055, + "grad_norm": 7.615596771240234, + "learning_rate": 4.377323929367073e-05, + "loss": 3.1287, + "step": 1375 + }, + { + "epoch": 0.9205167872224778, + "grad_norm": 4.038976192474365, + "learning_rate": 4.37603731220557e-05, + "loss": 3.0772, + "step": 1376 + }, + { + "epoch": 0.9211857674457499, + "grad_norm": 4.48494291305542, + "learning_rate": 4.3747495566706344e-05, + "loss": 2.8203, + "step": 1377 + }, + { + "epoch": 0.9218547476690221, + "grad_norm": 6.21243953704834, + "learning_rate": 4.3734606635436734e-05, + "loss": 3.0804, + "step": 1378 + }, + { + "epoch": 0.9225237278922942, + "grad_norm": 5.390618324279785, + "learning_rate": 4.372170633606784e-05, + "loss": 3.0191, + "step": 1379 + }, + { + "epoch": 0.9231927081155663, + "grad_norm": 3.5611183643341064, + "learning_rate": 4.370879467642751e-05, + "loss": 2.7865, + "step": 1380 + }, + { + "epoch": 0.9238616883388385, + "grad_norm": 6.147199630737305, + "learning_rate": 4.369587166435051e-05, + "loss": 2.9694, + "step": 1381 + }, + { + "epoch": 0.9245306685621106, + "grad_norm": 6.140897274017334, + "learning_rate": 4.368293730767846e-05, + "loss": 2.8317, + "step": 1382 + }, + { + "epoch": 0.9251996487853827, + "grad_norm": 4.62473726272583, + "learning_rate": 4.366999161425991e-05, + "loss": 3.0022, + "step": 1383 + }, + { + "epoch": 0.925868629008655, + "grad_norm": 4.647932052612305, + "learning_rate": 4.3657034591950254e-05, + "loss": 3.1382, + "step": 1384 + }, + { + "epoch": 0.9265376092319271, + "grad_norm": 6.712825775146484, + "learning_rate": 4.364406624861177e-05, + "loss": 3.1722, + "step": 1385 + }, + { + "epoch": 0.9272065894551992, + "grad_norm": 4.585726261138916, + "learning_rate": 4.363108659211361e-05, + "loss": 3.1261, + "step": 1386 + }, + { + "epoch": 0.9278755696784714, + "grad_norm": 4.828583240509033, + "learning_rate": 4.361809563033179e-05, + "loss": 2.9589, + "step": 1387 + }, + { + "epoch": 0.9285445499017435, + "grad_norm": 4.226254940032959, + "learning_rate": 4.360509337114918e-05, + "loss": 3.0593, + "step": 1388 + }, + { + "epoch": 0.9292135301250157, + "grad_norm": 4.8540940284729, + "learning_rate": 4.359207982245551e-05, + "loss": 2.6497, + "step": 1389 + }, + { + "epoch": 0.9298825103482878, + "grad_norm": 6.509584903717041, + "learning_rate": 4.357905499214736e-05, + "loss": 2.9424, + "step": 1390 + }, + { + "epoch": 0.93055149057156, + "grad_norm": 5.68663215637207, + "learning_rate": 4.3566018888128165e-05, + "loss": 3.1698, + "step": 1391 + }, + { + "epoch": 0.9312204707948322, + "grad_norm": 3.5178868770599365, + "learning_rate": 4.355297151830818e-05, + "loss": 2.908, + "step": 1392 + }, + { + "epoch": 0.9318894510181043, + "grad_norm": 5.952272891998291, + "learning_rate": 4.3539912890604504e-05, + "loss": 3.0404, + "step": 1393 + }, + { + "epoch": 0.9325584312413764, + "grad_norm": 6.591036796569824, + "learning_rate": 4.352684301294108e-05, + "loss": 2.8514, + "step": 1394 + }, + { + "epoch": 0.9332274114646486, + "grad_norm": 5.238092422485352, + "learning_rate": 4.351376189324867e-05, + "loss": 2.7471, + "step": 1395 + }, + { + "epoch": 0.9338963916879207, + "grad_norm": 4.533151149749756, + "learning_rate": 4.3500669539464846e-05, + "loss": 3.0083, + "step": 1396 + }, + { + "epoch": 0.9345653719111928, + "grad_norm": 8.677496910095215, + "learning_rate": 4.3487565959534004e-05, + "loss": 3.0341, + "step": 1397 + }, + { + "epoch": 0.935234352134465, + "grad_norm": 5.255722522735596, + "learning_rate": 4.3474451161407364e-05, + "loss": 2.8045, + "step": 1398 + }, + { + "epoch": 0.9359033323577371, + "grad_norm": 5.049302577972412, + "learning_rate": 4.346132515304294e-05, + "loss": 2.7667, + "step": 1399 + }, + { + "epoch": 0.9365723125810094, + "grad_norm": 5.458214282989502, + "learning_rate": 4.344818794240556e-05, + "loss": 2.8785, + "step": 1400 + }, + { + "epoch": 0.9372412928042815, + "grad_norm": 5.965051174163818, + "learning_rate": 4.343503953746681e-05, + "loss": 2.9263, + "step": 1401 + }, + { + "epoch": 0.9379102730275536, + "grad_norm": 7.19428825378418, + "learning_rate": 4.3421879946205145e-05, + "loss": 2.8425, + "step": 1402 + }, + { + "epoch": 0.9385792532508258, + "grad_norm": 4.539826393127441, + "learning_rate": 4.3408709176605734e-05, + "loss": 2.8562, + "step": 1403 + }, + { + "epoch": 0.9392482334740979, + "grad_norm": 4.270603656768799, + "learning_rate": 4.339552723666057e-05, + "loss": 2.7289, + "step": 1404 + }, + { + "epoch": 0.93991721369737, + "grad_norm": 4.474053859710693, + "learning_rate": 4.338233413436839e-05, + "loss": 2.7669, + "step": 1405 + }, + { + "epoch": 0.9405861939206422, + "grad_norm": 10.911941528320312, + "learning_rate": 4.336912987773476e-05, + "loss": 2.9069, + "step": 1406 + }, + { + "epoch": 0.9412551741439144, + "grad_norm": 7.811734676361084, + "learning_rate": 4.335591447477196e-05, + "loss": 3.0396, + "step": 1407 + }, + { + "epoch": 0.9419241543671866, + "grad_norm": 6.42579460144043, + "learning_rate": 4.334268793349905e-05, + "loss": 2.8405, + "step": 1408 + }, + { + "epoch": 0.9425931345904587, + "grad_norm": 6.013075351715088, + "learning_rate": 4.332945026194187e-05, + "loss": 2.9972, + "step": 1409 + }, + { + "epoch": 0.9432621148137308, + "grad_norm": 5.426165580749512, + "learning_rate": 4.3316201468132985e-05, + "loss": 3.0449, + "step": 1410 + }, + { + "epoch": 0.943931095037003, + "grad_norm": 7.533892631530762, + "learning_rate": 4.330294156011172e-05, + "loss": 2.7979, + "step": 1411 + }, + { + "epoch": 0.9446000752602751, + "grad_norm": 7.255002021789551, + "learning_rate": 4.3289670545924144e-05, + "loss": 2.8471, + "step": 1412 + }, + { + "epoch": 0.9452690554835472, + "grad_norm": 4.2335686683654785, + "learning_rate": 4.327638843362307e-05, + "loss": 2.8513, + "step": 1413 + }, + { + "epoch": 0.9459380357068194, + "grad_norm": 3.1816556453704834, + "learning_rate": 4.3263095231268044e-05, + "loss": 2.7395, + "step": 1414 + }, + { + "epoch": 0.9466070159300916, + "grad_norm": 5.051225185394287, + "learning_rate": 4.324979094692534e-05, + "loss": 2.9584, + "step": 1415 + }, + { + "epoch": 0.9472759961533637, + "grad_norm": 5.697210788726807, + "learning_rate": 4.3236475588667946e-05, + "loss": 2.8683, + "step": 1416 + }, + { + "epoch": 0.9479449763766359, + "grad_norm": 5.087251663208008, + "learning_rate": 4.3223149164575585e-05, + "loss": 3.1022, + "step": 1417 + }, + { + "epoch": 0.948613956599908, + "grad_norm": 5.369622707366943, + "learning_rate": 4.320981168273468e-05, + "loss": 2.8525, + "step": 1418 + }, + { + "epoch": 0.9492829368231802, + "grad_norm": 4.636052131652832, + "learning_rate": 4.319646315123839e-05, + "loss": 2.954, + "step": 1419 + }, + { + "epoch": 0.9499519170464523, + "grad_norm": 4.25987434387207, + "learning_rate": 4.318310357818654e-05, + "loss": 2.8005, + "step": 1420 + }, + { + "epoch": 0.9506208972697244, + "grad_norm": 4.870820045471191, + "learning_rate": 4.3169732971685686e-05, + "loss": 3.0632, + "step": 1421 + }, + { + "epoch": 0.9512898774929966, + "grad_norm": 4.916179656982422, + "learning_rate": 4.315635133984908e-05, + "loss": 3.0651, + "step": 1422 + }, + { + "epoch": 0.9519588577162688, + "grad_norm": 2.8693549633026123, + "learning_rate": 4.3142958690796624e-05, + "loss": 2.6375, + "step": 1423 + }, + { + "epoch": 0.9526278379395409, + "grad_norm": 3.8217246532440186, + "learning_rate": 4.312955503265497e-05, + "loss": 2.5429, + "step": 1424 + }, + { + "epoch": 0.9532968181628131, + "grad_norm": 4.727277755737305, + "learning_rate": 4.311614037355739e-05, + "loss": 3.1547, + "step": 1425 + }, + { + "epoch": 0.9539657983860852, + "grad_norm": 3.9927327632904053, + "learning_rate": 4.310271472164387e-05, + "loss": 2.638, + "step": 1426 + }, + { + "epoch": 0.9546347786093574, + "grad_norm": 4.312863349914551, + "learning_rate": 4.3089278085061035e-05, + "loss": 2.7786, + "step": 1427 + }, + { + "epoch": 0.9553037588326295, + "grad_norm": 4.242437839508057, + "learning_rate": 4.307583047196221e-05, + "loss": 2.8335, + "step": 1428 + }, + { + "epoch": 0.9559727390559016, + "grad_norm": 6.428162097930908, + "learning_rate": 4.306237189050737e-05, + "loss": 2.7921, + "step": 1429 + }, + { + "epoch": 0.9566417192791739, + "grad_norm": 5.5870537757873535, + "learning_rate": 4.3048902348863116e-05, + "loss": 2.9045, + "step": 1430 + }, + { + "epoch": 0.957310699502446, + "grad_norm": 4.459833145141602, + "learning_rate": 4.303542185520273e-05, + "loss": 2.9307, + "step": 1431 + }, + { + "epoch": 0.9579796797257181, + "grad_norm": 7.269958972930908, + "learning_rate": 4.3021930417706144e-05, + "loss": 2.9311, + "step": 1432 + }, + { + "epoch": 0.9586486599489903, + "grad_norm": 4.830624580383301, + "learning_rate": 4.300842804455991e-05, + "loss": 2.7988, + "step": 1433 + }, + { + "epoch": 0.9593176401722624, + "grad_norm": 5.2541422843933105, + "learning_rate": 4.2994914743957226e-05, + "loss": 2.9206, + "step": 1434 + }, + { + "epoch": 0.9599866203955345, + "grad_norm": 5.538652420043945, + "learning_rate": 4.298139052409792e-05, + "loss": 2.9494, + "step": 1435 + }, + { + "epoch": 0.9606556006188067, + "grad_norm": 4.298689842224121, + "learning_rate": 4.296785539318845e-05, + "loss": 2.6522, + "step": 1436 + }, + { + "epoch": 0.9613245808420788, + "grad_norm": 5.333482265472412, + "learning_rate": 4.295430935944188e-05, + "loss": 2.8006, + "step": 1437 + }, + { + "epoch": 0.961993561065351, + "grad_norm": 6.548314094543457, + "learning_rate": 4.29407524310779e-05, + "loss": 3.0809, + "step": 1438 + }, + { + "epoch": 0.9626625412886232, + "grad_norm": 5.444504737854004, + "learning_rate": 4.2927184616322823e-05, + "loss": 2.9464, + "step": 1439 + }, + { + "epoch": 0.9633315215118953, + "grad_norm": 8.92197036743164, + "learning_rate": 4.291360592340955e-05, + "loss": 3.3383, + "step": 1440 + }, + { + "epoch": 0.9640005017351675, + "grad_norm": 4.618531703948975, + "learning_rate": 4.2900016360577585e-05, + "loss": 2.8019, + "step": 1441 + }, + { + "epoch": 0.9646694819584396, + "grad_norm": 6.276029109954834, + "learning_rate": 4.2886415936073035e-05, + "loss": 3.4108, + "step": 1442 + }, + { + "epoch": 0.9653384621817117, + "grad_norm": 3.802837371826172, + "learning_rate": 4.287280465814858e-05, + "loss": 2.8491, + "step": 1443 + }, + { + "epoch": 0.9660074424049839, + "grad_norm": 5.446820259094238, + "learning_rate": 4.2859182535063525e-05, + "loss": 3.0644, + "step": 1444 + }, + { + "epoch": 0.966676422628256, + "grad_norm": 5.9092888832092285, + "learning_rate": 4.284554957508371e-05, + "loss": 2.9815, + "step": 1445 + }, + { + "epoch": 0.9673454028515281, + "grad_norm": 5.796885013580322, + "learning_rate": 4.283190578648158e-05, + "loss": 3.1203, + "step": 1446 + }, + { + "epoch": 0.9680143830748004, + "grad_norm": 6.438416957855225, + "learning_rate": 4.2818251177536136e-05, + "loss": 2.8655, + "step": 1447 + }, + { + "epoch": 0.9686833632980725, + "grad_norm": 6.921572685241699, + "learning_rate": 4.2804585756532965e-05, + "loss": 3.0448, + "step": 1448 + }, + { + "epoch": 0.9693523435213447, + "grad_norm": 7.228209495544434, + "learning_rate": 4.2790909531764196e-05, + "loss": 3.05, + "step": 1449 + }, + { + "epoch": 0.9700213237446168, + "grad_norm": 7.1065449714660645, + "learning_rate": 4.2777222511528504e-05, + "loss": 2.5712, + "step": 1450 + }, + { + "epoch": 0.9706903039678889, + "grad_norm": 6.614255905151367, + "learning_rate": 4.276352470413114e-05, + "loss": 3.1933, + "step": 1451 + }, + { + "epoch": 0.9713592841911611, + "grad_norm": 4.059726715087891, + "learning_rate": 4.274981611788389e-05, + "loss": 2.9931, + "step": 1452 + }, + { + "epoch": 0.9720282644144332, + "grad_norm": 4.65960693359375, + "learning_rate": 4.273609676110508e-05, + "loss": 2.9133, + "step": 1453 + }, + { + "epoch": 0.9726972446377053, + "grad_norm": 5.1689324378967285, + "learning_rate": 4.272236664211957e-05, + "loss": 2.9943, + "step": 1454 + }, + { + "epoch": 0.9733662248609776, + "grad_norm": 8.48508358001709, + "learning_rate": 4.2708625769258756e-05, + "loss": 2.9463, + "step": 1455 + }, + { + "epoch": 0.9740352050842497, + "grad_norm": 4.781745433807373, + "learning_rate": 4.269487415086055e-05, + "loss": 2.61, + "step": 1456 + }, + { + "epoch": 0.9747041853075219, + "grad_norm": 6.530716896057129, + "learning_rate": 4.268111179526939e-05, + "loss": 3.0087, + "step": 1457 + }, + { + "epoch": 0.975373165530794, + "grad_norm": 4.822265625, + "learning_rate": 4.266733871083624e-05, + "loss": 2.8659, + "step": 1458 + }, + { + "epoch": 0.9760421457540661, + "grad_norm": 3.321605682373047, + "learning_rate": 4.2653554905918544e-05, + "loss": 2.8481, + "step": 1459 + }, + { + "epoch": 0.9767111259773383, + "grad_norm": 4.193107604980469, + "learning_rate": 4.263976038888029e-05, + "loss": 3.0597, + "step": 1460 + }, + { + "epoch": 0.9773801062006104, + "grad_norm": 6.2677483558654785, + "learning_rate": 4.262595516809194e-05, + "loss": 2.99, + "step": 1461 + }, + { + "epoch": 0.9780490864238826, + "grad_norm": 6.198704719543457, + "learning_rate": 4.261213925193045e-05, + "loss": 3.0541, + "step": 1462 + }, + { + "epoch": 0.9787180666471548, + "grad_norm": 4.100915431976318, + "learning_rate": 4.259831264877928e-05, + "loss": 3.0413, + "step": 1463 + }, + { + "epoch": 0.9793870468704269, + "grad_norm": 5.984002590179443, + "learning_rate": 4.258447536702838e-05, + "loss": 2.7577, + "step": 1464 + }, + { + "epoch": 0.980056027093699, + "grad_norm": 4.392581462860107, + "learning_rate": 4.2570627415074146e-05, + "loss": 2.8131, + "step": 1465 + }, + { + "epoch": 0.9807250073169712, + "grad_norm": 7.353703498840332, + "learning_rate": 4.2556768801319485e-05, + "loss": 3.0096, + "step": 1466 + }, + { + "epoch": 0.9813939875402433, + "grad_norm": 6.616916179656982, + "learning_rate": 4.254289953417376e-05, + "loss": 2.958, + "step": 1467 + }, + { + "epoch": 0.9820629677635155, + "grad_norm": 6.607530117034912, + "learning_rate": 4.252901962205279e-05, + "loss": 2.9169, + "step": 1468 + }, + { + "epoch": 0.9827319479867876, + "grad_norm": 3.915989398956299, + "learning_rate": 4.2515129073378866e-05, + "loss": 2.8512, + "step": 1469 + }, + { + "epoch": 0.9834009282100598, + "grad_norm": 5.1544189453125, + "learning_rate": 4.250122789658073e-05, + "loss": 3.093, + "step": 1470 + }, + { + "epoch": 0.984069908433332, + "grad_norm": 3.6646721363067627, + "learning_rate": 4.2487316100093564e-05, + "loss": 3.1035, + "step": 1471 + }, + { + "epoch": 0.9847388886566041, + "grad_norm": 5.358120441436768, + "learning_rate": 4.247339369235901e-05, + "loss": 2.9236, + "step": 1472 + }, + { + "epoch": 0.9854078688798762, + "grad_norm": 4.24708890914917, + "learning_rate": 4.2459460681825134e-05, + "loss": 3.06, + "step": 1473 + }, + { + "epoch": 0.9860768491031484, + "grad_norm": 4.978499889373779, + "learning_rate": 4.244551707694645e-05, + "loss": 2.8849, + "step": 1474 + }, + { + "epoch": 0.9867458293264205, + "grad_norm": 5.576192855834961, + "learning_rate": 4.2431562886183886e-05, + "loss": 2.8256, + "step": 1475 + }, + { + "epoch": 0.9874148095496927, + "grad_norm": 8.536967277526855, + "learning_rate": 4.24175981180048e-05, + "loss": 3.2155, + "step": 1476 + }, + { + "epoch": 0.9880837897729648, + "grad_norm": 6.740468978881836, + "learning_rate": 4.2403622780882976e-05, + "loss": 2.942, + "step": 1477 + }, + { + "epoch": 0.988752769996237, + "grad_norm": 7.4838032722473145, + "learning_rate": 4.23896368832986e-05, + "loss": 2.9484, + "step": 1478 + }, + { + "epoch": 0.9894217502195092, + "grad_norm": 4.943514823913574, + "learning_rate": 4.237564043373827e-05, + "loss": 3.074, + "step": 1479 + }, + { + "epoch": 0.9900907304427813, + "grad_norm": 3.450420618057251, + "learning_rate": 4.236163344069498e-05, + "loss": 2.8443, + "step": 1480 + }, + { + "epoch": 0.9907597106660534, + "grad_norm": 5.6595659255981445, + "learning_rate": 4.2347615912668136e-05, + "loss": 2.9945, + "step": 1481 + }, + { + "epoch": 0.9914286908893256, + "grad_norm": 4.968037128448486, + "learning_rate": 4.2333587858163524e-05, + "loss": 3.077, + "step": 1482 + }, + { + "epoch": 0.9920976711125977, + "grad_norm": 6.695304870605469, + "learning_rate": 4.2319549285693325e-05, + "loss": 3.1005, + "step": 1483 + }, + { + "epoch": 0.9927666513358698, + "grad_norm": 5.530367374420166, + "learning_rate": 4.230550020377611e-05, + "loss": 3.1696, + "step": 1484 + }, + { + "epoch": 0.993435631559142, + "grad_norm": 6.297682762145996, + "learning_rate": 4.2291440620936796e-05, + "loss": 2.9688, + "step": 1485 + }, + { + "epoch": 0.9941046117824142, + "grad_norm": 3.887913465499878, + "learning_rate": 4.227737054570671e-05, + "loss": 2.768, + "step": 1486 + }, + { + "epoch": 0.9947735920056864, + "grad_norm": 5.558178901672363, + "learning_rate": 4.2263289986623525e-05, + "loss": 2.8784, + "step": 1487 + }, + { + "epoch": 0.9954425722289585, + "grad_norm": 4.500943183898926, + "learning_rate": 4.224919895223127e-05, + "loss": 3.1376, + "step": 1488 + }, + { + "epoch": 0.9961115524522306, + "grad_norm": 4.17768669128418, + "learning_rate": 4.223509745108035e-05, + "loss": 2.5942, + "step": 1489 + }, + { + "epoch": 0.9967805326755028, + "grad_norm": 4.17385196685791, + "learning_rate": 4.222098549172751e-05, + "loss": 3.0907, + "step": 1490 + }, + { + "epoch": 0.9974495128987749, + "grad_norm": 4.986076354980469, + "learning_rate": 4.2206863082735837e-05, + "loss": 2.9474, + "step": 1491 + }, + { + "epoch": 0.998118493122047, + "grad_norm": 5.448312759399414, + "learning_rate": 4.219273023267476e-05, + "loss": 3.0986, + "step": 1492 + }, + { + "epoch": 0.9987874733453193, + "grad_norm": 5.713490962982178, + "learning_rate": 4.217858695012007e-05, + "loss": 2.8431, + "step": 1493 + }, + { + "epoch": 0.9994564535685914, + "grad_norm": 4.883641719818115, + "learning_rate": 4.216443324365383e-05, + "loss": 2.9106, + "step": 1494 + }, + { + "epoch": 1.0006689802232722, + "grad_norm": 11.87452220916748, + "learning_rate": 4.215026912186449e-05, + "loss": 5.5449, + "step": 1495 + }, + { + "epoch": 1.0013379604465442, + "grad_norm": 6.693982124328613, + "learning_rate": 4.213609459334678e-05, + "loss": 3.0085, + "step": 1496 + }, + { + "epoch": 1.0020069406698164, + "grad_norm": 4.143570899963379, + "learning_rate": 4.2121909666701766e-05, + "loss": 2.6887, + "step": 1497 + }, + { + "epoch": 1.0026759208930887, + "grad_norm": 3.687885046005249, + "learning_rate": 4.210771435053682e-05, + "loss": 2.6233, + "step": 1498 + }, + { + "epoch": 1.0033449011163607, + "grad_norm": 3.9757401943206787, + "learning_rate": 4.2093508653465605e-05, + "loss": 2.6142, + "step": 1499 + }, + { + "epoch": 1.0040138813396329, + "grad_norm": 4.822112560272217, + "learning_rate": 4.207929258410809e-05, + "loss": 2.794, + "step": 1500 + }, + { + "epoch": 1.004682861562905, + "grad_norm": 3.2196388244628906, + "learning_rate": 4.206506615109055e-05, + "loss": 2.6118, + "step": 1501 + }, + { + "epoch": 1.005351841786177, + "grad_norm": 6.094923973083496, + "learning_rate": 4.205082936304554e-05, + "loss": 2.6836, + "step": 1502 + }, + { + "epoch": 1.0060208220094493, + "grad_norm": 3.8810741901397705, + "learning_rate": 4.203658222861189e-05, + "loss": 2.7444, + "step": 1503 + }, + { + "epoch": 1.0066898022327215, + "grad_norm": 4.88921594619751, + "learning_rate": 4.2022324756434715e-05, + "loss": 2.6127, + "step": 1504 + }, + { + "epoch": 1.0073587824559938, + "grad_norm": 5.336818695068359, + "learning_rate": 4.200805695516541e-05, + "loss": 2.7797, + "step": 1505 + }, + { + "epoch": 1.0080277626792657, + "grad_norm": 4.798128604888916, + "learning_rate": 4.199377883346163e-05, + "loss": 3.0443, + "step": 1506 + }, + { + "epoch": 1.008696742902538, + "grad_norm": 4.512149810791016, + "learning_rate": 4.197949039998729e-05, + "loss": 2.9289, + "step": 1507 + }, + { + "epoch": 1.0093657231258102, + "grad_norm": 4.345590114593506, + "learning_rate": 4.196519166341256e-05, + "loss": 2.6111, + "step": 1508 + }, + { + "epoch": 1.0100347033490822, + "grad_norm": 4.6364030838012695, + "learning_rate": 4.1950882632413876e-05, + "loss": 3.0167, + "step": 1509 + }, + { + "epoch": 1.0107036835723544, + "grad_norm": 3.8757243156433105, + "learning_rate": 4.193656331567392e-05, + "loss": 2.73, + "step": 1510 + }, + { + "epoch": 1.0113726637956266, + "grad_norm": 4.849210739135742, + "learning_rate": 4.192223372188159e-05, + "loss": 2.7352, + "step": 1511 + }, + { + "epoch": 1.0120416440188986, + "grad_norm": 6.5901312828063965, + "learning_rate": 4.190789385973205e-05, + "loss": 2.7431, + "step": 1512 + }, + { + "epoch": 1.0127106242421708, + "grad_norm": 4.913456439971924, + "learning_rate": 4.189354373792668e-05, + "loss": 2.7734, + "step": 1513 + }, + { + "epoch": 1.013379604465443, + "grad_norm": 6.975290775299072, + "learning_rate": 4.187918336517308e-05, + "loss": 2.8391, + "step": 1514 + }, + { + "epoch": 1.014048584688715, + "grad_norm": 6.176608085632324, + "learning_rate": 4.186481275018509e-05, + "loss": 2.7831, + "step": 1515 + }, + { + "epoch": 1.0147175649119873, + "grad_norm": 6.237487316131592, + "learning_rate": 4.185043190168274e-05, + "loss": 2.8756, + "step": 1516 + }, + { + "epoch": 1.0153865451352595, + "grad_norm": 4.657079696655273, + "learning_rate": 4.18360408283923e-05, + "loss": 2.8786, + "step": 1517 + }, + { + "epoch": 1.0160555253585315, + "grad_norm": 5.093790054321289, + "learning_rate": 4.182163953904621e-05, + "loss": 2.7005, + "step": 1518 + }, + { + "epoch": 1.0167245055818037, + "grad_norm": 5.690388202667236, + "learning_rate": 4.180722804238314e-05, + "loss": 2.9104, + "step": 1519 + }, + { + "epoch": 1.017393485805076, + "grad_norm": 5.185051441192627, + "learning_rate": 4.179280634714793e-05, + "loss": 2.6394, + "step": 1520 + }, + { + "epoch": 1.018062466028348, + "grad_norm": 6.092372417449951, + "learning_rate": 4.1778374462091616e-05, + "loss": 2.6966, + "step": 1521 + }, + { + "epoch": 1.0187314462516202, + "grad_norm": 3.6631357669830322, + "learning_rate": 4.176393239597144e-05, + "loss": 2.7309, + "step": 1522 + }, + { + "epoch": 1.0194004264748924, + "grad_norm": 4.976971626281738, + "learning_rate": 4.1749480157550774e-05, + "loss": 2.9311, + "step": 1523 + }, + { + "epoch": 1.0200694066981646, + "grad_norm": 5.371352195739746, + "learning_rate": 4.173501775559921e-05, + "loss": 2.9003, + "step": 1524 + }, + { + "epoch": 1.0207383869214366, + "grad_norm": 4.864296913146973, + "learning_rate": 4.172054519889248e-05, + "loss": 2.8299, + "step": 1525 + }, + { + "epoch": 1.0214073671447088, + "grad_norm": 4.238711357116699, + "learning_rate": 4.1706062496212487e-05, + "loss": 2.9332, + "step": 1526 + }, + { + "epoch": 1.022076347367981, + "grad_norm": 5.82922887802124, + "learning_rate": 4.169156965634728e-05, + "loss": 2.7335, + "step": 1527 + }, + { + "epoch": 1.022745327591253, + "grad_norm": 5.272682189941406, + "learning_rate": 4.1677066688091085e-05, + "loss": 2.7228, + "step": 1528 + }, + { + "epoch": 1.0234143078145252, + "grad_norm": 4.921049118041992, + "learning_rate": 4.1662553600244234e-05, + "loss": 2.9095, + "step": 1529 + }, + { + "epoch": 1.0240832880377975, + "grad_norm": 4.771050453186035, + "learning_rate": 4.1648030401613246e-05, + "loss": 2.7689, + "step": 1530 + }, + { + "epoch": 1.0247522682610695, + "grad_norm": 3.7184505462646484, + "learning_rate": 4.163349710101073e-05, + "loss": 2.8432, + "step": 1531 + }, + { + "epoch": 1.0254212484843417, + "grad_norm": 5.250668048858643, + "learning_rate": 4.161895370725547e-05, + "loss": 2.8125, + "step": 1532 + }, + { + "epoch": 1.026090228707614, + "grad_norm": 5.156813144683838, + "learning_rate": 4.1604400229172324e-05, + "loss": 2.5678, + "step": 1533 + }, + { + "epoch": 1.026759208930886, + "grad_norm": 4.721811294555664, + "learning_rate": 4.158983667559232e-05, + "loss": 3.0089, + "step": 1534 + }, + { + "epoch": 1.0274281891541581, + "grad_norm": 3.9132628440856934, + "learning_rate": 4.157526305535256e-05, + "loss": 2.654, + "step": 1535 + }, + { + "epoch": 1.0280971693774303, + "grad_norm": 4.286519527435303, + "learning_rate": 4.156067937729628e-05, + "loss": 2.9266, + "step": 1536 + }, + { + "epoch": 1.0287661496007023, + "grad_norm": 6.115054607391357, + "learning_rate": 4.1546085650272795e-05, + "loss": 2.7046, + "step": 1537 + }, + { + "epoch": 1.0294351298239746, + "grad_norm": 3.2762906551361084, + "learning_rate": 4.153148188313753e-05, + "loss": 2.4544, + "step": 1538 + }, + { + "epoch": 1.0301041100472468, + "grad_norm": 5.929058074951172, + "learning_rate": 4.151686808475204e-05, + "loss": 2.4098, + "step": 1539 + }, + { + "epoch": 1.0307730902705188, + "grad_norm": 4.244700908660889, + "learning_rate": 4.1502244263983894e-05, + "loss": 2.6728, + "step": 1540 + }, + { + "epoch": 1.031442070493791, + "grad_norm": 5.544959545135498, + "learning_rate": 4.148761042970679e-05, + "loss": 2.7866, + "step": 1541 + }, + { + "epoch": 1.0321110507170632, + "grad_norm": 8.907869338989258, + "learning_rate": 4.14729665908005e-05, + "loss": 3.0113, + "step": 1542 + }, + { + "epoch": 1.0327800309403354, + "grad_norm": 4.410710334777832, + "learning_rate": 4.145831275615084e-05, + "loss": 2.7286, + "step": 1543 + }, + { + "epoch": 1.0334490111636074, + "grad_norm": 4.794613361358643, + "learning_rate": 4.144364893464974e-05, + "loss": 2.7656, + "step": 1544 + }, + { + "epoch": 1.0341179913868797, + "grad_norm": 4.0600361824035645, + "learning_rate": 4.142897513519512e-05, + "loss": 2.5502, + "step": 1545 + }, + { + "epoch": 1.0347869716101519, + "grad_norm": 4.491250514984131, + "learning_rate": 4.141429136669103e-05, + "loss": 2.4829, + "step": 1546 + }, + { + "epoch": 1.0354559518334239, + "grad_norm": 5.653236389160156, + "learning_rate": 4.139959763804752e-05, + "loss": 2.6999, + "step": 1547 + }, + { + "epoch": 1.036124932056696, + "grad_norm": 4.625912666320801, + "learning_rate": 4.138489395818069e-05, + "loss": 2.853, + "step": 1548 + }, + { + "epoch": 1.0367939122799683, + "grad_norm": 4.4478254318237305, + "learning_rate": 4.1370180336012696e-05, + "loss": 2.7662, + "step": 1549 + }, + { + "epoch": 1.0374628925032403, + "grad_norm": 6.205041408538818, + "learning_rate": 4.1355456780471716e-05, + "loss": 3.1062, + "step": 1550 + }, + { + "epoch": 1.0381318727265125, + "grad_norm": 4.5253987312316895, + "learning_rate": 4.134072330049195e-05, + "loss": 2.6946, + "step": 1551 + }, + { + "epoch": 1.0388008529497847, + "grad_norm": 5.051163196563721, + "learning_rate": 4.132597990501363e-05, + "loss": 2.7989, + "step": 1552 + }, + { + "epoch": 1.0394698331730567, + "grad_norm": 4.920440673828125, + "learning_rate": 4.131122660298301e-05, + "loss": 3.0435, + "step": 1553 + }, + { + "epoch": 1.040138813396329, + "grad_norm": 3.8905622959136963, + "learning_rate": 4.129646340335234e-05, + "loss": 2.5991, + "step": 1554 + }, + { + "epoch": 1.0408077936196012, + "grad_norm": 4.620569705963135, + "learning_rate": 4.128169031507987e-05, + "loss": 2.7107, + "step": 1555 + }, + { + "epoch": 1.0414767738428732, + "grad_norm": 4.2700581550598145, + "learning_rate": 4.126690734712988e-05, + "loss": 2.9075, + "step": 1556 + }, + { + "epoch": 1.0421457540661454, + "grad_norm": 4.611904621124268, + "learning_rate": 4.1252114508472614e-05, + "loss": 2.6566, + "step": 1557 + }, + { + "epoch": 1.0428147342894176, + "grad_norm": 4.809654235839844, + "learning_rate": 4.1237311808084335e-05, + "loss": 2.7724, + "step": 1558 + }, + { + "epoch": 1.0434837145126896, + "grad_norm": 4.333112716674805, + "learning_rate": 4.122249925494726e-05, + "loss": 2.7317, + "step": 1559 + }, + { + "epoch": 1.0441526947359618, + "grad_norm": 4.872314453125, + "learning_rate": 4.12076768580496e-05, + "loss": 2.8534, + "step": 1560 + }, + { + "epoch": 1.044821674959234, + "grad_norm": 4.6519036293029785, + "learning_rate": 4.119284462638555e-05, + "loss": 2.7633, + "step": 1561 + }, + { + "epoch": 1.045490655182506, + "grad_norm": 4.549657821655273, + "learning_rate": 4.1178002568955246e-05, + "loss": 2.516, + "step": 1562 + }, + { + "epoch": 1.0461596354057783, + "grad_norm": 5.804410457611084, + "learning_rate": 4.116315069476481e-05, + "loss": 2.8158, + "step": 1563 + }, + { + "epoch": 1.0468286156290505, + "grad_norm": 5.040773868560791, + "learning_rate": 4.114828901282631e-05, + "loss": 2.7724, + "step": 1564 + }, + { + "epoch": 1.0474975958523227, + "grad_norm": 5.569967746734619, + "learning_rate": 4.113341753215777e-05, + "loss": 2.8755, + "step": 1565 + }, + { + "epoch": 1.0481665760755947, + "grad_norm": 4.0738444328308105, + "learning_rate": 4.111853626178315e-05, + "loss": 2.7354, + "step": 1566 + }, + { + "epoch": 1.048835556298867, + "grad_norm": 4.854263782501221, + "learning_rate": 4.110364521073236e-05, + "loss": 2.7054, + "step": 1567 + }, + { + "epoch": 1.0495045365221392, + "grad_norm": 7.207700252532959, + "learning_rate": 4.1088744388041235e-05, + "loss": 3.0215, + "step": 1568 + }, + { + "epoch": 1.0501735167454112, + "grad_norm": 6.67018985748291, + "learning_rate": 4.107383380275156e-05, + "loss": 2.6792, + "step": 1569 + }, + { + "epoch": 1.0508424969686834, + "grad_norm": 7.6153130531311035, + "learning_rate": 4.105891346391102e-05, + "loss": 2.8476, + "step": 1570 + }, + { + "epoch": 1.0515114771919556, + "grad_norm": 5.213456153869629, + "learning_rate": 4.1043983380573234e-05, + "loss": 2.7051, + "step": 1571 + }, + { + "epoch": 1.0521804574152276, + "grad_norm": 3.381913423538208, + "learning_rate": 4.1029043561797734e-05, + "loss": 2.6172, + "step": 1572 + }, + { + "epoch": 1.0528494376384998, + "grad_norm": 7.620858669281006, + "learning_rate": 4.101409401664994e-05, + "loss": 2.7223, + "step": 1573 + }, + { + "epoch": 1.053518417861772, + "grad_norm": 4.394527435302734, + "learning_rate": 4.09991347542012e-05, + "loss": 3.0849, + "step": 1574 + }, + { + "epoch": 1.054187398085044, + "grad_norm": 4.2898335456848145, + "learning_rate": 4.0984165783528736e-05, + "loss": 2.5078, + "step": 1575 + }, + { + "epoch": 1.0548563783083162, + "grad_norm": 4.043838977813721, + "learning_rate": 4.096918711371569e-05, + "loss": 2.6517, + "step": 1576 + }, + { + "epoch": 1.0555253585315885, + "grad_norm": 4.759499549865723, + "learning_rate": 4.0954198753851045e-05, + "loss": 2.4175, + "step": 1577 + }, + { + "epoch": 1.0561943387548605, + "grad_norm": 4.681356430053711, + "learning_rate": 4.0939200713029715e-05, + "loss": 2.7955, + "step": 1578 + }, + { + "epoch": 1.0568633189781327, + "grad_norm": 4.3460373878479, + "learning_rate": 4.0924193000352445e-05, + "loss": 2.5704, + "step": 1579 + }, + { + "epoch": 1.057532299201405, + "grad_norm": 7.12529993057251, + "learning_rate": 4.0909175624925875e-05, + "loss": 2.7936, + "step": 1580 + }, + { + "epoch": 1.0582012794246771, + "grad_norm": 9.37367057800293, + "learning_rate": 4.08941485958625e-05, + "loss": 2.9732, + "step": 1581 + }, + { + "epoch": 1.0588702596479491, + "grad_norm": 5.256577491760254, + "learning_rate": 4.087911192228067e-05, + "loss": 2.5803, + "step": 1582 + }, + { + "epoch": 1.0595392398712213, + "grad_norm": 4.99888801574707, + "learning_rate": 4.086406561330459e-05, + "loss": 2.9305, + "step": 1583 + }, + { + "epoch": 1.0602082200944936, + "grad_norm": 5.101346969604492, + "learning_rate": 4.084900967806432e-05, + "loss": 2.5905, + "step": 1584 + }, + { + "epoch": 1.0608772003177656, + "grad_norm": 6.627160549163818, + "learning_rate": 4.083394412569574e-05, + "loss": 2.8194, + "step": 1585 + }, + { + "epoch": 1.0615461805410378, + "grad_norm": 6.262012481689453, + "learning_rate": 4.08188689653406e-05, + "loss": 2.8002, + "step": 1586 + }, + { + "epoch": 1.06221516076431, + "grad_norm": 4.888954162597656, + "learning_rate": 4.0803784206146434e-05, + "loss": 2.9786, + "step": 1587 + }, + { + "epoch": 1.062884140987582, + "grad_norm": 6.124375343322754, + "learning_rate": 4.078868985726665e-05, + "loss": 2.8576, + "step": 1588 + }, + { + "epoch": 1.0635531212108542, + "grad_norm": 5.127064228057861, + "learning_rate": 4.077358592786043e-05, + "loss": 2.9365, + "step": 1589 + }, + { + "epoch": 1.0642221014341264, + "grad_norm": 4.611770153045654, + "learning_rate": 4.0758472427092785e-05, + "loss": 2.7742, + "step": 1590 + }, + { + "epoch": 1.0648910816573984, + "grad_norm": 4.597067356109619, + "learning_rate": 4.0743349364134566e-05, + "loss": 2.6967, + "step": 1591 + }, + { + "epoch": 1.0655600618806707, + "grad_norm": 5.276156425476074, + "learning_rate": 4.072821674816239e-05, + "loss": 2.872, + "step": 1592 + }, + { + "epoch": 1.0662290421039429, + "grad_norm": 3.8604416847229004, + "learning_rate": 4.071307458835866e-05, + "loss": 2.7134, + "step": 1593 + }, + { + "epoch": 1.0668980223272149, + "grad_norm": 4.565550327301025, + "learning_rate": 4.069792289391161e-05, + "loss": 2.753, + "step": 1594 + }, + { + "epoch": 1.067567002550487, + "grad_norm": 5.758005142211914, + "learning_rate": 4.0682761674015236e-05, + "loss": 2.8769, + "step": 1595 + }, + { + "epoch": 1.0682359827737593, + "grad_norm": 6.4277729988098145, + "learning_rate": 4.066759093786931e-05, + "loss": 2.7766, + "step": 1596 + }, + { + "epoch": 1.0689049629970313, + "grad_norm": 6.652651309967041, + "learning_rate": 4.06524106946794e-05, + "loss": 2.823, + "step": 1597 + }, + { + "epoch": 1.0695739432203035, + "grad_norm": 7.791245460510254, + "learning_rate": 4.063722095365682e-05, + "loss": 3.1185, + "step": 1598 + }, + { + "epoch": 1.0702429234435757, + "grad_norm": 4.381209850311279, + "learning_rate": 4.062202172401865e-05, + "loss": 2.646, + "step": 1599 + }, + { + "epoch": 1.0709119036668477, + "grad_norm": 4.1545634269714355, + "learning_rate": 4.060681301498775e-05, + "loss": 2.6542, + "step": 1600 + }, + { + "epoch": 1.07158088389012, + "grad_norm": 5.229256629943848, + "learning_rate": 4.05915948357927e-05, + "loss": 2.8453, + "step": 1601 + }, + { + "epoch": 1.0722498641133922, + "grad_norm": 5.327920436859131, + "learning_rate": 4.057636719566785e-05, + "loss": 2.8317, + "step": 1602 + }, + { + "epoch": 1.0729188443366644, + "grad_norm": 5.403182029724121, + "learning_rate": 4.056113010385329e-05, + "loss": 2.8012, + "step": 1603 + }, + { + "epoch": 1.0735878245599364, + "grad_norm": 4.147651195526123, + "learning_rate": 4.054588356959482e-05, + "loss": 2.7209, + "step": 1604 + }, + { + "epoch": 1.0742568047832086, + "grad_norm": 4.1708784103393555, + "learning_rate": 4.0530627602144015e-05, + "loss": 2.6841, + "step": 1605 + }, + { + "epoch": 1.0749257850064808, + "grad_norm": 5.463420391082764, + "learning_rate": 4.0515362210758126e-05, + "loss": 2.8198, + "step": 1606 + }, + { + "epoch": 1.0755947652297528, + "grad_norm": 6.347439765930176, + "learning_rate": 4.050008740470014e-05, + "loss": 2.6981, + "step": 1607 + }, + { + "epoch": 1.076263745453025, + "grad_norm": 5.56416130065918, + "learning_rate": 4.0484803193238773e-05, + "loss": 2.597, + "step": 1608 + }, + { + "epoch": 1.0769327256762973, + "grad_norm": 4.583149433135986, + "learning_rate": 4.046950958564843e-05, + "loss": 2.6726, + "step": 1609 + }, + { + "epoch": 1.0776017058995693, + "grad_norm": 4.929792881011963, + "learning_rate": 4.045420659120923e-05, + "loss": 2.9779, + "step": 1610 + }, + { + "epoch": 1.0782706861228415, + "grad_norm": 5.686063289642334, + "learning_rate": 4.043889421920698e-05, + "loss": 3.0255, + "step": 1611 + }, + { + "epoch": 1.0789396663461137, + "grad_norm": 6.810145854949951, + "learning_rate": 4.042357247893317e-05, + "loss": 2.8852, + "step": 1612 + }, + { + "epoch": 1.0796086465693857, + "grad_norm": 5.9859724044799805, + "learning_rate": 4.040824137968499e-05, + "loss": 3.1111, + "step": 1613 + }, + { + "epoch": 1.080277626792658, + "grad_norm": 5.715677738189697, + "learning_rate": 4.039290093076529e-05, + "loss": 2.7821, + "step": 1614 + }, + { + "epoch": 1.0809466070159301, + "grad_norm": 3.387718439102173, + "learning_rate": 4.0377551141482614e-05, + "loss": 2.7442, + "step": 1615 + }, + { + "epoch": 1.0816155872392021, + "grad_norm": 5.054075241088867, + "learning_rate": 4.0362192021151174e-05, + "loss": 3.0196, + "step": 1616 + }, + { + "epoch": 1.0822845674624744, + "grad_norm": 3.9575531482696533, + "learning_rate": 4.0346823579090826e-05, + "loss": 2.5998, + "step": 1617 + }, + { + "epoch": 1.0829535476857466, + "grad_norm": 3.741792917251587, + "learning_rate": 4.033144582462709e-05, + "loss": 2.8112, + "step": 1618 + }, + { + "epoch": 1.0836225279090188, + "grad_norm": 4.696652412414551, + "learning_rate": 4.031605876709113e-05, + "loss": 2.9065, + "step": 1619 + }, + { + "epoch": 1.0842915081322908, + "grad_norm": 6.763078689575195, + "learning_rate": 4.030066241581979e-05, + "loss": 2.8761, + "step": 1620 + }, + { + "epoch": 1.084960488355563, + "grad_norm": 5.179434776306152, + "learning_rate": 4.02852567801555e-05, + "loss": 2.9401, + "step": 1621 + }, + { + "epoch": 1.0856294685788352, + "grad_norm": 4.58511209487915, + "learning_rate": 4.0269841869446365e-05, + "loss": 2.6891, + "step": 1622 + }, + { + "epoch": 1.0862984488021072, + "grad_norm": 3.7305219173431396, + "learning_rate": 4.0254417693046096e-05, + "loss": 2.4721, + "step": 1623 + }, + { + "epoch": 1.0869674290253795, + "grad_norm": 4.831632137298584, + "learning_rate": 4.0238984260314036e-05, + "loss": 2.7192, + "step": 1624 + }, + { + "epoch": 1.0876364092486517, + "grad_norm": 5.245891094207764, + "learning_rate": 4.022354158061515e-05, + "loss": 2.7525, + "step": 1625 + }, + { + "epoch": 1.0883053894719237, + "grad_norm": 4.761187553405762, + "learning_rate": 4.0208089663319994e-05, + "loss": 2.9389, + "step": 1626 + }, + { + "epoch": 1.088974369695196, + "grad_norm": 5.587457180023193, + "learning_rate": 4.019262851780474e-05, + "loss": 2.7128, + "step": 1627 + }, + { + "epoch": 1.0896433499184681, + "grad_norm": 6.042799949645996, + "learning_rate": 4.0177158153451176e-05, + "loss": 2.7198, + "step": 1628 + }, + { + "epoch": 1.0903123301417401, + "grad_norm": 5.319640636444092, + "learning_rate": 4.016167857964667e-05, + "loss": 2.5434, + "step": 1629 + }, + { + "epoch": 1.0909813103650123, + "grad_norm": 4.484240531921387, + "learning_rate": 4.014618980578416e-05, + "loss": 2.6542, + "step": 1630 + }, + { + "epoch": 1.0916502905882846, + "grad_norm": 5.133976936340332, + "learning_rate": 4.0130691841262194e-05, + "loss": 2.7771, + "step": 1631 + }, + { + "epoch": 1.0923192708115566, + "grad_norm": 3.1250579357147217, + "learning_rate": 4.01151846954849e-05, + "loss": 2.606, + "step": 1632 + }, + { + "epoch": 1.0929882510348288, + "grad_norm": 5.172818183898926, + "learning_rate": 4.0099668377861944e-05, + "loss": 3.0745, + "step": 1633 + }, + { + "epoch": 1.093657231258101, + "grad_norm": 4.249288082122803, + "learning_rate": 4.008414289780859e-05, + "loss": 2.6543, + "step": 1634 + }, + { + "epoch": 1.094326211481373, + "grad_norm": 8.297806739807129, + "learning_rate": 4.0068608264745636e-05, + "loss": 2.8929, + "step": 1635 + }, + { + "epoch": 1.0949951917046452, + "grad_norm": 3.5871880054473877, + "learning_rate": 4.005306448809946e-05, + "loss": 2.5515, + "step": 1636 + }, + { + "epoch": 1.0956641719279174, + "grad_norm": 5.7941999435424805, + "learning_rate": 4.003751157730198e-05, + "loss": 2.5705, + "step": 1637 + }, + { + "epoch": 1.0963331521511894, + "grad_norm": 3.356236219406128, + "learning_rate": 4.002194954179064e-05, + "loss": 2.4806, + "step": 1638 + }, + { + "epoch": 1.0970021323744616, + "grad_norm": 4.91937780380249, + "learning_rate": 4.000637839100845e-05, + "loss": 2.842, + "step": 1639 + }, + { + "epoch": 1.0976711125977339, + "grad_norm": 5.350463390350342, + "learning_rate": 3.9990798134403906e-05, + "loss": 2.8923, + "step": 1640 + }, + { + "epoch": 1.0983400928210059, + "grad_norm": 4.7535319328308105, + "learning_rate": 3.997520878143109e-05, + "loss": 2.9656, + "step": 1641 + }, + { + "epoch": 1.099009073044278, + "grad_norm": 6.683962345123291, + "learning_rate": 3.9959610341549546e-05, + "loss": 3.0393, + "step": 1642 + }, + { + "epoch": 1.0996780532675503, + "grad_norm": 4.360228061676025, + "learning_rate": 3.994400282422438e-05, + "loss": 2.7038, + "step": 1643 + }, + { + "epoch": 1.1003470334908225, + "grad_norm": 3.990546941757202, + "learning_rate": 3.9928386238926165e-05, + "loss": 2.8572, + "step": 1644 + }, + { + "epoch": 1.1010160137140945, + "grad_norm": 3.997955560684204, + "learning_rate": 3.991276059513099e-05, + "loss": 2.7878, + "step": 1645 + }, + { + "epoch": 1.1016849939373667, + "grad_norm": 7.930902004241943, + "learning_rate": 3.989712590232048e-05, + "loss": 3.067, + "step": 1646 + }, + { + "epoch": 1.102353974160639, + "grad_norm": 5.619079113006592, + "learning_rate": 3.9881482169981676e-05, + "loss": 2.6665, + "step": 1647 + }, + { + "epoch": 1.103022954383911, + "grad_norm": 7.326338291168213, + "learning_rate": 3.986582940760717e-05, + "loss": 2.6139, + "step": 1648 + }, + { + "epoch": 1.1036919346071832, + "grad_norm": 4.203922271728516, + "learning_rate": 3.9850167624694994e-05, + "loss": 2.7178, + "step": 1649 + }, + { + "epoch": 1.1043609148304554, + "grad_norm": 13.013337135314941, + "learning_rate": 3.983449683074868e-05, + "loss": 2.858, + "step": 1650 + }, + { + "epoch": 1.1050298950537274, + "grad_norm": 7.882613658905029, + "learning_rate": 3.981881703527721e-05, + "loss": 2.6311, + "step": 1651 + }, + { + "epoch": 1.1056988752769996, + "grad_norm": 8.829507827758789, + "learning_rate": 3.980312824779503e-05, + "loss": 2.6887, + "step": 1652 + }, + { + "epoch": 1.1063678555002718, + "grad_norm": 6.895455837249756, + "learning_rate": 3.9787430477822046e-05, + "loss": 2.5745, + "step": 1653 + }, + { + "epoch": 1.1070368357235438, + "grad_norm": 5.998924732208252, + "learning_rate": 3.9771723734883624e-05, + "loss": 2.8309, + "step": 1654 + }, + { + "epoch": 1.107705815946816, + "grad_norm": 8.006895065307617, + "learning_rate": 3.975600802851056e-05, + "loss": 2.7938, + "step": 1655 + }, + { + "epoch": 1.1083747961700883, + "grad_norm": 4.172781944274902, + "learning_rate": 3.9740283368239086e-05, + "loss": 2.8041, + "step": 1656 + }, + { + "epoch": 1.1090437763933603, + "grad_norm": 5.066199779510498, + "learning_rate": 3.9724549763610884e-05, + "loss": 2.5142, + "step": 1657 + }, + { + "epoch": 1.1097127566166325, + "grad_norm": 4.01627779006958, + "learning_rate": 3.9708807224173064e-05, + "loss": 2.6579, + "step": 1658 + }, + { + "epoch": 1.1103817368399047, + "grad_norm": 5.281778335571289, + "learning_rate": 3.9693055759478124e-05, + "loss": 2.8503, + "step": 1659 + }, + { + "epoch": 1.111050717063177, + "grad_norm": 5.004472255706787, + "learning_rate": 3.9677295379084026e-05, + "loss": 2.9395, + "step": 1660 + }, + { + "epoch": 1.111719697286449, + "grad_norm": 4.206826686859131, + "learning_rate": 3.96615260925541e-05, + "loss": 2.6226, + "step": 1661 + }, + { + "epoch": 1.1123886775097211, + "grad_norm": 5.367136001586914, + "learning_rate": 3.964574790945712e-05, + "loss": 2.6628, + "step": 1662 + }, + { + "epoch": 1.1130576577329934, + "grad_norm": 4.41979455947876, + "learning_rate": 3.962996083936722e-05, + "loss": 2.6717, + "step": 1663 + }, + { + "epoch": 1.1137266379562654, + "grad_norm": 4.343268394470215, + "learning_rate": 3.961416489186394e-05, + "loss": 2.8632, + "step": 1664 + }, + { + "epoch": 1.1143956181795376, + "grad_norm": 6.543291091918945, + "learning_rate": 3.9598360076532226e-05, + "loss": 2.8875, + "step": 1665 + }, + { + "epoch": 1.1150645984028098, + "grad_norm": 4.79495096206665, + "learning_rate": 3.958254640296238e-05, + "loss": 2.7381, + "step": 1666 + }, + { + "epoch": 1.1157335786260818, + "grad_norm": 3.8318138122558594, + "learning_rate": 3.956672388075009e-05, + "loss": 2.6061, + "step": 1667 + }, + { + "epoch": 1.116402558849354, + "grad_norm": 5.714412689208984, + "learning_rate": 3.955089251949641e-05, + "loss": 2.6708, + "step": 1668 + }, + { + "epoch": 1.1170715390726262, + "grad_norm": 5.564817428588867, + "learning_rate": 3.9535052328807764e-05, + "loss": 2.5249, + "step": 1669 + }, + { + "epoch": 1.1177405192958982, + "grad_norm": 5.668190002441406, + "learning_rate": 3.951920331829593e-05, + "loss": 2.7984, + "step": 1670 + }, + { + "epoch": 1.1184094995191705, + "grad_norm": 3.878696918487549, + "learning_rate": 3.950334549757803e-05, + "loss": 2.6898, + "step": 1671 + }, + { + "epoch": 1.1190784797424427, + "grad_norm": 4.1487274169921875, + "learning_rate": 3.948747887627653e-05, + "loss": 2.5919, + "step": 1672 + }, + { + "epoch": 1.1197474599657147, + "grad_norm": 8.2774019241333, + "learning_rate": 3.947160346401927e-05, + "loss": 3.047, + "step": 1673 + }, + { + "epoch": 1.120416440188987, + "grad_norm": 4.1392974853515625, + "learning_rate": 3.9455719270439386e-05, + "loss": 2.4994, + "step": 1674 + }, + { + "epoch": 1.1210854204122591, + "grad_norm": 4.564013481140137, + "learning_rate": 3.9439826305175345e-05, + "loss": 2.7787, + "step": 1675 + }, + { + "epoch": 1.1217544006355311, + "grad_norm": 4.009519100189209, + "learning_rate": 3.942392457787096e-05, + "loss": 2.4334, + "step": 1676 + }, + { + "epoch": 1.1224233808588033, + "grad_norm": 3.0793025493621826, + "learning_rate": 3.940801409817536e-05, + "loss": 2.5784, + "step": 1677 + }, + { + "epoch": 1.1230923610820756, + "grad_norm": 4.557519912719727, + "learning_rate": 3.939209487574295e-05, + "loss": 2.8636, + "step": 1678 + }, + { + "epoch": 1.1237613413053475, + "grad_norm": 4.885311603546143, + "learning_rate": 3.937616692023347e-05, + "loss": 2.4412, + "step": 1679 + }, + { + "epoch": 1.1244303215286198, + "grad_norm": 4.2733354568481445, + "learning_rate": 3.936023024131196e-05, + "loss": 2.738, + "step": 1680 + }, + { + "epoch": 1.125099301751892, + "grad_norm": 5.4617180824279785, + "learning_rate": 3.934428484864874e-05, + "loss": 2.9871, + "step": 1681 + }, + { + "epoch": 1.125768281975164, + "grad_norm": 4.2067365646362305, + "learning_rate": 3.9328330751919424e-05, + "loss": 2.4473, + "step": 1682 + }, + { + "epoch": 1.1264372621984362, + "grad_norm": 5.401765823364258, + "learning_rate": 3.9312367960804905e-05, + "loss": 2.7275, + "step": 1683 + }, + { + "epoch": 1.1271062424217084, + "grad_norm": 4.645919322967529, + "learning_rate": 3.929639648499136e-05, + "loss": 2.7506, + "step": 1684 + }, + { + "epoch": 1.1277752226449806, + "grad_norm": 6.534749984741211, + "learning_rate": 3.928041633417022e-05, + "loss": 2.3065, + "step": 1685 + }, + { + "epoch": 1.1284442028682526, + "grad_norm": 4.626363754272461, + "learning_rate": 3.926442751803819e-05, + "loss": 2.7914, + "step": 1686 + }, + { + "epoch": 1.1291131830915249, + "grad_norm": 4.61073637008667, + "learning_rate": 3.9248430046297246e-05, + "loss": 2.7046, + "step": 1687 + }, + { + "epoch": 1.129782163314797, + "grad_norm": 3.616173505783081, + "learning_rate": 3.923242392865459e-05, + "loss": 2.6379, + "step": 1688 + }, + { + "epoch": 1.130451143538069, + "grad_norm": 5.434972286224365, + "learning_rate": 3.9216409174822685e-05, + "loss": 2.8262, + "step": 1689 + }, + { + "epoch": 1.1311201237613413, + "grad_norm": 5.812896251678467, + "learning_rate": 3.920038579451923e-05, + "loss": 2.8654, + "step": 1690 + }, + { + "epoch": 1.1317891039846135, + "grad_norm": 4.690445423126221, + "learning_rate": 3.918435379746716e-05, + "loss": 2.8625, + "step": 1691 + }, + { + "epoch": 1.1324580842078855, + "grad_norm": 5.103313446044922, + "learning_rate": 3.9168313193394655e-05, + "loss": 2.8396, + "step": 1692 + }, + { + "epoch": 1.1331270644311577, + "grad_norm": 5.238773822784424, + "learning_rate": 3.915226399203509e-05, + "loss": 2.903, + "step": 1693 + }, + { + "epoch": 1.13379604465443, + "grad_norm": 7.802103519439697, + "learning_rate": 3.913620620312706e-05, + "loss": 3.1013, + "step": 1694 + }, + { + "epoch": 1.1344650248777022, + "grad_norm": 5.691326141357422, + "learning_rate": 3.912013983641439e-05, + "loss": 2.6053, + "step": 1695 + }, + { + "epoch": 1.1351340051009742, + "grad_norm": 5.531711101531982, + "learning_rate": 3.910406490164611e-05, + "loss": 2.7102, + "step": 1696 + }, + { + "epoch": 1.1358029853242464, + "grad_norm": 5.916006088256836, + "learning_rate": 3.908798140857642e-05, + "loss": 2.8587, + "step": 1697 + }, + { + "epoch": 1.1364719655475186, + "grad_norm": 4.010486602783203, + "learning_rate": 3.907188936696475e-05, + "loss": 2.5842, + "step": 1698 + }, + { + "epoch": 1.1371409457707906, + "grad_norm": 4.513060569763184, + "learning_rate": 3.905578878657567e-05, + "loss": 2.7569, + "step": 1699 + }, + { + "epoch": 1.1378099259940628, + "grad_norm": 6.514904499053955, + "learning_rate": 3.9039679677179e-05, + "loss": 2.6758, + "step": 1700 + }, + { + "epoch": 1.138478906217335, + "grad_norm": 5.1382246017456055, + "learning_rate": 3.902356204854967e-05, + "loss": 2.8837, + "step": 1701 + }, + { + "epoch": 1.139147886440607, + "grad_norm": 6.538110256195068, + "learning_rate": 3.900743591046782e-05, + "loss": 2.6701, + "step": 1702 + }, + { + "epoch": 1.1398168666638793, + "grad_norm": 5.245736122131348, + "learning_rate": 3.8991301272718713e-05, + "loss": 2.8015, + "step": 1703 + }, + { + "epoch": 1.1404858468871515, + "grad_norm": 6.4442362785339355, + "learning_rate": 3.8975158145092825e-05, + "loss": 3.0992, + "step": 1704 + }, + { + "epoch": 1.1411548271104235, + "grad_norm": 4.114341735839844, + "learning_rate": 3.8959006537385736e-05, + "loss": 2.6867, + "step": 1705 + }, + { + "epoch": 1.1418238073336957, + "grad_norm": 5.672820568084717, + "learning_rate": 3.8942846459398194e-05, + "loss": 2.6206, + "step": 1706 + }, + { + "epoch": 1.142492787556968, + "grad_norm": 4.0786356925964355, + "learning_rate": 3.8926677920936096e-05, + "loss": 3.0933, + "step": 1707 + }, + { + "epoch": 1.14316176778024, + "grad_norm": 5.188081741333008, + "learning_rate": 3.891050093181044e-05, + "loss": 2.7933, + "step": 1708 + }, + { + "epoch": 1.1438307480035121, + "grad_norm": 5.515549659729004, + "learning_rate": 3.889431550183738e-05, + "loss": 2.8488, + "step": 1709 + }, + { + "epoch": 1.1444997282267844, + "grad_norm": 5.796931743621826, + "learning_rate": 3.8878121640838186e-05, + "loss": 2.9218, + "step": 1710 + }, + { + "epoch": 1.1451687084500564, + "grad_norm": 5.553003787994385, + "learning_rate": 3.886191935863923e-05, + "loss": 2.7191, + "step": 1711 + }, + { + "epoch": 1.1458376886733286, + "grad_norm": 4.528774738311768, + "learning_rate": 3.884570866507202e-05, + "loss": 2.9298, + "step": 1712 + }, + { + "epoch": 1.1465066688966008, + "grad_norm": 4.108309745788574, + "learning_rate": 3.882948956997314e-05, + "loss": 2.8881, + "step": 1713 + }, + { + "epoch": 1.1471756491198728, + "grad_norm": 4.2996439933776855, + "learning_rate": 3.8813262083184286e-05, + "loss": 2.7954, + "step": 1714 + }, + { + "epoch": 1.147844629343145, + "grad_norm": 6.285360813140869, + "learning_rate": 3.879702621455226e-05, + "loss": 2.7149, + "step": 1715 + }, + { + "epoch": 1.1485136095664172, + "grad_norm": 6.409306526184082, + "learning_rate": 3.878078197392891e-05, + "loss": 2.8814, + "step": 1716 + }, + { + "epoch": 1.1491825897896892, + "grad_norm": 4.87045955657959, + "learning_rate": 3.876452937117122e-05, + "loss": 2.7626, + "step": 1717 + }, + { + "epoch": 1.1498515700129615, + "grad_norm": 4.315850734710693, + "learning_rate": 3.8748268416141184e-05, + "loss": 2.8149, + "step": 1718 + }, + { + "epoch": 1.1505205502362337, + "grad_norm": 4.383823394775391, + "learning_rate": 3.8731999118705926e-05, + "loss": 2.732, + "step": 1719 + }, + { + "epoch": 1.1511895304595057, + "grad_norm": 5.145540714263916, + "learning_rate": 3.871572148873759e-05, + "loss": 2.6938, + "step": 1720 + }, + { + "epoch": 1.151858510682778, + "grad_norm": 3.9767284393310547, + "learning_rate": 3.869943553611338e-05, + "loss": 2.4974, + "step": 1721 + }, + { + "epoch": 1.1525274909060501, + "grad_norm": 5.369380950927734, + "learning_rate": 3.868314127071559e-05, + "loss": 2.6343, + "step": 1722 + }, + { + "epoch": 1.1531964711293223, + "grad_norm": 4.192481994628906, + "learning_rate": 3.86668387024315e-05, + "loss": 2.4316, + "step": 1723 + }, + { + "epoch": 1.1538654513525943, + "grad_norm": 6.330613613128662, + "learning_rate": 3.8650527841153454e-05, + "loss": 2.7826, + "step": 1724 + }, + { + "epoch": 1.1545344315758665, + "grad_norm": 5.088796615600586, + "learning_rate": 3.8634208696778856e-05, + "loss": 2.8024, + "step": 1725 + }, + { + "epoch": 1.1552034117991388, + "grad_norm": 5.2484450340271, + "learning_rate": 3.861788127921009e-05, + "loss": 2.7792, + "step": 1726 + }, + { + "epoch": 1.1558723920224108, + "grad_norm": 4.708180904388428, + "learning_rate": 3.8601545598354593e-05, + "loss": 2.9515, + "step": 1727 + }, + { + "epoch": 1.156541372245683, + "grad_norm": 4.836954593658447, + "learning_rate": 3.8585201664124795e-05, + "loss": 3.0762, + "step": 1728 + }, + { + "epoch": 1.1572103524689552, + "grad_norm": 5.8461503982543945, + "learning_rate": 3.856884948643814e-05, + "loss": 2.9477, + "step": 1729 + }, + { + "epoch": 1.1578793326922272, + "grad_norm": 5.0680694580078125, + "learning_rate": 3.8552489075217085e-05, + "loss": 2.9862, + "step": 1730 + }, + { + "epoch": 1.1585483129154994, + "grad_norm": 5.802671432495117, + "learning_rate": 3.853612044038908e-05, + "loss": 2.9056, + "step": 1731 + }, + { + "epoch": 1.1592172931387716, + "grad_norm": 6.276515483856201, + "learning_rate": 3.8519743591886546e-05, + "loss": 2.9355, + "step": 1732 + }, + { + "epoch": 1.1598862733620436, + "grad_norm": 4.246423244476318, + "learning_rate": 3.850335853964692e-05, + "loss": 2.65, + "step": 1733 + }, + { + "epoch": 1.1605552535853159, + "grad_norm": 5.163718223571777, + "learning_rate": 3.848696529361258e-05, + "loss": 2.7763, + "step": 1734 + }, + { + "epoch": 1.161224233808588, + "grad_norm": 6.367431163787842, + "learning_rate": 3.847056386373089e-05, + "loss": 2.9175, + "step": 1735 + }, + { + "epoch": 1.1618932140318603, + "grad_norm": 3.0664758682250977, + "learning_rate": 3.845415425995421e-05, + "loss": 2.6235, + "step": 1736 + }, + { + "epoch": 1.1625621942551323, + "grad_norm": 4.423579216003418, + "learning_rate": 3.843773649223983e-05, + "loss": 2.8541, + "step": 1737 + }, + { + "epoch": 1.1632311744784045, + "grad_norm": 4.328232765197754, + "learning_rate": 3.8421310570549994e-05, + "loss": 2.8183, + "step": 1738 + }, + { + "epoch": 1.1639001547016767, + "grad_norm": 6.539478302001953, + "learning_rate": 3.840487650485189e-05, + "loss": 2.9259, + "step": 1739 + }, + { + "epoch": 1.1645691349249487, + "grad_norm": 5.035600185394287, + "learning_rate": 3.838843430511766e-05, + "loss": 2.638, + "step": 1740 + }, + { + "epoch": 1.165238115148221, + "grad_norm": 4.728442192077637, + "learning_rate": 3.8371983981324395e-05, + "loss": 2.7532, + "step": 1741 + }, + { + "epoch": 1.1659070953714932, + "grad_norm": 4.8734540939331055, + "learning_rate": 3.835552554345407e-05, + "loss": 2.6562, + "step": 1742 + }, + { + "epoch": 1.1665760755947652, + "grad_norm": 4.309051513671875, + "learning_rate": 3.833905900149364e-05, + "loss": 2.823, + "step": 1743 + }, + { + "epoch": 1.1672450558180374, + "grad_norm": 4.245873928070068, + "learning_rate": 3.832258436543494e-05, + "loss": 2.5424, + "step": 1744 + }, + { + "epoch": 1.1679140360413096, + "grad_norm": 5.439658164978027, + "learning_rate": 3.830610164527472e-05, + "loss": 2.7097, + "step": 1745 + }, + { + "epoch": 1.1685830162645816, + "grad_norm": 5.236258506774902, + "learning_rate": 3.828961085101463e-05, + "loss": 2.7249, + "step": 1746 + }, + { + "epoch": 1.1692519964878538, + "grad_norm": 4.445048809051514, + "learning_rate": 3.827311199266127e-05, + "loss": 2.5616, + "step": 1747 + }, + { + "epoch": 1.169920976711126, + "grad_norm": 5.976138114929199, + "learning_rate": 3.8256605080226064e-05, + "loss": 2.9866, + "step": 1748 + }, + { + "epoch": 1.170589956934398, + "grad_norm": 6.202071666717529, + "learning_rate": 3.8240090123725357e-05, + "loss": 2.7178, + "step": 1749 + }, + { + "epoch": 1.1712589371576703, + "grad_norm": 4.172088623046875, + "learning_rate": 3.822356713318038e-05, + "loss": 2.79, + "step": 1750 + }, + { + "epoch": 1.1719279173809425, + "grad_norm": 4.560709476470947, + "learning_rate": 3.820703611861722e-05, + "loss": 2.6545, + "step": 1751 + }, + { + "epoch": 1.1725968976042145, + "grad_norm": 5.1008172035217285, + "learning_rate": 3.819049709006687e-05, + "loss": 2.5919, + "step": 1752 + }, + { + "epoch": 1.1732658778274867, + "grad_norm": 6.211871147155762, + "learning_rate": 3.8173950057565125e-05, + "loss": 2.9597, + "step": 1753 + }, + { + "epoch": 1.173934858050759, + "grad_norm": 5.119531631469727, + "learning_rate": 3.815739503115268e-05, + "loss": 2.7113, + "step": 1754 + }, + { + "epoch": 1.174603838274031, + "grad_norm": 5.102022647857666, + "learning_rate": 3.8140832020875086e-05, + "loss": 2.915, + "step": 1755 + }, + { + "epoch": 1.1752728184973031, + "grad_norm": 4.11543083190918, + "learning_rate": 3.8124261036782714e-05, + "loss": 2.5921, + "step": 1756 + }, + { + "epoch": 1.1759417987205754, + "grad_norm": 3.230837106704712, + "learning_rate": 3.8107682088930794e-05, + "loss": 2.5988, + "step": 1757 + }, + { + "epoch": 1.1766107789438474, + "grad_norm": 3.4689149856567383, + "learning_rate": 3.8091095187379366e-05, + "loss": 2.5246, + "step": 1758 + }, + { + "epoch": 1.1772797591671196, + "grad_norm": 4.02721643447876, + "learning_rate": 3.807450034219332e-05, + "loss": 2.8093, + "step": 1759 + }, + { + "epoch": 1.1779487393903918, + "grad_norm": 5.369773864746094, + "learning_rate": 3.805789756344234e-05, + "loss": 2.6829, + "step": 1760 + }, + { + "epoch": 1.178617719613664, + "grad_norm": 3.211688756942749, + "learning_rate": 3.804128686120095e-05, + "loss": 2.6756, + "step": 1761 + }, + { + "epoch": 1.179286699836936, + "grad_norm": 4.2267165184021, + "learning_rate": 3.802466824554847e-05, + "loss": 2.702, + "step": 1762 + }, + { + "epoch": 1.1799556800602082, + "grad_norm": 7.6689300537109375, + "learning_rate": 3.8008041726569024e-05, + "loss": 2.8305, + "step": 1763 + }, + { + "epoch": 1.1806246602834805, + "grad_norm": 5.8855485916137695, + "learning_rate": 3.799140731435152e-05, + "loss": 2.805, + "step": 1764 + }, + { + "epoch": 1.1812936405067525, + "grad_norm": 3.6822450160980225, + "learning_rate": 3.797476501898968e-05, + "loss": 2.6626, + "step": 1765 + }, + { + "epoch": 1.1819626207300247, + "grad_norm": 6.995255470275879, + "learning_rate": 3.795811485058199e-05, + "loss": 2.5857, + "step": 1766 + }, + { + "epoch": 1.182631600953297, + "grad_norm": 6.011009216308594, + "learning_rate": 3.7941456819231715e-05, + "loss": 2.7998, + "step": 1767 + }, + { + "epoch": 1.1833005811765689, + "grad_norm": 3.980313301086426, + "learning_rate": 3.79247909350469e-05, + "loss": 2.6141, + "step": 1768 + }, + { + "epoch": 1.183969561399841, + "grad_norm": 4.1785759925842285, + "learning_rate": 3.7908117208140346e-05, + "loss": 2.5139, + "step": 1769 + }, + { + "epoch": 1.1846385416231133, + "grad_norm": 5.0990376472473145, + "learning_rate": 3.7891435648629625e-05, + "loss": 2.8435, + "step": 1770 + }, + { + "epoch": 1.1853075218463853, + "grad_norm": 5.530050754547119, + "learning_rate": 3.787474626663705e-05, + "loss": 2.8514, + "step": 1771 + }, + { + "epoch": 1.1859765020696575, + "grad_norm": 4.31281042098999, + "learning_rate": 3.785804907228968e-05, + "loss": 2.8415, + "step": 1772 + }, + { + "epoch": 1.1866454822929298, + "grad_norm": 5.490376949310303, + "learning_rate": 3.784134407571932e-05, + "loss": 2.9642, + "step": 1773 + }, + { + "epoch": 1.187314462516202, + "grad_norm": 5.688234806060791, + "learning_rate": 3.782463128706251e-05, + "loss": 2.6184, + "step": 1774 + }, + { + "epoch": 1.187983442739474, + "grad_norm": 5.56623649597168, + "learning_rate": 3.780791071646052e-05, + "loss": 2.6272, + "step": 1775 + }, + { + "epoch": 1.1886524229627462, + "grad_norm": 4.018519401550293, + "learning_rate": 3.7791182374059334e-05, + "loss": 2.8058, + "step": 1776 + }, + { + "epoch": 1.1893214031860184, + "grad_norm": 6.15987491607666, + "learning_rate": 3.777444627000966e-05, + "loss": 2.7379, + "step": 1777 + }, + { + "epoch": 1.1899903834092904, + "grad_norm": 5.726695537567139, + "learning_rate": 3.7757702414466914e-05, + "loss": 2.7939, + "step": 1778 + }, + { + "epoch": 1.1906593636325626, + "grad_norm": 6.305606365203857, + "learning_rate": 3.77409508175912e-05, + "loss": 2.8558, + "step": 1779 + }, + { + "epoch": 1.1913283438558349, + "grad_norm": 5.644913196563721, + "learning_rate": 3.772419148954735e-05, + "loss": 2.996, + "step": 1780 + }, + { + "epoch": 1.1919973240791069, + "grad_norm": 4.535463809967041, + "learning_rate": 3.770742444050487e-05, + "loss": 2.5719, + "step": 1781 + }, + { + "epoch": 1.192666304302379, + "grad_norm": 5.501713752746582, + "learning_rate": 3.7690649680637935e-05, + "loss": 2.8254, + "step": 1782 + }, + { + "epoch": 1.1933352845256513, + "grad_norm": 4.598433017730713, + "learning_rate": 3.767386722012543e-05, + "loss": 2.7285, + "step": 1783 + }, + { + "epoch": 1.1940042647489233, + "grad_norm": 4.824279308319092, + "learning_rate": 3.76570770691509e-05, + "loss": 2.7039, + "step": 1784 + }, + { + "epoch": 1.1946732449721955, + "grad_norm": 6.458413600921631, + "learning_rate": 3.7640279237902554e-05, + "loss": 2.8277, + "step": 1785 + }, + { + "epoch": 1.1953422251954677, + "grad_norm": 3.8205926418304443, + "learning_rate": 3.762347373657325e-05, + "loss": 2.6138, + "step": 1786 + }, + { + "epoch": 1.1960112054187397, + "grad_norm": 4.0686163902282715, + "learning_rate": 3.760666057536052e-05, + "loss": 2.7758, + "step": 1787 + }, + { + "epoch": 1.196680185642012, + "grad_norm": 6.732685089111328, + "learning_rate": 3.758983976446654e-05, + "loss": 3.061, + "step": 1788 + }, + { + "epoch": 1.1973491658652842, + "grad_norm": 4.614278793334961, + "learning_rate": 3.757301131409812e-05, + "loss": 2.6263, + "step": 1789 + }, + { + "epoch": 1.1980181460885562, + "grad_norm": 5.702500343322754, + "learning_rate": 3.7556175234466705e-05, + "loss": 2.6778, + "step": 1790 + }, + { + "epoch": 1.1986871263118284, + "grad_norm": 4.065542221069336, + "learning_rate": 3.7539331535788387e-05, + "loss": 2.792, + "step": 1791 + }, + { + "epoch": 1.1993561065351006, + "grad_norm": 6.724399566650391, + "learning_rate": 3.752248022828386e-05, + "loss": 2.84, + "step": 1792 + }, + { + "epoch": 1.2000250867583726, + "grad_norm": 6.001861095428467, + "learning_rate": 3.750562132217844e-05, + "loss": 2.7299, + "step": 1793 + }, + { + "epoch": 1.2006940669816448, + "grad_norm": 4.842615604400635, + "learning_rate": 3.748875482770207e-05, + "loss": 2.6922, + "step": 1794 + }, + { + "epoch": 1.201363047204917, + "grad_norm": 5.486823558807373, + "learning_rate": 3.747188075508928e-05, + "loss": 2.8164, + "step": 1795 + }, + { + "epoch": 1.202032027428189, + "grad_norm": 5.144231796264648, + "learning_rate": 3.745499911457919e-05, + "loss": 2.7281, + "step": 1796 + }, + { + "epoch": 1.2027010076514613, + "grad_norm": 5.316722393035889, + "learning_rate": 3.743810991641553e-05, + "loss": 2.7225, + "step": 1797 + }, + { + "epoch": 1.2033699878747335, + "grad_norm": 4.000890254974365, + "learning_rate": 3.742121317084662e-05, + "loss": 2.6897, + "step": 1798 + }, + { + "epoch": 1.2040389680980055, + "grad_norm": 5.847841739654541, + "learning_rate": 3.740430888812536e-05, + "loss": 2.8504, + "step": 1799 + }, + { + "epoch": 1.2047079483212777, + "grad_norm": 4.641834259033203, + "learning_rate": 3.738739707850919e-05, + "loss": 2.8136, + "step": 1800 + }, + { + "epoch": 1.20537692854455, + "grad_norm": 4.454596996307373, + "learning_rate": 3.737047775226017e-05, + "loss": 2.8346, + "step": 1801 + }, + { + "epoch": 1.2060459087678221, + "grad_norm": 5.596829891204834, + "learning_rate": 3.735355091964486e-05, + "loss": 2.8441, + "step": 1802 + }, + { + "epoch": 1.2067148889910941, + "grad_norm": 4.663771152496338, + "learning_rate": 3.7336616590934434e-05, + "loss": 2.7965, + "step": 1803 + }, + { + "epoch": 1.2073838692143664, + "grad_norm": 4.829925060272217, + "learning_rate": 3.731967477640457e-05, + "loss": 2.8794, + "step": 1804 + }, + { + "epoch": 1.2080528494376386, + "grad_norm": 7.195576190948486, + "learning_rate": 3.7302725486335526e-05, + "loss": 2.5335, + "step": 1805 + }, + { + "epoch": 1.2087218296609106, + "grad_norm": 3.178122043609619, + "learning_rate": 3.728576873101207e-05, + "loss": 2.3999, + "step": 1806 + }, + { + "epoch": 1.2093908098841828, + "grad_norm": 6.593000411987305, + "learning_rate": 3.7268804520723495e-05, + "loss": 2.7625, + "step": 1807 + }, + { + "epoch": 1.210059790107455, + "grad_norm": 3.4164230823516846, + "learning_rate": 3.725183286576363e-05, + "loss": 2.6513, + "step": 1808 + }, + { + "epoch": 1.210728770330727, + "grad_norm": 6.249453067779541, + "learning_rate": 3.723485377643084e-05, + "loss": 2.8028, + "step": 1809 + }, + { + "epoch": 1.2113977505539992, + "grad_norm": 4.547549724578857, + "learning_rate": 3.721786726302798e-05, + "loss": 2.8385, + "step": 1810 + }, + { + "epoch": 1.2120667307772715, + "grad_norm": 5.134420394897461, + "learning_rate": 3.72008733358624e-05, + "loss": 2.6617, + "step": 1811 + }, + { + "epoch": 1.2127357110005437, + "grad_norm": 4.485007286071777, + "learning_rate": 3.718387200524596e-05, + "loss": 2.5839, + "step": 1812 + }, + { + "epoch": 1.2134046912238157, + "grad_norm": 3.850969076156616, + "learning_rate": 3.7166863281495005e-05, + "loss": 2.6409, + "step": 1813 + }, + { + "epoch": 1.2140736714470879, + "grad_norm": 4.061034202575684, + "learning_rate": 3.71498471749304e-05, + "loss": 2.8591, + "step": 1814 + }, + { + "epoch": 1.21474265167036, + "grad_norm": 5.947023391723633, + "learning_rate": 3.713282369587745e-05, + "loss": 2.523, + "step": 1815 + }, + { + "epoch": 1.215411631893632, + "grad_norm": 5.140153408050537, + "learning_rate": 3.711579285466594e-05, + "loss": 2.7576, + "step": 1816 + }, + { + "epoch": 1.2160806121169043, + "grad_norm": 6.343984603881836, + "learning_rate": 3.709875466163014e-05, + "loss": 2.8076, + "step": 1817 + }, + { + "epoch": 1.2167495923401765, + "grad_norm": 4.9777750968933105, + "learning_rate": 3.708170912710877e-05, + "loss": 2.6444, + "step": 1818 + }, + { + "epoch": 1.2174185725634485, + "grad_norm": 4.851789474487305, + "learning_rate": 3.7064656261445004e-05, + "loss": 2.623, + "step": 1819 + }, + { + "epoch": 1.2180875527867208, + "grad_norm": 6.258243560791016, + "learning_rate": 3.704759607498646e-05, + "loss": 2.8272, + "step": 1820 + }, + { + "epoch": 1.218756533009993, + "grad_norm": 3.94958758354187, + "learning_rate": 3.703052857808522e-05, + "loss": 2.5172, + "step": 1821 + }, + { + "epoch": 1.219425513233265, + "grad_norm": 4.468460559844971, + "learning_rate": 3.7013453781097774e-05, + "loss": 2.7856, + "step": 1822 + }, + { + "epoch": 1.2200944934565372, + "grad_norm": 4.049428939819336, + "learning_rate": 3.699637169438505e-05, + "loss": 2.7046, + "step": 1823 + }, + { + "epoch": 1.2207634736798094, + "grad_norm": 4.41973352432251, + "learning_rate": 3.6979282328312414e-05, + "loss": 2.6378, + "step": 1824 + }, + { + "epoch": 1.2214324539030814, + "grad_norm": 5.145456790924072, + "learning_rate": 3.6962185693249646e-05, + "loss": 2.7381, + "step": 1825 + }, + { + "epoch": 1.2221014341263536, + "grad_norm": 6.784951686859131, + "learning_rate": 3.694508179957091e-05, + "loss": 3.057, + "step": 1826 + }, + { + "epoch": 1.2227704143496259, + "grad_norm": 3.9923250675201416, + "learning_rate": 3.69279706576548e-05, + "loss": 2.5722, + "step": 1827 + }, + { + "epoch": 1.2234393945728979, + "grad_norm": 5.133220672607422, + "learning_rate": 3.691085227788431e-05, + "loss": 2.8907, + "step": 1828 + }, + { + "epoch": 1.22410837479617, + "grad_norm": 5.521151542663574, + "learning_rate": 3.689372667064681e-05, + "loss": 2.7463, + "step": 1829 + }, + { + "epoch": 1.2247773550194423, + "grad_norm": 6.073947429656982, + "learning_rate": 3.687659384633407e-05, + "loss": 2.5392, + "step": 1830 + }, + { + "epoch": 1.2254463352427143, + "grad_norm": 4.901395320892334, + "learning_rate": 3.685945381534222e-05, + "loss": 2.7601, + "step": 1831 + }, + { + "epoch": 1.2261153154659865, + "grad_norm": 4.377593517303467, + "learning_rate": 3.6842306588071795e-05, + "loss": 2.7779, + "step": 1832 + }, + { + "epoch": 1.2267842956892587, + "grad_norm": 7.987823009490967, + "learning_rate": 3.682515217492766e-05, + "loss": 2.8408, + "step": 1833 + }, + { + "epoch": 1.2274532759125307, + "grad_norm": 5.431763172149658, + "learning_rate": 3.6807990586319076e-05, + "loss": 2.6746, + "step": 1834 + }, + { + "epoch": 1.228122256135803, + "grad_norm": 4.709609508514404, + "learning_rate": 3.6790821832659616e-05, + "loss": 2.8218, + "step": 1835 + }, + { + "epoch": 1.2287912363590752, + "grad_norm": 6.702217102050781, + "learning_rate": 3.677364592436725e-05, + "loss": 2.9065, + "step": 1836 + }, + { + "epoch": 1.2294602165823472, + "grad_norm": 5.945807456970215, + "learning_rate": 3.675646287186425e-05, + "loss": 2.6951, + "step": 1837 + }, + { + "epoch": 1.2301291968056194, + "grad_norm": 3.4071404933929443, + "learning_rate": 3.673927268557724e-05, + "loss": 2.562, + "step": 1838 + }, + { + "epoch": 1.2307981770288916, + "grad_norm": 4.416913032531738, + "learning_rate": 3.6722075375937166e-05, + "loss": 2.8139, + "step": 1839 + }, + { + "epoch": 1.2314671572521638, + "grad_norm": 3.2345592975616455, + "learning_rate": 3.670487095337931e-05, + "loss": 2.3068, + "step": 1840 + }, + { + "epoch": 1.2321361374754358, + "grad_norm": 4.063303470611572, + "learning_rate": 3.668765942834324e-05, + "loss": 2.7729, + "step": 1841 + }, + { + "epoch": 1.232805117698708, + "grad_norm": 5.84446382522583, + "learning_rate": 3.667044081127288e-05, + "loss": 2.7892, + "step": 1842 + }, + { + "epoch": 1.2334740979219803, + "grad_norm": 5.9814252853393555, + "learning_rate": 3.665321511261642e-05, + "loss": 2.6805, + "step": 1843 + }, + { + "epoch": 1.2341430781452523, + "grad_norm": 4.754057884216309, + "learning_rate": 3.663598234282636e-05, + "loss": 2.8755, + "step": 1844 + }, + { + "epoch": 1.2348120583685245, + "grad_norm": 4.782750129699707, + "learning_rate": 3.6618742512359487e-05, + "loss": 2.8039, + "step": 1845 + }, + { + "epoch": 1.2354810385917967, + "grad_norm": 5.0642409324646, + "learning_rate": 3.660149563167687e-05, + "loss": 2.5444, + "step": 1846 + }, + { + "epoch": 1.2361500188150687, + "grad_norm": 5.08937406539917, + "learning_rate": 3.658424171124388e-05, + "loss": 2.8231, + "step": 1847 + }, + { + "epoch": 1.236818999038341, + "grad_norm": 4.314083576202393, + "learning_rate": 3.656698076153013e-05, + "loss": 2.4211, + "step": 1848 + }, + { + "epoch": 1.2374879792616131, + "grad_norm": 5.273926734924316, + "learning_rate": 3.65497127930095e-05, + "loss": 2.7676, + "step": 1849 + }, + { + "epoch": 1.2381569594848851, + "grad_norm": 6.733925819396973, + "learning_rate": 3.6532437816160145e-05, + "loss": 2.7285, + "step": 1850 + }, + { + "epoch": 1.2388259397081574, + "grad_norm": 4.898383617401123, + "learning_rate": 3.651515584146447e-05, + "loss": 2.6919, + "step": 1851 + }, + { + "epoch": 1.2394949199314296, + "grad_norm": 5.283299922943115, + "learning_rate": 3.649786687940911e-05, + "loss": 3.0118, + "step": 1852 + }, + { + "epoch": 1.2401639001547018, + "grad_norm": 3.843108892440796, + "learning_rate": 3.6480570940484956e-05, + "loss": 2.5973, + "step": 1853 + }, + { + "epoch": 1.2408328803779738, + "grad_norm": 4.891538143157959, + "learning_rate": 3.646326803518715e-05, + "loss": 2.78, + "step": 1854 + }, + { + "epoch": 1.241501860601246, + "grad_norm": 6.736623287200928, + "learning_rate": 3.644595817401501e-05, + "loss": 2.905, + "step": 1855 + }, + { + "epoch": 1.2421708408245182, + "grad_norm": 6.363578796386719, + "learning_rate": 3.6428641367472116e-05, + "loss": 2.9187, + "step": 1856 + }, + { + "epoch": 1.2428398210477902, + "grad_norm": 3.4829938411712646, + "learning_rate": 3.641131762606626e-05, + "loss": 2.5329, + "step": 1857 + }, + { + "epoch": 1.2435088012710624, + "grad_norm": 5.517954349517822, + "learning_rate": 3.639398696030941e-05, + "loss": 2.9501, + "step": 1858 + }, + { + "epoch": 1.2441777814943347, + "grad_norm": 9.04926586151123, + "learning_rate": 3.637664938071777e-05, + "loss": 3.0881, + "step": 1859 + }, + { + "epoch": 1.2448467617176067, + "grad_norm": 3.5141665935516357, + "learning_rate": 3.635930489781173e-05, + "loss": 2.5798, + "step": 1860 + }, + { + "epoch": 1.2455157419408789, + "grad_norm": 4.416510105133057, + "learning_rate": 3.6341953522115876e-05, + "loss": 2.7676, + "step": 1861 + }, + { + "epoch": 1.246184722164151, + "grad_norm": 6.0014238357543945, + "learning_rate": 3.6324595264158955e-05, + "loss": 2.7161, + "step": 1862 + }, + { + "epoch": 1.246853702387423, + "grad_norm": 4.988345623016357, + "learning_rate": 3.63072301344739e-05, + "loss": 2.8456, + "step": 1863 + }, + { + "epoch": 1.2475226826106953, + "grad_norm": 6.99977970123291, + "learning_rate": 3.6289858143597826e-05, + "loss": 2.8185, + "step": 1864 + }, + { + "epoch": 1.2481916628339675, + "grad_norm": 3.997215509414673, + "learning_rate": 3.6272479302072e-05, + "loss": 2.5535, + "step": 1865 + }, + { + "epoch": 1.2488606430572395, + "grad_norm": 5.643239974975586, + "learning_rate": 3.6255093620441834e-05, + "loss": 2.9158, + "step": 1866 + }, + { + "epoch": 1.2495296232805118, + "grad_norm": 5.907159805297852, + "learning_rate": 3.623770110925692e-05, + "loss": 2.9065, + "step": 1867 + }, + { + "epoch": 1.250198603503784, + "grad_norm": 3.4002277851104736, + "learning_rate": 3.6220301779070966e-05, + "loss": 2.548, + "step": 1868 + }, + { + "epoch": 1.250867583727056, + "grad_norm": 4.918555736541748, + "learning_rate": 3.620289564044183e-05, + "loss": 2.7617, + "step": 1869 + }, + { + "epoch": 1.2515365639503282, + "grad_norm": 4.701360702514648, + "learning_rate": 3.618548270393152e-05, + "loss": 2.7191, + "step": 1870 + }, + { + "epoch": 1.2522055441736004, + "grad_norm": 5.712444305419922, + "learning_rate": 3.6168062980106126e-05, + "loss": 2.9235, + "step": 1871 + }, + { + "epoch": 1.2528745243968724, + "grad_norm": 5.300381183624268, + "learning_rate": 3.61506364795359e-05, + "loss": 2.8037, + "step": 1872 + }, + { + "epoch": 1.2535435046201446, + "grad_norm": 4.37861442565918, + "learning_rate": 3.613320321279518e-05, + "loss": 2.7889, + "step": 1873 + }, + { + "epoch": 1.2542124848434169, + "grad_norm": 5.089453220367432, + "learning_rate": 3.61157631904624e-05, + "loss": 3.0934, + "step": 1874 + }, + { + "epoch": 1.2548814650666889, + "grad_norm": 4.397032260894775, + "learning_rate": 3.6098316423120133e-05, + "loss": 2.7309, + "step": 1875 + }, + { + "epoch": 1.255550445289961, + "grad_norm": 4.115896224975586, + "learning_rate": 3.608086292135501e-05, + "loss": 2.7652, + "step": 1876 + }, + { + "epoch": 1.2562194255132333, + "grad_norm": 4.964057445526123, + "learning_rate": 3.6063402695757765e-05, + "loss": 2.8038, + "step": 1877 + }, + { + "epoch": 1.2568884057365053, + "grad_norm": 4.864250659942627, + "learning_rate": 3.60459357569232e-05, + "loss": 2.685, + "step": 1878 + }, + { + "epoch": 1.2575573859597775, + "grad_norm": 4.61652135848999, + "learning_rate": 3.602846211545021e-05, + "loss": 2.9624, + "step": 1879 + }, + { + "epoch": 1.2582263661830497, + "grad_norm": 3.5036540031433105, + "learning_rate": 3.601098178194173e-05, + "loss": 2.8418, + "step": 1880 + }, + { + "epoch": 1.2588953464063217, + "grad_norm": 5.311166286468506, + "learning_rate": 3.599349476700478e-05, + "loss": 2.6239, + "step": 1881 + }, + { + "epoch": 1.259564326629594, + "grad_norm": 5.348335266113281, + "learning_rate": 3.5976001081250414e-05, + "loss": 2.7304, + "step": 1882 + }, + { + "epoch": 1.2602333068528662, + "grad_norm": 6.328084945678711, + "learning_rate": 3.595850073529377e-05, + "loss": 2.6692, + "step": 1883 + }, + { + "epoch": 1.2609022870761384, + "grad_norm": 5.2390007972717285, + "learning_rate": 3.594099373975397e-05, + "loss": 2.7312, + "step": 1884 + }, + { + "epoch": 1.2615712672994104, + "grad_norm": 7.658189296722412, + "learning_rate": 3.592348010525421e-05, + "loss": 3.1415, + "step": 1885 + }, + { + "epoch": 1.2622402475226826, + "grad_norm": 4.922848701477051, + "learning_rate": 3.5905959842421726e-05, + "loss": 2.7902, + "step": 1886 + }, + { + "epoch": 1.2629092277459548, + "grad_norm": 4.480006217956543, + "learning_rate": 3.588843296188775e-05, + "loss": 2.6461, + "step": 1887 + }, + { + "epoch": 1.263578207969227, + "grad_norm": 4.117129802703857, + "learning_rate": 3.587089947428752e-05, + "loss": 2.6328, + "step": 1888 + }, + { + "epoch": 1.264247188192499, + "grad_norm": 5.596131801605225, + "learning_rate": 3.585335939026032e-05, + "loss": 2.6906, + "step": 1889 + }, + { + "epoch": 1.2649161684157713, + "grad_norm": 7.91386604309082, + "learning_rate": 3.583581272044941e-05, + "loss": 2.7841, + "step": 1890 + }, + { + "epoch": 1.2655851486390435, + "grad_norm": 5.925399303436279, + "learning_rate": 3.581825947550205e-05, + "loss": 2.723, + "step": 1891 + }, + { + "epoch": 1.2662541288623155, + "grad_norm": 6.4818806648254395, + "learning_rate": 3.580069966606949e-05, + "loss": 2.8101, + "step": 1892 + }, + { + "epoch": 1.2669231090855877, + "grad_norm": 4.033934593200684, + "learning_rate": 3.578313330280698e-05, + "loss": 2.5151, + "step": 1893 + }, + { + "epoch": 1.26759208930886, + "grad_norm": 6.469705104827881, + "learning_rate": 3.576556039637372e-05, + "loss": 2.708, + "step": 1894 + }, + { + "epoch": 1.268261069532132, + "grad_norm": 5.2010955810546875, + "learning_rate": 3.57479809574329e-05, + "loss": 2.589, + "step": 1895 + }, + { + "epoch": 1.2689300497554041, + "grad_norm": 4.195599555969238, + "learning_rate": 3.5730394996651664e-05, + "loss": 2.5701, + "step": 1896 + }, + { + "epoch": 1.2695990299786764, + "grad_norm": 3.8891968727111816, + "learning_rate": 3.571280252470111e-05, + "loss": 2.5916, + "step": 1897 + }, + { + "epoch": 1.2702680102019483, + "grad_norm": 4.924985885620117, + "learning_rate": 3.569520355225631e-05, + "loss": 2.7964, + "step": 1898 + }, + { + "epoch": 1.2709369904252206, + "grad_norm": 5.558375835418701, + "learning_rate": 3.5677598089996254e-05, + "loss": 2.8219, + "step": 1899 + }, + { + "epoch": 1.2716059706484928, + "grad_norm": 4.105373382568359, + "learning_rate": 3.565998614860388e-05, + "loss": 2.8599, + "step": 1900 + }, + { + "epoch": 1.2722749508717648, + "grad_norm": 5.856644153594971, + "learning_rate": 3.564236773876606e-05, + "loss": 2.814, + "step": 1901 + }, + { + "epoch": 1.272943931095037, + "grad_norm": 3.8691678047180176, + "learning_rate": 3.562474287117359e-05, + "loss": 2.7608, + "step": 1902 + }, + { + "epoch": 1.2736129113183092, + "grad_norm": 6.936550140380859, + "learning_rate": 3.5607111556521175e-05, + "loss": 3.1666, + "step": 1903 + }, + { + "epoch": 1.2742818915415812, + "grad_norm": 4.588903903961182, + "learning_rate": 3.558947380550744e-05, + "loss": 2.5422, + "step": 1904 + }, + { + "epoch": 1.2749508717648534, + "grad_norm": 5.447290897369385, + "learning_rate": 3.557182962883494e-05, + "loss": 2.8046, + "step": 1905 + }, + { + "epoch": 1.2756198519881257, + "grad_norm": 5.050882339477539, + "learning_rate": 3.555417903721008e-05, + "loss": 2.8847, + "step": 1906 + }, + { + "epoch": 1.2762888322113977, + "grad_norm": 4.748391151428223, + "learning_rate": 3.5536522041343185e-05, + "loss": 2.832, + "step": 1907 + }, + { + "epoch": 1.2769578124346699, + "grad_norm": 5.2018890380859375, + "learning_rate": 3.551885865194847e-05, + "loss": 2.8506, + "step": 1908 + }, + { + "epoch": 1.277626792657942, + "grad_norm": 5.160276412963867, + "learning_rate": 3.550118887974402e-05, + "loss": 2.6371, + "step": 1909 + }, + { + "epoch": 1.278295772881214, + "grad_norm": 4.636349201202393, + "learning_rate": 3.54835127354518e-05, + "loss": 2.7996, + "step": 1910 + }, + { + "epoch": 1.2789647531044863, + "grad_norm": 4.72723913192749, + "learning_rate": 3.5465830229797623e-05, + "loss": 2.6162, + "step": 1911 + }, + { + "epoch": 1.2796337333277585, + "grad_norm": 4.232020854949951, + "learning_rate": 3.54481413735112e-05, + "loss": 2.7203, + "step": 1912 + }, + { + "epoch": 1.2803027135510305, + "grad_norm": 4.942488193511963, + "learning_rate": 3.543044617732606e-05, + "loss": 2.8028, + "step": 1913 + }, + { + "epoch": 1.2809716937743028, + "grad_norm": 6.861090660095215, + "learning_rate": 3.541274465197959e-05, + "loss": 2.8319, + "step": 1914 + }, + { + "epoch": 1.281640673997575, + "grad_norm": 9.18194580078125, + "learning_rate": 3.539503680821302e-05, + "loss": 3.4554, + "step": 1915 + }, + { + "epoch": 1.282309654220847, + "grad_norm": 5.122642517089844, + "learning_rate": 3.537732265677142e-05, + "loss": 2.5247, + "step": 1916 + }, + { + "epoch": 1.2829786344441192, + "grad_norm": 5.579825401306152, + "learning_rate": 3.5359602208403666e-05, + "loss": 2.7803, + "step": 1917 + }, + { + "epoch": 1.2836476146673914, + "grad_norm": 5.433165550231934, + "learning_rate": 3.5341875473862485e-05, + "loss": 2.8789, + "step": 1918 + }, + { + "epoch": 1.2843165948906634, + "grad_norm": 5.925359725952148, + "learning_rate": 3.5324142463904385e-05, + "loss": 3.0326, + "step": 1919 + }, + { + "epoch": 1.2849855751139356, + "grad_norm": 5.1778340339660645, + "learning_rate": 3.5306403189289725e-05, + "loss": 2.5914, + "step": 1920 + }, + { + "epoch": 1.2856545553372078, + "grad_norm": 5.54917049407959, + "learning_rate": 3.5288657660782615e-05, + "loss": 2.7516, + "step": 1921 + }, + { + "epoch": 1.28632353556048, + "grad_norm": 5.052066802978516, + "learning_rate": 3.5270905889151e-05, + "loss": 2.8725, + "step": 1922 + }, + { + "epoch": 1.286992515783752, + "grad_norm": 3.5192253589630127, + "learning_rate": 3.525314788516659e-05, + "loss": 2.695, + "step": 1923 + }, + { + "epoch": 1.2876614960070243, + "grad_norm": 4.60258674621582, + "learning_rate": 3.523538365960489e-05, + "loss": 2.8105, + "step": 1924 + }, + { + "epoch": 1.2883304762302965, + "grad_norm": 5.980016708374023, + "learning_rate": 3.5217613223245164e-05, + "loss": 2.8577, + "step": 1925 + }, + { + "epoch": 1.2889994564535687, + "grad_norm": 4.692354679107666, + "learning_rate": 3.519983658687047e-05, + "loss": 3.007, + "step": 1926 + }, + { + "epoch": 1.2896684366768407, + "grad_norm": 6.644775867462158, + "learning_rate": 3.518205376126762e-05, + "loss": 2.5909, + "step": 1927 + }, + { + "epoch": 1.290337416900113, + "grad_norm": 4.538536071777344, + "learning_rate": 3.516426475722715e-05, + "loss": 2.6458, + "step": 1928 + }, + { + "epoch": 1.2910063971233852, + "grad_norm": 4.779280662536621, + "learning_rate": 3.514646958554339e-05, + "loss": 2.7819, + "step": 1929 + }, + { + "epoch": 1.2916753773466572, + "grad_norm": 2.987003803253174, + "learning_rate": 3.512866825701439e-05, + "loss": 2.5834, + "step": 1930 + }, + { + "epoch": 1.2923443575699294, + "grad_norm": 4.973797798156738, + "learning_rate": 3.511086078244194e-05, + "loss": 2.9792, + "step": 1931 + }, + { + "epoch": 1.2930133377932016, + "grad_norm": 4.948237895965576, + "learning_rate": 3.5093047172631555e-05, + "loss": 3.0585, + "step": 1932 + }, + { + "epoch": 1.2936823180164736, + "grad_norm": 4.105443954467773, + "learning_rate": 3.507522743839247e-05, + "loss": 2.7073, + "step": 1933 + }, + { + "epoch": 1.2943512982397458, + "grad_norm": 5.659438610076904, + "learning_rate": 3.505740159053766e-05, + "loss": 2.785, + "step": 1934 + }, + { + "epoch": 1.295020278463018, + "grad_norm": 5.185451984405518, + "learning_rate": 3.5039569639883773e-05, + "loss": 2.9656, + "step": 1935 + }, + { + "epoch": 1.29568925868629, + "grad_norm": 5.640125751495361, + "learning_rate": 3.502173159725119e-05, + "loss": 2.9592, + "step": 1936 + }, + { + "epoch": 1.2963582389095623, + "grad_norm": 3.587552785873413, + "learning_rate": 3.5003887473463984e-05, + "loss": 2.4749, + "step": 1937 + }, + { + "epoch": 1.2970272191328345, + "grad_norm": 4.075209617614746, + "learning_rate": 3.498603727934991e-05, + "loss": 2.7787, + "step": 1938 + }, + { + "epoch": 1.2976961993561065, + "grad_norm": 5.543696403503418, + "learning_rate": 3.496818102574039e-05, + "loss": 2.8827, + "step": 1939 + }, + { + "epoch": 1.2983651795793787, + "grad_norm": 9.334135055541992, + "learning_rate": 3.4950318723470565e-05, + "loss": 2.8724, + "step": 1940 + }, + { + "epoch": 1.299034159802651, + "grad_norm": 5.707855701446533, + "learning_rate": 3.493245038337921e-05, + "loss": 2.9266, + "step": 1941 + }, + { + "epoch": 1.299703140025923, + "grad_norm": 7.300833702087402, + "learning_rate": 3.491457601630878e-05, + "loss": 3.1611, + "step": 1942 + }, + { + "epoch": 1.3003721202491951, + "grad_norm": 5.916547775268555, + "learning_rate": 3.489669563310538e-05, + "loss": 2.5979, + "step": 1943 + }, + { + "epoch": 1.3010411004724673, + "grad_norm": 5.269513130187988, + "learning_rate": 3.487880924461878e-05, + "loss": 2.723, + "step": 1944 + }, + { + "epoch": 1.3017100806957393, + "grad_norm": 5.020068645477295, + "learning_rate": 3.486091686170237e-05, + "loss": 2.7228, + "step": 1945 + }, + { + "epoch": 1.3023790609190116, + "grad_norm": 3.875847339630127, + "learning_rate": 3.48430184952132e-05, + "loss": 2.6033, + "step": 1946 + }, + { + "epoch": 1.3030480411422838, + "grad_norm": 4.772490501403809, + "learning_rate": 3.4825114156011934e-05, + "loss": 2.6238, + "step": 1947 + }, + { + "epoch": 1.3037170213655558, + "grad_norm": 4.842042922973633, + "learning_rate": 3.480720385496287e-05, + "loss": 2.5166, + "step": 1948 + }, + { + "epoch": 1.304386001588828, + "grad_norm": 4.124337196350098, + "learning_rate": 3.4789287602933936e-05, + "loss": 2.757, + "step": 1949 + }, + { + "epoch": 1.3050549818121002, + "grad_norm": 7.956783294677734, + "learning_rate": 3.477136541079663e-05, + "loss": 2.9116, + "step": 1950 + }, + { + "epoch": 1.3057239620353722, + "grad_norm": 8.2808837890625, + "learning_rate": 3.47534372894261e-05, + "loss": 3.0217, + "step": 1951 + }, + { + "epoch": 1.3063929422586444, + "grad_norm": 4.387228965759277, + "learning_rate": 3.4735503249701065e-05, + "loss": 2.8402, + "step": 1952 + }, + { + "epoch": 1.3070619224819167, + "grad_norm": 5.978759765625, + "learning_rate": 3.4717563302503844e-05, + "loss": 2.9344, + "step": 1953 + }, + { + "epoch": 1.3077309027051887, + "grad_norm": 6.487181186676025, + "learning_rate": 3.469961745872034e-05, + "loss": 2.7791, + "step": 1954 + }, + { + "epoch": 1.3083998829284609, + "grad_norm": 3.971789836883545, + "learning_rate": 3.4681665729240034e-05, + "loss": 2.5318, + "step": 1955 + }, + { + "epoch": 1.309068863151733, + "grad_norm": 6.075745105743408, + "learning_rate": 3.466370812495598e-05, + "loss": 2.8359, + "step": 1956 + }, + { + "epoch": 1.309737843375005, + "grad_norm": 5.87401819229126, + "learning_rate": 3.464574465676479e-05, + "loss": 2.86, + "step": 1957 + }, + { + "epoch": 1.3104068235982773, + "grad_norm": 9.17673110961914, + "learning_rate": 3.4627775335566636e-05, + "loss": 2.9832, + "step": 1958 + }, + { + "epoch": 1.3110758038215495, + "grad_norm": 7.885680198669434, + "learning_rate": 3.460980017226525e-05, + "loss": 3.0542, + "step": 1959 + }, + { + "epoch": 1.3117447840448218, + "grad_norm": 4.909152507781982, + "learning_rate": 3.459181917776792e-05, + "loss": 2.7076, + "step": 1960 + }, + { + "epoch": 1.3124137642680938, + "grad_norm": 7.326097011566162, + "learning_rate": 3.4573832362985424e-05, + "loss": 2.7299, + "step": 1961 + }, + { + "epoch": 1.313082744491366, + "grad_norm": 5.211129188537598, + "learning_rate": 3.455583973883212e-05, + "loss": 2.5845, + "step": 1962 + }, + { + "epoch": 1.3137517247146382, + "grad_norm": 7.045957088470459, + "learning_rate": 3.4537841316225885e-05, + "loss": 2.6112, + "step": 1963 + }, + { + "epoch": 1.3144207049379104, + "grad_norm": 6.861929416656494, + "learning_rate": 3.4519837106088074e-05, + "loss": 2.642, + "step": 1964 + }, + { + "epoch": 1.3150896851611824, + "grad_norm": 4.8646321296691895, + "learning_rate": 3.450182711934361e-05, + "loss": 2.6681, + "step": 1965 + }, + { + "epoch": 1.3157586653844546, + "grad_norm": 4.6669721603393555, + "learning_rate": 3.448381136692089e-05, + "loss": 2.7032, + "step": 1966 + }, + { + "epoch": 1.3164276456077268, + "grad_norm": 6.643599510192871, + "learning_rate": 3.446578985975182e-05, + "loss": 2.6507, + "step": 1967 + }, + { + "epoch": 1.3170966258309988, + "grad_norm": 4.169411659240723, + "learning_rate": 3.444776260877177e-05, + "loss": 2.8854, + "step": 1968 + }, + { + "epoch": 1.317765606054271, + "grad_norm": 6.282775402069092, + "learning_rate": 3.4429729624919644e-05, + "loss": 2.892, + "step": 1969 + }, + { + "epoch": 1.3184345862775433, + "grad_norm": 5.78385066986084, + "learning_rate": 3.4411690919137786e-05, + "loss": 2.689, + "step": 1970 + }, + { + "epoch": 1.3191035665008153, + "grad_norm": 5.781465530395508, + "learning_rate": 3.439364650237203e-05, + "loss": 2.8935, + "step": 1971 + }, + { + "epoch": 1.3197725467240875, + "grad_norm": 12.037712097167969, + "learning_rate": 3.437559638557166e-05, + "loss": 2.8092, + "step": 1972 + }, + { + "epoch": 1.3204415269473597, + "grad_norm": 5.438355445861816, + "learning_rate": 3.435754057968945e-05, + "loss": 2.7404, + "step": 1973 + }, + { + "epoch": 1.3211105071706317, + "grad_norm": 4.841614246368408, + "learning_rate": 3.433947909568158e-05, + "loss": 2.4909, + "step": 1974 + }, + { + "epoch": 1.321779487393904, + "grad_norm": 6.428897857666016, + "learning_rate": 3.432141194450772e-05, + "loss": 2.564, + "step": 1975 + }, + { + "epoch": 1.3224484676171762, + "grad_norm": 7.659292221069336, + "learning_rate": 3.430333913713095e-05, + "loss": 3.0917, + "step": 1976 + }, + { + "epoch": 1.3231174478404482, + "grad_norm": 3.677401065826416, + "learning_rate": 3.428526068451778e-05, + "loss": 2.3395, + "step": 1977 + }, + { + "epoch": 1.3237864280637204, + "grad_norm": 4.640537261962891, + "learning_rate": 3.4267176597638194e-05, + "loss": 2.8815, + "step": 1978 + }, + { + "epoch": 1.3244554082869926, + "grad_norm": 5.220210075378418, + "learning_rate": 3.424908688746552e-05, + "loss": 2.84, + "step": 1979 + }, + { + "epoch": 1.3251243885102646, + "grad_norm": 5.86066198348999, + "learning_rate": 3.423099156497655e-05, + "loss": 2.781, + "step": 1980 + }, + { + "epoch": 1.3257933687335368, + "grad_norm": 4.961976528167725, + "learning_rate": 3.421289064115147e-05, + "loss": 2.841, + "step": 1981 + }, + { + "epoch": 1.326462348956809, + "grad_norm": 5.830520153045654, + "learning_rate": 3.419478412697388e-05, + "loss": 3.0078, + "step": 1982 + }, + { + "epoch": 1.327131329180081, + "grad_norm": 5.7861762046813965, + "learning_rate": 3.4176672033430714e-05, + "loss": 2.804, + "step": 1983 + }, + { + "epoch": 1.3278003094033533, + "grad_norm": 4.089806079864502, + "learning_rate": 3.415855437151237e-05, + "loss": 2.4129, + "step": 1984 + }, + { + "epoch": 1.3284692896266255, + "grad_norm": 2.5153582096099854, + "learning_rate": 3.414043115221256e-05, + "loss": 2.461, + "step": 1985 + }, + { + "epoch": 1.3291382698498975, + "grad_norm": 6.85758638381958, + "learning_rate": 3.4122302386528404e-05, + "loss": 2.7807, + "step": 1986 + }, + { + "epoch": 1.3298072500731697, + "grad_norm": 5.164159774780273, + "learning_rate": 3.410416808546039e-05, + "loss": 2.868, + "step": 1987 + }, + { + "epoch": 1.330476230296442, + "grad_norm": 7.103829860687256, + "learning_rate": 3.4086028260012344e-05, + "loss": 2.7594, + "step": 1988 + }, + { + "epoch": 1.331145210519714, + "grad_norm": 3.77983021736145, + "learning_rate": 3.406788292119146e-05, + "loss": 2.5318, + "step": 1989 + }, + { + "epoch": 1.3318141907429861, + "grad_norm": 4.805266380310059, + "learning_rate": 3.404973208000826e-05, + "loss": 2.7652, + "step": 1990 + }, + { + "epoch": 1.3324831709662583, + "grad_norm": 4.950036525726318, + "learning_rate": 3.4031575747476624e-05, + "loss": 2.427, + "step": 1991 + }, + { + "epoch": 1.3331521511895303, + "grad_norm": 6.413376331329346, + "learning_rate": 3.401341393461376e-05, + "loss": 3.0278, + "step": 1992 + }, + { + "epoch": 1.3338211314128026, + "grad_norm": 3.3853354454040527, + "learning_rate": 3.3995246652440194e-05, + "loss": 2.5914, + "step": 1993 + }, + { + "epoch": 1.3344901116360748, + "grad_norm": 6.072537422180176, + "learning_rate": 3.397707391197977e-05, + "loss": 2.8304, + "step": 1994 + }, + { + "epoch": 1.3351590918593468, + "grad_norm": 7.784000396728516, + "learning_rate": 3.395889572425965e-05, + "loss": 2.7672, + "step": 1995 + }, + { + "epoch": 1.335828072082619, + "grad_norm": 4.891425609588623, + "learning_rate": 3.3940712100310315e-05, + "loss": 2.6594, + "step": 1996 + }, + { + "epoch": 1.3364970523058912, + "grad_norm": 4.242300510406494, + "learning_rate": 3.3922523051165515e-05, + "loss": 2.6352, + "step": 1997 + }, + { + "epoch": 1.3371660325291634, + "grad_norm": 6.101925849914551, + "learning_rate": 3.39043285878623e-05, + "loss": 3.0095, + "step": 1998 + }, + { + "epoch": 1.3378350127524354, + "grad_norm": 4.063016891479492, + "learning_rate": 3.388612872144104e-05, + "loss": 2.6412, + "step": 1999 + }, + { + "epoch": 1.3385039929757077, + "grad_norm": 5.656928062438965, + "learning_rate": 3.386792346294532e-05, + "loss": 2.8988, + "step": 2000 + }, + { + "epoch": 1.3391729731989799, + "grad_norm": 4.359591007232666, + "learning_rate": 3.384971282342206e-05, + "loss": 2.7349, + "step": 2001 + }, + { + "epoch": 1.3398419534222519, + "grad_norm": 4.76541805267334, + "learning_rate": 3.38314968139214e-05, + "loss": 2.8656, + "step": 2002 + }, + { + "epoch": 1.340510933645524, + "grad_norm": 6.233043193817139, + "learning_rate": 3.3813275445496764e-05, + "loss": 2.7234, + "step": 2003 + }, + { + "epoch": 1.3411799138687963, + "grad_norm": 4.522028923034668, + "learning_rate": 3.379504872920483e-05, + "loss": 2.6152, + "step": 2004 + }, + { + "epoch": 1.3418488940920685, + "grad_norm": 5.168178081512451, + "learning_rate": 3.3776816676105495e-05, + "loss": 2.8025, + "step": 2005 + }, + { + "epoch": 1.3425178743153405, + "grad_norm": 4.653557300567627, + "learning_rate": 3.375857929726191e-05, + "loss": 2.8364, + "step": 2006 + }, + { + "epoch": 1.3431868545386128, + "grad_norm": 5.8500261306762695, + "learning_rate": 3.374033660374047e-05, + "loss": 2.8903, + "step": 2007 + }, + { + "epoch": 1.343855834761885, + "grad_norm": 5.529808521270752, + "learning_rate": 3.3722088606610784e-05, + "loss": 2.8225, + "step": 2008 + }, + { + "epoch": 1.344524814985157, + "grad_norm": 4.344649314880371, + "learning_rate": 3.3703835316945665e-05, + "loss": 2.6386, + "step": 2009 + }, + { + "epoch": 1.3451937952084292, + "grad_norm": 5.259212970733643, + "learning_rate": 3.368557674582116e-05, + "loss": 2.8751, + "step": 2010 + }, + { + "epoch": 1.3458627754317014, + "grad_norm": 5.934889793395996, + "learning_rate": 3.3667312904316506e-05, + "loss": 2.7374, + "step": 2011 + }, + { + "epoch": 1.3465317556549734, + "grad_norm": 5.568259239196777, + "learning_rate": 3.364904380351415e-05, + "loss": 2.824, + "step": 2012 + }, + { + "epoch": 1.3472007358782456, + "grad_norm": 7.427395820617676, + "learning_rate": 3.363076945449971e-05, + "loss": 2.8405, + "step": 2013 + }, + { + "epoch": 1.3478697161015178, + "grad_norm": 6.825466632843018, + "learning_rate": 3.3612489868362017e-05, + "loss": 2.716, + "step": 2014 + }, + { + "epoch": 1.3485386963247898, + "grad_norm": 5.038285732269287, + "learning_rate": 3.3594205056193065e-05, + "loss": 2.5977, + "step": 2015 + }, + { + "epoch": 1.349207676548062, + "grad_norm": 5.154592037200928, + "learning_rate": 3.357591502908802e-05, + "loss": 2.791, + "step": 2016 + }, + { + "epoch": 1.3498766567713343, + "grad_norm": 4.823801517486572, + "learning_rate": 3.35576197981452e-05, + "loss": 2.9221, + "step": 2017 + }, + { + "epoch": 1.3505456369946063, + "grad_norm": 5.911928653717041, + "learning_rate": 3.35393193744661e-05, + "loss": 2.8327, + "step": 2018 + }, + { + "epoch": 1.3512146172178785, + "grad_norm": 4.692645072937012, + "learning_rate": 3.352101376915536e-05, + "loss": 2.6108, + "step": 2019 + }, + { + "epoch": 1.3518835974411507, + "grad_norm": 4.444061756134033, + "learning_rate": 3.3502702993320754e-05, + "loss": 2.6292, + "step": 2020 + }, + { + "epoch": 1.3525525776644227, + "grad_norm": 6.115309715270996, + "learning_rate": 3.348438705807322e-05, + "loss": 2.9655, + "step": 2021 + }, + { + "epoch": 1.353221557887695, + "grad_norm": 5.134885311126709, + "learning_rate": 3.3466065974526794e-05, + "loss": 2.711, + "step": 2022 + }, + { + "epoch": 1.3538905381109672, + "grad_norm": 5.172490119934082, + "learning_rate": 3.344773975379865e-05, + "loss": 2.5484, + "step": 2023 + }, + { + "epoch": 1.3545595183342392, + "grad_norm": 3.98433256149292, + "learning_rate": 3.3429408407009086e-05, + "loss": 2.672, + "step": 2024 + }, + { + "epoch": 1.3552284985575114, + "grad_norm": 5.418702125549316, + "learning_rate": 3.3411071945281515e-05, + "loss": 2.7459, + "step": 2025 + }, + { + "epoch": 1.3558974787807836, + "grad_norm": 6.298463344573975, + "learning_rate": 3.339273037974241e-05, + "loss": 2.7903, + "step": 2026 + }, + { + "epoch": 1.3565664590040556, + "grad_norm": 6.452141761779785, + "learning_rate": 3.337438372152141e-05, + "loss": 2.7773, + "step": 2027 + }, + { + "epoch": 1.3572354392273278, + "grad_norm": 3.5536160469055176, + "learning_rate": 3.335603198175119e-05, + "loss": 2.4353, + "step": 2028 + }, + { + "epoch": 1.3579044194506, + "grad_norm": 6.420649528503418, + "learning_rate": 3.333767517156754e-05, + "loss": 2.7223, + "step": 2029 + }, + { + "epoch": 1.358573399673872, + "grad_norm": 6.5228095054626465, + "learning_rate": 3.33193133021093e-05, + "loss": 3.1202, + "step": 2030 + }, + { + "epoch": 1.3592423798971442, + "grad_norm": 5.087987422943115, + "learning_rate": 3.330094638451839e-05, + "loss": 2.9592, + "step": 2031 + }, + { + "epoch": 1.3599113601204165, + "grad_norm": 5.699573040008545, + "learning_rate": 3.32825744299398e-05, + "loss": 2.7894, + "step": 2032 + }, + { + "epoch": 1.3605803403436885, + "grad_norm": 3.749000310897827, + "learning_rate": 3.32641974495216e-05, + "loss": 2.6215, + "step": 2033 + }, + { + "epoch": 1.3612493205669607, + "grad_norm": 4.384515285491943, + "learning_rate": 3.324581545441485e-05, + "loss": 2.53, + "step": 2034 + }, + { + "epoch": 1.361918300790233, + "grad_norm": 4.807569980621338, + "learning_rate": 3.3227428455773694e-05, + "loss": 2.8052, + "step": 2035 + }, + { + "epoch": 1.362587281013505, + "grad_norm": 5.514527797698975, + "learning_rate": 3.320903646475531e-05, + "loss": 2.7406, + "step": 2036 + }, + { + "epoch": 1.3632562612367771, + "grad_norm": 3.7961368560791016, + "learning_rate": 3.319063949251989e-05, + "loss": 2.6709, + "step": 2037 + }, + { + "epoch": 1.3639252414600493, + "grad_norm": 4.431958198547363, + "learning_rate": 3.3172237550230666e-05, + "loss": 2.756, + "step": 2038 + }, + { + "epoch": 1.3645942216833216, + "grad_norm": 5.05426025390625, + "learning_rate": 3.315383064905388e-05, + "loss": 2.9408, + "step": 2039 + }, + { + "epoch": 1.3652632019065936, + "grad_norm": 6.362969398498535, + "learning_rate": 3.313541880015877e-05, + "loss": 2.4547, + "step": 2040 + }, + { + "epoch": 1.3659321821298658, + "grad_norm": 6.6766252517700195, + "learning_rate": 3.3117002014717604e-05, + "loss": 2.9067, + "step": 2041 + }, + { + "epoch": 1.366601162353138, + "grad_norm": 4.779316425323486, + "learning_rate": 3.30985803039056e-05, + "loss": 2.83, + "step": 2042 + }, + { + "epoch": 1.3672701425764102, + "grad_norm": 4.906229019165039, + "learning_rate": 3.308015367890102e-05, + "loss": 2.6688, + "step": 2043 + }, + { + "epoch": 1.3679391227996822, + "grad_norm": 6.010954856872559, + "learning_rate": 3.306172215088508e-05, + "loss": 2.6788, + "step": 2044 + }, + { + "epoch": 1.3686081030229544, + "grad_norm": 6.692445278167725, + "learning_rate": 3.304328573104195e-05, + "loss": 2.8227, + "step": 2045 + }, + { + "epoch": 1.3692770832462267, + "grad_norm": 5.441335678100586, + "learning_rate": 3.302484443055881e-05, + "loss": 2.6127, + "step": 2046 + }, + { + "epoch": 1.3699460634694987, + "grad_norm": 7.512343883514404, + "learning_rate": 3.3006398260625774e-05, + "loss": 2.8606, + "step": 2047 + }, + { + "epoch": 1.3706150436927709, + "grad_norm": 5.280481815338135, + "learning_rate": 3.298794723243592e-05, + "loss": 2.6278, + "step": 2048 + }, + { + "epoch": 1.371284023916043, + "grad_norm": 5.495764255523682, + "learning_rate": 3.2969491357185275e-05, + "loss": 2.771, + "step": 2049 + }, + { + "epoch": 1.371953004139315, + "grad_norm": 8.037247657775879, + "learning_rate": 3.295103064607281e-05, + "loss": 2.725, + "step": 2050 + }, + { + "epoch": 1.3726219843625873, + "grad_norm": 4.578863620758057, + "learning_rate": 3.2932565110300415e-05, + "loss": 2.8497, + "step": 2051 + }, + { + "epoch": 1.3732909645858595, + "grad_norm": 4.044233798980713, + "learning_rate": 3.2914094761072914e-05, + "loss": 2.6499, + "step": 2052 + }, + { + "epoch": 1.3739599448091315, + "grad_norm": 5.420017242431641, + "learning_rate": 3.2895619609598075e-05, + "loss": 3.1044, + "step": 2053 + }, + { + "epoch": 1.3746289250324037, + "grad_norm": 5.717971324920654, + "learning_rate": 3.2877139667086534e-05, + "loss": 2.674, + "step": 2054 + }, + { + "epoch": 1.375297905255676, + "grad_norm": 5.100498199462891, + "learning_rate": 3.285865494475189e-05, + "loss": 2.5895, + "step": 2055 + }, + { + "epoch": 1.375966885478948, + "grad_norm": 4.620347023010254, + "learning_rate": 3.28401654538106e-05, + "loss": 2.6014, + "step": 2056 + }, + { + "epoch": 1.3766358657022202, + "grad_norm": 6.425408840179443, + "learning_rate": 3.2821671205482026e-05, + "loss": 2.7834, + "step": 2057 + }, + { + "epoch": 1.3773048459254924, + "grad_norm": 4.689121723175049, + "learning_rate": 3.280317221098842e-05, + "loss": 2.7513, + "step": 2058 + }, + { + "epoch": 1.3779738261487644, + "grad_norm": 4.472986221313477, + "learning_rate": 3.278466848155491e-05, + "loss": 2.7845, + "step": 2059 + }, + { + "epoch": 1.3786428063720366, + "grad_norm": 5.076809406280518, + "learning_rate": 3.27661600284095e-05, + "loss": 2.7951, + "step": 2060 + }, + { + "epoch": 1.3793117865953088, + "grad_norm": 5.643988132476807, + "learning_rate": 3.274764686278307e-05, + "loss": 2.959, + "step": 2061 + }, + { + "epoch": 1.3799807668185808, + "grad_norm": 5.873498439788818, + "learning_rate": 3.272912899590934e-05, + "loss": 2.6622, + "step": 2062 + }, + { + "epoch": 1.380649747041853, + "grad_norm": 5.192942142486572, + "learning_rate": 3.2710606439024896e-05, + "loss": 2.8687, + "step": 2063 + }, + { + "epoch": 1.3813187272651253, + "grad_norm": 4.184119701385498, + "learning_rate": 3.2692079203369156e-05, + "loss": 2.8474, + "step": 2064 + }, + { + "epoch": 1.3819877074883973, + "grad_norm": 4.882514476776123, + "learning_rate": 3.2673547300184404e-05, + "loss": 2.6392, + "step": 2065 + }, + { + "epoch": 1.3826566877116695, + "grad_norm": 6.144156455993652, + "learning_rate": 3.2655010740715736e-05, + "loss": 2.7037, + "step": 2066 + }, + { + "epoch": 1.3833256679349417, + "grad_norm": 4.782991409301758, + "learning_rate": 3.263646953621106e-05, + "loss": 2.7595, + "step": 2067 + }, + { + "epoch": 1.3839946481582137, + "grad_norm": 4.191819190979004, + "learning_rate": 3.261792369792114e-05, + "loss": 2.5515, + "step": 2068 + }, + { + "epoch": 1.384663628381486, + "grad_norm": 5.204425811767578, + "learning_rate": 3.259937323709952e-05, + "loss": 2.8186, + "step": 2069 + }, + { + "epoch": 1.3853326086047582, + "grad_norm": 4.756034851074219, + "learning_rate": 3.258081816500257e-05, + "loss": 2.7698, + "step": 2070 + }, + { + "epoch": 1.3860015888280302, + "grad_norm": 5.133122444152832, + "learning_rate": 3.256225849288943e-05, + "loss": 2.8698, + "step": 2071 + }, + { + "epoch": 1.3866705690513024, + "grad_norm": 6.972858905792236, + "learning_rate": 3.254369423202207e-05, + "loss": 2.9405, + "step": 2072 + }, + { + "epoch": 1.3873395492745746, + "grad_norm": 4.848676681518555, + "learning_rate": 3.2525125393665216e-05, + "loss": 2.7072, + "step": 2073 + }, + { + "epoch": 1.3880085294978466, + "grad_norm": 5.23881196975708, + "learning_rate": 3.2506551989086374e-05, + "loss": 2.8509, + "step": 2074 + }, + { + "epoch": 1.3886775097211188, + "grad_norm": 4.224327564239502, + "learning_rate": 3.248797402955583e-05, + "loss": 2.6614, + "step": 2075 + }, + { + "epoch": 1.389346489944391, + "grad_norm": 6.613017559051514, + "learning_rate": 3.246939152634664e-05, + "loss": 2.9236, + "step": 2076 + }, + { + "epoch": 1.3900154701676632, + "grad_norm": 5.5714874267578125, + "learning_rate": 3.245080449073459e-05, + "loss": 2.7773, + "step": 2077 + }, + { + "epoch": 1.3906844503909352, + "grad_norm": 5.062618732452393, + "learning_rate": 3.243221293399825e-05, + "loss": 2.9224, + "step": 2078 + }, + { + "epoch": 1.3913534306142075, + "grad_norm": 4.741576671600342, + "learning_rate": 3.2413616867418904e-05, + "loss": 2.5638, + "step": 2079 + }, + { + "epoch": 1.3920224108374797, + "grad_norm": 6.06392240524292, + "learning_rate": 3.23950163022806e-05, + "loss": 2.7167, + "step": 2080 + }, + { + "epoch": 1.392691391060752, + "grad_norm": 6.1369733810424805, + "learning_rate": 3.2376411249870085e-05, + "loss": 2.5687, + "step": 2081 + }, + { + "epoch": 1.393360371284024, + "grad_norm": 6.010268211364746, + "learning_rate": 3.2357801721476854e-05, + "loss": 3.0348, + "step": 2082 + }, + { + "epoch": 1.3940293515072961, + "grad_norm": 5.525701522827148, + "learning_rate": 3.23391877283931e-05, + "loss": 2.7513, + "step": 2083 + }, + { + "epoch": 1.3946983317305683, + "grad_norm": 5.653034687042236, + "learning_rate": 3.232056928191376e-05, + "loss": 2.5086, + "step": 2084 + }, + { + "epoch": 1.3953673119538403, + "grad_norm": 5.4909772872924805, + "learning_rate": 3.230194639333642e-05, + "loss": 2.8214, + "step": 2085 + }, + { + "epoch": 1.3960362921771126, + "grad_norm": 4.054011344909668, + "learning_rate": 3.228331907396141e-05, + "loss": 2.6106, + "step": 2086 + }, + { + "epoch": 1.3967052724003848, + "grad_norm": 5.332735061645508, + "learning_rate": 3.2264687335091696e-05, + "loss": 2.8631, + "step": 2087 + }, + { + "epoch": 1.3973742526236568, + "grad_norm": 7.368805885314941, + "learning_rate": 3.2246051188033e-05, + "loss": 2.9556, + "step": 2088 + }, + { + "epoch": 1.398043232846929, + "grad_norm": 4.063880443572998, + "learning_rate": 3.222741064409364e-05, + "loss": 2.7505, + "step": 2089 + }, + { + "epoch": 1.3987122130702012, + "grad_norm": 7.633145332336426, + "learning_rate": 3.220876571458466e-05, + "loss": 2.8162, + "step": 2090 + }, + { + "epoch": 1.3993811932934732, + "grad_norm": 4.736606121063232, + "learning_rate": 3.219011641081974e-05, + "loss": 2.4496, + "step": 2091 + }, + { + "epoch": 1.4000501735167454, + "grad_norm": 6.102734565734863, + "learning_rate": 3.217146274411521e-05, + "loss": 3.0237, + "step": 2092 + }, + { + "epoch": 1.4007191537400177, + "grad_norm": 3.788506507873535, + "learning_rate": 3.215280472579006e-05, + "loss": 2.3751, + "step": 2093 + }, + { + "epoch": 1.4013881339632897, + "grad_norm": 4.817192077636719, + "learning_rate": 3.2134142367165916e-05, + "loss": 2.875, + "step": 2094 + }, + { + "epoch": 1.4020571141865619, + "grad_norm": 5.458423137664795, + "learning_rate": 3.211547567956704e-05, + "loss": 2.9478, + "step": 2095 + }, + { + "epoch": 1.402726094409834, + "grad_norm": 3.884122610092163, + "learning_rate": 3.2096804674320305e-05, + "loss": 2.6837, + "step": 2096 + }, + { + "epoch": 1.403395074633106, + "grad_norm": 4.676953315734863, + "learning_rate": 3.2078129362755236e-05, + "loss": 2.759, + "step": 2097 + }, + { + "epoch": 1.4040640548563783, + "grad_norm": 4.61336088180542, + "learning_rate": 3.205944975620394e-05, + "loss": 2.8484, + "step": 2098 + }, + { + "epoch": 1.4047330350796505, + "grad_norm": 4.984833240509033, + "learning_rate": 3.2040765866001157e-05, + "loss": 3.1608, + "step": 2099 + }, + { + "epoch": 1.4054020153029225, + "grad_norm": 8.212212562561035, + "learning_rate": 3.202207770348419e-05, + "loss": 2.7521, + "step": 2100 + }, + { + "epoch": 1.4060709955261947, + "grad_norm": 4.495399475097656, + "learning_rate": 3.200338527999296e-05, + "loss": 2.6543, + "step": 2101 + }, + { + "epoch": 1.406739975749467, + "grad_norm": 4.785201072692871, + "learning_rate": 3.198468860686999e-05, + "loss": 2.6804, + "step": 2102 + }, + { + "epoch": 1.407408955972739, + "grad_norm": 5.067458152770996, + "learning_rate": 3.196598769546034e-05, + "loss": 2.5417, + "step": 2103 + }, + { + "epoch": 1.4080779361960112, + "grad_norm": 6.252809047698975, + "learning_rate": 3.194728255711167e-05, + "loss": 2.792, + "step": 2104 + }, + { + "epoch": 1.4087469164192834, + "grad_norm": 5.277778625488281, + "learning_rate": 3.1928573203174206e-05, + "loss": 2.7697, + "step": 2105 + }, + { + "epoch": 1.4094158966425554, + "grad_norm": 4.683997631072998, + "learning_rate": 3.1909859645000714e-05, + "loss": 2.873, + "step": 2106 + }, + { + "epoch": 1.4100848768658276, + "grad_norm": 4.702674388885498, + "learning_rate": 3.189114189394653e-05, + "loss": 2.8581, + "step": 2107 + }, + { + "epoch": 1.4107538570890998, + "grad_norm": 5.353943347930908, + "learning_rate": 3.187241996136951e-05, + "loss": 2.6926, + "step": 2108 + }, + { + "epoch": 1.4114228373123718, + "grad_norm": 4.439987659454346, + "learning_rate": 3.185369385863007e-05, + "loss": 2.7103, + "step": 2109 + }, + { + "epoch": 1.412091817535644, + "grad_norm": 5.188424110412598, + "learning_rate": 3.1834963597091165e-05, + "loss": 2.7763, + "step": 2110 + }, + { + "epoch": 1.4127607977589163, + "grad_norm": 3.740800380706787, + "learning_rate": 3.181622918811824e-05, + "loss": 2.5914, + "step": 2111 + }, + { + "epoch": 1.4134297779821883, + "grad_norm": 5.433175563812256, + "learning_rate": 3.179749064307927e-05, + "loss": 2.6397, + "step": 2112 + }, + { + "epoch": 1.4140987582054605, + "grad_norm": 6.00779390335083, + "learning_rate": 3.177874797334477e-05, + "loss": 2.5472, + "step": 2113 + }, + { + "epoch": 1.4147677384287327, + "grad_norm": 5.701888084411621, + "learning_rate": 3.17600011902877e-05, + "loss": 2.8456, + "step": 2114 + }, + { + "epoch": 1.415436718652005, + "grad_norm": 5.572426795959473, + "learning_rate": 3.1741250305283566e-05, + "loss": 2.7787, + "step": 2115 + }, + { + "epoch": 1.416105698875277, + "grad_norm": 5.159278869628906, + "learning_rate": 3.172249532971033e-05, + "loss": 2.7681, + "step": 2116 + }, + { + "epoch": 1.4167746790985492, + "grad_norm": 7.181278705596924, + "learning_rate": 3.170373627494848e-05, + "loss": 2.7483, + "step": 2117 + }, + { + "epoch": 1.4174436593218214, + "grad_norm": 5.110281944274902, + "learning_rate": 3.1684973152380934e-05, + "loss": 2.6859, + "step": 2118 + }, + { + "epoch": 1.4181126395450934, + "grad_norm": 6.126352310180664, + "learning_rate": 3.1666205973393084e-05, + "loss": 2.5781, + "step": 2119 + }, + { + "epoch": 1.4187816197683656, + "grad_norm": 4.8328118324279785, + "learning_rate": 3.1647434749372804e-05, + "loss": 2.6383, + "step": 2120 + }, + { + "epoch": 1.4194505999916378, + "grad_norm": 4.447200298309326, + "learning_rate": 3.162865949171042e-05, + "loss": 2.5225, + "step": 2121 + }, + { + "epoch": 1.42011958021491, + "grad_norm": 4.1558074951171875, + "learning_rate": 3.160988021179868e-05, + "loss": 2.7313, + "step": 2122 + }, + { + "epoch": 1.420788560438182, + "grad_norm": 3.8075015544891357, + "learning_rate": 3.15910969210328e-05, + "loss": 2.6495, + "step": 2123 + }, + { + "epoch": 1.4214575406614542, + "grad_norm": 6.061837196350098, + "learning_rate": 3.1572309630810434e-05, + "loss": 2.8919, + "step": 2124 + }, + { + "epoch": 1.4221265208847265, + "grad_norm": 4.573391437530518, + "learning_rate": 3.155351835253163e-05, + "loss": 2.7806, + "step": 2125 + }, + { + "epoch": 1.4227955011079985, + "grad_norm": 4.304897308349609, + "learning_rate": 3.153472309759888e-05, + "loss": 2.613, + "step": 2126 + }, + { + "epoch": 1.4234644813312707, + "grad_norm": 4.302021026611328, + "learning_rate": 3.15159238774171e-05, + "loss": 2.7939, + "step": 2127 + }, + { + "epoch": 1.424133461554543, + "grad_norm": 5.093643665313721, + "learning_rate": 3.1497120703393576e-05, + "loss": 2.6664, + "step": 2128 + }, + { + "epoch": 1.424802441777815, + "grad_norm": 6.002468109130859, + "learning_rate": 3.1478313586938025e-05, + "loss": 2.543, + "step": 2129 + }, + { + "epoch": 1.4254714220010871, + "grad_norm": 6.307436466217041, + "learning_rate": 3.1459502539462536e-05, + "loss": 2.7632, + "step": 2130 + }, + { + "epoch": 1.4261404022243593, + "grad_norm": 5.699851036071777, + "learning_rate": 3.14406875723816e-05, + "loss": 2.5711, + "step": 2131 + }, + { + "epoch": 1.4268093824476313, + "grad_norm": 6.780220985412598, + "learning_rate": 3.1421868697112084e-05, + "loss": 2.6382, + "step": 2132 + }, + { + "epoch": 1.4274783626709036, + "grad_norm": 6.741306304931641, + "learning_rate": 3.140304592507321e-05, + "loss": 2.6952, + "step": 2133 + }, + { + "epoch": 1.4281473428941758, + "grad_norm": 5.3856096267700195, + "learning_rate": 3.138421926768658e-05, + "loss": 2.6026, + "step": 2134 + }, + { + "epoch": 1.4288163231174478, + "grad_norm": 4.4347310066223145, + "learning_rate": 3.136538873637615e-05, + "loss": 2.4124, + "step": 2135 + }, + { + "epoch": 1.42948530334072, + "grad_norm": 5.228308200836182, + "learning_rate": 3.134655434256822e-05, + "loss": 3.0497, + "step": 2136 + }, + { + "epoch": 1.4301542835639922, + "grad_norm": 3.872063636779785, + "learning_rate": 3.132771609769145e-05, + "loss": 2.684, + "step": 2137 + }, + { + "epoch": 1.4308232637872642, + "grad_norm": 4.839834690093994, + "learning_rate": 3.130887401317682e-05, + "loss": 2.8611, + "step": 2138 + }, + { + "epoch": 1.4314922440105364, + "grad_norm": 3.6270556449890137, + "learning_rate": 3.129002810045765e-05, + "loss": 2.8794, + "step": 2139 + }, + { + "epoch": 1.4321612242338086, + "grad_norm": 4.208321571350098, + "learning_rate": 3.127117837096958e-05, + "loss": 2.8055, + "step": 2140 + }, + { + "epoch": 1.4328302044570806, + "grad_norm": 6.322272777557373, + "learning_rate": 3.125232483615056e-05, + "loss": 2.6562, + "step": 2141 + }, + { + "epoch": 1.4334991846803529, + "grad_norm": 5.087634563446045, + "learning_rate": 3.123346750744086e-05, + "loss": 2.8283, + "step": 2142 + }, + { + "epoch": 1.434168164903625, + "grad_norm": 5.131560802459717, + "learning_rate": 3.1214606396283044e-05, + "loss": 3.0144, + "step": 2143 + }, + { + "epoch": 1.434837145126897, + "grad_norm": 6.122884273529053, + "learning_rate": 3.119574151412197e-05, + "loss": 2.7595, + "step": 2144 + }, + { + "epoch": 1.4355061253501693, + "grad_norm": 7.18737268447876, + "learning_rate": 3.11768728724048e-05, + "loss": 3.0811, + "step": 2145 + }, + { + "epoch": 1.4361751055734415, + "grad_norm": 4.818100452423096, + "learning_rate": 3.115800048258096e-05, + "loss": 2.6299, + "step": 2146 + }, + { + "epoch": 1.4368440857967135, + "grad_norm": 4.766518592834473, + "learning_rate": 3.1139124356102145e-05, + "loss": 2.8003, + "step": 2147 + }, + { + "epoch": 1.4375130660199857, + "grad_norm": 7.5392937660217285, + "learning_rate": 3.112024450442234e-05, + "loss": 2.7066, + "step": 2148 + }, + { + "epoch": 1.438182046243258, + "grad_norm": 6.259847640991211, + "learning_rate": 3.110136093899777e-05, + "loss": 3.0412, + "step": 2149 + }, + { + "epoch": 1.43885102646653, + "grad_norm": 5.854325771331787, + "learning_rate": 3.108247367128694e-05, + "loss": 2.7265, + "step": 2150 + }, + { + "epoch": 1.4395200066898022, + "grad_norm": 8.203265190124512, + "learning_rate": 3.106358271275056e-05, + "loss": 2.8515, + "step": 2151 + }, + { + "epoch": 1.4401889869130744, + "grad_norm": 4.087404727935791, + "learning_rate": 3.1044688074851615e-05, + "loss": 2.6054, + "step": 2152 + }, + { + "epoch": 1.4408579671363464, + "grad_norm": 4.96695613861084, + "learning_rate": 3.102578976905531e-05, + "loss": 2.6986, + "step": 2153 + }, + { + "epoch": 1.4415269473596186, + "grad_norm": 4.936301231384277, + "learning_rate": 3.1006887806829085e-05, + "loss": 2.6312, + "step": 2154 + }, + { + "epoch": 1.4421959275828908, + "grad_norm": 9.694040298461914, + "learning_rate": 3.098798219964257e-05, + "loss": 2.9009, + "step": 2155 + }, + { + "epoch": 1.442864907806163, + "grad_norm": 8.300200462341309, + "learning_rate": 3.096907295896764e-05, + "loss": 2.7605, + "step": 2156 + }, + { + "epoch": 1.443533888029435, + "grad_norm": 4.935064792633057, + "learning_rate": 3.0950160096278364e-05, + "loss": 2.8497, + "step": 2157 + }, + { + "epoch": 1.4442028682527073, + "grad_norm": 5.755708694458008, + "learning_rate": 3.0931243623051e-05, + "loss": 2.9343, + "step": 2158 + }, + { + "epoch": 1.4448718484759795, + "grad_norm": 5.971693515777588, + "learning_rate": 3.0912323550764e-05, + "loss": 2.7599, + "step": 2159 + }, + { + "epoch": 1.4455408286992517, + "grad_norm": 5.377431392669678, + "learning_rate": 3.0893399890898014e-05, + "loss": 2.7569, + "step": 2160 + }, + { + "epoch": 1.4462098089225237, + "grad_norm": 5.563959121704102, + "learning_rate": 3.087447265493586e-05, + "loss": 2.6611, + "step": 2161 + }, + { + "epoch": 1.446878789145796, + "grad_norm": 8.300984382629395, + "learning_rate": 3.08555418543625e-05, + "loss": 2.5863, + "step": 2162 + }, + { + "epoch": 1.4475477693690681, + "grad_norm": 4.10382604598999, + "learning_rate": 3.083660750066511e-05, + "loss": 2.8028, + "step": 2163 + }, + { + "epoch": 1.4482167495923401, + "grad_norm": 5.85797119140625, + "learning_rate": 3.081766960533299e-05, + "loss": 2.8329, + "step": 2164 + }, + { + "epoch": 1.4488857298156124, + "grad_norm": 6.761934280395508, + "learning_rate": 3.0798728179857584e-05, + "loss": 2.7883, + "step": 2165 + }, + { + "epoch": 1.4495547100388846, + "grad_norm": 5.8419976234436035, + "learning_rate": 3.0779783235732495e-05, + "loss": 2.7093, + "step": 2166 + }, + { + "epoch": 1.4502236902621566, + "grad_norm": 6.216766357421875, + "learning_rate": 3.0760834784453453e-05, + "loss": 2.6303, + "step": 2167 + }, + { + "epoch": 1.4508926704854288, + "grad_norm": 7.233863830566406, + "learning_rate": 3.074188283751832e-05, + "loss": 2.5462, + "step": 2168 + }, + { + "epoch": 1.451561650708701, + "grad_norm": 7.057400226593018, + "learning_rate": 3.072292740642707e-05, + "loss": 2.6736, + "step": 2169 + }, + { + "epoch": 1.452230630931973, + "grad_norm": 5.978362083435059, + "learning_rate": 3.070396850268181e-05, + "loss": 2.7756, + "step": 2170 + }, + { + "epoch": 1.4528996111552452, + "grad_norm": 6.943479537963867, + "learning_rate": 3.0685006137786726e-05, + "loss": 3.0649, + "step": 2171 + }, + { + "epoch": 1.4535685913785175, + "grad_norm": 7.135985374450684, + "learning_rate": 3.066604032324813e-05, + "loss": 2.8992, + "step": 2172 + }, + { + "epoch": 1.4542375716017895, + "grad_norm": 3.9003190994262695, + "learning_rate": 3.064707107057443e-05, + "loss": 2.4526, + "step": 2173 + }, + { + "epoch": 1.4549065518250617, + "grad_norm": 7.062236785888672, + "learning_rate": 3.062809839127607e-05, + "loss": 3.088, + "step": 2174 + }, + { + "epoch": 1.455575532048334, + "grad_norm": 5.225010395050049, + "learning_rate": 3.060912229686565e-05, + "loss": 2.6068, + "step": 2175 + }, + { + "epoch": 1.456244512271606, + "grad_norm": 5.395065784454346, + "learning_rate": 3.059014279885779e-05, + "loss": 2.7208, + "step": 2176 + }, + { + "epoch": 1.4569134924948781, + "grad_norm": 5.0800275802612305, + "learning_rate": 3.057115990876918e-05, + "loss": 2.6391, + "step": 2177 + }, + { + "epoch": 1.4575824727181503, + "grad_norm": 8.030608177185059, + "learning_rate": 3.055217363811859e-05, + "loss": 2.9492, + "step": 2178 + }, + { + "epoch": 1.4582514529414223, + "grad_norm": 5.007354259490967, + "learning_rate": 3.053318399842682e-05, + "loss": 2.3849, + "step": 2179 + }, + { + "epoch": 1.4589204331646946, + "grad_norm": 5.231847763061523, + "learning_rate": 3.0514191001216724e-05, + "loss": 2.831, + "step": 2180 + }, + { + "epoch": 1.4595894133879668, + "grad_norm": 5.7164306640625, + "learning_rate": 3.0495194658013194e-05, + "loss": 2.7224, + "step": 2181 + }, + { + "epoch": 1.4602583936112388, + "grad_norm": 5.191862106323242, + "learning_rate": 3.047619498034314e-05, + "loss": 2.7955, + "step": 2182 + }, + { + "epoch": 1.460927373834511, + "grad_norm": 5.349247932434082, + "learning_rate": 3.0457191979735528e-05, + "loss": 2.9513, + "step": 2183 + }, + { + "epoch": 1.4615963540577832, + "grad_norm": 4.896350383758545, + "learning_rate": 3.04381856677213e-05, + "loss": 2.7237, + "step": 2184 + }, + { + "epoch": 1.4622653342810552, + "grad_norm": 5.641765117645264, + "learning_rate": 3.0419176055833426e-05, + "loss": 2.5212, + "step": 2185 + }, + { + "epoch": 1.4629343145043274, + "grad_norm": 6.158010959625244, + "learning_rate": 3.0400163155606887e-05, + "loss": 2.9407, + "step": 2186 + }, + { + "epoch": 1.4636032947275996, + "grad_norm": 4.178887367248535, + "learning_rate": 3.0381146978578633e-05, + "loss": 2.5664, + "step": 2187 + }, + { + "epoch": 1.4642722749508716, + "grad_norm": 5.496964931488037, + "learning_rate": 3.0362127536287637e-05, + "loss": 2.9001, + "step": 2188 + }, + { + "epoch": 1.4649412551741439, + "grad_norm": 5.312406539916992, + "learning_rate": 3.034310484027483e-05, + "loss": 2.5888, + "step": 2189 + }, + { + "epoch": 1.465610235397416, + "grad_norm": 4.688312530517578, + "learning_rate": 3.032407890208312e-05, + "loss": 2.5179, + "step": 2190 + }, + { + "epoch": 1.466279215620688, + "grad_norm": 4.950438499450684, + "learning_rate": 3.0305049733257384e-05, + "loss": 2.5942, + "step": 2191 + }, + { + "epoch": 1.4669481958439603, + "grad_norm": 5.423849582672119, + "learning_rate": 3.0286017345344465e-05, + "loss": 2.7217, + "step": 2192 + }, + { + "epoch": 1.4676171760672325, + "grad_norm": 4.332028865814209, + "learning_rate": 3.0266981749893157e-05, + "loss": 2.5761, + "step": 2193 + }, + { + "epoch": 1.4682861562905047, + "grad_norm": 5.536228656768799, + "learning_rate": 3.0247942958454196e-05, + "loss": 2.6629, + "step": 2194 + }, + { + "epoch": 1.4689551365137767, + "grad_norm": 5.129214763641357, + "learning_rate": 3.0228900982580256e-05, + "loss": 2.5893, + "step": 2195 + }, + { + "epoch": 1.469624116737049, + "grad_norm": 3.6339681148529053, + "learning_rate": 3.0209855833825952e-05, + "loss": 2.2444, + "step": 2196 + }, + { + "epoch": 1.4702930969603212, + "grad_norm": 4.836607933044434, + "learning_rate": 3.0190807523747822e-05, + "loss": 2.6725, + "step": 2197 + }, + { + "epoch": 1.4709620771835934, + "grad_norm": 3.9454689025878906, + "learning_rate": 3.0171756063904303e-05, + "loss": 2.6341, + "step": 2198 + }, + { + "epoch": 1.4716310574068654, + "grad_norm": 4.593690395355225, + "learning_rate": 3.0152701465855778e-05, + "loss": 2.6333, + "step": 2199 + }, + { + "epoch": 1.4723000376301376, + "grad_norm": 8.685956954956055, + "learning_rate": 3.0133643741164508e-05, + "loss": 2.9083, + "step": 2200 + }, + { + "epoch": 1.4729690178534098, + "grad_norm": 5.938663005828857, + "learning_rate": 3.0114582901394667e-05, + "loss": 2.6169, + "step": 2201 + }, + { + "epoch": 1.4736379980766818, + "grad_norm": 4.498161315917969, + "learning_rate": 3.0095518958112295e-05, + "loss": 2.7415, + "step": 2202 + }, + { + "epoch": 1.474306978299954, + "grad_norm": 6.615968227386475, + "learning_rate": 3.0076451922885346e-05, + "loss": 2.9558, + "step": 2203 + }, + { + "epoch": 1.4749759585232263, + "grad_norm": 7.6475725173950195, + "learning_rate": 3.0057381807283642e-05, + "loss": 2.8374, + "step": 2204 + }, + { + "epoch": 1.4756449387464983, + "grad_norm": 5.680813789367676, + "learning_rate": 3.0038308622878846e-05, + "loss": 2.741, + "step": 2205 + }, + { + "epoch": 1.4763139189697705, + "grad_norm": 5.488100051879883, + "learning_rate": 3.0019232381244515e-05, + "loss": 2.6979, + "step": 2206 + }, + { + "epoch": 1.4769828991930427, + "grad_norm": 4.570145606994629, + "learning_rate": 3.000015309395606e-05, + "loss": 2.8078, + "step": 2207 + }, + { + "epoch": 1.4776518794163147, + "grad_norm": 3.7472822666168213, + "learning_rate": 2.998107077259073e-05, + "loss": 2.6571, + "step": 2208 + }, + { + "epoch": 1.478320859639587, + "grad_norm": 4.0875773429870605, + "learning_rate": 2.99619854287276e-05, + "loss": 2.9654, + "step": 2209 + }, + { + "epoch": 1.4789898398628591, + "grad_norm": 6.403375148773193, + "learning_rate": 2.9942897073947612e-05, + "loss": 2.7213, + "step": 2210 + }, + { + "epoch": 1.4796588200861311, + "grad_norm": 4.437558650970459, + "learning_rate": 2.9923805719833515e-05, + "loss": 2.6668, + "step": 2211 + }, + { + "epoch": 1.4803278003094034, + "grad_norm": 7.018237113952637, + "learning_rate": 2.9904711377969884e-05, + "loss": 2.8256, + "step": 2212 + }, + { + "epoch": 1.4809967805326756, + "grad_norm": 4.830729961395264, + "learning_rate": 2.988561405994309e-05, + "loss": 2.8145, + "step": 2213 + }, + { + "epoch": 1.4816657607559476, + "grad_norm": 6.391282081604004, + "learning_rate": 2.986651377734134e-05, + "loss": 2.9568, + "step": 2214 + }, + { + "epoch": 1.4823347409792198, + "grad_norm": 5.865687847137451, + "learning_rate": 2.984741054175463e-05, + "loss": 2.9988, + "step": 2215 + }, + { + "epoch": 1.483003721202492, + "grad_norm": 6.1040544509887695, + "learning_rate": 2.9828304364774713e-05, + "loss": 2.7426, + "step": 2216 + }, + { + "epoch": 1.483672701425764, + "grad_norm": 4.569874286651611, + "learning_rate": 2.9809195257995182e-05, + "loss": 2.6406, + "step": 2217 + }, + { + "epoch": 1.4843416816490362, + "grad_norm": 5.783868312835693, + "learning_rate": 2.9790083233011372e-05, + "loss": 2.9414, + "step": 2218 + }, + { + "epoch": 1.4850106618723085, + "grad_norm": 5.311706066131592, + "learning_rate": 2.977096830142041e-05, + "loss": 2.7498, + "step": 2219 + }, + { + "epoch": 1.4856796420955805, + "grad_norm": 5.672433376312256, + "learning_rate": 2.9751850474821153e-05, + "loss": 2.8065, + "step": 2220 + }, + { + "epoch": 1.4863486223188527, + "grad_norm": 9.245611190795898, + "learning_rate": 2.9732729764814254e-05, + "loss": 3.0967, + "step": 2221 + }, + { + "epoch": 1.487017602542125, + "grad_norm": 5.793460845947266, + "learning_rate": 2.9713606183002098e-05, + "loss": 2.8664, + "step": 2222 + }, + { + "epoch": 1.487686582765397, + "grad_norm": 3.8744213581085205, + "learning_rate": 2.969447974098881e-05, + "loss": 2.504, + "step": 2223 + }, + { + "epoch": 1.4883555629886691, + "grad_norm": 4.165330410003662, + "learning_rate": 2.9675350450380253e-05, + "loss": 2.8091, + "step": 2224 + }, + { + "epoch": 1.4890245432119413, + "grad_norm": 4.632652759552002, + "learning_rate": 2.9656218322784014e-05, + "loss": 2.662, + "step": 2225 + }, + { + "epoch": 1.4896935234352133, + "grad_norm": 4.595365047454834, + "learning_rate": 2.963708336980942e-05, + "loss": 2.8775, + "step": 2226 + }, + { + "epoch": 1.4903625036584855, + "grad_norm": 3.8869335651397705, + "learning_rate": 2.961794560306749e-05, + "loss": 2.6281, + "step": 2227 + }, + { + "epoch": 1.4910314838817578, + "grad_norm": 6.43788480758667, + "learning_rate": 2.959880503417095e-05, + "loss": 2.6576, + "step": 2228 + }, + { + "epoch": 1.4917004641050298, + "grad_norm": 4.685789108276367, + "learning_rate": 2.9579661674734256e-05, + "loss": 2.6969, + "step": 2229 + }, + { + "epoch": 1.492369444328302, + "grad_norm": 3.7685611248016357, + "learning_rate": 2.9560515536373534e-05, + "loss": 2.6655, + "step": 2230 + }, + { + "epoch": 1.4930384245515742, + "grad_norm": 6.255924701690674, + "learning_rate": 2.9541366630706586e-05, + "loss": 2.7999, + "step": 2231 + }, + { + "epoch": 1.4937074047748464, + "grad_norm": 4.318514347076416, + "learning_rate": 2.9522214969352912e-05, + "loss": 2.6507, + "step": 2232 + }, + { + "epoch": 1.4943763849981184, + "grad_norm": 4.408954620361328, + "learning_rate": 2.9503060563933682e-05, + "loss": 2.5212, + "step": 2233 + }, + { + "epoch": 1.4950453652213906, + "grad_norm": 6.041835784912109, + "learning_rate": 2.9483903426071734e-05, + "loss": 2.9122, + "step": 2234 + }, + { + "epoch": 1.4957143454446629, + "grad_norm": 5.376307964324951, + "learning_rate": 2.9464743567391546e-05, + "loss": 2.6712, + "step": 2235 + }, + { + "epoch": 1.4963833256679349, + "grad_norm": 6.752755641937256, + "learning_rate": 2.944558099951926e-05, + "loss": 2.8976, + "step": 2236 + }, + { + "epoch": 1.497052305891207, + "grad_norm": 4.995867729187012, + "learning_rate": 2.942641573408267e-05, + "loss": 2.8559, + "step": 2237 + }, + { + "epoch": 1.4977212861144793, + "grad_norm": 5.833775043487549, + "learning_rate": 2.940724778271119e-05, + "loss": 2.711, + "step": 2238 + }, + { + "epoch": 1.4983902663377515, + "grad_norm": 6.020483016967773, + "learning_rate": 2.938807715703587e-05, + "loss": 2.5386, + "step": 2239 + }, + { + "epoch": 1.4990592465610235, + "grad_norm": 9.630859375, + "learning_rate": 2.9368903868689392e-05, + "loss": 2.8233, + "step": 2240 + }, + { + "epoch": 1.4997282267842957, + "grad_norm": 5.620861053466797, + "learning_rate": 2.9349727929306042e-05, + "loss": 2.6048, + "step": 2241 + }, + { + "epoch": 1.500397207007568, + "grad_norm": 7.493035316467285, + "learning_rate": 2.933054935052172e-05, + "loss": 2.6908, + "step": 2242 + }, + { + "epoch": 1.50106618723084, + "grad_norm": 6.487952709197998, + "learning_rate": 2.9311368143973915e-05, + "loss": 2.7721, + "step": 2243 + }, + { + "epoch": 1.5017351674541122, + "grad_norm": 6.43418025970459, + "learning_rate": 2.9292184321301742e-05, + "loss": 2.9517, + "step": 2244 + }, + { + "epoch": 1.5024041476773844, + "grad_norm": 4.627255916595459, + "learning_rate": 2.927299789414587e-05, + "loss": 2.6272, + "step": 2245 + }, + { + "epoch": 1.5030731279006564, + "grad_norm": 5.553348064422607, + "learning_rate": 2.925380887414856e-05, + "loss": 2.777, + "step": 2246 + }, + { + "epoch": 1.5037421081239286, + "grad_norm": 4.826366424560547, + "learning_rate": 2.9234617272953653e-05, + "loss": 2.5868, + "step": 2247 + }, + { + "epoch": 1.5044110883472008, + "grad_norm": 4.4571213722229, + "learning_rate": 2.921542310220655e-05, + "loss": 2.5002, + "step": 2248 + }, + { + "epoch": 1.5050800685704728, + "grad_norm": 5.0305376052856445, + "learning_rate": 2.9196226373554213e-05, + "loss": 2.7349, + "step": 2249 + }, + { + "epoch": 1.505749048793745, + "grad_norm": 4.984687805175781, + "learning_rate": 2.9177027098645155e-05, + "loss": 2.7713, + "step": 2250 + }, + { + "epoch": 1.5064180290170173, + "grad_norm": 5.873697280883789, + "learning_rate": 2.915782528912943e-05, + "loss": 2.7877, + "step": 2251 + }, + { + "epoch": 1.5070870092402893, + "grad_norm": 7.105508327484131, + "learning_rate": 2.9138620956658645e-05, + "loss": 2.6506, + "step": 2252 + }, + { + "epoch": 1.5077559894635615, + "grad_norm": 4.614120960235596, + "learning_rate": 2.9119414112885917e-05, + "loss": 2.834, + "step": 2253 + }, + { + "epoch": 1.5084249696868337, + "grad_norm": 4.375961780548096, + "learning_rate": 2.91002047694659e-05, + "loss": 2.5743, + "step": 2254 + }, + { + "epoch": 1.5090939499101057, + "grad_norm": 4.351882457733154, + "learning_rate": 2.908099293805477e-05, + "loss": 2.6655, + "step": 2255 + }, + { + "epoch": 1.509762930133378, + "grad_norm": 6.840676784515381, + "learning_rate": 2.9061778630310193e-05, + "loss": 2.9516, + "step": 2256 + }, + { + "epoch": 1.5104319103566501, + "grad_norm": 7.174767017364502, + "learning_rate": 2.9042561857891353e-05, + "loss": 3.0367, + "step": 2257 + }, + { + "epoch": 1.5111008905799221, + "grad_norm": 5.582670211791992, + "learning_rate": 2.9023342632458934e-05, + "loss": 2.7125, + "step": 2258 + }, + { + "epoch": 1.5117698708031944, + "grad_norm": 5.117880344390869, + "learning_rate": 2.900412096567509e-05, + "loss": 3.0327, + "step": 2259 + }, + { + "epoch": 1.5124388510264666, + "grad_norm": 7.351419925689697, + "learning_rate": 2.8984896869203472e-05, + "loss": 2.5291, + "step": 2260 + }, + { + "epoch": 1.5131078312497386, + "grad_norm": 4.509941577911377, + "learning_rate": 2.8965670354709196e-05, + "loss": 2.6116, + "step": 2261 + }, + { + "epoch": 1.5137768114730108, + "grad_norm": 5.0681986808776855, + "learning_rate": 2.894644143385885e-05, + "loss": 2.6985, + "step": 2262 + }, + { + "epoch": 1.514445791696283, + "grad_norm": 5.417664051055908, + "learning_rate": 2.892721011832049e-05, + "loss": 2.6398, + "step": 2263 + }, + { + "epoch": 1.515114771919555, + "grad_norm": 7.193685531616211, + "learning_rate": 2.8907976419763605e-05, + "loss": 3.3176, + "step": 2264 + }, + { + "epoch": 1.5157837521428272, + "grad_norm": 6.769975662231445, + "learning_rate": 2.888874034985915e-05, + "loss": 3.1323, + "step": 2265 + }, + { + "epoch": 1.5164527323660995, + "grad_norm": 3.5006496906280518, + "learning_rate": 2.8869501920279506e-05, + "loss": 2.4759, + "step": 2266 + }, + { + "epoch": 1.5171217125893715, + "grad_norm": 4.2594990730285645, + "learning_rate": 2.88502611426985e-05, + "loss": 2.6502, + "step": 2267 + }, + { + "epoch": 1.5177906928126437, + "grad_norm": 4.369060039520264, + "learning_rate": 2.8831018028791356e-05, + "loss": 2.5705, + "step": 2268 + }, + { + "epoch": 1.518459673035916, + "grad_norm": 5.244803428649902, + "learning_rate": 2.8811772590234754e-05, + "loss": 2.7474, + "step": 2269 + }, + { + "epoch": 1.5191286532591879, + "grad_norm": 4.564072132110596, + "learning_rate": 2.8792524838706754e-05, + "loss": 2.8363, + "step": 2270 + }, + { + "epoch": 1.5197976334824603, + "grad_norm": 2.9686384201049805, + "learning_rate": 2.877327478588682e-05, + "loss": 2.5589, + "step": 2271 + }, + { + "epoch": 1.5204666137057323, + "grad_norm": 8.189765930175781, + "learning_rate": 2.8754022443455842e-05, + "loss": 2.48, + "step": 2272 + }, + { + "epoch": 1.5211355939290043, + "grad_norm": 5.444003582000732, + "learning_rate": 2.8734767823096065e-05, + "loss": 2.8464, + "step": 2273 + }, + { + "epoch": 1.5218045741522768, + "grad_norm": 5.806658744812012, + "learning_rate": 2.8715510936491147e-05, + "loss": 2.7004, + "step": 2274 + }, + { + "epoch": 1.5224735543755488, + "grad_norm": 6.077183723449707, + "learning_rate": 2.8696251795326083e-05, + "loss": 2.7294, + "step": 2275 + }, + { + "epoch": 1.5231425345988208, + "grad_norm": 5.352018356323242, + "learning_rate": 2.8676990411287276e-05, + "loss": 2.7559, + "step": 2276 + }, + { + "epoch": 1.5238115148220932, + "grad_norm": 5.8964948654174805, + "learning_rate": 2.8657726796062468e-05, + "loss": 2.7667, + "step": 2277 + }, + { + "epoch": 1.5244804950453652, + "grad_norm": 6.853862762451172, + "learning_rate": 2.8638460961340756e-05, + "loss": 2.5696, + "step": 2278 + }, + { + "epoch": 1.5251494752686374, + "grad_norm": 4.766753673553467, + "learning_rate": 2.8619192918812586e-05, + "loss": 2.8655, + "step": 2279 + }, + { + "epoch": 1.5258184554919096, + "grad_norm": 5.381381034851074, + "learning_rate": 2.8599922680169744e-05, + "loss": 3.0158, + "step": 2280 + }, + { + "epoch": 1.5264874357151816, + "grad_norm": 5.1292405128479, + "learning_rate": 2.8580650257105357e-05, + "loss": 2.8744, + "step": 2281 + }, + { + "epoch": 1.5271564159384539, + "grad_norm": 4.248069763183594, + "learning_rate": 2.856137566131386e-05, + "loss": 2.5012, + "step": 2282 + }, + { + "epoch": 1.527825396161726, + "grad_norm": 8.12672233581543, + "learning_rate": 2.854209890449102e-05, + "loss": 3.2941, + "step": 2283 + }, + { + "epoch": 1.528494376384998, + "grad_norm": 6.2802863121032715, + "learning_rate": 2.8522819998333915e-05, + "loss": 2.6625, + "step": 2284 + }, + { + "epoch": 1.5291633566082703, + "grad_norm": 5.26627254486084, + "learning_rate": 2.850353895454092e-05, + "loss": 2.7865, + "step": 2285 + }, + { + "epoch": 1.5298323368315425, + "grad_norm": 5.914576530456543, + "learning_rate": 2.8484255784811707e-05, + "loss": 2.6483, + "step": 2286 + }, + { + "epoch": 1.5305013170548145, + "grad_norm": 4.533624649047852, + "learning_rate": 2.846497050084725e-05, + "loss": 2.7012, + "step": 2287 + }, + { + "epoch": 1.5311702972780867, + "grad_norm": 6.660671234130859, + "learning_rate": 2.844568311434979e-05, + "loss": 2.8513, + "step": 2288 + }, + { + "epoch": 1.531839277501359, + "grad_norm": 5.66995906829834, + "learning_rate": 2.8426393637022858e-05, + "loss": 2.7217, + "step": 2289 + }, + { + "epoch": 1.532508257724631, + "grad_norm": 6.058644771575928, + "learning_rate": 2.8407102080571234e-05, + "loss": 2.7753, + "step": 2290 + }, + { + "epoch": 1.5331772379479032, + "grad_norm": 5.899769306182861, + "learning_rate": 2.8387808456700986e-05, + "loss": 2.7172, + "step": 2291 + }, + { + "epoch": 1.5338462181711754, + "grad_norm": 5.000714302062988, + "learning_rate": 2.8368512777119428e-05, + "loss": 3.0024, + "step": 2292 + }, + { + "epoch": 1.5345151983944474, + "grad_norm": 4.8641252517700195, + "learning_rate": 2.8349215053535095e-05, + "loss": 2.8792, + "step": 2293 + }, + { + "epoch": 1.5351841786177196, + "grad_norm": 4.457775115966797, + "learning_rate": 2.83299152976578e-05, + "loss": 2.6964, + "step": 2294 + }, + { + "epoch": 1.5358531588409918, + "grad_norm": 8.015108108520508, + "learning_rate": 2.8310613521198565e-05, + "loss": 3.0513, + "step": 2295 + }, + { + "epoch": 1.5365221390642638, + "grad_norm": 5.425005912780762, + "learning_rate": 2.829130973586966e-05, + "loss": 2.8324, + "step": 2296 + }, + { + "epoch": 1.537191119287536, + "grad_norm": 4.353015422821045, + "learning_rate": 2.8272003953384545e-05, + "loss": 2.5468, + "step": 2297 + }, + { + "epoch": 1.5378600995108083, + "grad_norm": 4.845297813415527, + "learning_rate": 2.825269618545791e-05, + "loss": 2.6662, + "step": 2298 + }, + { + "epoch": 1.5385290797340803, + "grad_norm": 4.563453197479248, + "learning_rate": 2.823338644380566e-05, + "loss": 2.7973, + "step": 2299 + }, + { + "epoch": 1.5391980599573525, + "grad_norm": 4.57271146774292, + "learning_rate": 2.8214074740144864e-05, + "loss": 2.6008, + "step": 2300 + }, + { + "epoch": 1.5398670401806247, + "grad_norm": 4.428469181060791, + "learning_rate": 2.819476108619381e-05, + "loss": 2.9012, + "step": 2301 + }, + { + "epoch": 1.5405360204038967, + "grad_norm": 6.25092077255249, + "learning_rate": 2.8175445493671972e-05, + "loss": 2.9164, + "step": 2302 + }, + { + "epoch": 1.541205000627169, + "grad_norm": 5.633223533630371, + "learning_rate": 2.8156127974299973e-05, + "loss": 2.7299, + "step": 2303 + }, + { + "epoch": 1.5418739808504411, + "grad_norm": 3.5399703979492188, + "learning_rate": 2.813680853979963e-05, + "loss": 2.8754, + "step": 2304 + }, + { + "epoch": 1.5425429610737131, + "grad_norm": 5.035823822021484, + "learning_rate": 2.8117487201893916e-05, + "loss": 2.6865, + "step": 2305 + }, + { + "epoch": 1.5432119412969854, + "grad_norm": 3.8767831325531006, + "learning_rate": 2.8098163972306952e-05, + "loss": 2.7932, + "step": 2306 + }, + { + "epoch": 1.5438809215202576, + "grad_norm": 7.477834224700928, + "learning_rate": 2.8078838862764016e-05, + "loss": 2.6505, + "step": 2307 + }, + { + "epoch": 1.5445499017435296, + "grad_norm": 5.698312282562256, + "learning_rate": 2.8059511884991524e-05, + "loss": 2.727, + "step": 2308 + }, + { + "epoch": 1.545218881966802, + "grad_norm": 4.2874932289123535, + "learning_rate": 2.8040183050717016e-05, + "loss": 2.7379, + "step": 2309 + }, + { + "epoch": 1.545887862190074, + "grad_norm": 5.99888277053833, + "learning_rate": 2.8020852371669177e-05, + "loss": 2.9435, + "step": 2310 + }, + { + "epoch": 1.546556842413346, + "grad_norm": 7.154504776000977, + "learning_rate": 2.800151985957779e-05, + "loss": 2.9904, + "step": 2311 + }, + { + "epoch": 1.5472258226366185, + "grad_norm": 6.446628570556641, + "learning_rate": 2.7982185526173767e-05, + "loss": 2.7681, + "step": 2312 + }, + { + "epoch": 1.5478948028598905, + "grad_norm": 4.669741153717041, + "learning_rate": 2.796284938318912e-05, + "loss": 2.6973, + "step": 2313 + }, + { + "epoch": 1.5485637830831624, + "grad_norm": 4.283153057098389, + "learning_rate": 2.7943511442356968e-05, + "loss": 2.5826, + "step": 2314 + }, + { + "epoch": 1.549232763306435, + "grad_norm": 6.789968490600586, + "learning_rate": 2.792417171541149e-05, + "loss": 2.7476, + "step": 2315 + }, + { + "epoch": 1.5499017435297069, + "grad_norm": 4.725489139556885, + "learning_rate": 2.7904830214087984e-05, + "loss": 2.9157, + "step": 2316 + }, + { + "epoch": 1.550570723752979, + "grad_norm": 5.4655232429504395, + "learning_rate": 2.7885486950122814e-05, + "loss": 2.6649, + "step": 2317 + }, + { + "epoch": 1.5512397039762513, + "grad_norm": 4.8952531814575195, + "learning_rate": 2.7866141935253404e-05, + "loss": 2.7492, + "step": 2318 + }, + { + "epoch": 1.5519086841995233, + "grad_norm": 3.991525173187256, + "learning_rate": 2.784679518121825e-05, + "loss": 2.7339, + "step": 2319 + }, + { + "epoch": 1.5525776644227955, + "grad_norm": 5.529263973236084, + "learning_rate": 2.78274466997569e-05, + "loss": 2.7831, + "step": 2320 + }, + { + "epoch": 1.5532466446460678, + "grad_norm": 6.294371604919434, + "learning_rate": 2.780809650260995e-05, + "loss": 2.8431, + "step": 2321 + }, + { + "epoch": 1.5539156248693398, + "grad_norm": 4.6729021072387695, + "learning_rate": 2.7788744601519035e-05, + "loss": 2.5727, + "step": 2322 + }, + { + "epoch": 1.554584605092612, + "grad_norm": 7.0208916664123535, + "learning_rate": 2.7769391008226825e-05, + "loss": 2.8229, + "step": 2323 + }, + { + "epoch": 1.5552535853158842, + "grad_norm": 6.4686126708984375, + "learning_rate": 2.7750035734477027e-05, + "loss": 2.7945, + "step": 2324 + }, + { + "epoch": 1.5559225655391562, + "grad_norm": 7.349049091339111, + "learning_rate": 2.7730678792014358e-05, + "loss": 2.9645, + "step": 2325 + }, + { + "epoch": 1.5565915457624284, + "grad_norm": 6.848609924316406, + "learning_rate": 2.7711320192584543e-05, + "loss": 2.8493, + "step": 2326 + }, + { + "epoch": 1.5572605259857006, + "grad_norm": 4.186678886413574, + "learning_rate": 2.769195994793432e-05, + "loss": 2.7836, + "step": 2327 + }, + { + "epoch": 1.5579295062089726, + "grad_norm": 5.591568470001221, + "learning_rate": 2.7672598069811423e-05, + "loss": 2.8385, + "step": 2328 + }, + { + "epoch": 1.5585984864322449, + "grad_norm": 4.2852702140808105, + "learning_rate": 2.7653234569964582e-05, + "loss": 2.7116, + "step": 2329 + }, + { + "epoch": 1.559267466655517, + "grad_norm": 3.834301710128784, + "learning_rate": 2.76338694601435e-05, + "loss": 2.6921, + "step": 2330 + }, + { + "epoch": 1.559936446878789, + "grad_norm": 5.2634358406066895, + "learning_rate": 2.761450275209887e-05, + "loss": 2.8392, + "step": 2331 + }, + { + "epoch": 1.5606054271020613, + "grad_norm": 6.76492977142334, + "learning_rate": 2.7595134457582346e-05, + "loss": 2.62, + "step": 2332 + }, + { + "epoch": 1.5612744073253335, + "grad_norm": 6.453455448150635, + "learning_rate": 2.7575764588346543e-05, + "loss": 2.8073, + "step": 2333 + }, + { + "epoch": 1.5619433875486055, + "grad_norm": 5.887423515319824, + "learning_rate": 2.7556393156145032e-05, + "loss": 2.8194, + "step": 2334 + }, + { + "epoch": 1.5626123677718777, + "grad_norm": 6.127762794494629, + "learning_rate": 2.753702017273235e-05, + "loss": 2.6971, + "step": 2335 + }, + { + "epoch": 1.56328134799515, + "grad_norm": 5.559381008148193, + "learning_rate": 2.751764564986396e-05, + "loss": 2.7486, + "step": 2336 + }, + { + "epoch": 1.563950328218422, + "grad_norm": 5.403796195983887, + "learning_rate": 2.749826959929625e-05, + "loss": 2.7299, + "step": 2337 + }, + { + "epoch": 1.5646193084416942, + "grad_norm": 4.958734035491943, + "learning_rate": 2.7478892032786545e-05, + "loss": 2.6112, + "step": 2338 + }, + { + "epoch": 1.5652882886649664, + "grad_norm": 4.9305195808410645, + "learning_rate": 2.74595129620931e-05, + "loss": 2.8485, + "step": 2339 + }, + { + "epoch": 1.5659572688882384, + "grad_norm": 6.793489456176758, + "learning_rate": 2.7440132398975072e-05, + "loss": 2.7815, + "step": 2340 + }, + { + "epoch": 1.5666262491115106, + "grad_norm": 5.763112545013428, + "learning_rate": 2.7420750355192516e-05, + "loss": 2.806, + "step": 2341 + }, + { + "epoch": 1.5672952293347828, + "grad_norm": 4.670166492462158, + "learning_rate": 2.74013668425064e-05, + "loss": 2.7168, + "step": 2342 + }, + { + "epoch": 1.5679642095580548, + "grad_norm": 4.048379421234131, + "learning_rate": 2.7381981872678575e-05, + "loss": 2.5713, + "step": 2343 + }, + { + "epoch": 1.568633189781327, + "grad_norm": 8.03201961517334, + "learning_rate": 2.7362595457471786e-05, + "loss": 2.8947, + "step": 2344 + }, + { + "epoch": 1.5693021700045993, + "grad_norm": 5.185798168182373, + "learning_rate": 2.7343207608649634e-05, + "loss": 2.8595, + "step": 2345 + }, + { + "epoch": 1.5699711502278713, + "grad_norm": 6.56360387802124, + "learning_rate": 2.732381833797661e-05, + "loss": 2.8782, + "step": 2346 + }, + { + "epoch": 1.5706401304511437, + "grad_norm": 5.290652275085449, + "learning_rate": 2.7304427657218074e-05, + "loss": 2.6202, + "step": 2347 + }, + { + "epoch": 1.5713091106744157, + "grad_norm": 4.337584495544434, + "learning_rate": 2.7285035578140207e-05, + "loss": 2.7512, + "step": 2348 + }, + { + "epoch": 1.5719780908976877, + "grad_norm": 4.621699810028076, + "learning_rate": 2.726564211251007e-05, + "loss": 2.7962, + "step": 2349 + }, + { + "epoch": 1.5726470711209601, + "grad_norm": 8.32238483428955, + "learning_rate": 2.7246247272095564e-05, + "loss": 2.7817, + "step": 2350 + }, + { + "epoch": 1.5733160513442321, + "grad_norm": 4.816995620727539, + "learning_rate": 2.722685106866541e-05, + "loss": 2.7522, + "step": 2351 + }, + { + "epoch": 1.5739850315675041, + "grad_norm": 4.7260518074035645, + "learning_rate": 2.7207453513989155e-05, + "loss": 2.5109, + "step": 2352 + }, + { + "epoch": 1.5746540117907766, + "grad_norm": 5.570161819458008, + "learning_rate": 2.7188054619837178e-05, + "loss": 2.9705, + "step": 2353 + }, + { + "epoch": 1.5753229920140486, + "grad_norm": 5.326760768890381, + "learning_rate": 2.7168654397980674e-05, + "loss": 2.6054, + "step": 2354 + }, + { + "epoch": 1.5759919722373206, + "grad_norm": 6.1734466552734375, + "learning_rate": 2.7149252860191628e-05, + "loss": 2.705, + "step": 2355 + }, + { + "epoch": 1.576660952460593, + "grad_norm": 5.643980979919434, + "learning_rate": 2.7129850018242826e-05, + "loss": 2.9079, + "step": 2356 + }, + { + "epoch": 1.577329932683865, + "grad_norm": 5.271204471588135, + "learning_rate": 2.7110445883907853e-05, + "loss": 2.5904, + "step": 2357 + }, + { + "epoch": 1.5779989129071372, + "grad_norm": 5.08945894241333, + "learning_rate": 2.7091040468961087e-05, + "loss": 2.7199, + "step": 2358 + }, + { + "epoch": 1.5786678931304094, + "grad_norm": 6.384211540222168, + "learning_rate": 2.707163378517766e-05, + "loss": 2.8164, + "step": 2359 + }, + { + "epoch": 1.5793368733536814, + "grad_norm": 4.800456523895264, + "learning_rate": 2.7052225844333477e-05, + "loss": 2.686, + "step": 2360 + }, + { + "epoch": 1.5800058535769537, + "grad_norm": 5.3238959312438965, + "learning_rate": 2.703281665820524e-05, + "loss": 2.6533, + "step": 2361 + }, + { + "epoch": 1.5806748338002259, + "grad_norm": 4.153467655181885, + "learning_rate": 2.7013406238570364e-05, + "loss": 2.5651, + "step": 2362 + }, + { + "epoch": 1.5813438140234979, + "grad_norm": 4.877929210662842, + "learning_rate": 2.699399459720703e-05, + "loss": 2.6007, + "step": 2363 + }, + { + "epoch": 1.58201279424677, + "grad_norm": 4.141830921173096, + "learning_rate": 2.6974581745894163e-05, + "loss": 2.6136, + "step": 2364 + }, + { + "epoch": 1.5826817744700423, + "grad_norm": 5.560862064361572, + "learning_rate": 2.6955167696411416e-05, + "loss": 2.8333, + "step": 2365 + }, + { + "epoch": 1.5833507546933143, + "grad_norm": 5.879995822906494, + "learning_rate": 2.6935752460539175e-05, + "loss": 2.8919, + "step": 2366 + }, + { + "epoch": 1.5840197349165865, + "grad_norm": 4.493490219116211, + "learning_rate": 2.691633605005854e-05, + "loss": 2.7167, + "step": 2367 + }, + { + "epoch": 1.5846887151398588, + "grad_norm": 6.017796516418457, + "learning_rate": 2.689691847675133e-05, + "loss": 2.9247, + "step": 2368 + }, + { + "epoch": 1.5853576953631308, + "grad_norm": 4.717609405517578, + "learning_rate": 2.687749975240007e-05, + "loss": 2.6592, + "step": 2369 + }, + { + "epoch": 1.586026675586403, + "grad_norm": 7.9234185218811035, + "learning_rate": 2.6858079888787967e-05, + "loss": 2.8465, + "step": 2370 + }, + { + "epoch": 1.5866956558096752, + "grad_norm": 6.025661468505859, + "learning_rate": 2.6838658897698944e-05, + "loss": 2.8172, + "step": 2371 + }, + { + "epoch": 1.5873646360329472, + "grad_norm": 5.558853626251221, + "learning_rate": 2.681923679091759e-05, + "loss": 2.5836, + "step": 2372 + }, + { + "epoch": 1.5880336162562194, + "grad_norm": 6.307706356048584, + "learning_rate": 2.6799813580229176e-05, + "loss": 2.8743, + "step": 2373 + }, + { + "epoch": 1.5887025964794916, + "grad_norm": 8.418734550476074, + "learning_rate": 2.6780389277419653e-05, + "loss": 3.1143, + "step": 2374 + }, + { + "epoch": 1.5893715767027636, + "grad_norm": 4.946826457977295, + "learning_rate": 2.6760963894275616e-05, + "loss": 2.8108, + "step": 2375 + }, + { + "epoch": 1.5900405569260359, + "grad_norm": 5.7382354736328125, + "learning_rate": 2.674153744258433e-05, + "loss": 2.6312, + "step": 2376 + }, + { + "epoch": 1.590709537149308, + "grad_norm": 6.723291873931885, + "learning_rate": 2.6722109934133705e-05, + "loss": 2.8463, + "step": 2377 + }, + { + "epoch": 1.59137851737258, + "grad_norm": 4.454161643981934, + "learning_rate": 2.670268138071228e-05, + "loss": 2.7235, + "step": 2378 + }, + { + "epoch": 1.5920474975958523, + "grad_norm": 3.8878867626190186, + "learning_rate": 2.6683251794109253e-05, + "loss": 2.9071, + "step": 2379 + }, + { + "epoch": 1.5927164778191245, + "grad_norm": 5.427379131317139, + "learning_rate": 2.6663821186114434e-05, + "loss": 2.776, + "step": 2380 + }, + { + "epoch": 1.5933854580423965, + "grad_norm": 6.1245527267456055, + "learning_rate": 2.6644389568518247e-05, + "loss": 2.6846, + "step": 2381 + }, + { + "epoch": 1.5940544382656687, + "grad_norm": 3.53409743309021, + "learning_rate": 2.6624956953111735e-05, + "loss": 2.681, + "step": 2382 + }, + { + "epoch": 1.594723418488941, + "grad_norm": 6.1705002784729, + "learning_rate": 2.6605523351686553e-05, + "loss": 2.7252, + "step": 2383 + }, + { + "epoch": 1.595392398712213, + "grad_norm": 4.879063606262207, + "learning_rate": 2.658608877603494e-05, + "loss": 2.8257, + "step": 2384 + }, + { + "epoch": 1.5960613789354852, + "grad_norm": 5.19885778427124, + "learning_rate": 2.6566653237949735e-05, + "loss": 2.7644, + "step": 2385 + }, + { + "epoch": 1.5967303591587574, + "grad_norm": 6.000220775604248, + "learning_rate": 2.6547216749224368e-05, + "loss": 2.7559, + "step": 2386 + }, + { + "epoch": 1.5973993393820294, + "grad_norm": 4.362799644470215, + "learning_rate": 2.652777932165284e-05, + "loss": 2.6174, + "step": 2387 + }, + { + "epoch": 1.5980683196053018, + "grad_norm": 6.249029636383057, + "learning_rate": 2.6508340967029704e-05, + "loss": 2.7535, + "step": 2388 + }, + { + "epoch": 1.5987372998285738, + "grad_norm": 6.7952141761779785, + "learning_rate": 2.6488901697150104e-05, + "loss": 2.7719, + "step": 2389 + }, + { + "epoch": 1.5994062800518458, + "grad_norm": 6.393811225891113, + "learning_rate": 2.6469461523809723e-05, + "loss": 2.6891, + "step": 2390 + }, + { + "epoch": 1.6000752602751183, + "grad_norm": 6.876070976257324, + "learning_rate": 2.6450020458804802e-05, + "loss": 2.6072, + "step": 2391 + }, + { + "epoch": 1.6007442404983903, + "grad_norm": 5.015986442565918, + "learning_rate": 2.6430578513932108e-05, + "loss": 2.4877, + "step": 2392 + }, + { + "epoch": 1.6014132207216623, + "grad_norm": 5.196181774139404, + "learning_rate": 2.6411135700988954e-05, + "loss": 2.6933, + "step": 2393 + }, + { + "epoch": 1.6020822009449347, + "grad_norm": 4.305976390838623, + "learning_rate": 2.639169203177318e-05, + "loss": 2.5628, + "step": 2394 + }, + { + "epoch": 1.6027511811682067, + "grad_norm": 5.467509746551514, + "learning_rate": 2.637224751808313e-05, + "loss": 2.7082, + "step": 2395 + }, + { + "epoch": 1.603420161391479, + "grad_norm": 5.646378993988037, + "learning_rate": 2.635280217171769e-05, + "loss": 2.5309, + "step": 2396 + }, + { + "epoch": 1.6040891416147511, + "grad_norm": 4.870308876037598, + "learning_rate": 2.6333356004476224e-05, + "loss": 2.7018, + "step": 2397 + }, + { + "epoch": 1.6047581218380231, + "grad_norm": 5.6591877937316895, + "learning_rate": 2.6313909028158608e-05, + "loss": 2.7843, + "step": 2398 + }, + { + "epoch": 1.6054271020612954, + "grad_norm": 7.571524620056152, + "learning_rate": 2.62944612545652e-05, + "loss": 2.4803, + "step": 2399 + }, + { + "epoch": 1.6060960822845676, + "grad_norm": 6.610879898071289, + "learning_rate": 2.6275012695496847e-05, + "loss": 2.904, + "step": 2400 + }, + { + "epoch": 1.6067650625078396, + "grad_norm": 7.075892448425293, + "learning_rate": 2.6255563362754875e-05, + "loss": 2.7231, + "step": 2401 + }, + { + "epoch": 1.6074340427311118, + "grad_norm": 6.549261569976807, + "learning_rate": 2.6236113268141083e-05, + "loss": 2.8158, + "step": 2402 + }, + { + "epoch": 1.608103022954384, + "grad_norm": 5.817134380340576, + "learning_rate": 2.6216662423457715e-05, + "loss": 2.9392, + "step": 2403 + }, + { + "epoch": 1.608772003177656, + "grad_norm": 6.896299839019775, + "learning_rate": 2.6197210840507485e-05, + "loss": 2.7464, + "step": 2404 + }, + { + "epoch": 1.6094409834009282, + "grad_norm": 4.643448829650879, + "learning_rate": 2.617775853109356e-05, + "loss": 2.7464, + "step": 2405 + }, + { + "epoch": 1.6101099636242004, + "grad_norm": 4.982819557189941, + "learning_rate": 2.615830550701952e-05, + "loss": 2.9396, + "step": 2406 + }, + { + "epoch": 1.6107789438474724, + "grad_norm": 4.733968257904053, + "learning_rate": 2.6138851780089413e-05, + "loss": 2.7082, + "step": 2407 + }, + { + "epoch": 1.6114479240707447, + "grad_norm": 6.8186421394348145, + "learning_rate": 2.6119397362107694e-05, + "loss": 2.7159, + "step": 2408 + }, + { + "epoch": 1.6121169042940169, + "grad_norm": 5.55661153793335, + "learning_rate": 2.6099942264879246e-05, + "loss": 2.6459, + "step": 2409 + }, + { + "epoch": 1.6127858845172889, + "grad_norm": 4.055827617645264, + "learning_rate": 2.608048650020935e-05, + "loss": 2.6294, + "step": 2410 + }, + { + "epoch": 1.613454864740561, + "grad_norm": 5.903413772583008, + "learning_rate": 2.606103007990371e-05, + "loss": 2.8888, + "step": 2411 + }, + { + "epoch": 1.6141238449638333, + "grad_norm": 3.653301477432251, + "learning_rate": 2.604157301576841e-05, + "loss": 2.6241, + "step": 2412 + }, + { + "epoch": 1.6147928251871053, + "grad_norm": 4.215085506439209, + "learning_rate": 2.6022115319609953e-05, + "loss": 2.7616, + "step": 2413 + }, + { + "epoch": 1.6154618054103775, + "grad_norm": 3.908628463745117, + "learning_rate": 2.600265700323518e-05, + "loss": 2.5136, + "step": 2414 + }, + { + "epoch": 1.6161307856336498, + "grad_norm": 4.34727668762207, + "learning_rate": 2.5983198078451355e-05, + "loss": 2.5217, + "step": 2415 + }, + { + "epoch": 1.6167997658569218, + "grad_norm": 3.767536163330078, + "learning_rate": 2.5963738557066092e-05, + "loss": 2.5968, + "step": 2416 + }, + { + "epoch": 1.617468746080194, + "grad_norm": 4.341113567352295, + "learning_rate": 2.594427845088735e-05, + "loss": 2.534, + "step": 2417 + }, + { + "epoch": 1.6181377263034662, + "grad_norm": 5.488800048828125, + "learning_rate": 2.5924817771723474e-05, + "loss": 2.6849, + "step": 2418 + }, + { + "epoch": 1.6188067065267382, + "grad_norm": 5.409704208374023, + "learning_rate": 2.5905356531383135e-05, + "loss": 2.858, + "step": 2419 + }, + { + "epoch": 1.6194756867500104, + "grad_norm": 4.850921154022217, + "learning_rate": 2.5885894741675353e-05, + "loss": 2.8339, + "step": 2420 + }, + { + "epoch": 1.6201446669732826, + "grad_norm": 7.075712203979492, + "learning_rate": 2.586643241440948e-05, + "loss": 2.7669, + "step": 2421 + }, + { + "epoch": 1.6208136471965546, + "grad_norm": 3.867203950881958, + "learning_rate": 2.5846969561395196e-05, + "loss": 2.6405, + "step": 2422 + }, + { + "epoch": 1.6214826274198268, + "grad_norm": 5.741216659545898, + "learning_rate": 2.582750619444249e-05, + "loss": 2.9541, + "step": 2423 + }, + { + "epoch": 1.622151607643099, + "grad_norm": 7.5297369956970215, + "learning_rate": 2.580804232536168e-05, + "loss": 2.8333, + "step": 2424 + }, + { + "epoch": 1.622820587866371, + "grad_norm": 5.421424865722656, + "learning_rate": 2.5788577965963372e-05, + "loss": 2.5568, + "step": 2425 + }, + { + "epoch": 1.6234895680896435, + "grad_norm": 4.421302318572998, + "learning_rate": 2.5769113128058486e-05, + "loss": 2.8412, + "step": 2426 + }, + { + "epoch": 1.6241585483129155, + "grad_norm": 4.942965030670166, + "learning_rate": 2.5749647823458218e-05, + "loss": 2.7317, + "step": 2427 + }, + { + "epoch": 1.6248275285361875, + "grad_norm": 5.792942047119141, + "learning_rate": 2.5730182063974044e-05, + "loss": 2.8114, + "step": 2428 + }, + { + "epoch": 1.62549650875946, + "grad_norm": 5.889285087585449, + "learning_rate": 2.571071586141774e-05, + "loss": 3.1153, + "step": 2429 + }, + { + "epoch": 1.626165488982732, + "grad_norm": 6.261662006378174, + "learning_rate": 2.569124922760132e-05, + "loss": 2.8418, + "step": 2430 + }, + { + "epoch": 1.626834469206004, + "grad_norm": 5.267799377441406, + "learning_rate": 2.5671782174337095e-05, + "loss": 2.5839, + "step": 2431 + }, + { + "epoch": 1.6275034494292764, + "grad_norm": 6.704413890838623, + "learning_rate": 2.5652314713437586e-05, + "loss": 2.7482, + "step": 2432 + }, + { + "epoch": 1.6281724296525484, + "grad_norm": 7.405350685119629, + "learning_rate": 2.5632846856715603e-05, + "loss": 2.865, + "step": 2433 + }, + { + "epoch": 1.6288414098758206, + "grad_norm": 4.24365234375, + "learning_rate": 2.561337861598418e-05, + "loss": 2.6506, + "step": 2434 + }, + { + "epoch": 1.6295103900990928, + "grad_norm": 4.277185440063477, + "learning_rate": 2.5593910003056576e-05, + "loss": 2.8607, + "step": 2435 + }, + { + "epoch": 1.6301793703223648, + "grad_norm": 6.50390625, + "learning_rate": 2.557444102974628e-05, + "loss": 2.8742, + "step": 2436 + }, + { + "epoch": 1.630848350545637, + "grad_norm": 5.022792816162109, + "learning_rate": 2.555497170786701e-05, + "loss": 2.7886, + "step": 2437 + }, + { + "epoch": 1.6315173307689093, + "grad_norm": 6.039017200469971, + "learning_rate": 2.5535502049232684e-05, + "loss": 2.9595, + "step": 2438 + }, + { + "epoch": 1.6321863109921813, + "grad_norm": 4.410915851593018, + "learning_rate": 2.551603206565743e-05, + "loss": 2.6244, + "step": 2439 + }, + { + "epoch": 1.6328552912154535, + "grad_norm": 5.923748016357422, + "learning_rate": 2.5496561768955574e-05, + "loss": 2.8382, + "step": 2440 + }, + { + "epoch": 1.6335242714387257, + "grad_norm": 7.241328716278076, + "learning_rate": 2.5477091170941626e-05, + "loss": 3.0066, + "step": 2441 + }, + { + "epoch": 1.6341932516619977, + "grad_norm": 5.229269504547119, + "learning_rate": 2.5457620283430285e-05, + "loss": 2.865, + "step": 2442 + }, + { + "epoch": 1.63486223188527, + "grad_norm": 5.172996997833252, + "learning_rate": 2.543814911823642e-05, + "loss": 2.6209, + "step": 2443 + }, + { + "epoch": 1.6355312121085421, + "grad_norm": 6.428318500518799, + "learning_rate": 2.541867768717507e-05, + "loss": 2.8982, + "step": 2444 + }, + { + "epoch": 1.6362001923318141, + "grad_norm": 4.812302589416504, + "learning_rate": 2.5399206002061443e-05, + "loss": 2.5506, + "step": 2445 + }, + { + "epoch": 1.6368691725550863, + "grad_norm": 6.246057510375977, + "learning_rate": 2.5379734074710898e-05, + "loss": 2.6993, + "step": 2446 + }, + { + "epoch": 1.6375381527783586, + "grad_norm": 5.78316068649292, + "learning_rate": 2.536026191693893e-05, + "loss": 2.9312, + "step": 2447 + }, + { + "epoch": 1.6382071330016306, + "grad_norm": 5.317570686340332, + "learning_rate": 2.5340789540561183e-05, + "loss": 2.9874, + "step": 2448 + }, + { + "epoch": 1.6388761132249028, + "grad_norm": 5.024258613586426, + "learning_rate": 2.5321316957393437e-05, + "loss": 2.7485, + "step": 2449 + }, + { + "epoch": 1.639545093448175, + "grad_norm": 4.547504425048828, + "learning_rate": 2.5301844179251588e-05, + "loss": 2.5821, + "step": 2450 + }, + { + "epoch": 1.640214073671447, + "grad_norm": 6.373087406158447, + "learning_rate": 2.528237121795166e-05, + "loss": 2.9019, + "step": 2451 + }, + { + "epoch": 1.6408830538947192, + "grad_norm": 4.163835525512695, + "learning_rate": 2.5262898085309777e-05, + "loss": 2.7396, + "step": 2452 + }, + { + "epoch": 1.6415520341179914, + "grad_norm": 5.687963008880615, + "learning_rate": 2.524342479314219e-05, + "loss": 2.6711, + "step": 2453 + }, + { + "epoch": 1.6422210143412634, + "grad_norm": 5.724370956420898, + "learning_rate": 2.5223951353265208e-05, + "loss": 2.7225, + "step": 2454 + }, + { + "epoch": 1.6428899945645357, + "grad_norm": 4.851588249206543, + "learning_rate": 2.5204477777495262e-05, + "loss": 2.823, + "step": 2455 + }, + { + "epoch": 1.6435589747878079, + "grad_norm": 8.624882698059082, + "learning_rate": 2.518500407764886e-05, + "loss": 2.996, + "step": 2456 + }, + { + "epoch": 1.6442279550110799, + "grad_norm": 4.683487892150879, + "learning_rate": 2.5165530265542576e-05, + "loss": 2.5252, + "step": 2457 + }, + { + "epoch": 1.644896935234352, + "grad_norm": 5.397092819213867, + "learning_rate": 2.5146056352993052e-05, + "loss": 3.0858, + "step": 2458 + }, + { + "epoch": 1.6455659154576243, + "grad_norm": 7.78914737701416, + "learning_rate": 2.5126582351817003e-05, + "loss": 2.6707, + "step": 2459 + }, + { + "epoch": 1.6462348956808963, + "grad_norm": 5.520780086517334, + "learning_rate": 2.5107108273831194e-05, + "loss": 2.7152, + "step": 2460 + }, + { + "epoch": 1.6469038759041685, + "grad_norm": 5.705004692077637, + "learning_rate": 2.508763413085242e-05, + "loss": 2.8296, + "step": 2461 + }, + { + "epoch": 1.6475728561274408, + "grad_norm": 5.346718788146973, + "learning_rate": 2.506815993469754e-05, + "loss": 3.0625, + "step": 2462 + }, + { + "epoch": 1.6482418363507128, + "grad_norm": 5.675193786621094, + "learning_rate": 2.5048685697183425e-05, + "loss": 2.6163, + "step": 2463 + }, + { + "epoch": 1.6489108165739852, + "grad_norm": 8.798904418945312, + "learning_rate": 2.502921143012699e-05, + "loss": 2.8633, + "step": 2464 + }, + { + "epoch": 1.6495797967972572, + "grad_norm": 6.258532524108887, + "learning_rate": 2.500973714534515e-05, + "loss": 2.7428, + "step": 2465 + }, + { + "epoch": 1.6502487770205292, + "grad_norm": 6.154701232910156, + "learning_rate": 2.499026285465485e-05, + "loss": 2.7623, + "step": 2466 + }, + { + "epoch": 1.6509177572438016, + "grad_norm": 6.404314041137695, + "learning_rate": 2.497078856987301e-05, + "loss": 2.805, + "step": 2467 + }, + { + "epoch": 1.6515867374670736, + "grad_norm": 5.8303914070129395, + "learning_rate": 2.495131430281658e-05, + "loss": 3.0137, + "step": 2468 + }, + { + "epoch": 1.6522557176903456, + "grad_norm": 4.913717746734619, + "learning_rate": 2.4931840065302463e-05, + "loss": 2.7482, + "step": 2469 + }, + { + "epoch": 1.652924697913618, + "grad_norm": 3.7035882472991943, + "learning_rate": 2.4912365869147585e-05, + "loss": 2.2987, + "step": 2470 + }, + { + "epoch": 1.65359367813689, + "grad_norm": 5.4154839515686035, + "learning_rate": 2.4892891726168812e-05, + "loss": 3.0513, + "step": 2471 + }, + { + "epoch": 1.654262658360162, + "grad_norm": 5.4690842628479, + "learning_rate": 2.4873417648183003e-05, + "loss": 2.4291, + "step": 2472 + }, + { + "epoch": 1.6549316385834345, + "grad_norm": 4.671787261962891, + "learning_rate": 2.485394364700695e-05, + "loss": 2.6835, + "step": 2473 + }, + { + "epoch": 1.6556006188067065, + "grad_norm": 5.292201995849609, + "learning_rate": 2.4834469734457433e-05, + "loss": 2.7257, + "step": 2474 + }, + { + "epoch": 1.6562695990299787, + "grad_norm": 4.135709762573242, + "learning_rate": 2.4814995922351146e-05, + "loss": 2.5343, + "step": 2475 + }, + { + "epoch": 1.656938579253251, + "grad_norm": 5.647322654724121, + "learning_rate": 2.4795522222504744e-05, + "loss": 2.8363, + "step": 2476 + }, + { + "epoch": 1.657607559476523, + "grad_norm": 6.4661970138549805, + "learning_rate": 2.4776048646734794e-05, + "loss": 2.9677, + "step": 2477 + }, + { + "epoch": 1.6582765396997952, + "grad_norm": 4.214534759521484, + "learning_rate": 2.4756575206857817e-05, + "loss": 2.5381, + "step": 2478 + }, + { + "epoch": 1.6589455199230674, + "grad_norm": 5.281362533569336, + "learning_rate": 2.4737101914690222e-05, + "loss": 2.8645, + "step": 2479 + }, + { + "epoch": 1.6596145001463394, + "grad_norm": 4.652920722961426, + "learning_rate": 2.4717628782048348e-05, + "loss": 2.5176, + "step": 2480 + }, + { + "epoch": 1.6602834803696116, + "grad_norm": 5.855139255523682, + "learning_rate": 2.4698155820748407e-05, + "loss": 2.8235, + "step": 2481 + }, + { + "epoch": 1.6609524605928838, + "grad_norm": 5.996157169342041, + "learning_rate": 2.4678683042606565e-05, + "loss": 2.8375, + "step": 2482 + }, + { + "epoch": 1.6616214408161558, + "grad_norm": 5.081485271453857, + "learning_rate": 2.4659210459438816e-05, + "loss": 2.8121, + "step": 2483 + }, + { + "epoch": 1.662290421039428, + "grad_norm": 5.23148775100708, + "learning_rate": 2.4639738083061075e-05, + "loss": 2.7627, + "step": 2484 + }, + { + "epoch": 1.6629594012627003, + "grad_norm": 5.417628288269043, + "learning_rate": 2.4620265925289098e-05, + "loss": 2.7727, + "step": 2485 + }, + { + "epoch": 1.6636283814859723, + "grad_norm": 7.604246139526367, + "learning_rate": 2.4600793997938563e-05, + "loss": 2.7401, + "step": 2486 + }, + { + "epoch": 1.6642973617092445, + "grad_norm": 5.298999309539795, + "learning_rate": 2.458132231282493e-05, + "loss": 2.683, + "step": 2487 + }, + { + "epoch": 1.6649663419325167, + "grad_norm": 7.451618194580078, + "learning_rate": 2.4561850881763588e-05, + "loss": 2.9445, + "step": 2488 + }, + { + "epoch": 1.6656353221557887, + "grad_norm": 5.242423057556152, + "learning_rate": 2.454237971656972e-05, + "loss": 2.6389, + "step": 2489 + }, + { + "epoch": 1.666304302379061, + "grad_norm": 5.074806213378906, + "learning_rate": 2.4522908829058383e-05, + "loss": 2.7597, + "step": 2490 + }, + { + "epoch": 1.6669732826023331, + "grad_norm": 6.158017158508301, + "learning_rate": 2.4503438231044425e-05, + "loss": 2.8183, + "step": 2491 + }, + { + "epoch": 1.6676422628256051, + "grad_norm": 6.835614204406738, + "learning_rate": 2.4483967934342574e-05, + "loss": 2.8358, + "step": 2492 + }, + { + "epoch": 1.6683112430488773, + "grad_norm": 4.561537742614746, + "learning_rate": 2.446449795076732e-05, + "loss": 2.5523, + "step": 2493 + }, + { + "epoch": 1.6689802232721496, + "grad_norm": 5.291999340057373, + "learning_rate": 2.4445028292132997e-05, + "loss": 2.6435, + "step": 2494 + }, + { + "epoch": 1.6696492034954216, + "grad_norm": 4.522041320800781, + "learning_rate": 2.442555897025372e-05, + "loss": 2.5627, + "step": 2495 + }, + { + "epoch": 1.6703181837186938, + "grad_norm": 3.6309473514556885, + "learning_rate": 2.4406089996943433e-05, + "loss": 2.6634, + "step": 2496 + }, + { + "epoch": 1.670987163941966, + "grad_norm": 9.647355079650879, + "learning_rate": 2.4386621384015823e-05, + "loss": 2.4431, + "step": 2497 + }, + { + "epoch": 1.671656144165238, + "grad_norm": 5.466341495513916, + "learning_rate": 2.43671531432844e-05, + "loss": 2.7964, + "step": 2498 + }, + { + "epoch": 1.6723251243885102, + "grad_norm": 4.108764171600342, + "learning_rate": 2.434768528656241e-05, + "loss": 2.6217, + "step": 2499 + }, + { + "epoch": 1.6729941046117824, + "grad_norm": 5.70421028137207, + "learning_rate": 2.432821782566291e-05, + "loss": 2.7245, + "step": 2500 + }, + { + "epoch": 1.6736630848350544, + "grad_norm": 6.602444648742676, + "learning_rate": 2.430875077239868e-05, + "loss": 2.9086, + "step": 2501 + }, + { + "epoch": 1.6743320650583269, + "grad_norm": 4.562328815460205, + "learning_rate": 2.428928413858227e-05, + "loss": 2.7498, + "step": 2502 + }, + { + "epoch": 1.6750010452815989, + "grad_norm": 7.803415775299072, + "learning_rate": 2.4269817936025955e-05, + "loss": 2.5058, + "step": 2503 + }, + { + "epoch": 1.6756700255048709, + "grad_norm": 5.827186107635498, + "learning_rate": 2.4250352176541788e-05, + "loss": 2.9237, + "step": 2504 + }, + { + "epoch": 1.6763390057281433, + "grad_norm": 4.48921537399292, + "learning_rate": 2.4230886871941517e-05, + "loss": 2.6325, + "step": 2505 + }, + { + "epoch": 1.6770079859514153, + "grad_norm": 5.492165565490723, + "learning_rate": 2.421142203403663e-05, + "loss": 2.8539, + "step": 2506 + }, + { + "epoch": 1.6776769661746873, + "grad_norm": 4.681061744689941, + "learning_rate": 2.4191957674638317e-05, + "loss": 2.7593, + "step": 2507 + }, + { + "epoch": 1.6783459463979598, + "grad_norm": 5.666809558868408, + "learning_rate": 2.4172493805557515e-05, + "loss": 2.9197, + "step": 2508 + }, + { + "epoch": 1.6790149266212318, + "grad_norm": 5.163647174835205, + "learning_rate": 2.4153030438604806e-05, + "loss": 2.7285, + "step": 2509 + }, + { + "epoch": 1.6796839068445037, + "grad_norm": 6.088483810424805, + "learning_rate": 2.4133567585590524e-05, + "loss": 2.6448, + "step": 2510 + }, + { + "epoch": 1.6803528870677762, + "grad_norm": 6.308938980102539, + "learning_rate": 2.411410525832465e-05, + "loss": 2.8534, + "step": 2511 + }, + { + "epoch": 1.6810218672910482, + "grad_norm": 8.010528564453125, + "learning_rate": 2.4094643468616874e-05, + "loss": 3.0351, + "step": 2512 + }, + { + "epoch": 1.6816908475143204, + "grad_norm": 4.884311676025391, + "learning_rate": 2.4075182228276528e-05, + "loss": 2.7731, + "step": 2513 + }, + { + "epoch": 1.6823598277375926, + "grad_norm": 5.070252895355225, + "learning_rate": 2.4055721549112654e-05, + "loss": 2.6083, + "step": 2514 + }, + { + "epoch": 1.6830288079608646, + "grad_norm": 5.1491241455078125, + "learning_rate": 2.4036261442933913e-05, + "loss": 2.544, + "step": 2515 + }, + { + "epoch": 1.6836977881841368, + "grad_norm": 5.936621189117432, + "learning_rate": 2.4016801921548648e-05, + "loss": 2.6634, + "step": 2516 + }, + { + "epoch": 1.684366768407409, + "grad_norm": 5.348430156707764, + "learning_rate": 2.3997342996764818e-05, + "loss": 2.4732, + "step": 2517 + }, + { + "epoch": 1.685035748630681, + "grad_norm": 5.629446983337402, + "learning_rate": 2.3977884680390056e-05, + "loss": 2.8167, + "step": 2518 + }, + { + "epoch": 1.6857047288539533, + "grad_norm": 4.8856024742126465, + "learning_rate": 2.395842698423159e-05, + "loss": 2.6797, + "step": 2519 + }, + { + "epoch": 1.6863737090772255, + "grad_norm": 6.2281494140625, + "learning_rate": 2.39389699200963e-05, + "loss": 2.5547, + "step": 2520 + }, + { + "epoch": 1.6870426893004975, + "grad_norm": 5.2909836769104, + "learning_rate": 2.3919513499790648e-05, + "loss": 2.8618, + "step": 2521 + }, + { + "epoch": 1.6877116695237697, + "grad_norm": 6.354259014129639, + "learning_rate": 2.390005773512076e-05, + "loss": 2.5831, + "step": 2522 + }, + { + "epoch": 1.688380649747042, + "grad_norm": 3.8957390785217285, + "learning_rate": 2.3880602637892305e-05, + "loss": 2.5017, + "step": 2523 + }, + { + "epoch": 1.689049629970314, + "grad_norm": 4.465644359588623, + "learning_rate": 2.386114821991059e-05, + "loss": 2.4734, + "step": 2524 + }, + { + "epoch": 1.6897186101935862, + "grad_norm": 5.646853923797607, + "learning_rate": 2.3841694492980476e-05, + "loss": 2.7715, + "step": 2525 + }, + { + "epoch": 1.6903875904168584, + "grad_norm": 4.684238433837891, + "learning_rate": 2.3822241468906448e-05, + "loss": 2.8278, + "step": 2526 + }, + { + "epoch": 1.6910565706401304, + "grad_norm": 5.539403438568115, + "learning_rate": 2.3802789159492514e-05, + "loss": 2.6784, + "step": 2527 + }, + { + "epoch": 1.6917255508634026, + "grad_norm": 4.852906703948975, + "learning_rate": 2.378333757654229e-05, + "loss": 2.6266, + "step": 2528 + }, + { + "epoch": 1.6923945310866748, + "grad_norm": 6.845240116119385, + "learning_rate": 2.3763886731858916e-05, + "loss": 2.9784, + "step": 2529 + }, + { + "epoch": 1.6930635113099468, + "grad_norm": 5.903868675231934, + "learning_rate": 2.374443663724513e-05, + "loss": 2.7955, + "step": 2530 + }, + { + "epoch": 1.693732491533219, + "grad_norm": 6.892399311065674, + "learning_rate": 2.3724987304503152e-05, + "loss": 2.7213, + "step": 2531 + }, + { + "epoch": 1.6944014717564913, + "grad_norm": 4.848674774169922, + "learning_rate": 2.3705538745434804e-05, + "loss": 2.8574, + "step": 2532 + }, + { + "epoch": 1.6950704519797632, + "grad_norm": 4.339311599731445, + "learning_rate": 2.3686090971841395e-05, + "loss": 2.878, + "step": 2533 + }, + { + "epoch": 1.6957394322030355, + "grad_norm": 7.817746639251709, + "learning_rate": 2.366664399552378e-05, + "loss": 2.6093, + "step": 2534 + }, + { + "epoch": 1.6964084124263077, + "grad_norm": 5.182209491729736, + "learning_rate": 2.364719782828231e-05, + "loss": 2.8362, + "step": 2535 + }, + { + "epoch": 1.6970773926495797, + "grad_norm": 5.601119518280029, + "learning_rate": 2.362775248191687e-05, + "loss": 2.7463, + "step": 2536 + }, + { + "epoch": 1.697746372872852, + "grad_norm": 6.002682685852051, + "learning_rate": 2.3608307968226826e-05, + "loss": 2.8956, + "step": 2537 + }, + { + "epoch": 1.6984153530961241, + "grad_norm": 7.065710067749023, + "learning_rate": 2.3588864299011055e-05, + "loss": 2.5411, + "step": 2538 + }, + { + "epoch": 1.6990843333193961, + "grad_norm": 5.531703948974609, + "learning_rate": 2.3569421486067894e-05, + "loss": 2.5599, + "step": 2539 + }, + { + "epoch": 1.6997533135426683, + "grad_norm": 7.126420497894287, + "learning_rate": 2.3549979541195204e-05, + "loss": 2.8409, + "step": 2540 + }, + { + "epoch": 1.7004222937659406, + "grad_norm": 5.211402893066406, + "learning_rate": 2.3530538476190276e-05, + "loss": 2.695, + "step": 2541 + }, + { + "epoch": 1.7010912739892126, + "grad_norm": 7.738270282745361, + "learning_rate": 2.3511098302849902e-05, + "loss": 2.6719, + "step": 2542 + }, + { + "epoch": 1.701760254212485, + "grad_norm": 4.948130130767822, + "learning_rate": 2.3491659032970295e-05, + "loss": 2.6603, + "step": 2543 + }, + { + "epoch": 1.702429234435757, + "grad_norm": 5.112238883972168, + "learning_rate": 2.347222067834717e-05, + "loss": 2.7037, + "step": 2544 + }, + { + "epoch": 1.703098214659029, + "grad_norm": 4.915751934051514, + "learning_rate": 2.345278325077563e-05, + "loss": 2.7256, + "step": 2545 + }, + { + "epoch": 1.7037671948823014, + "grad_norm": 6.279435634613037, + "learning_rate": 2.3433346762050268e-05, + "loss": 2.9779, + "step": 2546 + }, + { + "epoch": 1.7044361751055734, + "grad_norm": 3.470745801925659, + "learning_rate": 2.3413911223965062e-05, + "loss": 2.4658, + "step": 2547 + }, + { + "epoch": 1.7051051553288454, + "grad_norm": 4.6231160163879395, + "learning_rate": 2.3394476648313453e-05, + "loss": 2.7526, + "step": 2548 + }, + { + "epoch": 1.7057741355521179, + "grad_norm": 6.271169185638428, + "learning_rate": 2.3375043046888268e-05, + "loss": 2.7204, + "step": 2549 + }, + { + "epoch": 1.7064431157753899, + "grad_norm": 6.299276351928711, + "learning_rate": 2.335561043148176e-05, + "loss": 2.6145, + "step": 2550 + }, + { + "epoch": 1.707112095998662, + "grad_norm": 7.203403949737549, + "learning_rate": 2.333617881388557e-05, + "loss": 2.9738, + "step": 2551 + }, + { + "epoch": 1.7077810762219343, + "grad_norm": 6.858221054077148, + "learning_rate": 2.3316748205890753e-05, + "loss": 2.9029, + "step": 2552 + }, + { + "epoch": 1.7084500564452063, + "grad_norm": 5.399645805358887, + "learning_rate": 2.3297318619287716e-05, + "loss": 2.6019, + "step": 2553 + }, + { + "epoch": 1.7091190366684785, + "grad_norm": 4.9139204025268555, + "learning_rate": 2.3277890065866304e-05, + "loss": 2.5849, + "step": 2554 + }, + { + "epoch": 1.7097880168917508, + "grad_norm": 6.575718402862549, + "learning_rate": 2.3258462557415673e-05, + "loss": 2.7456, + "step": 2555 + }, + { + "epoch": 1.7104569971150227, + "grad_norm": 5.743617057800293, + "learning_rate": 2.323903610572439e-05, + "loss": 2.9058, + "step": 2556 + }, + { + "epoch": 1.711125977338295, + "grad_norm": 5.065006256103516, + "learning_rate": 2.321961072258035e-05, + "loss": 2.811, + "step": 2557 + }, + { + "epoch": 1.7117949575615672, + "grad_norm": 3.897226095199585, + "learning_rate": 2.3200186419770826e-05, + "loss": 2.5951, + "step": 2558 + }, + { + "epoch": 1.7124639377848392, + "grad_norm": 4.879258632659912, + "learning_rate": 2.3180763209082415e-05, + "loss": 2.7645, + "step": 2559 + }, + { + "epoch": 1.7131329180081114, + "grad_norm": 5.2255072593688965, + "learning_rate": 2.3161341102301065e-05, + "loss": 2.7062, + "step": 2560 + }, + { + "epoch": 1.7138018982313836, + "grad_norm": 5.900786399841309, + "learning_rate": 2.3141920111212035e-05, + "loss": 2.7575, + "step": 2561 + }, + { + "epoch": 1.7144708784546556, + "grad_norm": 5.447509288787842, + "learning_rate": 2.312250024759994e-05, + "loss": 2.7415, + "step": 2562 + }, + { + "epoch": 1.7151398586779278, + "grad_norm": 6.256686687469482, + "learning_rate": 2.310308152324867e-05, + "loss": 3.0382, + "step": 2563 + }, + { + "epoch": 1.7158088389012, + "grad_norm": 4.430761814117432, + "learning_rate": 2.3083663949941463e-05, + "loss": 2.7999, + "step": 2564 + }, + { + "epoch": 1.716477819124472, + "grad_norm": 5.721620559692383, + "learning_rate": 2.3064247539460827e-05, + "loss": 2.8862, + "step": 2565 + }, + { + "epoch": 1.7171467993477443, + "grad_norm": 5.0352935791015625, + "learning_rate": 2.3044832303588586e-05, + "loss": 2.8294, + "step": 2566 + }, + { + "epoch": 1.7178157795710165, + "grad_norm": 5.007111072540283, + "learning_rate": 2.302541825410584e-05, + "loss": 2.7377, + "step": 2567 + }, + { + "epoch": 1.7184847597942885, + "grad_norm": 5.62821102142334, + "learning_rate": 2.3006005402792976e-05, + "loss": 2.6374, + "step": 2568 + }, + { + "epoch": 1.7191537400175607, + "grad_norm": 9.058024406433105, + "learning_rate": 2.2986593761429638e-05, + "loss": 2.7939, + "step": 2569 + }, + { + "epoch": 1.719822720240833, + "grad_norm": 4.320191383361816, + "learning_rate": 2.2967183341794768e-05, + "loss": 2.831, + "step": 2570 + }, + { + "epoch": 1.720491700464105, + "grad_norm": 4.51514196395874, + "learning_rate": 2.294777415566652e-05, + "loss": 2.7715, + "step": 2571 + }, + { + "epoch": 1.7211606806873772, + "grad_norm": 7.908797740936279, + "learning_rate": 2.292836621482235e-05, + "loss": 2.7382, + "step": 2572 + }, + { + "epoch": 1.7218296609106494, + "grad_norm": 5.191076755523682, + "learning_rate": 2.290895953103892e-05, + "loss": 2.753, + "step": 2573 + }, + { + "epoch": 1.7224986411339214, + "grad_norm": 5.839506149291992, + "learning_rate": 2.2889554116092153e-05, + "loss": 2.7941, + "step": 2574 + }, + { + "epoch": 1.7231676213571936, + "grad_norm": 4.96605920791626, + "learning_rate": 2.2870149981757177e-05, + "loss": 2.588, + "step": 2575 + }, + { + "epoch": 1.7238366015804658, + "grad_norm": 5.3897833824157715, + "learning_rate": 2.285074713980838e-05, + "loss": 2.7056, + "step": 2576 + }, + { + "epoch": 1.7245055818037378, + "grad_norm": 4.577094554901123, + "learning_rate": 2.283134560201933e-05, + "loss": 2.8574, + "step": 2577 + }, + { + "epoch": 1.72517456202701, + "grad_norm": 8.089116096496582, + "learning_rate": 2.2811945380162824e-05, + "loss": 2.7452, + "step": 2578 + }, + { + "epoch": 1.7258435422502822, + "grad_norm": 10.091802597045898, + "learning_rate": 2.2792546486010847e-05, + "loss": 2.9093, + "step": 2579 + }, + { + "epoch": 1.7265125224735542, + "grad_norm": 8.744396209716797, + "learning_rate": 2.2773148931334594e-05, + "loss": 2.7463, + "step": 2580 + }, + { + "epoch": 1.7271815026968267, + "grad_norm": 5.300685405731201, + "learning_rate": 2.275375272790444e-05, + "loss": 2.885, + "step": 2581 + }, + { + "epoch": 1.7278504829200987, + "grad_norm": 4.582159042358398, + "learning_rate": 2.2734357887489934e-05, + "loss": 2.3113, + "step": 2582 + }, + { + "epoch": 1.7285194631433707, + "grad_norm": 4.907561302185059, + "learning_rate": 2.2714964421859792e-05, + "loss": 2.7512, + "step": 2583 + }, + { + "epoch": 1.7291884433666431, + "grad_norm": 4.984943389892578, + "learning_rate": 2.2695572342781935e-05, + "loss": 2.5355, + "step": 2584 + }, + { + "epoch": 1.7298574235899151, + "grad_norm": 6.7600932121276855, + "learning_rate": 2.2676181662023387e-05, + "loss": 2.7291, + "step": 2585 + }, + { + "epoch": 1.7305264038131871, + "grad_norm": 4.853344440460205, + "learning_rate": 2.2656792391350375e-05, + "loss": 2.3747, + "step": 2586 + }, + { + "epoch": 1.7311953840364596, + "grad_norm": 3.686701774597168, + "learning_rate": 2.2637404542528217e-05, + "loss": 2.7271, + "step": 2587 + }, + { + "epoch": 1.7318643642597316, + "grad_norm": 6.155206203460693, + "learning_rate": 2.2618018127321427e-05, + "loss": 2.6733, + "step": 2588 + }, + { + "epoch": 1.7325333444830036, + "grad_norm": 7.384573936462402, + "learning_rate": 2.2598633157493604e-05, + "loss": 2.9799, + "step": 2589 + }, + { + "epoch": 1.733202324706276, + "grad_norm": 4.645562648773193, + "learning_rate": 2.257924964480749e-05, + "loss": 2.6954, + "step": 2590 + }, + { + "epoch": 1.733871304929548, + "grad_norm": 5.855587482452393, + "learning_rate": 2.2559867601024927e-05, + "loss": 2.7054, + "step": 2591 + }, + { + "epoch": 1.7345402851528202, + "grad_norm": 5.897169589996338, + "learning_rate": 2.2540487037906906e-05, + "loss": 2.9156, + "step": 2592 + }, + { + "epoch": 1.7352092653760924, + "grad_norm": 6.18589973449707, + "learning_rate": 2.2521107967213454e-05, + "loss": 2.5325, + "step": 2593 + }, + { + "epoch": 1.7358782455993644, + "grad_norm": 7.244022369384766, + "learning_rate": 2.250173040070376e-05, + "loss": 3.0386, + "step": 2594 + }, + { + "epoch": 1.7365472258226367, + "grad_norm": 5.8611955642700195, + "learning_rate": 2.2482354350136045e-05, + "loss": 3.0563, + "step": 2595 + }, + { + "epoch": 1.7372162060459089, + "grad_norm": 7.672198295593262, + "learning_rate": 2.246297982726765e-05, + "loss": 2.8646, + "step": 2596 + }, + { + "epoch": 1.7378851862691809, + "grad_norm": 4.697488307952881, + "learning_rate": 2.2443606843854964e-05, + "loss": 2.7142, + "step": 2597 + }, + { + "epoch": 1.738554166492453, + "grad_norm": 4.74916410446167, + "learning_rate": 2.2424235411653466e-05, + "loss": 2.5616, + "step": 2598 + }, + { + "epoch": 1.7392231467157253, + "grad_norm": 7.083244323730469, + "learning_rate": 2.240486554241766e-05, + "loss": 2.6612, + "step": 2599 + }, + { + "epoch": 1.7398921269389973, + "grad_norm": 7.194074630737305, + "learning_rate": 2.238549724790114e-05, + "loss": 2.8222, + "step": 2600 + }, + { + "epoch": 1.7405611071622695, + "grad_norm": 5.098743915557861, + "learning_rate": 2.23661305398565e-05, + "loss": 2.7619, + "step": 2601 + }, + { + "epoch": 1.7412300873855417, + "grad_norm": 6.6131134033203125, + "learning_rate": 2.234676543003542e-05, + "loss": 3.2162, + "step": 2602 + }, + { + "epoch": 1.7418990676088137, + "grad_norm": 6.013036727905273, + "learning_rate": 2.232740193018858e-05, + "loss": 2.8715, + "step": 2603 + }, + { + "epoch": 1.742568047832086, + "grad_norm": 4.700963973999023, + "learning_rate": 2.230804005206569e-05, + "loss": 2.4515, + "step": 2604 + }, + { + "epoch": 1.7432370280553582, + "grad_norm": 6.152329444885254, + "learning_rate": 2.228867980741546e-05, + "loss": 2.8436, + "step": 2605 + }, + { + "epoch": 1.7439060082786302, + "grad_norm": 5.965137004852295, + "learning_rate": 2.2269321207985645e-05, + "loss": 2.9214, + "step": 2606 + }, + { + "epoch": 1.7445749885019024, + "grad_norm": 3.820052146911621, + "learning_rate": 2.2249964265522972e-05, + "loss": 2.6333, + "step": 2607 + }, + { + "epoch": 1.7452439687251746, + "grad_norm": 5.213476181030273, + "learning_rate": 2.223060899177318e-05, + "loss": 2.8786, + "step": 2608 + }, + { + "epoch": 1.7459129489484466, + "grad_norm": 5.23811149597168, + "learning_rate": 2.2211255398480967e-05, + "loss": 2.7324, + "step": 2609 + }, + { + "epoch": 1.7465819291717188, + "grad_norm": 4.518553256988525, + "learning_rate": 2.2191903497390057e-05, + "loss": 2.5349, + "step": 2610 + }, + { + "epoch": 1.747250909394991, + "grad_norm": 5.865686893463135, + "learning_rate": 2.2172553300243103e-05, + "loss": 2.7363, + "step": 2611 + }, + { + "epoch": 1.747919889618263, + "grad_norm": 4.556504249572754, + "learning_rate": 2.2153204818781757e-05, + "loss": 2.9129, + "step": 2612 + }, + { + "epoch": 1.7485888698415353, + "grad_norm": 6.721836566925049, + "learning_rate": 2.2133858064746592e-05, + "loss": 2.8609, + "step": 2613 + }, + { + "epoch": 1.7492578500648075, + "grad_norm": 7.764565944671631, + "learning_rate": 2.2114513049877195e-05, + "loss": 3.3474, + "step": 2614 + }, + { + "epoch": 1.7499268302880795, + "grad_norm": 6.830298900604248, + "learning_rate": 2.2095169785912015e-05, + "loss": 2.5932, + "step": 2615 + }, + { + "epoch": 1.7505958105113517, + "grad_norm": 4.948487281799316, + "learning_rate": 2.2075828284588514e-05, + "loss": 2.5113, + "step": 2616 + }, + { + "epoch": 1.751264790734624, + "grad_norm": 4.6457839012146, + "learning_rate": 2.205648855764304e-05, + "loss": 2.807, + "step": 2617 + }, + { + "epoch": 1.751933770957896, + "grad_norm": 5.526391506195068, + "learning_rate": 2.2037150616810883e-05, + "loss": 2.76, + "step": 2618 + }, + { + "epoch": 1.7526027511811684, + "grad_norm": 10.367713928222656, + "learning_rate": 2.2017814473826232e-05, + "loss": 2.5667, + "step": 2619 + }, + { + "epoch": 1.7532717314044404, + "grad_norm": 4.087628364562988, + "learning_rate": 2.1998480140422214e-05, + "loss": 2.9489, + "step": 2620 + }, + { + "epoch": 1.7539407116277124, + "grad_norm": 6.0001959800720215, + "learning_rate": 2.197914762833083e-05, + "loss": 2.5951, + "step": 2621 + }, + { + "epoch": 1.7546096918509848, + "grad_norm": 3.751884698867798, + "learning_rate": 2.195981694928299e-05, + "loss": 2.2341, + "step": 2622 + }, + { + "epoch": 1.7552786720742568, + "grad_norm": 7.510499477386475, + "learning_rate": 2.1940488115008475e-05, + "loss": 2.8456, + "step": 2623 + }, + { + "epoch": 1.7559476522975288, + "grad_norm": 5.829528331756592, + "learning_rate": 2.1921161137235986e-05, + "loss": 2.8387, + "step": 2624 + }, + { + "epoch": 1.7566166325208012, + "grad_norm": 4.678679466247559, + "learning_rate": 2.1901836027693047e-05, + "loss": 2.6812, + "step": 2625 + }, + { + "epoch": 1.7572856127440732, + "grad_norm": 5.86160135269165, + "learning_rate": 2.188251279810609e-05, + "loss": 2.946, + "step": 2626 + }, + { + "epoch": 1.7579545929673452, + "grad_norm": 7.377922058105469, + "learning_rate": 2.1863191460200366e-05, + "loss": 2.8606, + "step": 2627 + }, + { + "epoch": 1.7586235731906177, + "grad_norm": 5.642475605010986, + "learning_rate": 2.184387202570003e-05, + "loss": 2.6162, + "step": 2628 + }, + { + "epoch": 1.7592925534138897, + "grad_norm": 5.498693943023682, + "learning_rate": 2.182455450632803e-05, + "loss": 2.7145, + "step": 2629 + }, + { + "epoch": 1.759961533637162, + "grad_norm": 5.875025749206543, + "learning_rate": 2.180523891380619e-05, + "loss": 2.6928, + "step": 2630 + }, + { + "epoch": 1.7606305138604341, + "grad_norm": 5.917715072631836, + "learning_rate": 2.1785925259855135e-05, + "loss": 2.7247, + "step": 2631 + }, + { + "epoch": 1.7612994940837061, + "grad_norm": 6.21785831451416, + "learning_rate": 2.1766613556194347e-05, + "loss": 2.7966, + "step": 2632 + }, + { + "epoch": 1.7619684743069783, + "grad_norm": 4.751443386077881, + "learning_rate": 2.1747303814542087e-05, + "loss": 2.6551, + "step": 2633 + }, + { + "epoch": 1.7626374545302506, + "grad_norm": 6.275938510894775, + "learning_rate": 2.172799604661546e-05, + "loss": 2.9517, + "step": 2634 + }, + { + "epoch": 1.7633064347535226, + "grad_norm": 5.280247688293457, + "learning_rate": 2.1708690264130342e-05, + "loss": 2.7783, + "step": 2635 + }, + { + "epoch": 1.7639754149767948, + "grad_norm": 6.113962173461914, + "learning_rate": 2.1689386478801438e-05, + "loss": 2.9751, + "step": 2636 + }, + { + "epoch": 1.764644395200067, + "grad_norm": 6.94852352142334, + "learning_rate": 2.1670084702342204e-05, + "loss": 2.8749, + "step": 2637 + }, + { + "epoch": 1.765313375423339, + "grad_norm": 6.6532392501831055, + "learning_rate": 2.165078494646491e-05, + "loss": 2.6929, + "step": 2638 + }, + { + "epoch": 1.7659823556466112, + "grad_norm": 5.977123737335205, + "learning_rate": 2.163148722288058e-05, + "loss": 2.8827, + "step": 2639 + }, + { + "epoch": 1.7666513358698834, + "grad_norm": 5.5445427894592285, + "learning_rate": 2.161219154329902e-05, + "loss": 2.7309, + "step": 2640 + }, + { + "epoch": 1.7673203160931554, + "grad_norm": 5.397950649261475, + "learning_rate": 2.1592897919428765e-05, + "loss": 2.9227, + "step": 2641 + }, + { + "epoch": 1.7679892963164276, + "grad_norm": 5.681791305541992, + "learning_rate": 2.157360636297715e-05, + "loss": 2.6949, + "step": 2642 + }, + { + "epoch": 1.7686582765396999, + "grad_norm": 5.597262382507324, + "learning_rate": 2.155431688565021e-05, + "loss": 2.784, + "step": 2643 + }, + { + "epoch": 1.7693272567629719, + "grad_norm": 7.133121967315674, + "learning_rate": 2.1535029499152757e-05, + "loss": 2.8139, + "step": 2644 + }, + { + "epoch": 1.769996236986244, + "grad_norm": 4.787456035614014, + "learning_rate": 2.151574421518829e-05, + "loss": 2.7211, + "step": 2645 + }, + { + "epoch": 1.7706652172095163, + "grad_norm": 9.780879974365234, + "learning_rate": 2.1496461045459083e-05, + "loss": 2.9031, + "step": 2646 + }, + { + "epoch": 1.7713341974327883, + "grad_norm": 6.123249530792236, + "learning_rate": 2.1477180001666084e-05, + "loss": 2.6404, + "step": 2647 + }, + { + "epoch": 1.7720031776560605, + "grad_norm": 5.120070934295654, + "learning_rate": 2.145790109550898e-05, + "loss": 2.6474, + "step": 2648 + }, + { + "epoch": 1.7726721578793327, + "grad_norm": 6.845491886138916, + "learning_rate": 2.1438624338686135e-05, + "loss": 2.9112, + "step": 2649 + }, + { + "epoch": 1.7733411381026047, + "grad_norm": 7.510354042053223, + "learning_rate": 2.1419349742894645e-05, + "loss": 2.7838, + "step": 2650 + }, + { + "epoch": 1.774010118325877, + "grad_norm": 4.251450061798096, + "learning_rate": 2.1400077319830255e-05, + "loss": 2.5678, + "step": 2651 + }, + { + "epoch": 1.7746790985491492, + "grad_norm": 7.768141269683838, + "learning_rate": 2.138080708118742e-05, + "loss": 2.8773, + "step": 2652 + }, + { + "epoch": 1.7753480787724212, + "grad_norm": 4.0545549392700195, + "learning_rate": 2.1361539038659246e-05, + "loss": 2.7939, + "step": 2653 + }, + { + "epoch": 1.7760170589956934, + "grad_norm": 4.221911430358887, + "learning_rate": 2.134227320393754e-05, + "loss": 2.6346, + "step": 2654 + }, + { + "epoch": 1.7766860392189656, + "grad_norm": 5.212001800537109, + "learning_rate": 2.1323009588712723e-05, + "loss": 2.7204, + "step": 2655 + }, + { + "epoch": 1.7773550194422376, + "grad_norm": 5.01778507232666, + "learning_rate": 2.130374820467392e-05, + "loss": 2.8009, + "step": 2656 + }, + { + "epoch": 1.7780239996655098, + "grad_norm": 5.883147716522217, + "learning_rate": 2.1284489063508863e-05, + "loss": 2.7216, + "step": 2657 + }, + { + "epoch": 1.778692979888782, + "grad_norm": 7.345731735229492, + "learning_rate": 2.126523217690394e-05, + "loss": 2.928, + "step": 2658 + }, + { + "epoch": 1.779361960112054, + "grad_norm": 6.369029521942139, + "learning_rate": 2.1245977556544157e-05, + "loss": 2.7213, + "step": 2659 + }, + { + "epoch": 1.7800309403353265, + "grad_norm": 8.463759422302246, + "learning_rate": 2.122672521411318e-05, + "loss": 2.7376, + "step": 2660 + }, + { + "epoch": 1.7806999205585985, + "grad_norm": 7.220532417297363, + "learning_rate": 2.1207475161293255e-05, + "loss": 2.9469, + "step": 2661 + }, + { + "epoch": 1.7813689007818705, + "grad_norm": 7.100991249084473, + "learning_rate": 2.1188227409765255e-05, + "loss": 2.9285, + "step": 2662 + }, + { + "epoch": 1.782037881005143, + "grad_norm": 6.796350002288818, + "learning_rate": 2.116898197120864e-05, + "loss": 2.7046, + "step": 2663 + }, + { + "epoch": 1.782706861228415, + "grad_norm": 5.8627753257751465, + "learning_rate": 2.1149738857301503e-05, + "loss": 2.6686, + "step": 2664 + }, + { + "epoch": 1.783375841451687, + "grad_norm": 4.756777763366699, + "learning_rate": 2.1130498079720493e-05, + "loss": 2.7317, + "step": 2665 + }, + { + "epoch": 1.7840448216749594, + "grad_norm": 4.591255187988281, + "learning_rate": 2.1111259650140854e-05, + "loss": 2.4963, + "step": 2666 + }, + { + "epoch": 1.7847138018982314, + "grad_norm": 5.408332824707031, + "learning_rate": 2.1092023580236394e-05, + "loss": 2.5654, + "step": 2667 + }, + { + "epoch": 1.7853827821215036, + "grad_norm": 4.69365930557251, + "learning_rate": 2.1072789881679514e-05, + "loss": 2.4306, + "step": 2668 + }, + { + "epoch": 1.7860517623447758, + "grad_norm": 5.50496244430542, + "learning_rate": 2.105355856614115e-05, + "loss": 2.7352, + "step": 2669 + }, + { + "epoch": 1.7867207425680478, + "grad_norm": 5.13930082321167, + "learning_rate": 2.1034329645290813e-05, + "loss": 2.6872, + "step": 2670 + }, + { + "epoch": 1.78738972279132, + "grad_norm": 6.122743129730225, + "learning_rate": 2.101510313079653e-05, + "loss": 2.9768, + "step": 2671 + }, + { + "epoch": 1.7880587030145922, + "grad_norm": 5.980995178222656, + "learning_rate": 2.0995879034324915e-05, + "loss": 2.9266, + "step": 2672 + }, + { + "epoch": 1.7887276832378642, + "grad_norm": 6.114793300628662, + "learning_rate": 2.0976657367541068e-05, + "loss": 2.8527, + "step": 2673 + }, + { + "epoch": 1.7893966634611365, + "grad_norm": 4.264657497406006, + "learning_rate": 2.095743814210865e-05, + "loss": 2.8714, + "step": 2674 + }, + { + "epoch": 1.7900656436844087, + "grad_norm": 4.827832221984863, + "learning_rate": 2.0938221369689806e-05, + "loss": 2.8182, + "step": 2675 + }, + { + "epoch": 1.7907346239076807, + "grad_norm": 5.008242607116699, + "learning_rate": 2.091900706194524e-05, + "loss": 2.5002, + "step": 2676 + }, + { + "epoch": 1.791403604130953, + "grad_norm": 5.181432247161865, + "learning_rate": 2.0899795230534097e-05, + "loss": 2.616, + "step": 2677 + }, + { + "epoch": 1.7920725843542251, + "grad_norm": 7.064171314239502, + "learning_rate": 2.0880585887114086e-05, + "loss": 2.7894, + "step": 2678 + }, + { + "epoch": 1.7927415645774971, + "grad_norm": 5.147838115692139, + "learning_rate": 2.0861379043341357e-05, + "loss": 2.6627, + "step": 2679 + }, + { + "epoch": 1.7934105448007693, + "grad_norm": 7.886783599853516, + "learning_rate": 2.0842174710870575e-05, + "loss": 2.7651, + "step": 2680 + }, + { + "epoch": 1.7940795250240416, + "grad_norm": 4.25205135345459, + "learning_rate": 2.0822972901354844e-05, + "loss": 2.7712, + "step": 2681 + }, + { + "epoch": 1.7947485052473136, + "grad_norm": 5.357091426849365, + "learning_rate": 2.080377362644579e-05, + "loss": 2.6271, + "step": 2682 + }, + { + "epoch": 1.7954174854705858, + "grad_norm": 3.9298439025878906, + "learning_rate": 2.0784576897793452e-05, + "loss": 2.7159, + "step": 2683 + }, + { + "epoch": 1.796086465693858, + "grad_norm": 6.184408664703369, + "learning_rate": 2.0765382727046353e-05, + "loss": 2.6444, + "step": 2684 + }, + { + "epoch": 1.79675544591713, + "grad_norm": 4.316830635070801, + "learning_rate": 2.074619112585144e-05, + "loss": 2.7326, + "step": 2685 + }, + { + "epoch": 1.7974244261404022, + "grad_norm": 5.157546043395996, + "learning_rate": 2.0727002105854136e-05, + "loss": 2.558, + "step": 2686 + }, + { + "epoch": 1.7980934063636744, + "grad_norm": 5.2561421394348145, + "learning_rate": 2.070781567869826e-05, + "loss": 2.6098, + "step": 2687 + }, + { + "epoch": 1.7987623865869464, + "grad_norm": 6.838529586791992, + "learning_rate": 2.0688631856026088e-05, + "loss": 2.958, + "step": 2688 + }, + { + "epoch": 1.7994313668102186, + "grad_norm": 6.147638320922852, + "learning_rate": 2.0669450649478283e-05, + "loss": 2.7633, + "step": 2689 + }, + { + "epoch": 1.8001003470334909, + "grad_norm": 5.286609172821045, + "learning_rate": 2.065027207069396e-05, + "loss": 3.0781, + "step": 2690 + }, + { + "epoch": 1.8007693272567629, + "grad_norm": 4.2134108543396, + "learning_rate": 2.063109613131061e-05, + "loss": 2.7279, + "step": 2691 + }, + { + "epoch": 1.801438307480035, + "grad_norm": 5.509737014770508, + "learning_rate": 2.0611922842964135e-05, + "loss": 2.8086, + "step": 2692 + }, + { + "epoch": 1.8021072877033073, + "grad_norm": 3.9975788593292236, + "learning_rate": 2.059275221728881e-05, + "loss": 2.8087, + "step": 2693 + }, + { + "epoch": 1.8027762679265793, + "grad_norm": 5.958218097686768, + "learning_rate": 2.0573584265917332e-05, + "loss": 3.0113, + "step": 2694 + }, + { + "epoch": 1.8034452481498515, + "grad_norm": 4.695807456970215, + "learning_rate": 2.055441900048074e-05, + "loss": 2.832, + "step": 2695 + }, + { + "epoch": 1.8041142283731237, + "grad_norm": 6.1919121742248535, + "learning_rate": 2.0535256432608464e-05, + "loss": 2.9007, + "step": 2696 + }, + { + "epoch": 1.8047832085963957, + "grad_norm": 6.809114456176758, + "learning_rate": 2.051609657392827e-05, + "loss": 2.8798, + "step": 2697 + }, + { + "epoch": 1.8054521888196682, + "grad_norm": 4.31395959854126, + "learning_rate": 2.0496939436066324e-05, + "loss": 2.6148, + "step": 2698 + }, + { + "epoch": 1.8061211690429402, + "grad_norm": 4.20595121383667, + "learning_rate": 2.047778503064709e-05, + "loss": 2.6138, + "step": 2699 + }, + { + "epoch": 1.8067901492662122, + "grad_norm": 4.048036098480225, + "learning_rate": 2.0458633369293424e-05, + "loss": 2.6183, + "step": 2700 + }, + { + "epoch": 1.8074591294894846, + "grad_norm": 6.035735607147217, + "learning_rate": 2.0439484463626475e-05, + "loss": 2.5516, + "step": 2701 + }, + { + "epoch": 1.8081281097127566, + "grad_norm": 6.940889835357666, + "learning_rate": 2.042033832526575e-05, + "loss": 2.6495, + "step": 2702 + }, + { + "epoch": 1.8087970899360286, + "grad_norm": 5.333974361419678, + "learning_rate": 2.0401194965829048e-05, + "loss": 2.5791, + "step": 2703 + }, + { + "epoch": 1.809466070159301, + "grad_norm": 4.075145244598389, + "learning_rate": 2.038205439693252e-05, + "loss": 2.6834, + "step": 2704 + }, + { + "epoch": 1.810135050382573, + "grad_norm": 5.112520217895508, + "learning_rate": 2.0362916630190587e-05, + "loss": 2.7141, + "step": 2705 + }, + { + "epoch": 1.810804030605845, + "grad_norm": 6.721662521362305, + "learning_rate": 2.0343781677215992e-05, + "loss": 2.7941, + "step": 2706 + }, + { + "epoch": 1.8114730108291175, + "grad_norm": 6.038949489593506, + "learning_rate": 2.032464954961975e-05, + "loss": 2.5827, + "step": 2707 + }, + { + "epoch": 1.8121419910523895, + "grad_norm": 5.384756565093994, + "learning_rate": 2.0305520259011195e-05, + "loss": 2.9197, + "step": 2708 + }, + { + "epoch": 1.8128109712756617, + "grad_norm": 4.2328081130981445, + "learning_rate": 2.0286393816997905e-05, + "loss": 2.7072, + "step": 2709 + }, + { + "epoch": 1.813479951498934, + "grad_norm": 5.852884292602539, + "learning_rate": 2.0267270235185748e-05, + "loss": 2.8412, + "step": 2710 + }, + { + "epoch": 1.814148931722206, + "grad_norm": 5.757684707641602, + "learning_rate": 2.0248149525178846e-05, + "loss": 2.8457, + "step": 2711 + }, + { + "epoch": 1.8148179119454781, + "grad_norm": 3.801441192626953, + "learning_rate": 2.02290316985796e-05, + "loss": 2.6505, + "step": 2712 + }, + { + "epoch": 1.8154868921687504, + "grad_norm": 6.909331321716309, + "learning_rate": 2.0209916766988627e-05, + "loss": 2.8613, + "step": 2713 + }, + { + "epoch": 1.8161558723920224, + "grad_norm": 4.29110050201416, + "learning_rate": 2.0190804742004823e-05, + "loss": 2.7479, + "step": 2714 + }, + { + "epoch": 1.8168248526152946, + "grad_norm": 5.398097038269043, + "learning_rate": 2.0171695635225286e-05, + "loss": 2.7035, + "step": 2715 + }, + { + "epoch": 1.8174938328385668, + "grad_norm": 4.887238502502441, + "learning_rate": 2.015258945824538e-05, + "loss": 2.666, + "step": 2716 + }, + { + "epoch": 1.8181628130618388, + "grad_norm": 4.847476959228516, + "learning_rate": 2.013348622265866e-05, + "loss": 2.6205, + "step": 2717 + }, + { + "epoch": 1.818831793285111, + "grad_norm": 6.064548492431641, + "learning_rate": 2.011438594005691e-05, + "loss": 2.5477, + "step": 2718 + }, + { + "epoch": 1.8195007735083832, + "grad_norm": 6.697597503662109, + "learning_rate": 2.009528862203012e-05, + "loss": 2.7419, + "step": 2719 + }, + { + "epoch": 1.8201697537316552, + "grad_norm": 5.916878700256348, + "learning_rate": 2.007619428016649e-05, + "loss": 2.6181, + "step": 2720 + }, + { + "epoch": 1.8208387339549275, + "grad_norm": 6.983180046081543, + "learning_rate": 2.0057102926052384e-05, + "loss": 2.7966, + "step": 2721 + }, + { + "epoch": 1.8215077141781997, + "grad_norm": 5.324311256408691, + "learning_rate": 2.00380145712724e-05, + "loss": 2.5784, + "step": 2722 + }, + { + "epoch": 1.8221766944014717, + "grad_norm": 4.834647178649902, + "learning_rate": 2.0018929227409276e-05, + "loss": 2.6259, + "step": 2723 + }, + { + "epoch": 1.822845674624744, + "grad_norm": 8.242607116699219, + "learning_rate": 1.9999846906043944e-05, + "loss": 2.9843, + "step": 2724 + }, + { + "epoch": 1.8235146548480161, + "grad_norm": 7.778139591217041, + "learning_rate": 1.998076761875548e-05, + "loss": 2.7287, + "step": 2725 + }, + { + "epoch": 1.8241836350712881, + "grad_norm": 5.7373151779174805, + "learning_rate": 1.996169137712116e-05, + "loss": 2.6754, + "step": 2726 + }, + { + "epoch": 1.8248526152945603, + "grad_norm": 6.458043098449707, + "learning_rate": 1.9942618192716367e-05, + "loss": 3.1508, + "step": 2727 + }, + { + "epoch": 1.8255215955178326, + "grad_norm": 5.5279154777526855, + "learning_rate": 1.9923548077114657e-05, + "loss": 2.6276, + "step": 2728 + }, + { + "epoch": 1.8261905757411045, + "grad_norm": 5.591763496398926, + "learning_rate": 1.99044810418877e-05, + "loss": 2.6509, + "step": 2729 + }, + { + "epoch": 1.8268595559643768, + "grad_norm": 7.234933376312256, + "learning_rate": 1.9885417098605342e-05, + "loss": 2.9541, + "step": 2730 + }, + { + "epoch": 1.827528536187649, + "grad_norm": 6.901897430419922, + "learning_rate": 1.986635625883549e-05, + "loss": 2.717, + "step": 2731 + }, + { + "epoch": 1.828197516410921, + "grad_norm": 6.3411030769348145, + "learning_rate": 1.9847298534144225e-05, + "loss": 2.6243, + "step": 2732 + }, + { + "epoch": 1.8288664966341932, + "grad_norm": 3.945781707763672, + "learning_rate": 1.9828243936095696e-05, + "loss": 2.7439, + "step": 2733 + }, + { + "epoch": 1.8295354768574654, + "grad_norm": 4.212995529174805, + "learning_rate": 1.9809192476252187e-05, + "loss": 2.5473, + "step": 2734 + }, + { + "epoch": 1.8302044570807374, + "grad_norm": 6.0992584228515625, + "learning_rate": 1.979014416617405e-05, + "loss": 2.7305, + "step": 2735 + }, + { + "epoch": 1.8308734373040099, + "grad_norm": 7.614537715911865, + "learning_rate": 1.9771099017419746e-05, + "loss": 3.1265, + "step": 2736 + }, + { + "epoch": 1.8315424175272819, + "grad_norm": 5.30592155456543, + "learning_rate": 1.9752057041545803e-05, + "loss": 2.6206, + "step": 2737 + }, + { + "epoch": 1.8322113977505539, + "grad_norm": 8.460625648498535, + "learning_rate": 1.973301825010685e-05, + "loss": 2.8481, + "step": 2738 + }, + { + "epoch": 1.8328803779738263, + "grad_norm": 5.965261459350586, + "learning_rate": 1.9713982654655534e-05, + "loss": 2.6954, + "step": 2739 + }, + { + "epoch": 1.8335493581970983, + "grad_norm": 7.425351142883301, + "learning_rate": 1.9694950266742622e-05, + "loss": 2.7809, + "step": 2740 + }, + { + "epoch": 1.8342183384203703, + "grad_norm": 6.250692844390869, + "learning_rate": 1.9675921097916887e-05, + "loss": 2.7152, + "step": 2741 + }, + { + "epoch": 1.8348873186436427, + "grad_norm": 6.669025421142578, + "learning_rate": 1.965689515972518e-05, + "loss": 2.9273, + "step": 2742 + }, + { + "epoch": 1.8355562988669147, + "grad_norm": 4.485771179199219, + "learning_rate": 1.9637872463712365e-05, + "loss": 2.562, + "step": 2743 + }, + { + "epoch": 1.8362252790901867, + "grad_norm": 7.555203437805176, + "learning_rate": 1.9618853021421373e-05, + "loss": 2.7249, + "step": 2744 + }, + { + "epoch": 1.8368942593134592, + "grad_norm": 5.925047397613525, + "learning_rate": 1.9599836844393122e-05, + "loss": 2.7757, + "step": 2745 + }, + { + "epoch": 1.8375632395367312, + "grad_norm": 7.533424377441406, + "learning_rate": 1.9580823944166583e-05, + "loss": 2.8247, + "step": 2746 + }, + { + "epoch": 1.8382322197600034, + "grad_norm": 7.545873641967773, + "learning_rate": 1.9561814332278704e-05, + "loss": 2.8188, + "step": 2747 + }, + { + "epoch": 1.8389011999832756, + "grad_norm": 5.071416854858398, + "learning_rate": 1.9542808020264474e-05, + "loss": 2.7274, + "step": 2748 + }, + { + "epoch": 1.8395701802065476, + "grad_norm": 5.147225379943848, + "learning_rate": 1.9523805019656854e-05, + "loss": 2.8018, + "step": 2749 + }, + { + "epoch": 1.8402391604298198, + "grad_norm": 6.20609188079834, + "learning_rate": 1.9504805341986812e-05, + "loss": 2.7468, + "step": 2750 + }, + { + "epoch": 1.840908140653092, + "grad_norm": 5.602138996124268, + "learning_rate": 1.9485808998783275e-05, + "loss": 2.7135, + "step": 2751 + }, + { + "epoch": 1.841577120876364, + "grad_norm": 5.072507381439209, + "learning_rate": 1.9466816001573183e-05, + "loss": 2.6772, + "step": 2752 + }, + { + "epoch": 1.8422461010996363, + "grad_norm": 5.482741832733154, + "learning_rate": 1.944782636188141e-05, + "loss": 2.7309, + "step": 2753 + }, + { + "epoch": 1.8429150813229085, + "grad_norm": 5.543910980224609, + "learning_rate": 1.942884009123082e-05, + "loss": 2.9191, + "step": 2754 + }, + { + "epoch": 1.8435840615461805, + "grad_norm": 6.336831569671631, + "learning_rate": 1.9409857201142208e-05, + "loss": 2.7211, + "step": 2755 + }, + { + "epoch": 1.8442530417694527, + "grad_norm": 4.116778373718262, + "learning_rate": 1.939087770313435e-05, + "loss": 2.8932, + "step": 2756 + }, + { + "epoch": 1.844922021992725, + "grad_norm": 5.453319072723389, + "learning_rate": 1.9371901608723923e-05, + "loss": 2.7212, + "step": 2757 + }, + { + "epoch": 1.845591002215997, + "grad_norm": 5.0413289070129395, + "learning_rate": 1.9352928929425586e-05, + "loss": 2.515, + "step": 2758 + }, + { + "epoch": 1.8462599824392691, + "grad_norm": 3.929534912109375, + "learning_rate": 1.9333959676751863e-05, + "loss": 2.5231, + "step": 2759 + }, + { + "epoch": 1.8469289626625414, + "grad_norm": 5.582021713256836, + "learning_rate": 1.9314993862213283e-05, + "loss": 2.6033, + "step": 2760 + }, + { + "epoch": 1.8475979428858134, + "grad_norm": 5.542810916900635, + "learning_rate": 1.9296031497318194e-05, + "loss": 2.8874, + "step": 2761 + }, + { + "epoch": 1.8482669231090856, + "grad_norm": 5.410712242126465, + "learning_rate": 1.9277072593572933e-05, + "loss": 2.8732, + "step": 2762 + }, + { + "epoch": 1.8489359033323578, + "grad_norm": 5.130572319030762, + "learning_rate": 1.9258117162481686e-05, + "loss": 2.5383, + "step": 2763 + }, + { + "epoch": 1.8496048835556298, + "grad_norm": 4.82888650894165, + "learning_rate": 1.9239165215546556e-05, + "loss": 2.6045, + "step": 2764 + }, + { + "epoch": 1.850273863778902, + "grad_norm": 6.0040106773376465, + "learning_rate": 1.9220216764267508e-05, + "loss": 2.8346, + "step": 2765 + }, + { + "epoch": 1.8509428440021742, + "grad_norm": 3.895348072052002, + "learning_rate": 1.9201271820142422e-05, + "loss": 2.5414, + "step": 2766 + }, + { + "epoch": 1.8516118242254462, + "grad_norm": 4.430002689361572, + "learning_rate": 1.9182330394667016e-05, + "loss": 2.7993, + "step": 2767 + }, + { + "epoch": 1.8522808044487185, + "grad_norm": 6.686192512512207, + "learning_rate": 1.9163392499334896e-05, + "loss": 2.8709, + "step": 2768 + }, + { + "epoch": 1.8529497846719907, + "grad_norm": 5.229307174682617, + "learning_rate": 1.9144458145637498e-05, + "loss": 2.8088, + "step": 2769 + }, + { + "epoch": 1.8536187648952627, + "grad_norm": 5.271556854248047, + "learning_rate": 1.9125527345064152e-05, + "loss": 2.9849, + "step": 2770 + }, + { + "epoch": 1.854287745118535, + "grad_norm": 3.884953737258911, + "learning_rate": 1.9106600109101988e-05, + "loss": 2.742, + "step": 2771 + }, + { + "epoch": 1.8549567253418071, + "grad_norm": 6.681604385375977, + "learning_rate": 1.9087676449236007e-05, + "loss": 2.8005, + "step": 2772 + }, + { + "epoch": 1.855625705565079, + "grad_norm": 6.033226013183594, + "learning_rate": 1.9068756376949003e-05, + "loss": 2.7764, + "step": 2773 + }, + { + "epoch": 1.8562946857883513, + "grad_norm": 4.049818992614746, + "learning_rate": 1.9049839903721646e-05, + "loss": 2.5804, + "step": 2774 + }, + { + "epoch": 1.8569636660116235, + "grad_norm": 5.595391273498535, + "learning_rate": 1.903092704103236e-05, + "loss": 2.7069, + "step": 2775 + }, + { + "epoch": 1.8576326462348955, + "grad_norm": 6.733685493469238, + "learning_rate": 1.9012017800357437e-05, + "loss": 2.8359, + "step": 2776 + }, + { + "epoch": 1.858301626458168, + "grad_norm": 5.3344526290893555, + "learning_rate": 1.899311219317092e-05, + "loss": 2.6181, + "step": 2777 + }, + { + "epoch": 1.85897060668144, + "grad_norm": 6.081495761871338, + "learning_rate": 1.897421023094469e-05, + "loss": 2.9845, + "step": 2778 + }, + { + "epoch": 1.859639586904712, + "grad_norm": 5.906316757202148, + "learning_rate": 1.8955311925148387e-05, + "loss": 2.5286, + "step": 2779 + }, + { + "epoch": 1.8603085671279844, + "grad_norm": 5.377201557159424, + "learning_rate": 1.893641728724945e-05, + "loss": 2.6125, + "step": 2780 + }, + { + "epoch": 1.8609775473512564, + "grad_norm": 5.031513690948486, + "learning_rate": 1.891752632871306e-05, + "loss": 2.6942, + "step": 2781 + }, + { + "epoch": 1.8616465275745284, + "grad_norm": 5.494041919708252, + "learning_rate": 1.8898639061002234e-05, + "loss": 2.7891, + "step": 2782 + }, + { + "epoch": 1.8623155077978009, + "grad_norm": 7.990489482879639, + "learning_rate": 1.887975549557766e-05, + "loss": 2.9002, + "step": 2783 + }, + { + "epoch": 1.8629844880210729, + "grad_norm": 6.093759059906006, + "learning_rate": 1.8860875643897864e-05, + "loss": 2.9595, + "step": 2784 + }, + { + "epoch": 1.863653468244345, + "grad_norm": 5.460567474365234, + "learning_rate": 1.884199951741905e-05, + "loss": 2.7114, + "step": 2785 + }, + { + "epoch": 1.8643224484676173, + "grad_norm": 6.7065749168396, + "learning_rate": 1.8823127127595207e-05, + "loss": 2.9208, + "step": 2786 + }, + { + "epoch": 1.8649914286908893, + "grad_norm": 6.396170139312744, + "learning_rate": 1.8804258485878025e-05, + "loss": 2.9138, + "step": 2787 + }, + { + "epoch": 1.8656604089141615, + "grad_norm": 6.916308879852295, + "learning_rate": 1.8785393603716962e-05, + "loss": 3.1748, + "step": 2788 + }, + { + "epoch": 1.8663293891374337, + "grad_norm": 4.9034223556518555, + "learning_rate": 1.8766532492559144e-05, + "loss": 2.8398, + "step": 2789 + }, + { + "epoch": 1.8669983693607057, + "grad_norm": 6.835448741912842, + "learning_rate": 1.8747675163849445e-05, + "loss": 2.8856, + "step": 2790 + }, + { + "epoch": 1.867667349583978, + "grad_norm": 5.442148685455322, + "learning_rate": 1.872882162903042e-05, + "loss": 2.7276, + "step": 2791 + }, + { + "epoch": 1.8683363298072502, + "grad_norm": 6.819629192352295, + "learning_rate": 1.8709971899542352e-05, + "loss": 2.6695, + "step": 2792 + }, + { + "epoch": 1.8690053100305222, + "grad_norm": 7.096249103546143, + "learning_rate": 1.8691125986823182e-05, + "loss": 3.0836, + "step": 2793 + }, + { + "epoch": 1.8696742902537944, + "grad_norm": 6.612250328063965, + "learning_rate": 1.8672283902308557e-05, + "loss": 2.8811, + "step": 2794 + }, + { + "epoch": 1.8703432704770666, + "grad_norm": 5.536661148071289, + "learning_rate": 1.8653445657431777e-05, + "loss": 2.7357, + "step": 2795 + }, + { + "epoch": 1.8710122507003386, + "grad_norm": 5.26278829574585, + "learning_rate": 1.863461126362386e-05, + "loss": 2.3215, + "step": 2796 + }, + { + "epoch": 1.8716812309236108, + "grad_norm": 4.44549036026001, + "learning_rate": 1.8615780732313425e-05, + "loss": 2.5687, + "step": 2797 + }, + { + "epoch": 1.872350211146883, + "grad_norm": 4.844627380371094, + "learning_rate": 1.85969540749268e-05, + "loss": 2.6822, + "step": 2798 + }, + { + "epoch": 1.873019191370155, + "grad_norm": 8.582950592041016, + "learning_rate": 1.8578131302887915e-05, + "loss": 2.5262, + "step": 2799 + }, + { + "epoch": 1.8736881715934273, + "grad_norm": 5.140320777893066, + "learning_rate": 1.8559312427618397e-05, + "loss": 2.6723, + "step": 2800 + }, + { + "epoch": 1.8743571518166995, + "grad_norm": 5.917047023773193, + "learning_rate": 1.8540497460537466e-05, + "loss": 2.6551, + "step": 2801 + }, + { + "epoch": 1.8750261320399715, + "grad_norm": 5.449909210205078, + "learning_rate": 1.852168641306198e-05, + "loss": 2.6342, + "step": 2802 + }, + { + "epoch": 1.8756951122632437, + "grad_norm": 6.341294288635254, + "learning_rate": 1.8502879296606426e-05, + "loss": 2.7602, + "step": 2803 + }, + { + "epoch": 1.876364092486516, + "grad_norm": 6.9685750007629395, + "learning_rate": 1.848407612258291e-05, + "loss": 2.7687, + "step": 2804 + }, + { + "epoch": 1.877033072709788, + "grad_norm": 4.044394493103027, + "learning_rate": 1.8465276902401114e-05, + "loss": 2.4774, + "step": 2805 + }, + { + "epoch": 1.8777020529330601, + "grad_norm": 7.276406288146973, + "learning_rate": 1.844648164746837e-05, + "loss": 2.711, + "step": 2806 + }, + { + "epoch": 1.8783710331563324, + "grad_norm": 5.422701358795166, + "learning_rate": 1.8427690369189572e-05, + "loss": 2.9431, + "step": 2807 + }, + { + "epoch": 1.8790400133796044, + "grad_norm": 4.879177570343018, + "learning_rate": 1.8408903078967202e-05, + "loss": 2.445, + "step": 2808 + }, + { + "epoch": 1.8797089936028766, + "grad_norm": 4.497944355010986, + "learning_rate": 1.8390119788201322e-05, + "loss": 2.5336, + "step": 2809 + }, + { + "epoch": 1.8803779738261488, + "grad_norm": 4.598652362823486, + "learning_rate": 1.8371340508289592e-05, + "loss": 2.5773, + "step": 2810 + }, + { + "epoch": 1.8810469540494208, + "grad_norm": 4.848037242889404, + "learning_rate": 1.83525652506272e-05, + "loss": 2.7507, + "step": 2811 + }, + { + "epoch": 1.881715934272693, + "grad_norm": 4.2268805503845215, + "learning_rate": 1.8333794026606925e-05, + "loss": 2.7592, + "step": 2812 + }, + { + "epoch": 1.8823849144959652, + "grad_norm": 3.9678916931152344, + "learning_rate": 1.831502684761907e-05, + "loss": 2.592, + "step": 2813 + }, + { + "epoch": 1.8830538947192372, + "grad_norm": 6.395400524139404, + "learning_rate": 1.829626372505152e-05, + "loss": 2.8128, + "step": 2814 + }, + { + "epoch": 1.8837228749425097, + "grad_norm": 5.863958358764648, + "learning_rate": 1.8277504670289663e-05, + "loss": 2.619, + "step": 2815 + }, + { + "epoch": 1.8843918551657817, + "grad_norm": 5.843434810638428, + "learning_rate": 1.8258749694716443e-05, + "loss": 2.7406, + "step": 2816 + }, + { + "epoch": 1.8850608353890537, + "grad_norm": 6.783684253692627, + "learning_rate": 1.8239998809712302e-05, + "loss": 2.4084, + "step": 2817 + }, + { + "epoch": 1.885729815612326, + "grad_norm": 5.074789047241211, + "learning_rate": 1.822125202665524e-05, + "loss": 2.6946, + "step": 2818 + }, + { + "epoch": 1.886398795835598, + "grad_norm": 5.649902820587158, + "learning_rate": 1.8202509356920726e-05, + "loss": 2.5995, + "step": 2819 + }, + { + "epoch": 1.88706777605887, + "grad_norm": 6.75178861618042, + "learning_rate": 1.8183770811881766e-05, + "loss": 2.523, + "step": 2820 + }, + { + "epoch": 1.8877367562821425, + "grad_norm": 5.733412742614746, + "learning_rate": 1.816503640290883e-05, + "loss": 2.6101, + "step": 2821 + }, + { + "epoch": 1.8884057365054145, + "grad_norm": 5.465363025665283, + "learning_rate": 1.814630614136993e-05, + "loss": 2.7557, + "step": 2822 + }, + { + "epoch": 1.8890747167286865, + "grad_norm": 5.500278472900391, + "learning_rate": 1.8127580038630487e-05, + "loss": 2.8996, + "step": 2823 + }, + { + "epoch": 1.889743696951959, + "grad_norm": 4.765371799468994, + "learning_rate": 1.810885810605348e-05, + "loss": 2.6213, + "step": 2824 + }, + { + "epoch": 1.890412677175231, + "grad_norm": 4.7066264152526855, + "learning_rate": 1.8090140354999285e-05, + "loss": 2.8025, + "step": 2825 + }, + { + "epoch": 1.8910816573985032, + "grad_norm": 4.957799911499023, + "learning_rate": 1.8071426796825797e-05, + "loss": 2.6934, + "step": 2826 + }, + { + "epoch": 1.8917506376217754, + "grad_norm": 4.18173360824585, + "learning_rate": 1.8052717442888324e-05, + "loss": 2.453, + "step": 2827 + }, + { + "epoch": 1.8924196178450474, + "grad_norm": 4.871091842651367, + "learning_rate": 1.8034012304539664e-05, + "loss": 2.7906, + "step": 2828 + }, + { + "epoch": 1.8930885980683196, + "grad_norm": 5.089080810546875, + "learning_rate": 1.8015311393130014e-05, + "loss": 2.5184, + "step": 2829 + }, + { + "epoch": 1.8937575782915919, + "grad_norm": 5.239760875701904, + "learning_rate": 1.7996614720007043e-05, + "loss": 2.72, + "step": 2830 + }, + { + "epoch": 1.8944265585148639, + "grad_norm": 5.64327335357666, + "learning_rate": 1.7977922296515816e-05, + "loss": 2.7463, + "step": 2831 + }, + { + "epoch": 1.895095538738136, + "grad_norm": 5.543586254119873, + "learning_rate": 1.7959234133998853e-05, + "loss": 2.8013, + "step": 2832 + }, + { + "epoch": 1.8957645189614083, + "grad_norm": 4.5046820640563965, + "learning_rate": 1.794055024379606e-05, + "loss": 2.7221, + "step": 2833 + }, + { + "epoch": 1.8964334991846803, + "grad_norm": 4.879184722900391, + "learning_rate": 1.792187063724477e-05, + "loss": 2.7548, + "step": 2834 + }, + { + "epoch": 1.8971024794079525, + "grad_norm": 5.551717281341553, + "learning_rate": 1.790319532567969e-05, + "loss": 2.9741, + "step": 2835 + }, + { + "epoch": 1.8977714596312247, + "grad_norm": 6.884735584259033, + "learning_rate": 1.7884524320432967e-05, + "loss": 2.783, + "step": 2836 + }, + { + "epoch": 1.8984404398544967, + "grad_norm": 5.557075023651123, + "learning_rate": 1.7865857632834087e-05, + "loss": 2.7484, + "step": 2837 + }, + { + "epoch": 1.899109420077769, + "grad_norm": 4.404065132141113, + "learning_rate": 1.7847195274209946e-05, + "loss": 2.8067, + "step": 2838 + }, + { + "epoch": 1.8997784003010412, + "grad_norm": 5.520792484283447, + "learning_rate": 1.7828537255884793e-05, + "loss": 2.8406, + "step": 2839 + }, + { + "epoch": 1.9004473805243132, + "grad_norm": 7.45339822769165, + "learning_rate": 1.7809883589180266e-05, + "loss": 2.6373, + "step": 2840 + }, + { + "epoch": 1.9011163607475854, + "grad_norm": 5.119688987731934, + "learning_rate": 1.779123428541534e-05, + "loss": 2.7223, + "step": 2841 + }, + { + "epoch": 1.9017853409708576, + "grad_norm": 4.976303577423096, + "learning_rate": 1.777258935590636e-05, + "loss": 2.7959, + "step": 2842 + }, + { + "epoch": 1.9024543211941296, + "grad_norm": 6.238039493560791, + "learning_rate": 1.7753948811967004e-05, + "loss": 2.799, + "step": 2843 + }, + { + "epoch": 1.9031233014174018, + "grad_norm": 6.58221960067749, + "learning_rate": 1.7735312664908306e-05, + "loss": 2.6526, + "step": 2844 + }, + { + "epoch": 1.903792281640674, + "grad_norm": 5.450275897979736, + "learning_rate": 1.7716680926038598e-05, + "loss": 2.8581, + "step": 2845 + }, + { + "epoch": 1.904461261863946, + "grad_norm": 5.318499565124512, + "learning_rate": 1.7698053606663585e-05, + "loss": 2.5682, + "step": 2846 + }, + { + "epoch": 1.9051302420872183, + "grad_norm": 5.3904194831848145, + "learning_rate": 1.7679430718086243e-05, + "loss": 2.5502, + "step": 2847 + }, + { + "epoch": 1.9057992223104905, + "grad_norm": 6.697218418121338, + "learning_rate": 1.7660812271606896e-05, + "loss": 2.8885, + "step": 2848 + }, + { + "epoch": 1.9064682025337625, + "grad_norm": 3.4873507022857666, + "learning_rate": 1.764219827852315e-05, + "loss": 2.3108, + "step": 2849 + }, + { + "epoch": 1.9071371827570347, + "grad_norm": 6.173823833465576, + "learning_rate": 1.762358875012992e-05, + "loss": 2.8236, + "step": 2850 + }, + { + "epoch": 1.907806162980307, + "grad_norm": 6.896760940551758, + "learning_rate": 1.7604983697719408e-05, + "loss": 2.75, + "step": 2851 + }, + { + "epoch": 1.908475143203579, + "grad_norm": 4.93982458114624, + "learning_rate": 1.75863831325811e-05, + "loss": 2.6241, + "step": 2852 + }, + { + "epoch": 1.9091441234268514, + "grad_norm": 2.976346492767334, + "learning_rate": 1.7567787066001752e-05, + "loss": 3.0217, + "step": 2853 + }, + { + "epoch": 1.9098131036501234, + "grad_norm": 6.784980297088623, + "learning_rate": 1.7549195509265408e-05, + "loss": 2.7611, + "step": 2854 + }, + { + "epoch": 1.9104820838733954, + "grad_norm": 4.974246025085449, + "learning_rate": 1.7530608473653367e-05, + "loss": 2.6508, + "step": 2855 + }, + { + "epoch": 1.9111510640966678, + "grad_norm": 5.458065986633301, + "learning_rate": 1.7512025970444173e-05, + "loss": 2.7313, + "step": 2856 + }, + { + "epoch": 1.9118200443199398, + "grad_norm": 5.316771507263184, + "learning_rate": 1.7493448010913625e-05, + "loss": 2.5354, + "step": 2857 + }, + { + "epoch": 1.9124890245432118, + "grad_norm": 4.226136684417725, + "learning_rate": 1.747487460633479e-05, + "loss": 2.5317, + "step": 2858 + }, + { + "epoch": 1.9131580047664842, + "grad_norm": 3.920102834701538, + "learning_rate": 1.745630576797793e-05, + "loss": 2.621, + "step": 2859 + }, + { + "epoch": 1.9138269849897562, + "grad_norm": 4.785923480987549, + "learning_rate": 1.743774150711057e-05, + "loss": 2.5633, + "step": 2860 + }, + { + "epoch": 1.9144959652130282, + "grad_norm": 4.950382709503174, + "learning_rate": 1.7419181834997435e-05, + "loss": 2.4897, + "step": 2861 + }, + { + "epoch": 1.9151649454363007, + "grad_norm": 5.032035827636719, + "learning_rate": 1.740062676290048e-05, + "loss": 2.8504, + "step": 2862 + }, + { + "epoch": 1.9158339256595727, + "grad_norm": 5.08629035949707, + "learning_rate": 1.738207630207886e-05, + "loss": 2.7167, + "step": 2863 + }, + { + "epoch": 1.9165029058828449, + "grad_norm": 6.729982852935791, + "learning_rate": 1.736353046378894e-05, + "loss": 3.1568, + "step": 2864 + }, + { + "epoch": 1.917171886106117, + "grad_norm": 4.999177932739258, + "learning_rate": 1.7344989259284267e-05, + "loss": 2.5559, + "step": 2865 + }, + { + "epoch": 1.917840866329389, + "grad_norm": 5.187335014343262, + "learning_rate": 1.7326452699815602e-05, + "loss": 2.8355, + "step": 2866 + }, + { + "epoch": 1.9185098465526613, + "grad_norm": 6.359440803527832, + "learning_rate": 1.730792079663084e-05, + "loss": 2.5979, + "step": 2867 + }, + { + "epoch": 1.9191788267759335, + "grad_norm": 6.776169300079346, + "learning_rate": 1.7289393560975113e-05, + "loss": 2.8677, + "step": 2868 + }, + { + "epoch": 1.9198478069992055, + "grad_norm": 8.18869400024414, + "learning_rate": 1.7270871004090663e-05, + "loss": 2.8048, + "step": 2869 + }, + { + "epoch": 1.9205167872224778, + "grad_norm": 5.368067741394043, + "learning_rate": 1.7252353137216938e-05, + "loss": 2.7922, + "step": 2870 + }, + { + "epoch": 1.92118576744575, + "grad_norm": 4.105527877807617, + "learning_rate": 1.72338399715905e-05, + "loss": 2.6717, + "step": 2871 + }, + { + "epoch": 1.921854747669022, + "grad_norm": 5.149494647979736, + "learning_rate": 1.7215331518445095e-05, + "loss": 2.5812, + "step": 2872 + }, + { + "epoch": 1.9225237278922942, + "grad_norm": 5.233162879943848, + "learning_rate": 1.7196827789011585e-05, + "loss": 2.6188, + "step": 2873 + }, + { + "epoch": 1.9231927081155664, + "grad_norm": 5.364161014556885, + "learning_rate": 1.7178328794517983e-05, + "loss": 2.7196, + "step": 2874 + }, + { + "epoch": 1.9238616883388384, + "grad_norm": 6.9434003829956055, + "learning_rate": 1.71598345461894e-05, + "loss": 2.8359, + "step": 2875 + }, + { + "epoch": 1.9245306685621106, + "grad_norm": 5.348224639892578, + "learning_rate": 1.7141345055248108e-05, + "loss": 2.5852, + "step": 2876 + }, + { + "epoch": 1.9251996487853829, + "grad_norm": 5.631481647491455, + "learning_rate": 1.712286033291346e-05, + "loss": 3.0714, + "step": 2877 + }, + { + "epoch": 1.9258686290086549, + "grad_norm": 5.012246131896973, + "learning_rate": 1.7104380390401938e-05, + "loss": 2.6516, + "step": 2878 + }, + { + "epoch": 1.926537609231927, + "grad_norm": 4.643505573272705, + "learning_rate": 1.7085905238927085e-05, + "loss": 2.8997, + "step": 2879 + }, + { + "epoch": 1.9272065894551993, + "grad_norm": 5.0478057861328125, + "learning_rate": 1.7067434889699598e-05, + "loss": 2.5784, + "step": 2880 + }, + { + "epoch": 1.9278755696784713, + "grad_norm": 4.843696117401123, + "learning_rate": 1.7048969353927195e-05, + "loss": 2.5817, + "step": 2881 + }, + { + "epoch": 1.9285445499017435, + "grad_norm": 4.8132710456848145, + "learning_rate": 1.703050864281473e-05, + "loss": 2.6692, + "step": 2882 + }, + { + "epoch": 1.9292135301250157, + "grad_norm": 5.918645858764648, + "learning_rate": 1.701205276756408e-05, + "loss": 2.5419, + "step": 2883 + }, + { + "epoch": 1.9298825103482877, + "grad_norm": 5.2134833335876465, + "learning_rate": 1.699360173937423e-05, + "loss": 2.7145, + "step": 2884 + }, + { + "epoch": 1.93055149057156, + "grad_norm": 5.388915538787842, + "learning_rate": 1.6975155569441192e-05, + "loss": 2.7019, + "step": 2885 + }, + { + "epoch": 1.9312204707948322, + "grad_norm": 5.895727634429932, + "learning_rate": 1.6956714268958058e-05, + "loss": 2.503, + "step": 2886 + }, + { + "epoch": 1.9318894510181042, + "grad_norm": 4.27496862411499, + "learning_rate": 1.6938277849114928e-05, + "loss": 2.6144, + "step": 2887 + }, + { + "epoch": 1.9325584312413764, + "grad_norm": 6.133200645446777, + "learning_rate": 1.6919846321098982e-05, + "loss": 2.8895, + "step": 2888 + }, + { + "epoch": 1.9332274114646486, + "grad_norm": 5.87167501449585, + "learning_rate": 1.6901419696094396e-05, + "loss": 2.9797, + "step": 2889 + }, + { + "epoch": 1.9338963916879206, + "grad_norm": 7.774203777313232, + "learning_rate": 1.6882997985282405e-05, + "loss": 2.548, + "step": 2890 + }, + { + "epoch": 1.9345653719111928, + "grad_norm": 4.060307025909424, + "learning_rate": 1.6864581199841228e-05, + "loss": 2.7134, + "step": 2891 + }, + { + "epoch": 1.935234352134465, + "grad_norm": 5.368431568145752, + "learning_rate": 1.684616935094613e-05, + "loss": 2.4984, + "step": 2892 + }, + { + "epoch": 1.935903332357737, + "grad_norm": 4.65740442276001, + "learning_rate": 1.682776244976933e-05, + "loss": 2.7124, + "step": 2893 + }, + { + "epoch": 1.9365723125810095, + "grad_norm": 4.4884843826293945, + "learning_rate": 1.680936050748011e-05, + "loss": 2.649, + "step": 2894 + }, + { + "epoch": 1.9372412928042815, + "grad_norm": 7.604023456573486, + "learning_rate": 1.6790963535244698e-05, + "loss": 2.4289, + "step": 2895 + }, + { + "epoch": 1.9379102730275535, + "grad_norm": 5.809898376464844, + "learning_rate": 1.6772571544226312e-05, + "loss": 2.8822, + "step": 2896 + }, + { + "epoch": 1.938579253250826, + "grad_norm": 5.104942321777344, + "learning_rate": 1.6754184545585152e-05, + "loss": 2.7296, + "step": 2897 + }, + { + "epoch": 1.939248233474098, + "grad_norm": 3.730175495147705, + "learning_rate": 1.6735802550478407e-05, + "loss": 2.4977, + "step": 2898 + }, + { + "epoch": 1.93991721369737, + "grad_norm": 5.256652355194092, + "learning_rate": 1.6717425570060192e-05, + "loss": 2.6609, + "step": 2899 + }, + { + "epoch": 1.9405861939206424, + "grad_norm": 7.092987060546875, + "learning_rate": 1.6699053615481616e-05, + "loss": 2.7562, + "step": 2900 + }, + { + "epoch": 1.9412551741439144, + "grad_norm": 6.14080810546875, + "learning_rate": 1.6680686697890706e-05, + "loss": 2.9019, + "step": 2901 + }, + { + "epoch": 1.9419241543671866, + "grad_norm": 3.7073490619659424, + "learning_rate": 1.6662324828432467e-05, + "loss": 2.3301, + "step": 2902 + }, + { + "epoch": 1.9425931345904588, + "grad_norm": 4.069120407104492, + "learning_rate": 1.6643968018248808e-05, + "loss": 2.5039, + "step": 2903 + }, + { + "epoch": 1.9432621148137308, + "grad_norm": 5.932475566864014, + "learning_rate": 1.662561627847859e-05, + "loss": 2.5813, + "step": 2904 + }, + { + "epoch": 1.943931095037003, + "grad_norm": 4.354176044464111, + "learning_rate": 1.6607269620257583e-05, + "loss": 2.5891, + "step": 2905 + }, + { + "epoch": 1.9446000752602752, + "grad_norm": 5.194872856140137, + "learning_rate": 1.6588928054718494e-05, + "loss": 3.0809, + "step": 2906 + }, + { + "epoch": 1.9452690554835472, + "grad_norm": 6.054746627807617, + "learning_rate": 1.6570591592990913e-05, + "loss": 2.9886, + "step": 2907 + }, + { + "epoch": 1.9459380357068194, + "grad_norm": 3.42716383934021, + "learning_rate": 1.6552260246201352e-05, + "loss": 2.6991, + "step": 2908 + }, + { + "epoch": 1.9466070159300917, + "grad_norm": 4.277167320251465, + "learning_rate": 1.6533934025473212e-05, + "loss": 2.5809, + "step": 2909 + }, + { + "epoch": 1.9472759961533637, + "grad_norm": 6.264553546905518, + "learning_rate": 1.6515612941926788e-05, + "loss": 2.94, + "step": 2910 + }, + { + "epoch": 1.9479449763766359, + "grad_norm": 6.1438517570495605, + "learning_rate": 1.649729700667924e-05, + "loss": 2.3862, + "step": 2911 + }, + { + "epoch": 1.948613956599908, + "grad_norm": 5.2817301750183105, + "learning_rate": 1.6478986230844645e-05, + "loss": 2.5003, + "step": 2912 + }, + { + "epoch": 1.94928293682318, + "grad_norm": 4.638598442077637, + "learning_rate": 1.6460680625533904e-05, + "loss": 2.4736, + "step": 2913 + }, + { + "epoch": 1.9499519170464523, + "grad_norm": 3.7549145221710205, + "learning_rate": 1.644238020185481e-05, + "loss": 2.7409, + "step": 2914 + }, + { + "epoch": 1.9506208972697245, + "grad_norm": 4.2682952880859375, + "learning_rate": 1.6424084970911984e-05, + "loss": 2.5337, + "step": 2915 + }, + { + "epoch": 1.9512898774929965, + "grad_norm": 4.557698726654053, + "learning_rate": 1.6405794943806934e-05, + "loss": 2.5818, + "step": 2916 + }, + { + "epoch": 1.9519588577162688, + "grad_norm": 6.678733825683594, + "learning_rate": 1.6387510131637982e-05, + "loss": 2.7797, + "step": 2917 + }, + { + "epoch": 1.952627837939541, + "grad_norm": 5.692575931549072, + "learning_rate": 1.63692305455003e-05, + "loss": 2.7032, + "step": 2918 + }, + { + "epoch": 1.953296818162813, + "grad_norm": 5.498258590698242, + "learning_rate": 1.6350956196485856e-05, + "loss": 2.7007, + "step": 2919 + }, + { + "epoch": 1.9539657983860852, + "grad_norm": 4.76276159286499, + "learning_rate": 1.6332687095683503e-05, + "loss": 2.7379, + "step": 2920 + }, + { + "epoch": 1.9546347786093574, + "grad_norm": 6.434892654418945, + "learning_rate": 1.6314423254178847e-05, + "loss": 2.6362, + "step": 2921 + }, + { + "epoch": 1.9553037588326294, + "grad_norm": 9.089195251464844, + "learning_rate": 1.6296164683054345e-05, + "loss": 2.9048, + "step": 2922 + }, + { + "epoch": 1.9559727390559016, + "grad_norm": 6.396719932556152, + "learning_rate": 1.6277911393389218e-05, + "loss": 2.7807, + "step": 2923 + }, + { + "epoch": 1.9566417192791739, + "grad_norm": 4.648181915283203, + "learning_rate": 1.6259663396259528e-05, + "loss": 3.1269, + "step": 2924 + }, + { + "epoch": 1.9573106995024458, + "grad_norm": 5.44894552230835, + "learning_rate": 1.6241420702738088e-05, + "loss": 2.6917, + "step": 2925 + }, + { + "epoch": 1.957979679725718, + "grad_norm": 8.427546501159668, + "learning_rate": 1.622318332389451e-05, + "loss": 3.0648, + "step": 2926 + }, + { + "epoch": 1.9586486599489903, + "grad_norm": 5.089845657348633, + "learning_rate": 1.620495127079517e-05, + "loss": 2.6871, + "step": 2927 + }, + { + "epoch": 1.9593176401722623, + "grad_norm": 5.062190532684326, + "learning_rate": 1.618672455450324e-05, + "loss": 2.7424, + "step": 2928 + }, + { + "epoch": 1.9599866203955345, + "grad_norm": 4.900814533233643, + "learning_rate": 1.6168503186078598e-05, + "loss": 2.5565, + "step": 2929 + }, + { + "epoch": 1.9606556006188067, + "grad_norm": 5.936448097229004, + "learning_rate": 1.6150287176577948e-05, + "loss": 2.6673, + "step": 2930 + }, + { + "epoch": 1.9613245808420787, + "grad_norm": 7.32464599609375, + "learning_rate": 1.613207653705468e-05, + "loss": 3.0643, + "step": 2931 + }, + { + "epoch": 1.9619935610653512, + "grad_norm": 5.269471645355225, + "learning_rate": 1.6113871278558974e-05, + "loss": 2.6852, + "step": 2932 + }, + { + "epoch": 1.9626625412886232, + "grad_norm": 4.94673490524292, + "learning_rate": 1.6095671412137694e-05, + "loss": 2.6746, + "step": 2933 + }, + { + "epoch": 1.9633315215118952, + "grad_norm": 4.557250022888184, + "learning_rate": 1.607747694883449e-05, + "loss": 2.6577, + "step": 2934 + }, + { + "epoch": 1.9640005017351676, + "grad_norm": 3.836230993270874, + "learning_rate": 1.6059287899689684e-05, + "loss": 2.6077, + "step": 2935 + }, + { + "epoch": 1.9646694819584396, + "grad_norm": 5.7456278800964355, + "learning_rate": 1.604110427574035e-05, + "loss": 2.7519, + "step": 2936 + }, + { + "epoch": 1.9653384621817116, + "grad_norm": 4.406337261199951, + "learning_rate": 1.6022926088020228e-05, + "loss": 2.7272, + "step": 2937 + }, + { + "epoch": 1.966007442404984, + "grad_norm": 4.271851539611816, + "learning_rate": 1.6004753347559808e-05, + "loss": 2.588, + "step": 2938 + }, + { + "epoch": 1.966676422628256, + "grad_norm": 4.202340602874756, + "learning_rate": 1.5986586065386243e-05, + "loss": 2.5318, + "step": 2939 + }, + { + "epoch": 1.967345402851528, + "grad_norm": 4.201272010803223, + "learning_rate": 1.5968424252523378e-05, + "loss": 2.6647, + "step": 2940 + }, + { + "epoch": 1.9680143830748005, + "grad_norm": 4.757193088531494, + "learning_rate": 1.595026791999174e-05, + "loss": 2.6625, + "step": 2941 + }, + { + "epoch": 1.9686833632980725, + "grad_norm": 4.6923112869262695, + "learning_rate": 1.5932117078808544e-05, + "loss": 2.7788, + "step": 2942 + }, + { + "epoch": 1.9693523435213447, + "grad_norm": 5.724450588226318, + "learning_rate": 1.5913971739987655e-05, + "loss": 2.7372, + "step": 2943 + }, + { + "epoch": 1.970021323744617, + "grad_norm": 6.370497703552246, + "learning_rate": 1.5895831914539616e-05, + "loss": 2.6868, + "step": 2944 + }, + { + "epoch": 1.970690303967889, + "grad_norm": 5.791628360748291, + "learning_rate": 1.587769761347159e-05, + "loss": 2.8062, + "step": 2945 + }, + { + "epoch": 1.9713592841911611, + "grad_norm": 5.352672576904297, + "learning_rate": 1.585956884778745e-05, + "loss": 2.7399, + "step": 2946 + }, + { + "epoch": 1.9720282644144334, + "grad_norm": 8.139093399047852, + "learning_rate": 1.584144562848764e-05, + "loss": 3.0253, + "step": 2947 + }, + { + "epoch": 1.9726972446377053, + "grad_norm": 6.4604411125183105, + "learning_rate": 1.582332796656929e-05, + "loss": 2.879, + "step": 2948 + }, + { + "epoch": 1.9733662248609776, + "grad_norm": 4.963821887969971, + "learning_rate": 1.5805215873026125e-05, + "loss": 2.6953, + "step": 2949 + }, + { + "epoch": 1.9740352050842498, + "grad_norm": 4.356338024139404, + "learning_rate": 1.5787109358848528e-05, + "loss": 2.6261, + "step": 2950 + }, + { + "epoch": 1.9747041853075218, + "grad_norm": 4.943756103515625, + "learning_rate": 1.5769008435023446e-05, + "loss": 2.6896, + "step": 2951 + }, + { + "epoch": 1.975373165530794, + "grad_norm": 7.068074703216553, + "learning_rate": 1.575091311253448e-05, + "loss": 2.886, + "step": 2952 + }, + { + "epoch": 1.9760421457540662, + "grad_norm": 4.81316614151001, + "learning_rate": 1.573282340236181e-05, + "loss": 2.5914, + "step": 2953 + }, + { + "epoch": 1.9767111259773382, + "grad_norm": 5.508342742919922, + "learning_rate": 1.571473931548222e-05, + "loss": 2.5439, + "step": 2954 + }, + { + "epoch": 1.9773801062006104, + "grad_norm": 7.063830375671387, + "learning_rate": 1.5696660862869057e-05, + "loss": 2.738, + "step": 2955 + }, + { + "epoch": 1.9780490864238827, + "grad_norm": 7.030398845672607, + "learning_rate": 1.567858805549229e-05, + "loss": 2.8256, + "step": 2956 + }, + { + "epoch": 1.9787180666471547, + "grad_norm": 5.586159706115723, + "learning_rate": 1.5660520904318424e-05, + "loss": 2.6369, + "step": 2957 + }, + { + "epoch": 1.9793870468704269, + "grad_norm": 5.897188663482666, + "learning_rate": 1.564245942031056e-05, + "loss": 2.6623, + "step": 2958 + }, + { + "epoch": 1.980056027093699, + "grad_norm": 5.793785095214844, + "learning_rate": 1.5624403614428336e-05, + "loss": 2.8337, + "step": 2959 + }, + { + "epoch": 1.980725007316971, + "grad_norm": 6.084945201873779, + "learning_rate": 1.5606353497627972e-05, + "loss": 2.88, + "step": 2960 + }, + { + "epoch": 1.9813939875402433, + "grad_norm": 7.513318061828613, + "learning_rate": 1.5588309080862216e-05, + "loss": 2.883, + "step": 2961 + }, + { + "epoch": 1.9820629677635155, + "grad_norm": 5.256368637084961, + "learning_rate": 1.5570270375080362e-05, + "loss": 2.8528, + "step": 2962 + }, + { + "epoch": 1.9827319479867875, + "grad_norm": 5.060674667358398, + "learning_rate": 1.5552237391228226e-05, + "loss": 2.6451, + "step": 2963 + }, + { + "epoch": 1.9834009282100598, + "grad_norm": 4.594638347625732, + "learning_rate": 1.553421014024819e-05, + "loss": 2.7623, + "step": 2964 + }, + { + "epoch": 1.984069908433332, + "grad_norm": 4.919723033905029, + "learning_rate": 1.551618863307911e-05, + "loss": 2.6493, + "step": 2965 + }, + { + "epoch": 1.984738888656604, + "grad_norm": 5.47368049621582, + "learning_rate": 1.549817288065639e-05, + "loss": 2.8822, + "step": 2966 + }, + { + "epoch": 1.9854078688798762, + "grad_norm": 5.195687770843506, + "learning_rate": 1.5480162893911922e-05, + "loss": 2.6127, + "step": 2967 + }, + { + "epoch": 1.9860768491031484, + "grad_norm": 4.345044136047363, + "learning_rate": 1.5462158683774124e-05, + "loss": 2.6003, + "step": 2968 + }, + { + "epoch": 1.9867458293264204, + "grad_norm": 6.155688285827637, + "learning_rate": 1.544416026116788e-05, + "loss": 2.831, + "step": 2969 + }, + { + "epoch": 1.9874148095496929, + "grad_norm": 5.565713405609131, + "learning_rate": 1.5426167637014582e-05, + "loss": 2.5097, + "step": 2970 + }, + { + "epoch": 1.9880837897729648, + "grad_norm": 6.924679279327393, + "learning_rate": 1.5408180822232088e-05, + "loss": 3.001, + "step": 2971 + }, + { + "epoch": 1.9887527699962368, + "grad_norm": 5.707123279571533, + "learning_rate": 1.5390199827734746e-05, + "loss": 2.6601, + "step": 2972 + }, + { + "epoch": 1.9894217502195093, + "grad_norm": 3.676952838897705, + "learning_rate": 1.537222466443336e-05, + "loss": 2.5597, + "step": 2973 + }, + { + "epoch": 1.9900907304427813, + "grad_norm": 7.1246867179870605, + "learning_rate": 1.5354255343235216e-05, + "loss": 2.8476, + "step": 2974 + }, + { + "epoch": 1.9907597106660533, + "grad_norm": 5.087657451629639, + "learning_rate": 1.5336291875044025e-05, + "loss": 2.7412, + "step": 2975 + }, + { + "epoch": 1.9914286908893257, + "grad_norm": 6.350460529327393, + "learning_rate": 1.5318334270759972e-05, + "loss": 2.6847, + "step": 2976 + }, + { + "epoch": 1.9920976711125977, + "grad_norm": 5.982015132904053, + "learning_rate": 1.5300382541279658e-05, + "loss": 2.8279, + "step": 2977 + }, + { + "epoch": 1.9927666513358697, + "grad_norm": 4.596728801727295, + "learning_rate": 1.528243669749616e-05, + "loss": 2.6684, + "step": 2978 + }, + { + "epoch": 1.9934356315591422, + "grad_norm": 4.809084415435791, + "learning_rate": 1.526449675029894e-05, + "loss": 2.8206, + "step": 2979 + }, + { + "epoch": 1.9941046117824142, + "grad_norm": 5.641161918640137, + "learning_rate": 1.5246562710573908e-05, + "loss": 2.9301, + "step": 2980 + }, + { + "epoch": 1.9947735920056864, + "grad_norm": 9.615150451660156, + "learning_rate": 1.5228634589203367e-05, + "loss": 3.0272, + "step": 2981 + }, + { + "epoch": 1.9954425722289586, + "grad_norm": 5.482935905456543, + "learning_rate": 1.521071239706607e-05, + "loss": 2.9605, + "step": 2982 + }, + { + "epoch": 1.9961115524522306, + "grad_norm": 6.808692932128906, + "learning_rate": 1.5192796145037125e-05, + "loss": 2.6895, + "step": 2983 + }, + { + "epoch": 1.9967805326755028, + "grad_norm": 5.924152851104736, + "learning_rate": 1.5174885843988069e-05, + "loss": 2.6848, + "step": 2984 + }, + { + "epoch": 1.997449512898775, + "grad_norm": 5.037985324859619, + "learning_rate": 1.5156981504786798e-05, + "loss": 2.6335, + "step": 2985 + }, + { + "epoch": 1.998118493122047, + "grad_norm": 7.344898700714111, + "learning_rate": 1.5139083138297633e-05, + "loss": 3.1455, + "step": 2986 + }, + { + "epoch": 1.9987874733453193, + "grad_norm": 5.326211929321289, + "learning_rate": 1.512119075538122e-05, + "loss": 2.7672, + "step": 2987 + }, + { + "epoch": 1.9994564535685915, + "grad_norm": 4.72623348236084, + "learning_rate": 1.5103304366894622e-05, + "loss": 2.7863, + "step": 2988 + }, + { + "epoch": 2.000668980223272, + "grad_norm": 12.768083572387695, + "learning_rate": 1.508542398369122e-05, + "loss": 5.1041, + "step": 2989 + }, + { + "epoch": 2.0013379604465444, + "grad_norm": 6.635680675506592, + "learning_rate": 1.506754961662079e-05, + "loss": 2.663, + "step": 2990 + }, + { + "epoch": 2.0020069406698164, + "grad_norm": 6.694709300994873, + "learning_rate": 1.5049681276529437e-05, + "loss": 2.4626, + "step": 2991 + }, + { + "epoch": 2.0026759208930884, + "grad_norm": 4.822940349578857, + "learning_rate": 1.503181897425961e-05, + "loss": 2.4768, + "step": 2992 + }, + { + "epoch": 2.003344901116361, + "grad_norm": 5.400036334991455, + "learning_rate": 1.5013962720650095e-05, + "loss": 2.3476, + "step": 2993 + }, + { + "epoch": 2.004013881339633, + "grad_norm": 6.009514331817627, + "learning_rate": 1.4996112526536019e-05, + "loss": 2.4499, + "step": 2994 + }, + { + "epoch": 2.004682861562905, + "grad_norm": 5.883105754852295, + "learning_rate": 1.4978268402748802e-05, + "loss": 2.5064, + "step": 2995 + }, + { + "epoch": 2.0053518417861773, + "grad_norm": 5.894524574279785, + "learning_rate": 1.4960430360116229e-05, + "loss": 2.4421, + "step": 2996 + }, + { + "epoch": 2.0060208220094493, + "grad_norm": 5.011678218841553, + "learning_rate": 1.4942598409462343e-05, + "loss": 2.2602, + "step": 2997 + }, + { + "epoch": 2.0066898022327213, + "grad_norm": 3.9589545726776123, + "learning_rate": 1.4924772561607537e-05, + "loss": 2.3687, + "step": 2998 + }, + { + "epoch": 2.0073587824559938, + "grad_norm": 5.826911449432373, + "learning_rate": 1.4906952827368447e-05, + "loss": 2.766, + "step": 2999 + }, + { + "epoch": 2.0080277626792657, + "grad_norm": 6.351194858551025, + "learning_rate": 1.4889139217558066e-05, + "loss": 2.0096, + "step": 3000 + }, + { + "epoch": 2.0086967429025377, + "grad_norm": 6.570158004760742, + "learning_rate": 1.4871331742985611e-05, + "loss": 2.5124, + "step": 3001 + }, + { + "epoch": 2.00936572312581, + "grad_norm": 5.056060314178467, + "learning_rate": 1.4853530414456612e-05, + "loss": 2.5826, + "step": 3002 + }, + { + "epoch": 2.010034703349082, + "grad_norm": 4.386926174163818, + "learning_rate": 1.4835735242772846e-05, + "loss": 2.1595, + "step": 3003 + }, + { + "epoch": 2.010703683572354, + "grad_norm": 6.453482627868652, + "learning_rate": 1.4817946238732389e-05, + "loss": 2.5977, + "step": 3004 + }, + { + "epoch": 2.0113726637956266, + "grad_norm": 5.471908092498779, + "learning_rate": 1.4800163413129525e-05, + "loss": 2.4666, + "step": 3005 + }, + { + "epoch": 2.0120416440188986, + "grad_norm": 6.3005194664001465, + "learning_rate": 1.478238677675484e-05, + "loss": 2.4631, + "step": 3006 + }, + { + "epoch": 2.0127106242421706, + "grad_norm": 4.162787437438965, + "learning_rate": 1.4764616340395116e-05, + "loss": 2.3307, + "step": 3007 + }, + { + "epoch": 2.013379604465443, + "grad_norm": 5.53495454788208, + "learning_rate": 1.4746852114833415e-05, + "loss": 2.4042, + "step": 3008 + }, + { + "epoch": 2.014048584688715, + "grad_norm": 6.692617416381836, + "learning_rate": 1.4729094110849004e-05, + "loss": 2.4228, + "step": 3009 + }, + { + "epoch": 2.0147175649119875, + "grad_norm": 6.387679576873779, + "learning_rate": 1.4711342339217387e-05, + "loss": 2.4697, + "step": 3010 + }, + { + "epoch": 2.0153865451352595, + "grad_norm": 7.4439826011657715, + "learning_rate": 1.4693596810710276e-05, + "loss": 2.494, + "step": 3011 + }, + { + "epoch": 2.0160555253585315, + "grad_norm": 6.417651176452637, + "learning_rate": 1.4675857536095616e-05, + "loss": 2.6192, + "step": 3012 + }, + { + "epoch": 2.016724505581804, + "grad_norm": 6.095968246459961, + "learning_rate": 1.4658124526137517e-05, + "loss": 2.5938, + "step": 3013 + }, + { + "epoch": 2.017393485805076, + "grad_norm": 6.1476335525512695, + "learning_rate": 1.4640397791596336e-05, + "loss": 2.3391, + "step": 3014 + }, + { + "epoch": 2.018062466028348, + "grad_norm": 4.672789573669434, + "learning_rate": 1.4622677343228586e-05, + "loss": 2.55, + "step": 3015 + }, + { + "epoch": 2.0187314462516204, + "grad_norm": 6.0801591873168945, + "learning_rate": 1.4604963191786986e-05, + "loss": 2.5872, + "step": 3016 + }, + { + "epoch": 2.0194004264748924, + "grad_norm": 4.310150146484375, + "learning_rate": 1.4587255348020415e-05, + "loss": 2.0883, + "step": 3017 + }, + { + "epoch": 2.0200694066981644, + "grad_norm": 5.962727069854736, + "learning_rate": 1.4569553822673948e-05, + "loss": 2.4211, + "step": 3018 + }, + { + "epoch": 2.020738386921437, + "grad_norm": 5.8133978843688965, + "learning_rate": 1.45518586264888e-05, + "loss": 2.3678, + "step": 3019 + }, + { + "epoch": 2.021407367144709, + "grad_norm": 6.652718544006348, + "learning_rate": 1.4534169770202377e-05, + "loss": 2.1991, + "step": 3020 + }, + { + "epoch": 2.022076347367981, + "grad_norm": 4.784534454345703, + "learning_rate": 1.4516487264548207e-05, + "loss": 2.2542, + "step": 3021 + }, + { + "epoch": 2.0227453275912533, + "grad_norm": 5.265605449676514, + "learning_rate": 1.4498811120255984e-05, + "loss": 2.3364, + "step": 3022 + }, + { + "epoch": 2.0234143078145252, + "grad_norm": 5.091157913208008, + "learning_rate": 1.448114134805153e-05, + "loss": 2.2092, + "step": 3023 + }, + { + "epoch": 2.0240832880377972, + "grad_norm": 4.364589214324951, + "learning_rate": 1.4463477958656818e-05, + "loss": 2.1874, + "step": 3024 + }, + { + "epoch": 2.0247522682610697, + "grad_norm": 6.272817611694336, + "learning_rate": 1.444582096278993e-05, + "loss": 2.5776, + "step": 3025 + }, + { + "epoch": 2.0254212484843417, + "grad_norm": 5.4384870529174805, + "learning_rate": 1.4428170371165067e-05, + "loss": 2.529, + "step": 3026 + }, + { + "epoch": 2.0260902287076137, + "grad_norm": 6.760064601898193, + "learning_rate": 1.4410526194492552e-05, + "loss": 2.5601, + "step": 3027 + }, + { + "epoch": 2.026759208930886, + "grad_norm": 5.941396713256836, + "learning_rate": 1.439288844347883e-05, + "loss": 2.3452, + "step": 3028 + }, + { + "epoch": 2.027428189154158, + "grad_norm": 4.927445411682129, + "learning_rate": 1.4375257128826413e-05, + "loss": 2.3153, + "step": 3029 + }, + { + "epoch": 2.02809716937743, + "grad_norm": 7.796842098236084, + "learning_rate": 1.4357632261233945e-05, + "loss": 2.2947, + "step": 3030 + }, + { + "epoch": 2.0287661496007026, + "grad_norm": 4.760647773742676, + "learning_rate": 1.4340013851396116e-05, + "loss": 2.4223, + "step": 3031 + }, + { + "epoch": 2.0294351298239746, + "grad_norm": 5.804800033569336, + "learning_rate": 1.4322401910003746e-05, + "loss": 2.298, + "step": 3032 + }, + { + "epoch": 2.0301041100472466, + "grad_norm": 6.583136558532715, + "learning_rate": 1.4304796447743693e-05, + "loss": 2.6659, + "step": 3033 + }, + { + "epoch": 2.030773090270519, + "grad_norm": 5.0297627449035645, + "learning_rate": 1.4287197475298894e-05, + "loss": 2.4211, + "step": 3034 + }, + { + "epoch": 2.031442070493791, + "grad_norm": 4.208191394805908, + "learning_rate": 1.4269605003348342e-05, + "loss": 2.1407, + "step": 3035 + }, + { + "epoch": 2.032111050717063, + "grad_norm": 4.9699554443359375, + "learning_rate": 1.4252019042567106e-05, + "loss": 2.5686, + "step": 3036 + }, + { + "epoch": 2.0327800309403354, + "grad_norm": 4.596959590911865, + "learning_rate": 1.4234439603626279e-05, + "loss": 2.231, + "step": 3037 + }, + { + "epoch": 2.0334490111636074, + "grad_norm": 6.562058925628662, + "learning_rate": 1.4216866697193024e-05, + "loss": 2.407, + "step": 3038 + }, + { + "epoch": 2.0341179913868794, + "grad_norm": 7.146629810333252, + "learning_rate": 1.4199300333930515e-05, + "loss": 2.3036, + "step": 3039 + }, + { + "epoch": 2.034786971610152, + "grad_norm": 6.620941638946533, + "learning_rate": 1.418174052449796e-05, + "loss": 2.2592, + "step": 3040 + }, + { + "epoch": 2.035455951833424, + "grad_norm": 4.3330559730529785, + "learning_rate": 1.4164187279550595e-05, + "loss": 2.2991, + "step": 3041 + }, + { + "epoch": 2.036124932056696, + "grad_norm": 5.615035533905029, + "learning_rate": 1.4146640609739686e-05, + "loss": 2.5124, + "step": 3042 + }, + { + "epoch": 2.0367939122799683, + "grad_norm": 5.588582515716553, + "learning_rate": 1.412910052571248e-05, + "loss": 2.5165, + "step": 3043 + }, + { + "epoch": 2.0374628925032403, + "grad_norm": 7.676965236663818, + "learning_rate": 1.4111567038112258e-05, + "loss": 2.484, + "step": 3044 + }, + { + "epoch": 2.0381318727265123, + "grad_norm": 6.182879447937012, + "learning_rate": 1.409404015757827e-05, + "loss": 2.2323, + "step": 3045 + }, + { + "epoch": 2.0388008529497847, + "grad_norm": 5.271181106567383, + "learning_rate": 1.4076519894745788e-05, + "loss": 2.3512, + "step": 3046 + }, + { + "epoch": 2.0394698331730567, + "grad_norm": 6.310977935791016, + "learning_rate": 1.4059006260246038e-05, + "loss": 2.5909, + "step": 3047 + }, + { + "epoch": 2.040138813396329, + "grad_norm": 7.053647041320801, + "learning_rate": 1.4041499264706246e-05, + "loss": 2.5453, + "step": 3048 + }, + { + "epoch": 2.040807793619601, + "grad_norm": 4.582780361175537, + "learning_rate": 1.4023998918749581e-05, + "loss": 2.1957, + "step": 3049 + }, + { + "epoch": 2.041476773842873, + "grad_norm": 4.66085147857666, + "learning_rate": 1.4006505232995226e-05, + "loss": 2.173, + "step": 3050 + }, + { + "epoch": 2.0421457540661456, + "grad_norm": 6.369279384613037, + "learning_rate": 1.398901821805827e-05, + "loss": 2.5652, + "step": 3051 + }, + { + "epoch": 2.0428147342894176, + "grad_norm": 6.272078514099121, + "learning_rate": 1.3971537884549796e-05, + "loss": 2.5331, + "step": 3052 + }, + { + "epoch": 2.0434837145126896, + "grad_norm": 5.896235466003418, + "learning_rate": 1.3954064243076793e-05, + "loss": 2.3149, + "step": 3053 + }, + { + "epoch": 2.044152694735962, + "grad_norm": 7.491322040557861, + "learning_rate": 1.3936597304242244e-05, + "loss": 2.5707, + "step": 3054 + }, + { + "epoch": 2.044821674959234, + "grad_norm": 5.85264778137207, + "learning_rate": 1.3919137078644995e-05, + "loss": 2.3324, + "step": 3055 + }, + { + "epoch": 2.045490655182506, + "grad_norm": 5.888369083404541, + "learning_rate": 1.3901683576879876e-05, + "loss": 2.3899, + "step": 3056 + }, + { + "epoch": 2.0461596354057785, + "grad_norm": 5.316038608551025, + "learning_rate": 1.3884236809537599e-05, + "loss": 2.5756, + "step": 3057 + }, + { + "epoch": 2.0468286156290505, + "grad_norm": 7.828998565673828, + "learning_rate": 1.3866796787204833e-05, + "loss": 2.4634, + "step": 3058 + }, + { + "epoch": 2.0474975958523225, + "grad_norm": 4.430447578430176, + "learning_rate": 1.3849363520464098e-05, + "loss": 2.4278, + "step": 3059 + }, + { + "epoch": 2.048166576075595, + "grad_norm": 4.44407844543457, + "learning_rate": 1.383193701989387e-05, + "loss": 2.4459, + "step": 3060 + }, + { + "epoch": 2.048835556298867, + "grad_norm": 4.023651123046875, + "learning_rate": 1.3814517296068486e-05, + "loss": 2.3309, + "step": 3061 + }, + { + "epoch": 2.049504536522139, + "grad_norm": 4.457033634185791, + "learning_rate": 1.3797104359558173e-05, + "loss": 2.2204, + "step": 3062 + }, + { + "epoch": 2.0501735167454114, + "grad_norm": 4.808585166931152, + "learning_rate": 1.3779698220929038e-05, + "loss": 2.5533, + "step": 3063 + }, + { + "epoch": 2.0508424969686834, + "grad_norm": 5.920096397399902, + "learning_rate": 1.3762298890743091e-05, + "loss": 2.3159, + "step": 3064 + }, + { + "epoch": 2.0515114771919554, + "grad_norm": 7.951557159423828, + "learning_rate": 1.3744906379558165e-05, + "loss": 2.7188, + "step": 3065 + }, + { + "epoch": 2.052180457415228, + "grad_norm": 5.221137046813965, + "learning_rate": 1.372752069792801e-05, + "loss": 2.442, + "step": 3066 + }, + { + "epoch": 2.0528494376385, + "grad_norm": 4.949860572814941, + "learning_rate": 1.371014185640217e-05, + "loss": 2.3144, + "step": 3067 + }, + { + "epoch": 2.053518417861772, + "grad_norm": 3.728174924850464, + "learning_rate": 1.3692769865526095e-05, + "loss": 2.135, + "step": 3068 + }, + { + "epoch": 2.0541873980850442, + "grad_norm": 5.952876567840576, + "learning_rate": 1.3675404735841046e-05, + "loss": 2.5221, + "step": 3069 + }, + { + "epoch": 2.0548563783083162, + "grad_norm": 7.346704006195068, + "learning_rate": 1.3658046477884128e-05, + "loss": 2.5614, + "step": 3070 + }, + { + "epoch": 2.0555253585315882, + "grad_norm": 6.7896857261657715, + "learning_rate": 1.3640695102188263e-05, + "loss": 2.6371, + "step": 3071 + }, + { + "epoch": 2.0561943387548607, + "grad_norm": 6.650427341461182, + "learning_rate": 1.3623350619282233e-05, + "loss": 2.4181, + "step": 3072 + }, + { + "epoch": 2.0568633189781327, + "grad_norm": 6.215967655181885, + "learning_rate": 1.3606013039690594e-05, + "loss": 2.3523, + "step": 3073 + }, + { + "epoch": 2.0575322992014047, + "grad_norm": 5.426647186279297, + "learning_rate": 1.3588682373933753e-05, + "loss": 2.3854, + "step": 3074 + }, + { + "epoch": 2.058201279424677, + "grad_norm": 6.204399585723877, + "learning_rate": 1.3571358632527881e-05, + "loss": 2.459, + "step": 3075 + }, + { + "epoch": 2.058870259647949, + "grad_norm": 8.206076622009277, + "learning_rate": 1.3554041825985e-05, + "loss": 2.497, + "step": 3076 + }, + { + "epoch": 2.059539239871221, + "grad_norm": 4.415890693664551, + "learning_rate": 1.3536731964812855e-05, + "loss": 2.3511, + "step": 3077 + }, + { + "epoch": 2.0602082200944936, + "grad_norm": 5.170623779296875, + "learning_rate": 1.3519429059515043e-05, + "loss": 2.3172, + "step": 3078 + }, + { + "epoch": 2.0608772003177656, + "grad_norm": 6.494916915893555, + "learning_rate": 1.3502133120590892e-05, + "loss": 2.3789, + "step": 3079 + }, + { + "epoch": 2.0615461805410376, + "grad_norm": 4.845317840576172, + "learning_rate": 1.348484415853554e-05, + "loss": 2.292, + "step": 3080 + }, + { + "epoch": 2.06221516076431, + "grad_norm": 6.121149063110352, + "learning_rate": 1.3467562183839855e-05, + "loss": 2.3501, + "step": 3081 + }, + { + "epoch": 2.062884140987582, + "grad_norm": 5.145164966583252, + "learning_rate": 1.3450287206990504e-05, + "loss": 2.4633, + "step": 3082 + }, + { + "epoch": 2.063553121210854, + "grad_norm": 5.962216854095459, + "learning_rate": 1.343301923846988e-05, + "loss": 2.404, + "step": 3083 + }, + { + "epoch": 2.0642221014341264, + "grad_norm": 6.056277751922607, + "learning_rate": 1.3415758288756125e-05, + "loss": 2.4574, + "step": 3084 + }, + { + "epoch": 2.0648910816573984, + "grad_norm": 3.393918514251709, + "learning_rate": 1.3398504368323122e-05, + "loss": 2.0317, + "step": 3085 + }, + { + "epoch": 2.065560061880671, + "grad_norm": 6.001176834106445, + "learning_rate": 1.338125748764052e-05, + "loss": 2.1996, + "step": 3086 + }, + { + "epoch": 2.066229042103943, + "grad_norm": 7.369204521179199, + "learning_rate": 1.3364017657173639e-05, + "loss": 2.286, + "step": 3087 + }, + { + "epoch": 2.066898022327215, + "grad_norm": 6.244478225708008, + "learning_rate": 1.3346784887383581e-05, + "loss": 2.3475, + "step": 3088 + }, + { + "epoch": 2.0675670025504873, + "grad_norm": 4.677946090698242, + "learning_rate": 1.3329559188727114e-05, + "loss": 2.4927, + "step": 3089 + }, + { + "epoch": 2.0682359827737593, + "grad_norm": 4.921236991882324, + "learning_rate": 1.3312340571656757e-05, + "loss": 2.1817, + "step": 3090 + }, + { + "epoch": 2.0689049629970313, + "grad_norm": 6.422043800354004, + "learning_rate": 1.3295129046620702e-05, + "loss": 2.3439, + "step": 3091 + }, + { + "epoch": 2.0695739432203037, + "grad_norm": 6.241523265838623, + "learning_rate": 1.3277924624062843e-05, + "loss": 2.5028, + "step": 3092 + }, + { + "epoch": 2.0702429234435757, + "grad_norm": 5.6386027336120605, + "learning_rate": 1.3260727314422766e-05, + "loss": 2.3629, + "step": 3093 + }, + { + "epoch": 2.0709119036668477, + "grad_norm": 8.21275806427002, + "learning_rate": 1.3243537128135759e-05, + "loss": 2.7467, + "step": 3094 + }, + { + "epoch": 2.07158088389012, + "grad_norm": 6.249711513519287, + "learning_rate": 1.322635407563275e-05, + "loss": 2.3824, + "step": 3095 + }, + { + "epoch": 2.072249864113392, + "grad_norm": 5.151498317718506, + "learning_rate": 1.320917816734038e-05, + "loss": 2.4113, + "step": 3096 + }, + { + "epoch": 2.072918844336664, + "grad_norm": 6.461350440979004, + "learning_rate": 1.3192009413680933e-05, + "loss": 2.5538, + "step": 3097 + }, + { + "epoch": 2.0735878245599366, + "grad_norm": 4.3360795974731445, + "learning_rate": 1.3174847825072346e-05, + "loss": 2.45, + "step": 3098 + }, + { + "epoch": 2.0742568047832086, + "grad_norm": 5.090626239776611, + "learning_rate": 1.315769341192821e-05, + "loss": 2.3227, + "step": 3099 + }, + { + "epoch": 2.0749257850064806, + "grad_norm": 5.059175491333008, + "learning_rate": 1.3140546184657785e-05, + "loss": 2.5223, + "step": 3100 + }, + { + "epoch": 2.075594765229753, + "grad_norm": 5.985151290893555, + "learning_rate": 1.3123406153665934e-05, + "loss": 2.2628, + "step": 3101 + }, + { + "epoch": 2.076263745453025, + "grad_norm": 5.502687454223633, + "learning_rate": 1.3106273329353194e-05, + "loss": 2.4181, + "step": 3102 + }, + { + "epoch": 2.076932725676297, + "grad_norm": 6.538254261016846, + "learning_rate": 1.3089147722115688e-05, + "loss": 2.5057, + "step": 3103 + }, + { + "epoch": 2.0776017058995695, + "grad_norm": 5.117298603057861, + "learning_rate": 1.3072029342345198e-05, + "loss": 2.469, + "step": 3104 + }, + { + "epoch": 2.0782706861228415, + "grad_norm": 4.5379252433776855, + "learning_rate": 1.3054918200429095e-05, + "loss": 2.4227, + "step": 3105 + }, + { + "epoch": 2.0789396663461135, + "grad_norm": 10.137548446655273, + "learning_rate": 1.3037814306750365e-05, + "loss": 2.6375, + "step": 3106 + }, + { + "epoch": 2.079608646569386, + "grad_norm": 5.899766445159912, + "learning_rate": 1.3020717671687581e-05, + "loss": 2.6206, + "step": 3107 + }, + { + "epoch": 2.080277626792658, + "grad_norm": 5.52589750289917, + "learning_rate": 1.3003628305614951e-05, + "loss": 2.5018, + "step": 3108 + }, + { + "epoch": 2.08094660701593, + "grad_norm": 5.906355381011963, + "learning_rate": 1.2986546218902229e-05, + "loss": 2.5298, + "step": 3109 + }, + { + "epoch": 2.0816155872392024, + "grad_norm": 4.823853015899658, + "learning_rate": 1.2969471421914786e-05, + "loss": 2.4524, + "step": 3110 + }, + { + "epoch": 2.0822845674624744, + "grad_norm": 6.769951820373535, + "learning_rate": 1.2952403925013534e-05, + "loss": 2.528, + "step": 3111 + }, + { + "epoch": 2.0829535476857464, + "grad_norm": 5.901849269866943, + "learning_rate": 1.2935343738555e-05, + "loss": 2.6521, + "step": 3112 + }, + { + "epoch": 2.083622527909019, + "grad_norm": 6.245223522186279, + "learning_rate": 1.2918290872891237e-05, + "loss": 2.5152, + "step": 3113 + }, + { + "epoch": 2.084291508132291, + "grad_norm": 5.069662094116211, + "learning_rate": 1.2901245338369866e-05, + "loss": 2.6033, + "step": 3114 + }, + { + "epoch": 2.084960488355563, + "grad_norm": 8.72216796875, + "learning_rate": 1.2884207145334059e-05, + "loss": 2.5902, + "step": 3115 + }, + { + "epoch": 2.0856294685788352, + "grad_norm": 5.8375444412231445, + "learning_rate": 1.2867176304122559e-05, + "loss": 2.4529, + "step": 3116 + }, + { + "epoch": 2.0862984488021072, + "grad_norm": 5.243298530578613, + "learning_rate": 1.2850152825069598e-05, + "loss": 2.5756, + "step": 3117 + }, + { + "epoch": 2.0869674290253792, + "grad_norm": 8.686291694641113, + "learning_rate": 1.2833136718504992e-05, + "loss": 2.5503, + "step": 3118 + }, + { + "epoch": 2.0876364092486517, + "grad_norm": 4.761375904083252, + "learning_rate": 1.2816127994754051e-05, + "loss": 2.4156, + "step": 3119 + }, + { + "epoch": 2.0883053894719237, + "grad_norm": 5.23760986328125, + "learning_rate": 1.2799126664137614e-05, + "loss": 2.5224, + "step": 3120 + }, + { + "epoch": 2.0889743696951957, + "grad_norm": 5.315793514251709, + "learning_rate": 1.2782132736972024e-05, + "loss": 2.2855, + "step": 3121 + }, + { + "epoch": 2.089643349918468, + "grad_norm": 7.616042613983154, + "learning_rate": 1.2765146223569157e-05, + "loss": 2.5564, + "step": 3122 + }, + { + "epoch": 2.09031233014174, + "grad_norm": 7.258033752441406, + "learning_rate": 1.2748167134236361e-05, + "loss": 2.3319, + "step": 3123 + }, + { + "epoch": 2.090981310365012, + "grad_norm": 6.943090438842773, + "learning_rate": 1.2731195479276511e-05, + "loss": 2.5598, + "step": 3124 + }, + { + "epoch": 2.0916502905882846, + "grad_norm": 5.5149245262146, + "learning_rate": 1.2714231268987934e-05, + "loss": 2.2787, + "step": 3125 + }, + { + "epoch": 2.0923192708115566, + "grad_norm": 6.838531494140625, + "learning_rate": 1.2697274513664476e-05, + "loss": 2.4476, + "step": 3126 + }, + { + "epoch": 2.092988251034829, + "grad_norm": 9.055147171020508, + "learning_rate": 1.268032522359543e-05, + "loss": 2.5538, + "step": 3127 + }, + { + "epoch": 2.093657231258101, + "grad_norm": 6.226515769958496, + "learning_rate": 1.2663383409065576e-05, + "loss": 2.513, + "step": 3128 + }, + { + "epoch": 2.094326211481373, + "grad_norm": 5.018380641937256, + "learning_rate": 1.2646449080355143e-05, + "loss": 2.3689, + "step": 3129 + }, + { + "epoch": 2.0949951917046454, + "grad_norm": 5.866727352142334, + "learning_rate": 1.2629522247739842e-05, + "loss": 2.6151, + "step": 3130 + }, + { + "epoch": 2.0956641719279174, + "grad_norm": 7.239480972290039, + "learning_rate": 1.2612602921490805e-05, + "loss": 2.4222, + "step": 3131 + }, + { + "epoch": 2.0963331521511894, + "grad_norm": 6.264415264129639, + "learning_rate": 1.2595691111874642e-05, + "loss": 2.496, + "step": 3132 + }, + { + "epoch": 2.097002132374462, + "grad_norm": 6.290718078613281, + "learning_rate": 1.2578786829153368e-05, + "loss": 2.4581, + "step": 3133 + }, + { + "epoch": 2.097671112597734, + "grad_norm": 5.889428615570068, + "learning_rate": 1.2561890083584466e-05, + "loss": 2.4592, + "step": 3134 + }, + { + "epoch": 2.098340092821006, + "grad_norm": 4.982312202453613, + "learning_rate": 1.2545000885420819e-05, + "loss": 2.3416, + "step": 3135 + }, + { + "epoch": 2.0990090730442783, + "grad_norm": 5.230997085571289, + "learning_rate": 1.2528119244910735e-05, + "loss": 2.5417, + "step": 3136 + }, + { + "epoch": 2.0996780532675503, + "grad_norm": 5.300906658172607, + "learning_rate": 1.251124517229793e-05, + "loss": 2.4974, + "step": 3137 + }, + { + "epoch": 2.1003470334908223, + "grad_norm": 4.464624404907227, + "learning_rate": 1.2494378677821562e-05, + "loss": 2.1956, + "step": 3138 + }, + { + "epoch": 2.1010160137140947, + "grad_norm": 7.037275314331055, + "learning_rate": 1.2477519771716137e-05, + "loss": 2.6677, + "step": 3139 + }, + { + "epoch": 2.1016849939373667, + "grad_norm": 5.845183849334717, + "learning_rate": 1.2460668464211614e-05, + "loss": 2.2375, + "step": 3140 + }, + { + "epoch": 2.1023539741606387, + "grad_norm": 5.264319896697998, + "learning_rate": 1.2443824765533295e-05, + "loss": 2.5408, + "step": 3141 + }, + { + "epoch": 2.103022954383911, + "grad_norm": 4.8145036697387695, + "learning_rate": 1.2426988685901892e-05, + "loss": 2.4151, + "step": 3142 + }, + { + "epoch": 2.103691934607183, + "grad_norm": 5.5874433517456055, + "learning_rate": 1.2410160235533463e-05, + "loss": 2.4036, + "step": 3143 + }, + { + "epoch": 2.104360914830455, + "grad_norm": 5.06381893157959, + "learning_rate": 1.2393339424639486e-05, + "loss": 2.253, + "step": 3144 + }, + { + "epoch": 2.1050298950537276, + "grad_norm": 6.6179070472717285, + "learning_rate": 1.2376526263426752e-05, + "loss": 2.3501, + "step": 3145 + }, + { + "epoch": 2.1056988752769996, + "grad_norm": 5.264029026031494, + "learning_rate": 1.2359720762097455e-05, + "loss": 2.5692, + "step": 3146 + }, + { + "epoch": 2.1063678555002716, + "grad_norm": 6.184389114379883, + "learning_rate": 1.2342922930849094e-05, + "loss": 2.6051, + "step": 3147 + }, + { + "epoch": 2.107036835723544, + "grad_norm": 5.724100112915039, + "learning_rate": 1.2326132779874566e-05, + "loss": 2.2546, + "step": 3148 + }, + { + "epoch": 2.107705815946816, + "grad_norm": 6.0407490730285645, + "learning_rate": 1.2309350319362068e-05, + "loss": 2.513, + "step": 3149 + }, + { + "epoch": 2.108374796170088, + "grad_norm": 4.576620578765869, + "learning_rate": 1.2292575559495143e-05, + "loss": 2.103, + "step": 3150 + }, + { + "epoch": 2.1090437763933605, + "grad_norm": 5.4054741859436035, + "learning_rate": 1.227580851045265e-05, + "loss": 2.426, + "step": 3151 + }, + { + "epoch": 2.1097127566166325, + "grad_norm": 5.270204067230225, + "learning_rate": 1.2259049182408804e-05, + "loss": 2.363, + "step": 3152 + }, + { + "epoch": 2.1103817368399045, + "grad_norm": 5.856900691986084, + "learning_rate": 1.2242297585533092e-05, + "loss": 2.5332, + "step": 3153 + }, + { + "epoch": 2.111050717063177, + "grad_norm": 5.15697717666626, + "learning_rate": 1.2225553729990345e-05, + "loss": 2.5178, + "step": 3154 + }, + { + "epoch": 2.111719697286449, + "grad_norm": 6.116770267486572, + "learning_rate": 1.2208817625940664e-05, + "loss": 2.2877, + "step": 3155 + }, + { + "epoch": 2.112388677509721, + "grad_norm": 5.580972671508789, + "learning_rate": 1.2192089283539481e-05, + "loss": 2.4115, + "step": 3156 + }, + { + "epoch": 2.1130576577329934, + "grad_norm": 6.879759788513184, + "learning_rate": 1.2175368712937494e-05, + "loss": 2.4383, + "step": 3157 + }, + { + "epoch": 2.1137266379562654, + "grad_norm": 4.9797749519348145, + "learning_rate": 1.2158655924280688e-05, + "loss": 2.3378, + "step": 3158 + }, + { + "epoch": 2.1143956181795374, + "grad_norm": 5.423282146453857, + "learning_rate": 1.2141950927710325e-05, + "loss": 2.3167, + "step": 3159 + }, + { + "epoch": 2.11506459840281, + "grad_norm": 3.600644826889038, + "learning_rate": 1.2125253733362957e-05, + "loss": 2.0791, + "step": 3160 + }, + { + "epoch": 2.115733578626082, + "grad_norm": 5.923058032989502, + "learning_rate": 1.2108564351370372e-05, + "loss": 2.4907, + "step": 3161 + }, + { + "epoch": 2.1164025588493542, + "grad_norm": 6.911498069763184, + "learning_rate": 1.2091882791859651e-05, + "loss": 2.3344, + "step": 3162 + }, + { + "epoch": 2.1170715390726262, + "grad_norm": 6.525911331176758, + "learning_rate": 1.2075209064953106e-05, + "loss": 2.6822, + "step": 3163 + }, + { + "epoch": 2.1177405192958982, + "grad_norm": 7.9432244300842285, + "learning_rate": 1.2058543180768295e-05, + "loss": 2.2904, + "step": 3164 + }, + { + "epoch": 2.1184094995191707, + "grad_norm": 5.426655292510986, + "learning_rate": 1.2041885149418014e-05, + "loss": 2.4089, + "step": 3165 + }, + { + "epoch": 2.1190784797424427, + "grad_norm": 4.516605854034424, + "learning_rate": 1.2025234981010328e-05, + "loss": 2.2793, + "step": 3166 + }, + { + "epoch": 2.1197474599657147, + "grad_norm": 5.841516494750977, + "learning_rate": 1.2008592685648478e-05, + "loss": 2.6497, + "step": 3167 + }, + { + "epoch": 2.120416440188987, + "grad_norm": 3.303682804107666, + "learning_rate": 1.1991958273430983e-05, + "loss": 2.2554, + "step": 3168 + }, + { + "epoch": 2.121085420412259, + "grad_norm": 5.285421848297119, + "learning_rate": 1.197533175445153e-05, + "loss": 2.3379, + "step": 3169 + }, + { + "epoch": 2.121754400635531, + "grad_norm": 4.775778770446777, + "learning_rate": 1.1958713138799052e-05, + "loss": 2.3428, + "step": 3170 + }, + { + "epoch": 2.1224233808588036, + "grad_norm": 7.574385643005371, + "learning_rate": 1.1942102436557667e-05, + "loss": 2.4585, + "step": 3171 + }, + { + "epoch": 2.1230923610820756, + "grad_norm": 5.816531658172607, + "learning_rate": 1.1925499657806693e-05, + "loss": 2.3998, + "step": 3172 + }, + { + "epoch": 2.1237613413053475, + "grad_norm": 6.713841438293457, + "learning_rate": 1.1908904812620636e-05, + "loss": 2.6598, + "step": 3173 + }, + { + "epoch": 2.12443032152862, + "grad_norm": 5.4964680671691895, + "learning_rate": 1.1892317911069212e-05, + "loss": 2.3309, + "step": 3174 + }, + { + "epoch": 2.125099301751892, + "grad_norm": 6.4591827392578125, + "learning_rate": 1.1875738963217281e-05, + "loss": 2.6421, + "step": 3175 + }, + { + "epoch": 2.125768281975164, + "grad_norm": 6.990157127380371, + "learning_rate": 1.1859167979124913e-05, + "loss": 2.2652, + "step": 3176 + }, + { + "epoch": 2.1264372621984364, + "grad_norm": 6.608819007873535, + "learning_rate": 1.1842604968847315e-05, + "loss": 2.4845, + "step": 3177 + }, + { + "epoch": 2.1271062424217084, + "grad_norm": 7.371438026428223, + "learning_rate": 1.182604994243488e-05, + "loss": 2.4403, + "step": 3178 + }, + { + "epoch": 2.1277752226449804, + "grad_norm": 3.5651650428771973, + "learning_rate": 1.180950290993314e-05, + "loss": 2.2112, + "step": 3179 + }, + { + "epoch": 2.128444202868253, + "grad_norm": 6.502554893493652, + "learning_rate": 1.1792963881382779e-05, + "loss": 2.4163, + "step": 3180 + }, + { + "epoch": 2.129113183091525, + "grad_norm": 4.840087890625, + "learning_rate": 1.177643286681962e-05, + "loss": 2.4764, + "step": 3181 + }, + { + "epoch": 2.129782163314797, + "grad_norm": 5.538188457489014, + "learning_rate": 1.1759909876274644e-05, + "loss": 2.5455, + "step": 3182 + }, + { + "epoch": 2.1304511435380693, + "grad_norm": 7.337990760803223, + "learning_rate": 1.1743394919773935e-05, + "loss": 2.2932, + "step": 3183 + }, + { + "epoch": 2.1311201237613413, + "grad_norm": 5.970000743865967, + "learning_rate": 1.1726888007338732e-05, + "loss": 2.2465, + "step": 3184 + }, + { + "epoch": 2.1317891039846133, + "grad_norm": 4.6120381355285645, + "learning_rate": 1.1710389148985368e-05, + "loss": 2.2366, + "step": 3185 + }, + { + "epoch": 2.1324580842078857, + "grad_norm": 6.094398021697998, + "learning_rate": 1.1693898354725294e-05, + "loss": 2.4836, + "step": 3186 + }, + { + "epoch": 2.1331270644311577, + "grad_norm": 5.002927303314209, + "learning_rate": 1.1677415634565067e-05, + "loss": 2.3964, + "step": 3187 + }, + { + "epoch": 2.1337960446544297, + "grad_norm": 5.617417812347412, + "learning_rate": 1.1660940998506365e-05, + "loss": 2.1987, + "step": 3188 + }, + { + "epoch": 2.134465024877702, + "grad_norm": 5.369129180908203, + "learning_rate": 1.1644474456545923e-05, + "loss": 2.3323, + "step": 3189 + }, + { + "epoch": 2.135134005100974, + "grad_norm": 6.841965675354004, + "learning_rate": 1.1628016018675611e-05, + "loss": 2.4832, + "step": 3190 + }, + { + "epoch": 2.135802985324246, + "grad_norm": 7.286813259124756, + "learning_rate": 1.1611565694882332e-05, + "loss": 2.3715, + "step": 3191 + }, + { + "epoch": 2.1364719655475186, + "grad_norm": 5.17401123046875, + "learning_rate": 1.159512349514811e-05, + "loss": 2.2877, + "step": 3192 + }, + { + "epoch": 2.1371409457707906, + "grad_norm": 4.89274787902832, + "learning_rate": 1.1578689429450012e-05, + "loss": 2.3143, + "step": 3193 + }, + { + "epoch": 2.1378099259940626, + "grad_norm": 6.023360729217529, + "learning_rate": 1.1562263507760173e-05, + "loss": 2.3968, + "step": 3194 + }, + { + "epoch": 2.138478906217335, + "grad_norm": 6.579507827758789, + "learning_rate": 1.1545845740045785e-05, + "loss": 2.6162, + "step": 3195 + }, + { + "epoch": 2.139147886440607, + "grad_norm": 7.763417720794678, + "learning_rate": 1.1529436136269112e-05, + "loss": 2.5656, + "step": 3196 + }, + { + "epoch": 2.139816866663879, + "grad_norm": 4.605822563171387, + "learning_rate": 1.1513034706387429e-05, + "loss": 2.5438, + "step": 3197 + }, + { + "epoch": 2.1404858468871515, + "grad_norm": 6.099738121032715, + "learning_rate": 1.1496641460353091e-05, + "loss": 2.5324, + "step": 3198 + }, + { + "epoch": 2.1411548271104235, + "grad_norm": 5.454736709594727, + "learning_rate": 1.1480256408113451e-05, + "loss": 2.5969, + "step": 3199 + }, + { + "epoch": 2.1418238073336955, + "grad_norm": 4.486236095428467, + "learning_rate": 1.1463879559610924e-05, + "loss": 2.2563, + "step": 3200 + }, + { + "epoch": 2.142492787556968, + "grad_norm": 7.137877464294434, + "learning_rate": 1.1447510924782917e-05, + "loss": 2.8421, + "step": 3201 + }, + { + "epoch": 2.14316176778024, + "grad_norm": 5.9389777183532715, + "learning_rate": 1.1431150513561866e-05, + "loss": 2.3989, + "step": 3202 + }, + { + "epoch": 2.143830748003512, + "grad_norm": 5.879643440246582, + "learning_rate": 1.141479833587521e-05, + "loss": 2.3041, + "step": 3203 + }, + { + "epoch": 2.1444997282267844, + "grad_norm": 5.889922142028809, + "learning_rate": 1.1398454401645414e-05, + "loss": 2.2307, + "step": 3204 + }, + { + "epoch": 2.1451687084500564, + "grad_norm": 6.074721336364746, + "learning_rate": 1.1382118720789908e-05, + "loss": 2.5629, + "step": 3205 + }, + { + "epoch": 2.145837688673329, + "grad_norm": 7.225193500518799, + "learning_rate": 1.1365791303221147e-05, + "loss": 2.6484, + "step": 3206 + }, + { + "epoch": 2.146506668896601, + "grad_norm": 7.407486438751221, + "learning_rate": 1.134947215884655e-05, + "loss": 2.4902, + "step": 3207 + }, + { + "epoch": 2.147175649119873, + "grad_norm": 5.985146999359131, + "learning_rate": 1.1333161297568514e-05, + "loss": 2.3124, + "step": 3208 + }, + { + "epoch": 2.1478446293431452, + "grad_norm": 5.915247440338135, + "learning_rate": 1.1316858729284419e-05, + "loss": 2.5533, + "step": 3209 + }, + { + "epoch": 2.1485136095664172, + "grad_norm": 6.133087635040283, + "learning_rate": 1.1300564463886621e-05, + "loss": 2.4636, + "step": 3210 + }, + { + "epoch": 2.1491825897896892, + "grad_norm": 6.220125675201416, + "learning_rate": 1.1284278511262414e-05, + "loss": 2.3713, + "step": 3211 + }, + { + "epoch": 2.1498515700129617, + "grad_norm": 6.014132499694824, + "learning_rate": 1.1268000881294078e-05, + "loss": 2.6069, + "step": 3212 + }, + { + "epoch": 2.1505205502362337, + "grad_norm": 10.985984802246094, + "learning_rate": 1.125173158385881e-05, + "loss": 2.5186, + "step": 3213 + }, + { + "epoch": 2.1511895304595057, + "grad_norm": 6.082069396972656, + "learning_rate": 1.1235470628828784e-05, + "loss": 2.5479, + "step": 3214 + }, + { + "epoch": 2.151858510682778, + "grad_norm": 7.279585838317871, + "learning_rate": 1.121921802607109e-05, + "loss": 2.6448, + "step": 3215 + }, + { + "epoch": 2.15252749090605, + "grad_norm": 7.176996231079102, + "learning_rate": 1.1202973785447752e-05, + "loss": 2.371, + "step": 3216 + }, + { + "epoch": 2.153196471129322, + "grad_norm": 5.35184383392334, + "learning_rate": 1.1186737916815713e-05, + "loss": 2.3467, + "step": 3217 + }, + { + "epoch": 2.1538654513525946, + "grad_norm": 6.730971813201904, + "learning_rate": 1.1170510430026867e-05, + "loss": 2.3922, + "step": 3218 + }, + { + "epoch": 2.1545344315758665, + "grad_norm": 5.809053897857666, + "learning_rate": 1.1154291334927982e-05, + "loss": 2.5138, + "step": 3219 + }, + { + "epoch": 2.1552034117991385, + "grad_norm": 7.718690395355225, + "learning_rate": 1.113808064136077e-05, + "loss": 2.4376, + "step": 3220 + }, + { + "epoch": 2.155872392022411, + "grad_norm": 7.815573692321777, + "learning_rate": 1.1121878359161813e-05, + "loss": 2.5477, + "step": 3221 + }, + { + "epoch": 2.156541372245683, + "grad_norm": 6.004550933837891, + "learning_rate": 1.110568449816263e-05, + "loss": 2.3911, + "step": 3222 + }, + { + "epoch": 2.157210352468955, + "grad_norm": 5.915016174316406, + "learning_rate": 1.1089499068189562e-05, + "loss": 2.3959, + "step": 3223 + }, + { + "epoch": 2.1578793326922274, + "grad_norm": 5.888389587402344, + "learning_rate": 1.1073322079063913e-05, + "loss": 2.5889, + "step": 3224 + }, + { + "epoch": 2.1585483129154994, + "grad_norm": 8.737438201904297, + "learning_rate": 1.1057153540601804e-05, + "loss": 2.5892, + "step": 3225 + }, + { + "epoch": 2.1592172931387714, + "grad_norm": 6.077867031097412, + "learning_rate": 1.1040993462614268e-05, + "loss": 2.4067, + "step": 3226 + }, + { + "epoch": 2.159886273362044, + "grad_norm": 4.763731002807617, + "learning_rate": 1.1024841854907176e-05, + "loss": 2.2011, + "step": 3227 + }, + { + "epoch": 2.160555253585316, + "grad_norm": 4.016061305999756, + "learning_rate": 1.1008698727281289e-05, + "loss": 2.3669, + "step": 3228 + }, + { + "epoch": 2.161224233808588, + "grad_norm": 6.888826847076416, + "learning_rate": 1.0992564089532193e-05, + "loss": 2.4226, + "step": 3229 + }, + { + "epoch": 2.1618932140318603, + "grad_norm": 5.749783515930176, + "learning_rate": 1.0976437951450336e-05, + "loss": 2.4373, + "step": 3230 + }, + { + "epoch": 2.1625621942551323, + "grad_norm": 6.960591793060303, + "learning_rate": 1.0960320322820999e-05, + "loss": 2.3529, + "step": 3231 + }, + { + "epoch": 2.1632311744784043, + "grad_norm": 6.294894218444824, + "learning_rate": 1.0944211213424326e-05, + "loss": 2.622, + "step": 3232 + }, + { + "epoch": 2.1639001547016767, + "grad_norm": 5.610545635223389, + "learning_rate": 1.0928110633035252e-05, + "loss": 2.4594, + "step": 3233 + }, + { + "epoch": 2.1645691349249487, + "grad_norm": 5.755736351013184, + "learning_rate": 1.0912018591423578e-05, + "loss": 2.5355, + "step": 3234 + }, + { + "epoch": 2.1652381151482207, + "grad_norm": 6.174062252044678, + "learning_rate": 1.0895935098353885e-05, + "loss": 2.5725, + "step": 3235 + }, + { + "epoch": 2.165907095371493, + "grad_norm": 4.383173942565918, + "learning_rate": 1.0879860163585604e-05, + "loss": 2.3036, + "step": 3236 + }, + { + "epoch": 2.166576075594765, + "grad_norm": 5.8941779136657715, + "learning_rate": 1.0863793796872942e-05, + "loss": 2.068, + "step": 3237 + }, + { + "epoch": 2.1672450558180376, + "grad_norm": 6.619831562042236, + "learning_rate": 1.084773600796492e-05, + "loss": 2.5603, + "step": 3238 + }, + { + "epoch": 2.1679140360413096, + "grad_norm": 4.755913734436035, + "learning_rate": 1.0831686806605345e-05, + "loss": 2.4281, + "step": 3239 + }, + { + "epoch": 2.1685830162645816, + "grad_norm": 7.150200843811035, + "learning_rate": 1.081564620253284e-05, + "loss": 2.5785, + "step": 3240 + }, + { + "epoch": 2.169251996487854, + "grad_norm": 7.264523506164551, + "learning_rate": 1.0799614205480768e-05, + "loss": 2.3924, + "step": 3241 + }, + { + "epoch": 2.169920976711126, + "grad_norm": 7.59616231918335, + "learning_rate": 1.078359082517732e-05, + "loss": 2.5872, + "step": 3242 + }, + { + "epoch": 2.170589956934398, + "grad_norm": 5.729783058166504, + "learning_rate": 1.0767576071345408e-05, + "loss": 2.4241, + "step": 3243 + }, + { + "epoch": 2.1712589371576705, + "grad_norm": 5.2718424797058105, + "learning_rate": 1.0751569953702765e-05, + "loss": 2.4088, + "step": 3244 + }, + { + "epoch": 2.1719279173809425, + "grad_norm": 4.924938201904297, + "learning_rate": 1.0735572481961809e-05, + "loss": 2.2696, + "step": 3245 + }, + { + "epoch": 2.1725968976042145, + "grad_norm": 4.805285930633545, + "learning_rate": 1.0719583665829788e-05, + "loss": 2.3245, + "step": 3246 + }, + { + "epoch": 2.173265877827487, + "grad_norm": 4.2103753089904785, + "learning_rate": 1.0703603515008643e-05, + "loss": 2.1653, + "step": 3247 + }, + { + "epoch": 2.173934858050759, + "grad_norm": 5.636720180511475, + "learning_rate": 1.0687632039195098e-05, + "loss": 2.6404, + "step": 3248 + }, + { + "epoch": 2.174603838274031, + "grad_norm": 6.387702465057373, + "learning_rate": 1.0671669248080573e-05, + "loss": 2.5182, + "step": 3249 + }, + { + "epoch": 2.1752728184973034, + "grad_norm": 6.871338367462158, + "learning_rate": 1.0655715151351262e-05, + "loss": 2.4908, + "step": 3250 + }, + { + "epoch": 2.1759417987205754, + "grad_norm": 5.293996810913086, + "learning_rate": 1.0639769758688046e-05, + "loss": 2.4777, + "step": 3251 + }, + { + "epoch": 2.1766107789438474, + "grad_norm": 7.730209827423096, + "learning_rate": 1.0623833079766534e-05, + "loss": 2.4757, + "step": 3252 + }, + { + "epoch": 2.17727975916712, + "grad_norm": 6.068915843963623, + "learning_rate": 1.0607905124257053e-05, + "loss": 2.5664, + "step": 3253 + }, + { + "epoch": 2.177948739390392, + "grad_norm": 5.767038822174072, + "learning_rate": 1.0591985901824648e-05, + "loss": 2.4516, + "step": 3254 + }, + { + "epoch": 2.178617719613664, + "grad_norm": 6.213324546813965, + "learning_rate": 1.0576075422129034e-05, + "loss": 2.5957, + "step": 3255 + }, + { + "epoch": 2.1792866998369362, + "grad_norm": 5.1829142570495605, + "learning_rate": 1.0560173694824658e-05, + "loss": 2.5273, + "step": 3256 + }, + { + "epoch": 2.1799556800602082, + "grad_norm": 6.2277069091796875, + "learning_rate": 1.0544280729560618e-05, + "loss": 2.5238, + "step": 3257 + }, + { + "epoch": 2.1806246602834802, + "grad_norm": 6.939554691314697, + "learning_rate": 1.0528396535980734e-05, + "loss": 2.4433, + "step": 3258 + }, + { + "epoch": 2.1812936405067527, + "grad_norm": 6.321778297424316, + "learning_rate": 1.0512521123723474e-05, + "loss": 2.3881, + "step": 3259 + }, + { + "epoch": 2.1819626207300247, + "grad_norm": 5.648446083068848, + "learning_rate": 1.0496654502421985e-05, + "loss": 2.4009, + "step": 3260 + }, + { + "epoch": 2.1826316009532967, + "grad_norm": 6.702240467071533, + "learning_rate": 1.0480796681704078e-05, + "loss": 2.3272, + "step": 3261 + }, + { + "epoch": 2.183300581176569, + "grad_norm": 6.676383018493652, + "learning_rate": 1.046494767119224e-05, + "loss": 2.5501, + "step": 3262 + }, + { + "epoch": 2.183969561399841, + "grad_norm": 5.882723808288574, + "learning_rate": 1.0449107480503589e-05, + "loss": 2.3693, + "step": 3263 + }, + { + "epoch": 2.184638541623113, + "grad_norm": 6.2965521812438965, + "learning_rate": 1.0433276119249913e-05, + "loss": 2.5059, + "step": 3264 + }, + { + "epoch": 2.1853075218463855, + "grad_norm": 5.6915059089660645, + "learning_rate": 1.0417453597037627e-05, + "loss": 2.6741, + "step": 3265 + }, + { + "epoch": 2.1859765020696575, + "grad_norm": 5.622255325317383, + "learning_rate": 1.0401639923467781e-05, + "loss": 2.3963, + "step": 3266 + }, + { + "epoch": 2.1866454822929295, + "grad_norm": 6.154139041900635, + "learning_rate": 1.038583510813606e-05, + "loss": 2.4418, + "step": 3267 + }, + { + "epoch": 2.187314462516202, + "grad_norm": 8.000636100769043, + "learning_rate": 1.037003916063279e-05, + "loss": 2.7, + "step": 3268 + }, + { + "epoch": 2.187983442739474, + "grad_norm": 5.7405924797058105, + "learning_rate": 1.0354252090542883e-05, + "loss": 2.4079, + "step": 3269 + }, + { + "epoch": 2.188652422962746, + "grad_norm": 4.20126485824585, + "learning_rate": 1.03384739074459e-05, + "loss": 2.2794, + "step": 3270 + }, + { + "epoch": 2.1893214031860184, + "grad_norm": 6.437028408050537, + "learning_rate": 1.0322704620915973e-05, + "loss": 2.641, + "step": 3271 + }, + { + "epoch": 2.1899903834092904, + "grad_norm": 6.125021934509277, + "learning_rate": 1.0306944240521876e-05, + "loss": 2.3918, + "step": 3272 + }, + { + "epoch": 2.1906593636325624, + "grad_norm": 5.8502678871154785, + "learning_rate": 1.0291192775826947e-05, + "loss": 2.3711, + "step": 3273 + }, + { + "epoch": 2.191328343855835, + "grad_norm": 7.589147090911865, + "learning_rate": 1.0275450236389123e-05, + "loss": 2.3566, + "step": 3274 + }, + { + "epoch": 2.191997324079107, + "grad_norm": 6.148003101348877, + "learning_rate": 1.0259716631760918e-05, + "loss": 2.3216, + "step": 3275 + }, + { + "epoch": 2.192666304302379, + "grad_norm": 5.355863094329834, + "learning_rate": 1.024399197148945e-05, + "loss": 2.4026, + "step": 3276 + }, + { + "epoch": 2.1933352845256513, + "grad_norm": 3.794316530227661, + "learning_rate": 1.0228276265116377e-05, + "loss": 2.2804, + "step": 3277 + }, + { + "epoch": 2.1940042647489233, + "grad_norm": 5.927673816680908, + "learning_rate": 1.0212569522177956e-05, + "loss": 2.7783, + "step": 3278 + }, + { + "epoch": 2.1946732449721953, + "grad_norm": 4.8662309646606445, + "learning_rate": 1.019687175220497e-05, + "loss": 2.3687, + "step": 3279 + }, + { + "epoch": 2.1953422251954677, + "grad_norm": 8.175592422485352, + "learning_rate": 1.0181182964722794e-05, + "loss": 2.4035, + "step": 3280 + }, + { + "epoch": 2.1960112054187397, + "grad_norm": 5.3390793800354, + "learning_rate": 1.0165503169251326e-05, + "loss": 2.3852, + "step": 3281 + }, + { + "epoch": 2.1966801856420117, + "grad_norm": 4.161181926727295, + "learning_rate": 1.0149832375305013e-05, + "loss": 2.4324, + "step": 3282 + }, + { + "epoch": 2.197349165865284, + "grad_norm": 6.178553104400635, + "learning_rate": 1.0134170592392836e-05, + "loss": 2.61, + "step": 3283 + }, + { + "epoch": 2.198018146088556, + "grad_norm": 6.2336530685424805, + "learning_rate": 1.0118517830018328e-05, + "loss": 2.3137, + "step": 3284 + }, + { + "epoch": 2.1986871263118286, + "grad_norm": 5.780093193054199, + "learning_rate": 1.0102874097679526e-05, + "loss": 2.4979, + "step": 3285 + }, + { + "epoch": 2.1993561065351006, + "grad_norm": 5.162668704986572, + "learning_rate": 1.0087239404869007e-05, + "loss": 2.561, + "step": 3286 + }, + { + "epoch": 2.2000250867583726, + "grad_norm": 5.267562389373779, + "learning_rate": 1.0071613761073844e-05, + "loss": 2.4652, + "step": 3287 + }, + { + "epoch": 2.200694066981645, + "grad_norm": 7.34504508972168, + "learning_rate": 1.0055997175775634e-05, + "loss": 2.959, + "step": 3288 + }, + { + "epoch": 2.201363047204917, + "grad_norm": 5.962170124053955, + "learning_rate": 1.0040389658450455e-05, + "loss": 2.3388, + "step": 3289 + }, + { + "epoch": 2.202032027428189, + "grad_norm": 5.704697132110596, + "learning_rate": 1.0024791218568918e-05, + "loss": 2.2603, + "step": 3290 + }, + { + "epoch": 2.2027010076514615, + "grad_norm": 8.443648338317871, + "learning_rate": 1.0009201865596091e-05, + "loss": 2.3282, + "step": 3291 + }, + { + "epoch": 2.2033699878747335, + "grad_norm": 6.203602313995361, + "learning_rate": 9.99362160899156e-06, + "loss": 2.7166, + "step": 3292 + }, + { + "epoch": 2.2040389680980055, + "grad_norm": 5.6963887214660645, + "learning_rate": 9.978050458209356e-06, + "loss": 2.2055, + "step": 3293 + }, + { + "epoch": 2.204707948321278, + "grad_norm": 6.479327201843262, + "learning_rate": 9.962488422698022e-06, + "loss": 2.4797, + "step": 3294 + }, + { + "epoch": 2.20537692854455, + "grad_norm": 6.6697845458984375, + "learning_rate": 9.94693551190054e-06, + "loss": 2.317, + "step": 3295 + }, + { + "epoch": 2.206045908767822, + "grad_norm": 5.989675521850586, + "learning_rate": 9.931391735254373e-06, + "loss": 2.3203, + "step": 3296 + }, + { + "epoch": 2.2067148889910944, + "grad_norm": 7.710916996002197, + "learning_rate": 9.91585710219142e-06, + "loss": 2.4679, + "step": 3297 + }, + { + "epoch": 2.2073838692143664, + "grad_norm": 4.716619491577148, + "learning_rate": 9.900331622138065e-06, + "loss": 2.124, + "step": 3298 + }, + { + "epoch": 2.2080528494376384, + "grad_norm": 8.539642333984375, + "learning_rate": 9.884815304515106e-06, + "loss": 2.3051, + "step": 3299 + }, + { + "epoch": 2.208721829660911, + "grad_norm": 23.150936126708984, + "learning_rate": 9.869308158737805e-06, + "loss": 2.2734, + "step": 3300 + }, + { + "epoch": 2.209390809884183, + "grad_norm": 7.312356948852539, + "learning_rate": 9.853810194215837e-06, + "loss": 2.4778, + "step": 3301 + }, + { + "epoch": 2.210059790107455, + "grad_norm": 6.028582572937012, + "learning_rate": 9.838321420353334e-06, + "loss": 2.2238, + "step": 3302 + }, + { + "epoch": 2.2107287703307272, + "grad_norm": 5.365725517272949, + "learning_rate": 9.822841846548825e-06, + "loss": 2.3366, + "step": 3303 + }, + { + "epoch": 2.2113977505539992, + "grad_norm": 5.354928970336914, + "learning_rate": 9.807371482195263e-06, + "loss": 2.2311, + "step": 3304 + }, + { + "epoch": 2.2120667307772712, + "grad_norm": 6.880374908447266, + "learning_rate": 9.79191033668001e-06, + "loss": 2.2402, + "step": 3305 + }, + { + "epoch": 2.2127357110005437, + "grad_norm": 4.652713298797607, + "learning_rate": 9.776458419384857e-06, + "loss": 2.2769, + "step": 3306 + }, + { + "epoch": 2.2134046912238157, + "grad_norm": 5.592299938201904, + "learning_rate": 9.761015739685961e-06, + "loss": 2.2559, + "step": 3307 + }, + { + "epoch": 2.2140736714470877, + "grad_norm": 9.013760566711426, + "learning_rate": 9.745582306953904e-06, + "loss": 2.6562, + "step": 3308 + }, + { + "epoch": 2.21474265167036, + "grad_norm": 6.487226486206055, + "learning_rate": 9.730158130553638e-06, + "loss": 2.3575, + "step": 3309 + }, + { + "epoch": 2.215411631893632, + "grad_norm": 6.366955757141113, + "learning_rate": 9.714743219844504e-06, + "loss": 2.4937, + "step": 3310 + }, + { + "epoch": 2.216080612116904, + "grad_norm": 6.285698413848877, + "learning_rate": 9.699337584180213e-06, + "loss": 2.4974, + "step": 3311 + }, + { + "epoch": 2.2167495923401765, + "grad_norm": 5.436456203460693, + "learning_rate": 9.683941232908869e-06, + "loss": 2.2167, + "step": 3312 + }, + { + "epoch": 2.2174185725634485, + "grad_norm": 8.226724624633789, + "learning_rate": 9.668554175372912e-06, + "loss": 2.5655, + "step": 3313 + }, + { + "epoch": 2.2180875527867205, + "grad_norm": 7.0515336990356445, + "learning_rate": 9.65317642090918e-06, + "loss": 2.6929, + "step": 3314 + }, + { + "epoch": 2.218756533009993, + "grad_norm": 4.807023525238037, + "learning_rate": 9.637807978848823e-06, + "loss": 2.5244, + "step": 3315 + }, + { + "epoch": 2.219425513233265, + "grad_norm": 7.209635257720947, + "learning_rate": 9.622448858517383e-06, + "loss": 2.4508, + "step": 3316 + }, + { + "epoch": 2.2200944934565374, + "grad_norm": 5.014361381530762, + "learning_rate": 9.607099069234719e-06, + "loss": 2.4491, + "step": 3317 + }, + { + "epoch": 2.2207634736798094, + "grad_norm": 4.0903801918029785, + "learning_rate": 9.591758620315025e-06, + "loss": 2.3502, + "step": 3318 + }, + { + "epoch": 2.2214324539030814, + "grad_norm": 6.134200572967529, + "learning_rate": 9.576427521066837e-06, + "loss": 2.4158, + "step": 3319 + }, + { + "epoch": 2.222101434126354, + "grad_norm": 4.862534523010254, + "learning_rate": 9.561105780793029e-06, + "loss": 2.3157, + "step": 3320 + }, + { + "epoch": 2.222770414349626, + "grad_norm": 6.2961907386779785, + "learning_rate": 9.545793408790769e-06, + "loss": 2.305, + "step": 3321 + }, + { + "epoch": 2.223439394572898, + "grad_norm": 8.750323295593262, + "learning_rate": 9.53049041435157e-06, + "loss": 2.7768, + "step": 3322 + }, + { + "epoch": 2.2241083747961703, + "grad_norm": 4.3722357749938965, + "learning_rate": 9.515196806761222e-06, + "loss": 2.3192, + "step": 3323 + }, + { + "epoch": 2.2247773550194423, + "grad_norm": 4.427755355834961, + "learning_rate": 9.499912595299863e-06, + "loss": 2.1664, + "step": 3324 + }, + { + "epoch": 2.2254463352427143, + "grad_norm": 6.45989990234375, + "learning_rate": 9.484637789241885e-06, + "loss": 2.4249, + "step": 3325 + }, + { + "epoch": 2.2261153154659867, + "grad_norm": 5.717044830322266, + "learning_rate": 9.469372397855995e-06, + "loss": 2.4274, + "step": 3326 + }, + { + "epoch": 2.2267842956892587, + "grad_norm": 7.242005348205566, + "learning_rate": 9.454116430405174e-06, + "loss": 2.753, + "step": 3327 + }, + { + "epoch": 2.2274532759125307, + "grad_norm": 7.040726184844971, + "learning_rate": 9.438869896146715e-06, + "loss": 2.479, + "step": 3328 + }, + { + "epoch": 2.228122256135803, + "grad_norm": 5.693268775939941, + "learning_rate": 9.423632804332144e-06, + "loss": 2.2508, + "step": 3329 + }, + { + "epoch": 2.228791236359075, + "grad_norm": 8.740731239318848, + "learning_rate": 9.408405164207298e-06, + "loss": 2.5289, + "step": 3330 + }, + { + "epoch": 2.229460216582347, + "grad_norm": 5.486669063568115, + "learning_rate": 9.393186985012256e-06, + "loss": 2.2686, + "step": 3331 + }, + { + "epoch": 2.2301291968056196, + "grad_norm": 6.015158176422119, + "learning_rate": 9.377978275981356e-06, + "loss": 2.3974, + "step": 3332 + }, + { + "epoch": 2.2307981770288916, + "grad_norm": 5.799219131469727, + "learning_rate": 9.362779046343184e-06, + "loss": 2.432, + "step": 3333 + }, + { + "epoch": 2.2314671572521636, + "grad_norm": 5.339161396026611, + "learning_rate": 9.347589305320607e-06, + "loss": 2.3394, + "step": 3334 + }, + { + "epoch": 2.232136137475436, + "grad_norm": 5.30642032623291, + "learning_rate": 9.332409062130687e-06, + "loss": 2.5456, + "step": 3335 + }, + { + "epoch": 2.232805117698708, + "grad_norm": 6.872066497802734, + "learning_rate": 9.31723832598477e-06, + "loss": 2.2248, + "step": 3336 + }, + { + "epoch": 2.23347409792198, + "grad_norm": 5.748722553253174, + "learning_rate": 9.302077106088389e-06, + "loss": 2.4969, + "step": 3337 + }, + { + "epoch": 2.2341430781452525, + "grad_norm": 6.080403804779053, + "learning_rate": 9.286925411641342e-06, + "loss": 2.3525, + "step": 3338 + }, + { + "epoch": 2.2348120583685245, + "grad_norm": 4.169589042663574, + "learning_rate": 9.271783251837623e-06, + "loss": 2.3091, + "step": 3339 + }, + { + "epoch": 2.2354810385917965, + "grad_norm": 6.129306316375732, + "learning_rate": 9.25665063586544e-06, + "loss": 2.8266, + "step": 3340 + }, + { + "epoch": 2.236150018815069, + "grad_norm": 3.3481273651123047, + "learning_rate": 9.241527572907214e-06, + "loss": 2.6435, + "step": 3341 + }, + { + "epoch": 2.236818999038341, + "grad_norm": 6.210189342498779, + "learning_rate": 9.226414072139584e-06, + "loss": 2.2529, + "step": 3342 + }, + { + "epoch": 2.237487979261613, + "grad_norm": 6.088385581970215, + "learning_rate": 9.211310142733356e-06, + "loss": 2.4758, + "step": 3343 + }, + { + "epoch": 2.2381569594848854, + "grad_norm": 6.229015350341797, + "learning_rate": 9.196215793853565e-06, + "loss": 2.4619, + "step": 3344 + }, + { + "epoch": 2.2388259397081574, + "grad_norm": 5.055471420288086, + "learning_rate": 9.181131034659398e-06, + "loss": 2.6748, + "step": 3345 + }, + { + "epoch": 2.2394949199314294, + "grad_norm": 5.857059955596924, + "learning_rate": 9.166055874304253e-06, + "loss": 2.4911, + "step": 3346 + }, + { + "epoch": 2.240163900154702, + "grad_norm": 7.624831199645996, + "learning_rate": 9.150990321935682e-06, + "loss": 2.5507, + "step": 3347 + }, + { + "epoch": 2.240832880377974, + "grad_norm": 6.141499996185303, + "learning_rate": 9.135934386695414e-06, + "loss": 2.7019, + "step": 3348 + }, + { + "epoch": 2.241501860601246, + "grad_norm": 4.614566326141357, + "learning_rate": 9.12088807771933e-06, + "loss": 2.2603, + "step": 3349 + }, + { + "epoch": 2.2421708408245182, + "grad_norm": 4.843814373016357, + "learning_rate": 9.105851404137505e-06, + "loss": 2.3101, + "step": 3350 + }, + { + "epoch": 2.2428398210477902, + "grad_norm": 7.0168352127075195, + "learning_rate": 9.090824375074122e-06, + "loss": 2.2894, + "step": 3351 + }, + { + "epoch": 2.2435088012710622, + "grad_norm": 5.551777362823486, + "learning_rate": 9.075806999647558e-06, + "loss": 2.4892, + "step": 3352 + }, + { + "epoch": 2.2441777814943347, + "grad_norm": 6.743337154388428, + "learning_rate": 9.060799286970292e-06, + "loss": 2.6235, + "step": 3353 + }, + { + "epoch": 2.2448467617176067, + "grad_norm": 5.058289051055908, + "learning_rate": 9.04580124614896e-06, + "loss": 2.3419, + "step": 3354 + }, + { + "epoch": 2.2455157419408787, + "grad_norm": 5.71526575088501, + "learning_rate": 9.030812886284315e-06, + "loss": 2.3071, + "step": 3355 + }, + { + "epoch": 2.246184722164151, + "grad_norm": 5.9139604568481445, + "learning_rate": 9.015834216471266e-06, + "loss": 2.2466, + "step": 3356 + }, + { + "epoch": 2.246853702387423, + "grad_norm": 5.514877796173096, + "learning_rate": 9.0008652457988e-06, + "loss": 2.5223, + "step": 3357 + }, + { + "epoch": 2.247522682610695, + "grad_norm": 6.15758752822876, + "learning_rate": 8.985905983350063e-06, + "loss": 2.2858, + "step": 3358 + }, + { + "epoch": 2.2481916628339675, + "grad_norm": 6.159371376037598, + "learning_rate": 8.970956438202265e-06, + "loss": 2.307, + "step": 3359 + }, + { + "epoch": 2.2488606430572395, + "grad_norm": 6.802591800689697, + "learning_rate": 8.956016619426765e-06, + "loss": 2.6682, + "step": 3360 + }, + { + "epoch": 2.2495296232805115, + "grad_norm": 6.4636101722717285, + "learning_rate": 8.941086536088983e-06, + "loss": 2.2357, + "step": 3361 + }, + { + "epoch": 2.250198603503784, + "grad_norm": 5.349886417388916, + "learning_rate": 8.92616619724845e-06, + "loss": 2.2873, + "step": 3362 + }, + { + "epoch": 2.250867583727056, + "grad_norm": 5.073741436004639, + "learning_rate": 8.911255611958769e-06, + "loss": 2.3533, + "step": 3363 + }, + { + "epoch": 2.251536563950328, + "grad_norm": 6.157240867614746, + "learning_rate": 8.896354789267653e-06, + "loss": 2.4994, + "step": 3364 + }, + { + "epoch": 2.2522055441736004, + "grad_norm": 5.950747013092041, + "learning_rate": 8.881463738216856e-06, + "loss": 2.4047, + "step": 3365 + }, + { + "epoch": 2.2528745243968724, + "grad_norm": 6.31601095199585, + "learning_rate": 8.866582467842236e-06, + "loss": 2.4934, + "step": 3366 + }, + { + "epoch": 2.253543504620145, + "grad_norm": 4.539876937866211, + "learning_rate": 8.851710987173684e-06, + "loss": 2.4644, + "step": 3367 + }, + { + "epoch": 2.254212484843417, + "grad_norm": 5.7052788734436035, + "learning_rate": 8.836849305235187e-06, + "loss": 2.4245, + "step": 3368 + }, + { + "epoch": 2.254881465066689, + "grad_norm": 4.134397983551025, + "learning_rate": 8.821997431044755e-06, + "loss": 2.1249, + "step": 3369 + }, + { + "epoch": 2.2555504452899613, + "grad_norm": 4.353209972381592, + "learning_rate": 8.807155373614457e-06, + "loss": 2.5069, + "step": 3370 + }, + { + "epoch": 2.2562194255132333, + "grad_norm": 7.150655269622803, + "learning_rate": 8.792323141950396e-06, + "loss": 2.3589, + "step": 3371 + }, + { + "epoch": 2.2568884057365053, + "grad_norm": 6.316367149353027, + "learning_rate": 8.777500745052744e-06, + "loss": 2.282, + "step": 3372 + }, + { + "epoch": 2.2575573859597777, + "grad_norm": 4.0687994956970215, + "learning_rate": 8.762688191915664e-06, + "loss": 2.2527, + "step": 3373 + }, + { + "epoch": 2.2582263661830497, + "grad_norm": 6.3995771408081055, + "learning_rate": 8.747885491527384e-06, + "loss": 2.4396, + "step": 3374 + }, + { + "epoch": 2.2588953464063217, + "grad_norm": 4.6264472007751465, + "learning_rate": 8.733092652870125e-06, + "loss": 2.2277, + "step": 3375 + }, + { + "epoch": 2.259564326629594, + "grad_norm": 7.339973449707031, + "learning_rate": 8.718309684920137e-06, + "loss": 2.423, + "step": 3376 + }, + { + "epoch": 2.260233306852866, + "grad_norm": 6.601887226104736, + "learning_rate": 8.703536596647666e-06, + "loss": 2.5287, + "step": 3377 + }, + { + "epoch": 2.260902287076138, + "grad_norm": 4.581928253173828, + "learning_rate": 8.688773397016992e-06, + "loss": 2.4514, + "step": 3378 + }, + { + "epoch": 2.2615712672994106, + "grad_norm": 6.076478004455566, + "learning_rate": 8.674020094986363e-06, + "loss": 2.3532, + "step": 3379 + }, + { + "epoch": 2.2622402475226826, + "grad_norm": 6.251124382019043, + "learning_rate": 8.659276699508051e-06, + "loss": 2.7184, + "step": 3380 + }, + { + "epoch": 2.2629092277459546, + "grad_norm": 5.632853984832764, + "learning_rate": 8.644543219528281e-06, + "loss": 2.449, + "step": 3381 + }, + { + "epoch": 2.263578207969227, + "grad_norm": 4.580968856811523, + "learning_rate": 8.629819663987305e-06, + "loss": 2.5447, + "step": 3382 + }, + { + "epoch": 2.264247188192499, + "grad_norm": 5.7596354484558105, + "learning_rate": 8.615106041819316e-06, + "loss": 2.4769, + "step": 3383 + }, + { + "epoch": 2.264916168415771, + "grad_norm": 4.481386184692383, + "learning_rate": 8.600402361952491e-06, + "loss": 2.3351, + "step": 3384 + }, + { + "epoch": 2.2655851486390435, + "grad_norm": 5.507460594177246, + "learning_rate": 8.585708633308972e-06, + "loss": 2.4185, + "step": 3385 + }, + { + "epoch": 2.2662541288623155, + "grad_norm": 4.184190273284912, + "learning_rate": 8.571024864804883e-06, + "loss": 2.3453, + "step": 3386 + }, + { + "epoch": 2.2669231090855875, + "grad_norm": 6.234518527984619, + "learning_rate": 8.556351065350268e-06, + "loss": 2.3477, + "step": 3387 + }, + { + "epoch": 2.26759208930886, + "grad_norm": 5.380000591278076, + "learning_rate": 8.541687243849158e-06, + "loss": 2.2576, + "step": 3388 + }, + { + "epoch": 2.268261069532132, + "grad_norm": 4.824820518493652, + "learning_rate": 8.527033409199501e-06, + "loss": 2.3433, + "step": 3389 + }, + { + "epoch": 2.2689300497554044, + "grad_norm": 4.683255195617676, + "learning_rate": 8.51238957029322e-06, + "loss": 2.2502, + "step": 3390 + }, + { + "epoch": 2.2695990299786764, + "grad_norm": 4.337817668914795, + "learning_rate": 8.49775573601611e-06, + "loss": 2.299, + "step": 3391 + }, + { + "epoch": 2.2702680102019483, + "grad_norm": 8.936278343200684, + "learning_rate": 8.483131915247968e-06, + "loss": 2.4815, + "step": 3392 + }, + { + "epoch": 2.270936990425221, + "grad_norm": 7.972809791564941, + "learning_rate": 8.468518116862462e-06, + "loss": 2.7594, + "step": 3393 + }, + { + "epoch": 2.271605970648493, + "grad_norm": 7.379626274108887, + "learning_rate": 8.453914349727216e-06, + "loss": 2.5508, + "step": 3394 + }, + { + "epoch": 2.272274950871765, + "grad_norm": 5.350943088531494, + "learning_rate": 8.439320622703729e-06, + "loss": 2.4487, + "step": 3395 + }, + { + "epoch": 2.2729439310950372, + "grad_norm": 5.500737190246582, + "learning_rate": 8.424736944647443e-06, + "loss": 2.221, + "step": 3396 + }, + { + "epoch": 2.2736129113183092, + "grad_norm": 5.009112358093262, + "learning_rate": 8.410163324407688e-06, + "loss": 2.2639, + "step": 3397 + }, + { + "epoch": 2.2742818915415812, + "grad_norm": 5.645633697509766, + "learning_rate": 8.39559977082768e-06, + "loss": 2.3556, + "step": 3398 + }, + { + "epoch": 2.2749508717648537, + "grad_norm": 5.343113899230957, + "learning_rate": 8.381046292744535e-06, + "loss": 2.2589, + "step": 3399 + }, + { + "epoch": 2.2756198519881257, + "grad_norm": 5.092655181884766, + "learning_rate": 8.36650289898927e-06, + "loss": 2.19, + "step": 3400 + }, + { + "epoch": 2.2762888322113977, + "grad_norm": 4.954383850097656, + "learning_rate": 8.351969598386755e-06, + "loss": 2.2989, + "step": 3401 + }, + { + "epoch": 2.27695781243467, + "grad_norm": 4.772448539733887, + "learning_rate": 8.337446399755766e-06, + "loss": 2.2088, + "step": 3402 + }, + { + "epoch": 2.277626792657942, + "grad_norm": 4.711516857147217, + "learning_rate": 8.322933311908917e-06, + "loss": 2.3112, + "step": 3403 + }, + { + "epoch": 2.278295772881214, + "grad_norm": 5.803802013397217, + "learning_rate": 8.30843034365272e-06, + "loss": 2.7157, + "step": 3404 + }, + { + "epoch": 2.2789647531044865, + "grad_norm": 5.59735107421875, + "learning_rate": 8.293937503787521e-06, + "loss": 2.346, + "step": 3405 + }, + { + "epoch": 2.2796337333277585, + "grad_norm": 6.138025283813477, + "learning_rate": 8.279454801107527e-06, + "loss": 2.5017, + "step": 3406 + }, + { + "epoch": 2.2803027135510305, + "grad_norm": 4.801734924316406, + "learning_rate": 8.264982244400793e-06, + "loss": 2.2864, + "step": 3407 + }, + { + "epoch": 2.280971693774303, + "grad_norm": 6.769373893737793, + "learning_rate": 8.25051984244923e-06, + "loss": 2.2275, + "step": 3408 + }, + { + "epoch": 2.281640673997575, + "grad_norm": 5.756905555725098, + "learning_rate": 8.236067604028563e-06, + "loss": 2.1619, + "step": 3409 + }, + { + "epoch": 2.282309654220847, + "grad_norm": 8.491303443908691, + "learning_rate": 8.221625537908384e-06, + "loss": 2.3861, + "step": 3410 + }, + { + "epoch": 2.2829786344441194, + "grad_norm": 7.500456809997559, + "learning_rate": 8.207193652852071e-06, + "loss": 2.5702, + "step": 3411 + }, + { + "epoch": 2.2836476146673914, + "grad_norm": 5.677613258361816, + "learning_rate": 8.19277195761687e-06, + "loss": 2.5359, + "step": 3412 + }, + { + "epoch": 2.2843165948906634, + "grad_norm": 8.363469123840332, + "learning_rate": 8.178360460953793e-06, + "loss": 2.3012, + "step": 3413 + }, + { + "epoch": 2.284985575113936, + "grad_norm": 4.777279853820801, + "learning_rate": 8.163959171607708e-06, + "loss": 2.2403, + "step": 3414 + }, + { + "epoch": 2.285654555337208, + "grad_norm": 8.776752471923828, + "learning_rate": 8.149568098317259e-06, + "loss": 2.4639, + "step": 3415 + }, + { + "epoch": 2.28632353556048, + "grad_norm": 6.5980401039123535, + "learning_rate": 8.135187249814916e-06, + "loss": 2.4976, + "step": 3416 + }, + { + "epoch": 2.2869925157837523, + "grad_norm": 7.707547187805176, + "learning_rate": 8.120816634826919e-06, + "loss": 2.4736, + "step": 3417 + }, + { + "epoch": 2.2876614960070243, + "grad_norm": 6.573399066925049, + "learning_rate": 8.106456262073326e-06, + "loss": 2.3439, + "step": 3418 + }, + { + "epoch": 2.2883304762302963, + "grad_norm": 6.642777442932129, + "learning_rate": 8.092106140267954e-06, + "loss": 2.4196, + "step": 3419 + }, + { + "epoch": 2.2889994564535687, + "grad_norm": 4.598809242248535, + "learning_rate": 8.077766278118414e-06, + "loss": 2.3227, + "step": 3420 + }, + { + "epoch": 2.2896684366768407, + "grad_norm": 5.574320316314697, + "learning_rate": 8.063436684326083e-06, + "loss": 2.2573, + "step": 3421 + }, + { + "epoch": 2.2903374169001127, + "grad_norm": 5.273810863494873, + "learning_rate": 8.049117367586126e-06, + "loss": 2.3305, + "step": 3422 + }, + { + "epoch": 2.291006397123385, + "grad_norm": 7.039480209350586, + "learning_rate": 8.03480833658744e-06, + "loss": 2.4374, + "step": 3423 + }, + { + "epoch": 2.291675377346657, + "grad_norm": 5.8367743492126465, + "learning_rate": 8.020509600012719e-06, + "loss": 2.19, + "step": 3424 + }, + { + "epoch": 2.292344357569929, + "grad_norm": 6.036994457244873, + "learning_rate": 8.006221166538372e-06, + "loss": 2.235, + "step": 3425 + }, + { + "epoch": 2.2930133377932016, + "grad_norm": 6.723577499389648, + "learning_rate": 7.99194304483459e-06, + "loss": 2.139, + "step": 3426 + }, + { + "epoch": 2.2936823180164736, + "grad_norm": 6.327938079833984, + "learning_rate": 7.977675243565288e-06, + "loss": 2.6306, + "step": 3427 + }, + { + "epoch": 2.2943512982397456, + "grad_norm": 6.290478229522705, + "learning_rate": 7.963417771388118e-06, + "loss": 2.4762, + "step": 3428 + }, + { + "epoch": 2.295020278463018, + "grad_norm": 5.47693395614624, + "learning_rate": 7.949170636954461e-06, + "loss": 2.4425, + "step": 3429 + }, + { + "epoch": 2.29568925868629, + "grad_norm": 7.121459484100342, + "learning_rate": 7.934933848909452e-06, + "loss": 2.5305, + "step": 3430 + }, + { + "epoch": 2.296358238909562, + "grad_norm": 5.080131530761719, + "learning_rate": 7.92070741589191e-06, + "loss": 2.015, + "step": 3431 + }, + { + "epoch": 2.2970272191328345, + "grad_norm": 7.147456645965576, + "learning_rate": 7.906491346534401e-06, + "loss": 2.3377, + "step": 3432 + }, + { + "epoch": 2.2976961993561065, + "grad_norm": 7.863618850708008, + "learning_rate": 7.892285649463188e-06, + "loss": 2.504, + "step": 3433 + }, + { + "epoch": 2.2983651795793785, + "grad_norm": 7.590881824493408, + "learning_rate": 7.878090333298242e-06, + "loss": 2.3579, + "step": 3434 + }, + { + "epoch": 2.299034159802651, + "grad_norm": 7.350370407104492, + "learning_rate": 7.863905406653221e-06, + "loss": 2.4175, + "step": 3435 + }, + { + "epoch": 2.299703140025923, + "grad_norm": 6.526215553283691, + "learning_rate": 7.84973087813552e-06, + "loss": 2.4259, + "step": 3436 + }, + { + "epoch": 2.300372120249195, + "grad_norm": 6.262446880340576, + "learning_rate": 7.835566756346171e-06, + "loss": 2.2656, + "step": 3437 + }, + { + "epoch": 2.3010411004724673, + "grad_norm": 5.917377471923828, + "learning_rate": 7.821413049879939e-06, + "loss": 2.5684, + "step": 3438 + }, + { + "epoch": 2.3017100806957393, + "grad_norm": 7.152966499328613, + "learning_rate": 7.807269767325232e-06, + "loss": 2.4683, + "step": 3439 + }, + { + "epoch": 2.3023790609190113, + "grad_norm": 4.026453018188477, + "learning_rate": 7.793136917264162e-06, + "loss": 2.2343, + "step": 3440 + }, + { + "epoch": 2.303048041142284, + "grad_norm": 6.001501083374023, + "learning_rate": 7.779014508272492e-06, + "loss": 2.2453, + "step": 3441 + }, + { + "epoch": 2.303717021365556, + "grad_norm": 5.8285980224609375, + "learning_rate": 7.764902548919654e-06, + "loss": 2.4493, + "step": 3442 + }, + { + "epoch": 2.304386001588828, + "grad_norm": 7.7580485343933105, + "learning_rate": 7.75080104776873e-06, + "loss": 2.5855, + "step": 3443 + }, + { + "epoch": 2.3050549818121002, + "grad_norm": 6.922948360443115, + "learning_rate": 7.73671001337648e-06, + "loss": 2.5874, + "step": 3444 + }, + { + "epoch": 2.305723962035372, + "grad_norm": 5.331365585327148, + "learning_rate": 7.722629454293288e-06, + "loss": 2.5301, + "step": 3445 + }, + { + "epoch": 2.3063929422586447, + "grad_norm": 6.9080634117126465, + "learning_rate": 7.708559379063204e-06, + "loss": 2.4968, + "step": 3446 + }, + { + "epoch": 2.3070619224819167, + "grad_norm": 4.506058692932129, + "learning_rate": 7.694499796223889e-06, + "loss": 2.241, + "step": 3447 + }, + { + "epoch": 2.3077309027051887, + "grad_norm": 5.730078220367432, + "learning_rate": 7.680450714306673e-06, + "loss": 2.4362, + "step": 3448 + }, + { + "epoch": 2.308399882928461, + "grad_norm": 5.697117805480957, + "learning_rate": 7.66641214183648e-06, + "loss": 2.295, + "step": 3449 + }, + { + "epoch": 2.309068863151733, + "grad_norm": 6.412027835845947, + "learning_rate": 7.652384087331873e-06, + "loss": 2.4617, + "step": 3450 + }, + { + "epoch": 2.309737843375005, + "grad_norm": 7.091875076293945, + "learning_rate": 7.638366559305023e-06, + "loss": 2.4985, + "step": 3451 + }, + { + "epoch": 2.3104068235982775, + "grad_norm": 7.024311542510986, + "learning_rate": 7.624359566261738e-06, + "loss": 2.396, + "step": 3452 + }, + { + "epoch": 2.3110758038215495, + "grad_norm": 6.934288501739502, + "learning_rate": 7.6103631167014e-06, + "loss": 2.418, + "step": 3453 + }, + { + "epoch": 2.3117447840448215, + "grad_norm": 6.523407459259033, + "learning_rate": 7.596377219117024e-06, + "loss": 2.2184, + "step": 3454 + }, + { + "epoch": 2.312413764268094, + "grad_norm": 6.243614196777344, + "learning_rate": 7.582401881995202e-06, + "loss": 2.5644, + "step": 3455 + }, + { + "epoch": 2.313082744491366, + "grad_norm": 6.896909236907959, + "learning_rate": 7.568437113816121e-06, + "loss": 2.4253, + "step": 3456 + }, + { + "epoch": 2.313751724714638, + "grad_norm": 6.0961761474609375, + "learning_rate": 7.554482923053552e-06, + "loss": 2.3516, + "step": 3457 + }, + { + "epoch": 2.3144207049379104, + "grad_norm": 5.688260555267334, + "learning_rate": 7.540539318174871e-06, + "loss": 2.4215, + "step": 3458 + }, + { + "epoch": 2.3150896851611824, + "grad_norm": 5.351925373077393, + "learning_rate": 7.526606307640993e-06, + "loss": 2.5199, + "step": 3459 + }, + { + "epoch": 2.3157586653844544, + "grad_norm": 5.184074878692627, + "learning_rate": 7.5126838999064415e-06, + "loss": 2.2493, + "step": 3460 + }, + { + "epoch": 2.316427645607727, + "grad_norm": 4.676231384277344, + "learning_rate": 7.498772103419274e-06, + "loss": 2.4986, + "step": 3461 + }, + { + "epoch": 2.317096625830999, + "grad_norm": 4.004918575286865, + "learning_rate": 7.484870926621138e-06, + "loss": 2.3022, + "step": 3462 + }, + { + "epoch": 2.317765606054271, + "grad_norm": 5.172874927520752, + "learning_rate": 7.470980377947218e-06, + "loss": 2.3933, + "step": 3463 + }, + { + "epoch": 2.3184345862775433, + "grad_norm": 5.958775997161865, + "learning_rate": 7.45710046582625e-06, + "loss": 2.5162, + "step": 3464 + }, + { + "epoch": 2.3191035665008153, + "grad_norm": 5.45428466796875, + "learning_rate": 7.443231198680517e-06, + "loss": 2.3389, + "step": 3465 + }, + { + "epoch": 2.3197725467240873, + "grad_norm": 5.295843124389648, + "learning_rate": 7.429372584925859e-06, + "loss": 2.317, + "step": 3466 + }, + { + "epoch": 2.3204415269473597, + "grad_norm": 5.812140941619873, + "learning_rate": 7.415524632971621e-06, + "loss": 2.4952, + "step": 3467 + }, + { + "epoch": 2.3211105071706317, + "grad_norm": 4.75517463684082, + "learning_rate": 7.401687351220718e-06, + "loss": 2.4702, + "step": 3468 + }, + { + "epoch": 2.321779487393904, + "grad_norm": 5.705507278442383, + "learning_rate": 7.387860748069547e-06, + "loss": 2.514, + "step": 3469 + }, + { + "epoch": 2.322448467617176, + "grad_norm": 4.852996826171875, + "learning_rate": 7.374044831908061e-06, + "loss": 2.3971, + "step": 3470 + }, + { + "epoch": 2.323117447840448, + "grad_norm": 8.440768241882324, + "learning_rate": 7.360239611119712e-06, + "loss": 2.3489, + "step": 3471 + }, + { + "epoch": 2.3237864280637206, + "grad_norm": 6.745054244995117, + "learning_rate": 7.346445094081458e-06, + "loss": 2.4147, + "step": 3472 + }, + { + "epoch": 2.3244554082869926, + "grad_norm": 5.936471939086914, + "learning_rate": 7.332661289163764e-06, + "loss": 2.2891, + "step": 3473 + }, + { + "epoch": 2.3251243885102646, + "grad_norm": 4.530374050140381, + "learning_rate": 7.318888204730612e-06, + "loss": 2.336, + "step": 3474 + }, + { + "epoch": 2.325793368733537, + "grad_norm": 5.543879985809326, + "learning_rate": 7.305125849139449e-06, + "loss": 2.3728, + "step": 3475 + }, + { + "epoch": 2.326462348956809, + "grad_norm": 5.113669395446777, + "learning_rate": 7.291374230741246e-06, + "loss": 2.5431, + "step": 3476 + }, + { + "epoch": 2.327131329180081, + "grad_norm": 5.893754005432129, + "learning_rate": 7.277633357880431e-06, + "loss": 2.2659, + "step": 3477 + }, + { + "epoch": 2.3278003094033535, + "grad_norm": 5.8386921882629395, + "learning_rate": 7.263903238894926e-06, + "loss": 2.3999, + "step": 3478 + }, + { + "epoch": 2.3284692896266255, + "grad_norm": 6.166362285614014, + "learning_rate": 7.25018388211611e-06, + "loss": 2.6455, + "step": 3479 + }, + { + "epoch": 2.3291382698498975, + "grad_norm": 6.047561168670654, + "learning_rate": 7.2364752958688635e-06, + "loss": 2.3912, + "step": 3480 + }, + { + "epoch": 2.32980725007317, + "grad_norm": 5.01530122756958, + "learning_rate": 7.222777488471497e-06, + "loss": 2.3128, + "step": 3481 + }, + { + "epoch": 2.330476230296442, + "grad_norm": 6.121397018432617, + "learning_rate": 7.2090904682358105e-06, + "loss": 2.4968, + "step": 3482 + }, + { + "epoch": 2.331145210519714, + "grad_norm": 4.667051792144775, + "learning_rate": 7.19541424346703e-06, + "loss": 2.2822, + "step": 3483 + }, + { + "epoch": 2.3318141907429863, + "grad_norm": 7.0047831535339355, + "learning_rate": 7.181748822463858e-06, + "loss": 2.4463, + "step": 3484 + }, + { + "epoch": 2.3324831709662583, + "grad_norm": 5.3871660232543945, + "learning_rate": 7.168094213518422e-06, + "loss": 2.4241, + "step": 3485 + }, + { + "epoch": 2.3331521511895303, + "grad_norm": 4.392592906951904, + "learning_rate": 7.154450424916298e-06, + "loss": 2.1489, + "step": 3486 + }, + { + "epoch": 2.333821131412803, + "grad_norm": 3.7943453788757324, + "learning_rate": 7.140817464936481e-06, + "loss": 2.2351, + "step": 3487 + }, + { + "epoch": 2.334490111636075, + "grad_norm": 5.024256229400635, + "learning_rate": 7.127195341851422e-06, + "loss": 2.4965, + "step": 3488 + }, + { + "epoch": 2.335159091859347, + "grad_norm": 5.761058807373047, + "learning_rate": 7.11358406392697e-06, + "loss": 2.2851, + "step": 3489 + }, + { + "epoch": 2.3358280720826192, + "grad_norm": 5.3968071937561035, + "learning_rate": 7.09998363942242e-06, + "loss": 2.4727, + "step": 3490 + }, + { + "epoch": 2.336497052305891, + "grad_norm": 6.065816402435303, + "learning_rate": 7.0863940765904494e-06, + "loss": 2.2239, + "step": 3491 + }, + { + "epoch": 2.337166032529163, + "grad_norm": 6.924764156341553, + "learning_rate": 7.072815383677176e-06, + "loss": 2.3033, + "step": 3492 + }, + { + "epoch": 2.3378350127524357, + "grad_norm": 6.719110488891602, + "learning_rate": 7.0592475689221e-06, + "loss": 2.679, + "step": 3493 + }, + { + "epoch": 2.3385039929757077, + "grad_norm": 6.690103530883789, + "learning_rate": 7.045690640558128e-06, + "loss": 2.7404, + "step": 3494 + }, + { + "epoch": 2.3391729731989797, + "grad_norm": 3.0337777137756348, + "learning_rate": 7.032144606811553e-06, + "loss": 2.0648, + "step": 3495 + }, + { + "epoch": 2.339841953422252, + "grad_norm": 5.208431720733643, + "learning_rate": 7.018609475902082e-06, + "loss": 2.2762, + "step": 3496 + }, + { + "epoch": 2.340510933645524, + "grad_norm": 6.43563985824585, + "learning_rate": 7.00508525604277e-06, + "loss": 2.3428, + "step": 3497 + }, + { + "epoch": 2.341179913868796, + "grad_norm": 4.2920756340026855, + "learning_rate": 6.9915719554400905e-06, + "loss": 2.1602, + "step": 3498 + }, + { + "epoch": 2.3418488940920685, + "grad_norm": 9.787894248962402, + "learning_rate": 6.978069582293859e-06, + "loss": 2.9342, + "step": 3499 + }, + { + "epoch": 2.3425178743153405, + "grad_norm": 6.664539337158203, + "learning_rate": 6.964578144797274e-06, + "loss": 2.4026, + "step": 3500 + }, + { + "epoch": 2.3431868545386125, + "grad_norm": 4.575187683105469, + "learning_rate": 6.951097651136889e-06, + "loss": 2.305, + "step": 3501 + }, + { + "epoch": 2.343855834761885, + "grad_norm": 5.636954307556152, + "learning_rate": 6.937628109492642e-06, + "loss": 2.5249, + "step": 3502 + }, + { + "epoch": 2.344524814985157, + "grad_norm": 5.097446918487549, + "learning_rate": 6.924169528037785e-06, + "loss": 2.4038, + "step": 3503 + }, + { + "epoch": 2.345193795208429, + "grad_norm": 6.356112003326416, + "learning_rate": 6.910721914938967e-06, + "loss": 2.3825, + "step": 3504 + }, + { + "epoch": 2.3458627754317014, + "grad_norm": 6.767902851104736, + "learning_rate": 6.8972852783561335e-06, + "loss": 2.2508, + "step": 3505 + }, + { + "epoch": 2.3465317556549734, + "grad_norm": 8.132856369018555, + "learning_rate": 6.883859626442612e-06, + "loss": 2.5005, + "step": 3506 + }, + { + "epoch": 2.3472007358782454, + "grad_norm": 6.069066524505615, + "learning_rate": 6.8704449673450385e-06, + "loss": 2.3835, + "step": 3507 + }, + { + "epoch": 2.347869716101518, + "grad_norm": 3.809129238128662, + "learning_rate": 6.85704130920338e-06, + "loss": 2.1535, + "step": 3508 + }, + { + "epoch": 2.34853869632479, + "grad_norm": 6.508683681488037, + "learning_rate": 6.843648660150931e-06, + "loss": 2.2884, + "step": 3509 + }, + { + "epoch": 2.349207676548062, + "grad_norm": 6.483622074127197, + "learning_rate": 6.8302670283143186e-06, + "loss": 2.4074, + "step": 3510 + }, + { + "epoch": 2.3498766567713343, + "grad_norm": 4.865424633026123, + "learning_rate": 6.816896421813462e-06, + "loss": 2.3473, + "step": 3511 + }, + { + "epoch": 2.3505456369946063, + "grad_norm": 5.602460861206055, + "learning_rate": 6.803536848761618e-06, + "loss": 2.5812, + "step": 3512 + }, + { + "epoch": 2.3512146172178783, + "grad_norm": 6.538539886474609, + "learning_rate": 6.790188317265317e-06, + "loss": 2.5944, + "step": 3513 + }, + { + "epoch": 2.3518835974411507, + "grad_norm": 7.687674045562744, + "learning_rate": 6.776850835424417e-06, + "loss": 2.4731, + "step": 3514 + }, + { + "epoch": 2.3525525776644227, + "grad_norm": 6.80280876159668, + "learning_rate": 6.763524411332056e-06, + "loss": 2.6157, + "step": 3515 + }, + { + "epoch": 2.3532215578876947, + "grad_norm": 5.034938335418701, + "learning_rate": 6.750209053074666e-06, + "loss": 2.3397, + "step": 3516 + }, + { + "epoch": 2.353890538110967, + "grad_norm": 6.163715839385986, + "learning_rate": 6.736904768731953e-06, + "loss": 2.5476, + "step": 3517 + }, + { + "epoch": 2.354559518334239, + "grad_norm": 6.720538139343262, + "learning_rate": 6.723611566376928e-06, + "loss": 2.735, + "step": 3518 + }, + { + "epoch": 2.355228498557511, + "grad_norm": 6.908609390258789, + "learning_rate": 6.710329454075853e-06, + "loss": 2.6974, + "step": 3519 + }, + { + "epoch": 2.3558974787807836, + "grad_norm": 5.462390422821045, + "learning_rate": 6.697058439888285e-06, + "loss": 2.5185, + "step": 3520 + }, + { + "epoch": 2.3565664590040556, + "grad_norm": 4.990724563598633, + "learning_rate": 6.683798531867022e-06, + "loss": 2.3428, + "step": 3521 + }, + { + "epoch": 2.357235439227328, + "grad_norm": 6.1307172775268555, + "learning_rate": 6.6705497380581375e-06, + "loss": 2.4776, + "step": 3522 + }, + { + "epoch": 2.3579044194506, + "grad_norm": 6.500049114227295, + "learning_rate": 6.657312066500948e-06, + "loss": 2.4623, + "step": 3523 + }, + { + "epoch": 2.358573399673872, + "grad_norm": 6.154669761657715, + "learning_rate": 6.644085525228047e-06, + "loss": 2.8649, + "step": 3524 + }, + { + "epoch": 2.3592423798971445, + "grad_norm": 5.445047378540039, + "learning_rate": 6.63087012226524e-06, + "loss": 2.6323, + "step": 3525 + }, + { + "epoch": 2.3599113601204165, + "grad_norm": 4.449127197265625, + "learning_rate": 6.617665865631606e-06, + "loss": 2.2545, + "step": 3526 + }, + { + "epoch": 2.3605803403436885, + "grad_norm": 9.050385475158691, + "learning_rate": 6.604472763339431e-06, + "loss": 2.5667, + "step": 3527 + }, + { + "epoch": 2.361249320566961, + "grad_norm": 9.080531120300293, + "learning_rate": 6.5912908233942635e-06, + "loss": 2.5164, + "step": 3528 + }, + { + "epoch": 2.361918300790233, + "grad_norm": 6.334784507751465, + "learning_rate": 6.578120053794856e-06, + "loss": 2.4306, + "step": 3529 + }, + { + "epoch": 2.362587281013505, + "grad_norm": 5.895359992980957, + "learning_rate": 6.5649604625331875e-06, + "loss": 2.2352, + "step": 3530 + }, + { + "epoch": 2.3632562612367773, + "grad_norm": 5.131560802459717, + "learning_rate": 6.551812057594447e-06, + "loss": 2.6061, + "step": 3531 + }, + { + "epoch": 2.3639252414600493, + "grad_norm": 4.937697410583496, + "learning_rate": 6.538674846957063e-06, + "loss": 2.2767, + "step": 3532 + }, + { + "epoch": 2.3645942216833213, + "grad_norm": 5.159997463226318, + "learning_rate": 6.525548838592635e-06, + "loss": 2.4506, + "step": 3533 + }, + { + "epoch": 2.365263201906594, + "grad_norm": 7.795278549194336, + "learning_rate": 6.512434040465998e-06, + "loss": 2.4397, + "step": 3534 + }, + { + "epoch": 2.365932182129866, + "grad_norm": 8.078782081604004, + "learning_rate": 6.499330460535155e-06, + "loss": 2.3376, + "step": 3535 + }, + { + "epoch": 2.3666011623531378, + "grad_norm": 7.0011138916015625, + "learning_rate": 6.486238106751332e-06, + "loss": 2.4338, + "step": 3536 + }, + { + "epoch": 2.36727014257641, + "grad_norm": 4.44496488571167, + "learning_rate": 6.4731569870589205e-06, + "loss": 2.1346, + "step": 3537 + }, + { + "epoch": 2.367939122799682, + "grad_norm": 5.2702460289001465, + "learning_rate": 6.460087109395499e-06, + "loss": 2.3183, + "step": 3538 + }, + { + "epoch": 2.368608103022954, + "grad_norm": 6.086611747741699, + "learning_rate": 6.447028481691822e-06, + "loss": 2.6677, + "step": 3539 + }, + { + "epoch": 2.3692770832462267, + "grad_norm": 6.076868534088135, + "learning_rate": 6.4339811118718395e-06, + "loss": 2.3818, + "step": 3540 + }, + { + "epoch": 2.3699460634694987, + "grad_norm": 5.417122840881348, + "learning_rate": 6.420945007852635e-06, + "loss": 2.3912, + "step": 3541 + }, + { + "epoch": 2.3706150436927707, + "grad_norm": 5.631089210510254, + "learning_rate": 6.4079201775444905e-06, + "loss": 2.5108, + "step": 3542 + }, + { + "epoch": 2.371284023916043, + "grad_norm": 6.386271953582764, + "learning_rate": 6.394906628850827e-06, + "loss": 2.3322, + "step": 3543 + }, + { + "epoch": 2.371953004139315, + "grad_norm": 5.839629650115967, + "learning_rate": 6.381904369668221e-06, + "loss": 2.3244, + "step": 3544 + }, + { + "epoch": 2.372621984362587, + "grad_norm": 6.385519981384277, + "learning_rate": 6.368913407886396e-06, + "loss": 2.5436, + "step": 3545 + }, + { + "epoch": 2.3732909645858595, + "grad_norm": 5.523813247680664, + "learning_rate": 6.355933751388241e-06, + "loss": 2.3619, + "step": 3546 + }, + { + "epoch": 2.3739599448091315, + "grad_norm": 7.015672206878662, + "learning_rate": 6.342965408049753e-06, + "loss": 2.5162, + "step": 3547 + }, + { + "epoch": 2.374628925032404, + "grad_norm": 5.8986921310424805, + "learning_rate": 6.3300083857400965e-06, + "loss": 2.2948, + "step": 3548 + }, + { + "epoch": 2.375297905255676, + "grad_norm": 6.848622798919678, + "learning_rate": 6.317062692321538e-06, + "loss": 2.5328, + "step": 3549 + }, + { + "epoch": 2.375966885478948, + "grad_norm": 5.186776638031006, + "learning_rate": 6.3041283356494976e-06, + "loss": 2.3067, + "step": 3550 + }, + { + "epoch": 2.3766358657022204, + "grad_norm": 6.804257869720459, + "learning_rate": 6.291205323572491e-06, + "loss": 2.4856, + "step": 3551 + }, + { + "epoch": 2.3773048459254924, + "grad_norm": 5.789464950561523, + "learning_rate": 6.278293663932164e-06, + "loss": 2.3167, + "step": 3552 + }, + { + "epoch": 2.3779738261487644, + "grad_norm": 6.576089382171631, + "learning_rate": 6.265393364563263e-06, + "loss": 2.4364, + "step": 3553 + }, + { + "epoch": 2.378642806372037, + "grad_norm": 8.175675392150879, + "learning_rate": 6.25250443329366e-06, + "loss": 2.418, + "step": 3554 + }, + { + "epoch": 2.379311786595309, + "grad_norm": 4.875731945037842, + "learning_rate": 6.239626877944305e-06, + "loss": 2.3146, + "step": 3555 + }, + { + "epoch": 2.379980766818581, + "grad_norm": 7.403187274932861, + "learning_rate": 6.226760706329271e-06, + "loss": 2.4863, + "step": 3556 + }, + { + "epoch": 2.3806497470418533, + "grad_norm": 3.971231698989868, + "learning_rate": 6.213905926255698e-06, + "loss": 2.3292, + "step": 3557 + }, + { + "epoch": 2.3813187272651253, + "grad_norm": 5.4843902587890625, + "learning_rate": 6.201062545523842e-06, + "loss": 2.1338, + "step": 3558 + }, + { + "epoch": 2.3819877074883973, + "grad_norm": 4.053069591522217, + "learning_rate": 6.1882305719269974e-06, + "loss": 2.339, + "step": 3559 + }, + { + "epoch": 2.3826566877116697, + "grad_norm": 6.378960132598877, + "learning_rate": 6.17541001325159e-06, + "loss": 2.5899, + "step": 3560 + }, + { + "epoch": 2.3833256679349417, + "grad_norm": 6.372867584228516, + "learning_rate": 6.162600877277078e-06, + "loss": 2.4973, + "step": 3561 + }, + { + "epoch": 2.3839946481582137, + "grad_norm": 4.501971244812012, + "learning_rate": 6.149803171776014e-06, + "loss": 2.2228, + "step": 3562 + }, + { + "epoch": 2.384663628381486, + "grad_norm": 5.696587085723877, + "learning_rate": 6.137016904513998e-06, + "loss": 2.3054, + "step": 3563 + }, + { + "epoch": 2.385332608604758, + "grad_norm": 7.66678524017334, + "learning_rate": 6.124242083249704e-06, + "loss": 2.4946, + "step": 3564 + }, + { + "epoch": 2.38600158882803, + "grad_norm": 5.758197784423828, + "learning_rate": 6.1114787157348475e-06, + "loss": 2.3556, + "step": 3565 + }, + { + "epoch": 2.3866705690513026, + "grad_norm": 5.065873146057129, + "learning_rate": 6.0987268097142e-06, + "loss": 2.4608, + "step": 3566 + }, + { + "epoch": 2.3873395492745746, + "grad_norm": 4.097575664520264, + "learning_rate": 6.08598637292557e-06, + "loss": 2.23, + "step": 3567 + }, + { + "epoch": 2.3880085294978466, + "grad_norm": 5.329041957855225, + "learning_rate": 6.073257413099826e-06, + "loss": 2.3445, + "step": 3568 + }, + { + "epoch": 2.388677509721119, + "grad_norm": 6.089375019073486, + "learning_rate": 6.060539937960846e-06, + "loss": 2.4183, + "step": 3569 + }, + { + "epoch": 2.389346489944391, + "grad_norm": 6.007900238037109, + "learning_rate": 6.047833955225571e-06, + "loss": 2.5968, + "step": 3570 + }, + { + "epoch": 2.390015470167663, + "grad_norm": 6.9910454750061035, + "learning_rate": 6.0351394726039346e-06, + "loss": 2.4901, + "step": 3571 + }, + { + "epoch": 2.3906844503909355, + "grad_norm": 9.373553276062012, + "learning_rate": 6.02245649779892e-06, + "loss": 2.2058, + "step": 3572 + }, + { + "epoch": 2.3913534306142075, + "grad_norm": 4.8613715171813965, + "learning_rate": 6.00978503850651e-06, + "loss": 2.4555, + "step": 3573 + }, + { + "epoch": 2.3920224108374795, + "grad_norm": 5.231014728546143, + "learning_rate": 5.997125102415707e-06, + "loss": 2.3205, + "step": 3574 + }, + { + "epoch": 2.392691391060752, + "grad_norm": 4.94096565246582, + "learning_rate": 5.984476697208513e-06, + "loss": 2.2283, + "step": 3575 + }, + { + "epoch": 2.393360371284024, + "grad_norm": 4.658742904663086, + "learning_rate": 5.97183983055995e-06, + "loss": 2.4525, + "step": 3576 + }, + { + "epoch": 2.394029351507296, + "grad_norm": 7.611774444580078, + "learning_rate": 5.959214510138017e-06, + "loss": 2.3272, + "step": 3577 + }, + { + "epoch": 2.3946983317305683, + "grad_norm": 6.290035724639893, + "learning_rate": 5.946600743603731e-06, + "loss": 2.3296, + "step": 3578 + }, + { + "epoch": 2.3953673119538403, + "grad_norm": 6.895848274230957, + "learning_rate": 5.9339985386110714e-06, + "loss": 2.4912, + "step": 3579 + }, + { + "epoch": 2.3960362921771123, + "grad_norm": 4.859911918640137, + "learning_rate": 5.921407902807038e-06, + "loss": 2.5304, + "step": 3580 + }, + { + "epoch": 2.396705272400385, + "grad_norm": 5.87416934967041, + "learning_rate": 5.90882884383156e-06, + "loss": 2.2073, + "step": 3581 + }, + { + "epoch": 2.3973742526236568, + "grad_norm": 4.920173645019531, + "learning_rate": 5.896261369317588e-06, + "loss": 2.4947, + "step": 3582 + }, + { + "epoch": 2.3980432328469288, + "grad_norm": 5.7263312339782715, + "learning_rate": 5.883705486891014e-06, + "loss": 2.5104, + "step": 3583 + }, + { + "epoch": 2.398712213070201, + "grad_norm": 5.924510478973389, + "learning_rate": 5.871161204170722e-06, + "loss": 2.2627, + "step": 3584 + }, + { + "epoch": 2.399381193293473, + "grad_norm": 7.603013038635254, + "learning_rate": 5.8586285287685285e-06, + "loss": 2.5297, + "step": 3585 + }, + { + "epoch": 2.400050173516745, + "grad_norm": 4.393563747406006, + "learning_rate": 5.846107468289233e-06, + "loss": 2.2998, + "step": 3586 + }, + { + "epoch": 2.4007191537400177, + "grad_norm": 5.311534881591797, + "learning_rate": 5.833598030330567e-06, + "loss": 2.3931, + "step": 3587 + }, + { + "epoch": 2.4013881339632897, + "grad_norm": 5.727334976196289, + "learning_rate": 5.821100222483222e-06, + "loss": 2.3992, + "step": 3588 + }, + { + "epoch": 2.4020571141865616, + "grad_norm": 6.384558200836182, + "learning_rate": 5.808614052330816e-06, + "loss": 2.3456, + "step": 3589 + }, + { + "epoch": 2.402726094409834, + "grad_norm": 4.6034836769104, + "learning_rate": 5.796139527449931e-06, + "loss": 2.3396, + "step": 3590 + }, + { + "epoch": 2.403395074633106, + "grad_norm": 4.576412200927734, + "learning_rate": 5.783676655410059e-06, + "loss": 2.2839, + "step": 3591 + }, + { + "epoch": 2.404064054856378, + "grad_norm": 8.068883895874023, + "learning_rate": 5.771225443773643e-06, + "loss": 2.4558, + "step": 3592 + }, + { + "epoch": 2.4047330350796505, + "grad_norm": 5.36494255065918, + "learning_rate": 5.758785900096023e-06, + "loss": 2.5018, + "step": 3593 + }, + { + "epoch": 2.4054020153029225, + "grad_norm": 5.629166126251221, + "learning_rate": 5.7463580319254865e-06, + "loss": 2.4106, + "step": 3594 + }, + { + "epoch": 2.4060709955261945, + "grad_norm": 6.478418350219727, + "learning_rate": 5.733941846803223e-06, + "loss": 2.4799, + "step": 3595 + }, + { + "epoch": 2.406739975749467, + "grad_norm": 5.750023365020752, + "learning_rate": 5.721537352263328e-06, + "loss": 2.4063, + "step": 3596 + }, + { + "epoch": 2.407408955972739, + "grad_norm": 6.4178643226623535, + "learning_rate": 5.709144555832807e-06, + "loss": 2.1961, + "step": 3597 + }, + { + "epoch": 2.408077936196011, + "grad_norm": 6.1644721031188965, + "learning_rate": 5.696763465031579e-06, + "loss": 2.4067, + "step": 3598 + }, + { + "epoch": 2.4087469164192834, + "grad_norm": 4.278310775756836, + "learning_rate": 5.684394087372438e-06, + "loss": 2.1201, + "step": 3599 + }, + { + "epoch": 2.4094158966425554, + "grad_norm": 7.321680545806885, + "learning_rate": 5.672036430361096e-06, + "loss": 2.3585, + "step": 3600 + }, + { + "epoch": 2.410084876865828, + "grad_norm": 5.392411231994629, + "learning_rate": 5.659690501496135e-06, + "loss": 2.4512, + "step": 3601 + }, + { + "epoch": 2.4107538570891, + "grad_norm": 7.5044403076171875, + "learning_rate": 5.6473563082690226e-06, + "loss": 2.3468, + "step": 3602 + }, + { + "epoch": 2.411422837312372, + "grad_norm": 5.9740753173828125, + "learning_rate": 5.635033858164102e-06, + "loss": 2.4891, + "step": 3603 + }, + { + "epoch": 2.4120918175356443, + "grad_norm": 4.555912971496582, + "learning_rate": 5.622723158658608e-06, + "loss": 2.2936, + "step": 3604 + }, + { + "epoch": 2.4127607977589163, + "grad_norm": 3.9958879947662354, + "learning_rate": 5.610424217222624e-06, + "loss": 2.3423, + "step": 3605 + }, + { + "epoch": 2.4134297779821883, + "grad_norm": 7.472552299499512, + "learning_rate": 5.598137041319118e-06, + "loss": 2.4645, + "step": 3606 + }, + { + "epoch": 2.4140987582054607, + "grad_norm": 5.533159255981445, + "learning_rate": 5.5858616384039e-06, + "loss": 2.4077, + "step": 3607 + }, + { + "epoch": 2.4147677384287327, + "grad_norm": 6.143260478973389, + "learning_rate": 5.573598015925655e-06, + "loss": 2.4896, + "step": 3608 + }, + { + "epoch": 2.4154367186520047, + "grad_norm": 7.7457275390625, + "learning_rate": 5.561346181325902e-06, + "loss": 2.3289, + "step": 3609 + }, + { + "epoch": 2.416105698875277, + "grad_norm": 6.496987342834473, + "learning_rate": 5.549106142039018e-06, + "loss": 2.3358, + "step": 3610 + }, + { + "epoch": 2.416774679098549, + "grad_norm": 5.537322044372559, + "learning_rate": 5.53687790549221e-06, + "loss": 2.4855, + "step": 3611 + }, + { + "epoch": 2.417443659321821, + "grad_norm": 5.359654903411865, + "learning_rate": 5.52466147910555e-06, + "loss": 2.3864, + "step": 3612 + }, + { + "epoch": 2.4181126395450936, + "grad_norm": 4.6059041023254395, + "learning_rate": 5.51245687029191e-06, + "loss": 2.5526, + "step": 3613 + }, + { + "epoch": 2.4187816197683656, + "grad_norm": 4.178314208984375, + "learning_rate": 5.500264086457024e-06, + "loss": 2.5011, + "step": 3614 + }, + { + "epoch": 2.4194505999916376, + "grad_norm": 6.147581577301025, + "learning_rate": 5.488083134999417e-06, + "loss": 2.6891, + "step": 3615 + }, + { + "epoch": 2.42011958021491, + "grad_norm": 5.813813209533691, + "learning_rate": 5.4759140233104665e-06, + "loss": 2.637, + "step": 3616 + }, + { + "epoch": 2.420788560438182, + "grad_norm": 5.292234897613525, + "learning_rate": 5.463756758774344e-06, + "loss": 2.3584, + "step": 3617 + }, + { + "epoch": 2.421457540661454, + "grad_norm": 5.2962822914123535, + "learning_rate": 5.4516113487680375e-06, + "loss": 2.4684, + "step": 3618 + }, + { + "epoch": 2.4221265208847265, + "grad_norm": 6.884120941162109, + "learning_rate": 5.4394778006613355e-06, + "loss": 2.4998, + "step": 3619 + }, + { + "epoch": 2.4227955011079985, + "grad_norm": 4.518246173858643, + "learning_rate": 5.427356121816854e-06, + "loss": 2.1073, + "step": 3620 + }, + { + "epoch": 2.4234644813312705, + "grad_norm": 7.931025981903076, + "learning_rate": 5.415246319589973e-06, + "loss": 2.478, + "step": 3621 + }, + { + "epoch": 2.424133461554543, + "grad_norm": 5.490780830383301, + "learning_rate": 5.403148401328892e-06, + "loss": 2.3397, + "step": 3622 + }, + { + "epoch": 2.424802441777815, + "grad_norm": 6.010279655456543, + "learning_rate": 5.391062374374587e-06, + "loss": 2.4276, + "step": 3623 + }, + { + "epoch": 2.4254714220010873, + "grad_norm": 6.396979808807373, + "learning_rate": 5.378988246060817e-06, + "loss": 2.3688, + "step": 3624 + }, + { + "epoch": 2.4261404022243593, + "grad_norm": 5.824526309967041, + "learning_rate": 5.366926023714119e-06, + "loss": 2.291, + "step": 3625 + }, + { + "epoch": 2.4268093824476313, + "grad_norm": 5.782132625579834, + "learning_rate": 5.3548757146538255e-06, + "loss": 2.5997, + "step": 3626 + }, + { + "epoch": 2.4274783626709038, + "grad_norm": 4.868682384490967, + "learning_rate": 5.342837326192013e-06, + "loss": 2.2591, + "step": 3627 + }, + { + "epoch": 2.4281473428941758, + "grad_norm": 4.649059295654297, + "learning_rate": 5.330810865633546e-06, + "loss": 2.3571, + "step": 3628 + }, + { + "epoch": 2.4288163231174478, + "grad_norm": 6.28415060043335, + "learning_rate": 5.318796340276031e-06, + "loss": 2.4075, + "step": 3629 + }, + { + "epoch": 2.42948530334072, + "grad_norm": 7.052877426147461, + "learning_rate": 5.306793757409856e-06, + "loss": 2.6753, + "step": 3630 + }, + { + "epoch": 2.430154283563992, + "grad_norm": 6.091507434844971, + "learning_rate": 5.294803124318146e-06, + "loss": 2.5088, + "step": 3631 + }, + { + "epoch": 2.430823263787264, + "grad_norm": 6.348925590515137, + "learning_rate": 5.282824448276777e-06, + "loss": 2.3445, + "step": 3632 + }, + { + "epoch": 2.4314922440105367, + "grad_norm": 6.2545623779296875, + "learning_rate": 5.270857736554366e-06, + "loss": 2.348, + "step": 3633 + }, + { + "epoch": 2.4321612242338086, + "grad_norm": 6.225286960601807, + "learning_rate": 5.258902996412285e-06, + "loss": 2.2498, + "step": 3634 + }, + { + "epoch": 2.4328302044570806, + "grad_norm": 4.817884922027588, + "learning_rate": 5.24696023510462e-06, + "loss": 2.2754, + "step": 3635 + }, + { + "epoch": 2.433499184680353, + "grad_norm": 5.229065895080566, + "learning_rate": 5.235029459878218e-06, + "loss": 2.6888, + "step": 3636 + }, + { + "epoch": 2.434168164903625, + "grad_norm": 7.9895172119140625, + "learning_rate": 5.223110677972617e-06, + "loss": 2.4893, + "step": 3637 + }, + { + "epoch": 2.434837145126897, + "grad_norm": 6.197998046875, + "learning_rate": 5.211203896620109e-06, + "loss": 2.5116, + "step": 3638 + }, + { + "epoch": 2.4355061253501695, + "grad_norm": 6.2058258056640625, + "learning_rate": 5.199309123045687e-06, + "loss": 2.459, + "step": 3639 + }, + { + "epoch": 2.4361751055734415, + "grad_norm": 5.2085795402526855, + "learning_rate": 5.18742636446706e-06, + "loss": 2.3802, + "step": 3640 + }, + { + "epoch": 2.4368440857967135, + "grad_norm": 5.660262584686279, + "learning_rate": 5.175555628094642e-06, + "loss": 2.4713, + "step": 3641 + }, + { + "epoch": 2.437513066019986, + "grad_norm": 7.204103946685791, + "learning_rate": 5.163696921131569e-06, + "loss": 2.4375, + "step": 3642 + }, + { + "epoch": 2.438182046243258, + "grad_norm": 5.309956073760986, + "learning_rate": 5.151850250773654e-06, + "loss": 2.3434, + "step": 3643 + }, + { + "epoch": 2.43885102646653, + "grad_norm": 7.497796058654785, + "learning_rate": 5.140015624209435e-06, + "loss": 2.5722, + "step": 3644 + }, + { + "epoch": 2.4395200066898024, + "grad_norm": 6.94460391998291, + "learning_rate": 5.128193048620114e-06, + "loss": 2.3324, + "step": 3645 + }, + { + "epoch": 2.4401889869130744, + "grad_norm": 6.5837202072143555, + "learning_rate": 5.116382531179595e-06, + "loss": 2.471, + "step": 3646 + }, + { + "epoch": 2.4408579671363464, + "grad_norm": 6.94624662399292, + "learning_rate": 5.1045840790544555e-06, + "loss": 2.562, + "step": 3647 + }, + { + "epoch": 2.441526947359619, + "grad_norm": 6.566301345825195, + "learning_rate": 5.09279769940397e-06, + "loss": 2.3742, + "step": 3648 + }, + { + "epoch": 2.442195927582891, + "grad_norm": 4.755611896514893, + "learning_rate": 5.081023399380064e-06, + "loss": 2.3707, + "step": 3649 + }, + { + "epoch": 2.442864907806163, + "grad_norm": 6.090338706970215, + "learning_rate": 5.0692611861273585e-06, + "loss": 2.395, + "step": 3650 + }, + { + "epoch": 2.4435338880294353, + "grad_norm": 6.200554847717285, + "learning_rate": 5.05751106678311e-06, + "loss": 2.2569, + "step": 3651 + }, + { + "epoch": 2.4442028682527073, + "grad_norm": 6.099989414215088, + "learning_rate": 5.045773048477273e-06, + "loss": 2.5628, + "step": 3652 + }, + { + "epoch": 2.4448718484759793, + "grad_norm": 5.186051845550537, + "learning_rate": 5.034047138332429e-06, + "loss": 2.3545, + "step": 3653 + }, + { + "epoch": 2.4455408286992517, + "grad_norm": 6.636256217956543, + "learning_rate": 5.022333343463822e-06, + "loss": 2.4531, + "step": 3654 + }, + { + "epoch": 2.4462098089225237, + "grad_norm": 6.151393413543701, + "learning_rate": 5.01063167097934e-06, + "loss": 2.5746, + "step": 3655 + }, + { + "epoch": 2.4468787891457957, + "grad_norm": 5.093237400054932, + "learning_rate": 4.998942127979533e-06, + "loss": 2.2485, + "step": 3656 + }, + { + "epoch": 2.447547769369068, + "grad_norm": 6.741740703582764, + "learning_rate": 4.987264721557564e-06, + "loss": 2.4145, + "step": 3657 + }, + { + "epoch": 2.44821674959234, + "grad_norm": 5.800689697265625, + "learning_rate": 4.975599458799263e-06, + "loss": 2.2672, + "step": 3658 + }, + { + "epoch": 2.448885729815612, + "grad_norm": 6.397168159484863, + "learning_rate": 4.963946346783055e-06, + "loss": 2.498, + "step": 3659 + }, + { + "epoch": 2.4495547100388846, + "grad_norm": 6.375482082366943, + "learning_rate": 4.952305392580029e-06, + "loss": 2.3123, + "step": 3660 + }, + { + "epoch": 2.4502236902621566, + "grad_norm": 7.884256362915039, + "learning_rate": 4.940676603253872e-06, + "loss": 2.5132, + "step": 3661 + }, + { + "epoch": 2.4508926704854286, + "grad_norm": 5.571125030517578, + "learning_rate": 4.929059985860893e-06, + "loss": 2.2322, + "step": 3662 + }, + { + "epoch": 2.451561650708701, + "grad_norm": 5.891666889190674, + "learning_rate": 4.917455547450011e-06, + "loss": 2.3724, + "step": 3663 + }, + { + "epoch": 2.452230630931973, + "grad_norm": 6.4692912101745605, + "learning_rate": 4.905863295062782e-06, + "loss": 2.4456, + "step": 3664 + }, + { + "epoch": 2.452899611155245, + "grad_norm": 4.736913204193115, + "learning_rate": 4.8942832357333266e-06, + "loss": 2.1508, + "step": 3665 + }, + { + "epoch": 2.4535685913785175, + "grad_norm": 6.954343795776367, + "learning_rate": 4.882715376488406e-06, + "loss": 2.5498, + "step": 3666 + }, + { + "epoch": 2.4542375716017895, + "grad_norm": 5.342285633087158, + "learning_rate": 4.871159724347351e-06, + "loss": 2.3468, + "step": 3667 + }, + { + "epoch": 2.4549065518250615, + "grad_norm": 4.8141398429870605, + "learning_rate": 4.859616286322094e-06, + "loss": 2.5519, + "step": 3668 + }, + { + "epoch": 2.455575532048334, + "grad_norm": 5.9077467918396, + "learning_rate": 4.848085069417152e-06, + "loss": 2.3814, + "step": 3669 + }, + { + "epoch": 2.456244512271606, + "grad_norm": 6.883209228515625, + "learning_rate": 4.836566080629643e-06, + "loss": 2.361, + "step": 3670 + }, + { + "epoch": 2.456913492494878, + "grad_norm": 6.057150840759277, + "learning_rate": 4.825059326949238e-06, + "loss": 2.5476, + "step": 3671 + }, + { + "epoch": 2.4575824727181503, + "grad_norm": 6.204245567321777, + "learning_rate": 4.813564815358213e-06, + "loss": 2.4882, + "step": 3672 + }, + { + "epoch": 2.4582514529414223, + "grad_norm": 6.64919900894165, + "learning_rate": 4.802082552831383e-06, + "loss": 2.5416, + "step": 3673 + }, + { + "epoch": 2.4589204331646943, + "grad_norm": 5.215975761413574, + "learning_rate": 4.790612546336168e-06, + "loss": 2.6288, + "step": 3674 + }, + { + "epoch": 2.4595894133879668, + "grad_norm": 5.347267150878906, + "learning_rate": 4.7791548028325225e-06, + "loss": 2.3693, + "step": 3675 + }, + { + "epoch": 2.4602583936112388, + "grad_norm": 6.27996826171875, + "learning_rate": 4.767709329272965e-06, + "loss": 2.4477, + "step": 3676 + }, + { + "epoch": 2.460927373834511, + "grad_norm": 5.72952938079834, + "learning_rate": 4.7562761326025715e-06, + "loss": 2.4017, + "step": 3677 + }, + { + "epoch": 2.461596354057783, + "grad_norm": 4.07435941696167, + "learning_rate": 4.744855219758976e-06, + "loss": 2.4521, + "step": 3678 + }, + { + "epoch": 2.462265334281055, + "grad_norm": 4.724560260772705, + "learning_rate": 4.733446597672345e-06, + "loss": 2.5419, + "step": 3679 + }, + { + "epoch": 2.4629343145043276, + "grad_norm": 6.7724080085754395, + "learning_rate": 4.7220502732653996e-06, + "loss": 2.6609, + "step": 3680 + }, + { + "epoch": 2.4636032947275996, + "grad_norm": 5.6218414306640625, + "learning_rate": 4.710666253453385e-06, + "loss": 2.4525, + "step": 3681 + }, + { + "epoch": 2.4642722749508716, + "grad_norm": 5.056951999664307, + "learning_rate": 4.699294545144103e-06, + "loss": 2.2776, + "step": 3682 + }, + { + "epoch": 2.464941255174144, + "grad_norm": 5.5675368309021, + "learning_rate": 4.687935155237857e-06, + "loss": 2.3139, + "step": 3683 + }, + { + "epoch": 2.465610235397416, + "grad_norm": 7.240647315979004, + "learning_rate": 4.676588090627493e-06, + "loss": 2.5412, + "step": 3684 + }, + { + "epoch": 2.466279215620688, + "grad_norm": 6.0798726081848145, + "learning_rate": 4.665253358198365e-06, + "loss": 2.2896, + "step": 3685 + }, + { + "epoch": 2.4669481958439605, + "grad_norm": 7.622159004211426, + "learning_rate": 4.653930964828368e-06, + "loss": 2.326, + "step": 3686 + }, + { + "epoch": 2.4676171760672325, + "grad_norm": 5.304567813873291, + "learning_rate": 4.642620917387877e-06, + "loss": 2.2201, + "step": 3687 + }, + { + "epoch": 2.4682861562905045, + "grad_norm": 5.0369391441345215, + "learning_rate": 4.6313232227398085e-06, + "loss": 2.2971, + "step": 3688 + }, + { + "epoch": 2.468955136513777, + "grad_norm": 5.348989486694336, + "learning_rate": 4.6200378877395586e-06, + "loss": 2.2328, + "step": 3689 + }, + { + "epoch": 2.469624116737049, + "grad_norm": 6.873260021209717, + "learning_rate": 4.608764919235034e-06, + "loss": 2.3915, + "step": 3690 + }, + { + "epoch": 2.470293096960321, + "grad_norm": 7.139838695526123, + "learning_rate": 4.5975043240666315e-06, + "loss": 2.5304, + "step": 3691 + }, + { + "epoch": 2.4709620771835934, + "grad_norm": 5.508767127990723, + "learning_rate": 4.586256109067252e-06, + "loss": 2.3532, + "step": 3692 + }, + { + "epoch": 2.4716310574068654, + "grad_norm": 6.246993541717529, + "learning_rate": 4.5750202810622675e-06, + "loss": 2.1157, + "step": 3693 + }, + { + "epoch": 2.4723000376301374, + "grad_norm": 6.130352973937988, + "learning_rate": 4.563796846869553e-06, + "loss": 2.161, + "step": 3694 + }, + { + "epoch": 2.47296901785341, + "grad_norm": 8.216900825500488, + "learning_rate": 4.552585813299443e-06, + "loss": 2.488, + "step": 3695 + }, + { + "epoch": 2.473637998076682, + "grad_norm": 7.8701019287109375, + "learning_rate": 4.5413871871547636e-06, + "loss": 2.6505, + "step": 3696 + }, + { + "epoch": 2.474306978299954, + "grad_norm": 8.133267402648926, + "learning_rate": 4.5302009752308056e-06, + "loss": 2.7707, + "step": 3697 + }, + { + "epoch": 2.4749759585232263, + "grad_norm": 4.782482147216797, + "learning_rate": 4.519027184315322e-06, + "loss": 2.4716, + "step": 3698 + }, + { + "epoch": 2.4756449387464983, + "grad_norm": 6.809669494628906, + "learning_rate": 4.5078658211885284e-06, + "loss": 2.646, + "step": 3699 + }, + { + "epoch": 2.4763139189697703, + "grad_norm": 5.910297870635986, + "learning_rate": 4.496716892623115e-06, + "loss": 2.4048, + "step": 3700 + }, + { + "epoch": 2.4769828991930427, + "grad_norm": 5.295024871826172, + "learning_rate": 4.485580405384207e-06, + "loss": 2.3907, + "step": 3701 + }, + { + "epoch": 2.4776518794163147, + "grad_norm": 6.249392986297607, + "learning_rate": 4.4744563662294015e-06, + "loss": 2.4406, + "step": 3702 + }, + { + "epoch": 2.478320859639587, + "grad_norm": 8.812426567077637, + "learning_rate": 4.463344781908713e-06, + "loss": 2.4162, + "step": 3703 + }, + { + "epoch": 2.478989839862859, + "grad_norm": 5.104134559631348, + "learning_rate": 4.452245659164634e-06, + "loss": 2.2016, + "step": 3704 + }, + { + "epoch": 2.479658820086131, + "grad_norm": 6.753453731536865, + "learning_rate": 4.4411590047320625e-06, + "loss": 2.2948, + "step": 3705 + }, + { + "epoch": 2.4803278003094036, + "grad_norm": 7.191890239715576, + "learning_rate": 4.430084825338351e-06, + "loss": 2.4825, + "step": 3706 + }, + { + "epoch": 2.4809967805326756, + "grad_norm": 5.819056987762451, + "learning_rate": 4.419023127703267e-06, + "loss": 2.3717, + "step": 3707 + }, + { + "epoch": 2.4816657607559476, + "grad_norm": 6.089680194854736, + "learning_rate": 4.407973918539029e-06, + "loss": 2.5377, + "step": 3708 + }, + { + "epoch": 2.48233474097922, + "grad_norm": 5.531108379364014, + "learning_rate": 4.396937204550247e-06, + "loss": 2.333, + "step": 3709 + }, + { + "epoch": 2.483003721202492, + "grad_norm": 6.844831466674805, + "learning_rate": 4.385912992433974e-06, + "loss": 2.6872, + "step": 3710 + }, + { + "epoch": 2.483672701425764, + "grad_norm": 5.804251194000244, + "learning_rate": 4.3749012888796656e-06, + "loss": 2.2028, + "step": 3711 + }, + { + "epoch": 2.4843416816490365, + "grad_norm": 5.867414951324463, + "learning_rate": 4.3639021005691835e-06, + "loss": 2.252, + "step": 3712 + }, + { + "epoch": 2.4850106618723085, + "grad_norm": 6.781108856201172, + "learning_rate": 4.352915434176794e-06, + "loss": 2.3789, + "step": 3713 + }, + { + "epoch": 2.4856796420955805, + "grad_norm": 6.32985258102417, + "learning_rate": 4.341941296369187e-06, + "loss": 2.5138, + "step": 3714 + }, + { + "epoch": 2.486348622318853, + "grad_norm": 5.433622360229492, + "learning_rate": 4.330979693805418e-06, + "loss": 2.4288, + "step": 3715 + }, + { + "epoch": 2.487017602542125, + "grad_norm": 4.8499956130981445, + "learning_rate": 4.3200306331369675e-06, + "loss": 2.3018, + "step": 3716 + }, + { + "epoch": 2.487686582765397, + "grad_norm": 6.094130992889404, + "learning_rate": 4.309094121007676e-06, + "loss": 2.634, + "step": 3717 + }, + { + "epoch": 2.4883555629886693, + "grad_norm": 4.951965808868408, + "learning_rate": 4.298170164053797e-06, + "loss": 2.358, + "step": 3718 + }, + { + "epoch": 2.4890245432119413, + "grad_norm": 5.590311527252197, + "learning_rate": 4.2872587689039484e-06, + "loss": 2.3814, + "step": 3719 + }, + { + "epoch": 2.4896935234352133, + "grad_norm": 5.994541645050049, + "learning_rate": 4.276359942179128e-06, + "loss": 2.3524, + "step": 3720 + }, + { + "epoch": 2.4903625036584858, + "grad_norm": 5.888491630554199, + "learning_rate": 4.265473690492702e-06, + "loss": 2.4733, + "step": 3721 + }, + { + "epoch": 2.4910314838817578, + "grad_norm": 5.276398658752441, + "learning_rate": 4.254600020450428e-06, + "loss": 2.3181, + "step": 3722 + }, + { + "epoch": 2.4917004641050298, + "grad_norm": 5.908614158630371, + "learning_rate": 4.243738938650396e-06, + "loss": 2.6607, + "step": 3723 + }, + { + "epoch": 2.492369444328302, + "grad_norm": 6.046211242675781, + "learning_rate": 4.232890451683097e-06, + "loss": 2.5582, + "step": 3724 + }, + { + "epoch": 2.493038424551574, + "grad_norm": 4.798820972442627, + "learning_rate": 4.222054566131336e-06, + "loss": 2.1913, + "step": 3725 + }, + { + "epoch": 2.493707404774846, + "grad_norm": 4.309851169586182, + "learning_rate": 4.211231288570322e-06, + "loss": 2.1527, + "step": 3726 + }, + { + "epoch": 2.4943763849981186, + "grad_norm": 4.639966011047363, + "learning_rate": 4.200420625567553e-06, + "loss": 2.4232, + "step": 3727 + }, + { + "epoch": 2.4950453652213906, + "grad_norm": 4.977262496948242, + "learning_rate": 4.189622583682925e-06, + "loss": 2.4505, + "step": 3728 + }, + { + "epoch": 2.4957143454446626, + "grad_norm": 6.8202290534973145, + "learning_rate": 4.1788371694686466e-06, + "loss": 2.4318, + "step": 3729 + }, + { + "epoch": 2.496383325667935, + "grad_norm": 7.176461219787598, + "learning_rate": 4.1680643894692785e-06, + "loss": 2.5829, + "step": 3730 + }, + { + "epoch": 2.497052305891207, + "grad_norm": 8.547842025756836, + "learning_rate": 4.157304250221702e-06, + "loss": 2.4333, + "step": 3731 + }, + { + "epoch": 2.497721286114479, + "grad_norm": 5.82031774520874, + "learning_rate": 4.146556758255146e-06, + "loss": 2.2988, + "step": 3732 + }, + { + "epoch": 2.4983902663377515, + "grad_norm": 5.97931432723999, + "learning_rate": 4.135821920091146e-06, + "loss": 2.4472, + "step": 3733 + }, + { + "epoch": 2.4990592465610235, + "grad_norm": 6.065347671508789, + "learning_rate": 4.125099742243571e-06, + "loss": 2.5448, + "step": 3734 + }, + { + "epoch": 2.4997282267842955, + "grad_norm": 6.322947025299072, + "learning_rate": 4.114390231218595e-06, + "loss": 2.7129, + "step": 3735 + }, + { + "epoch": 2.500397207007568, + "grad_norm": 5.404718399047852, + "learning_rate": 4.1036933935147295e-06, + "loss": 2.3136, + "step": 3736 + }, + { + "epoch": 2.50106618723084, + "grad_norm": 5.688011169433594, + "learning_rate": 4.093009235622766e-06, + "loss": 2.2174, + "step": 3737 + }, + { + "epoch": 2.501735167454112, + "grad_norm": 8.533523559570312, + "learning_rate": 4.082337764025831e-06, + "loss": 2.7263, + "step": 3738 + }, + { + "epoch": 2.5024041476773844, + "grad_norm": 5.000412464141846, + "learning_rate": 4.071678985199331e-06, + "loss": 2.2914, + "step": 3739 + }, + { + "epoch": 2.5030731279006564, + "grad_norm": 6.482676029205322, + "learning_rate": 4.061032905610987e-06, + "loss": 2.5748, + "step": 3740 + }, + { + "epoch": 2.5037421081239284, + "grad_norm": 6.046717643737793, + "learning_rate": 4.050399531720797e-06, + "loss": 2.4276, + "step": 3741 + }, + { + "epoch": 2.504411088347201, + "grad_norm": 6.825987339019775, + "learning_rate": 4.039778869981064e-06, + "loss": 2.4705, + "step": 3742 + }, + { + "epoch": 2.505080068570473, + "grad_norm": 7.941644191741943, + "learning_rate": 4.029170926836365e-06, + "loss": 2.8526, + "step": 3743 + }, + { + "epoch": 2.505749048793745, + "grad_norm": 7.009332656860352, + "learning_rate": 4.018575708723574e-06, + "loss": 2.4424, + "step": 3744 + }, + { + "epoch": 2.5064180290170173, + "grad_norm": 4.548746109008789, + "learning_rate": 4.0079932220718265e-06, + "loss": 2.0793, + "step": 3745 + }, + { + "epoch": 2.5070870092402893, + "grad_norm": 5.2693376541137695, + "learning_rate": 3.99742347330255e-06, + "loss": 2.2571, + "step": 3746 + }, + { + "epoch": 2.5077559894635613, + "grad_norm": 4.7540435791015625, + "learning_rate": 3.986866468829428e-06, + "loss": 2.3172, + "step": 3747 + }, + { + "epoch": 2.5084249696868337, + "grad_norm": 7.303004741668701, + "learning_rate": 3.976322215058431e-06, + "loss": 2.3028, + "step": 3748 + }, + { + "epoch": 2.5090939499101057, + "grad_norm": 6.449319362640381, + "learning_rate": 3.9657907183877545e-06, + "loss": 2.4566, + "step": 3749 + }, + { + "epoch": 2.5097629301333777, + "grad_norm": 5.482738971710205, + "learning_rate": 3.955271985207895e-06, + "loss": 2.6462, + "step": 3750 + }, + { + "epoch": 2.51043191035665, + "grad_norm": 4.709473609924316, + "learning_rate": 3.944766021901578e-06, + "loss": 2.3023, + "step": 3751 + }, + { + "epoch": 2.511100890579922, + "grad_norm": 7.775249004364014, + "learning_rate": 3.934272834843794e-06, + "loss": 2.579, + "step": 3752 + }, + { + "epoch": 2.511769870803194, + "grad_norm": 6.218977928161621, + "learning_rate": 3.923792430401765e-06, + "loss": 2.5588, + "step": 3753 + }, + { + "epoch": 2.5124388510264666, + "grad_norm": 4.948222637176514, + "learning_rate": 3.913324814934985e-06, + "loss": 2.3676, + "step": 3754 + }, + { + "epoch": 2.5131078312497386, + "grad_norm": 7.155378818511963, + "learning_rate": 3.902869994795155e-06, + "loss": 2.3456, + "step": 3755 + }, + { + "epoch": 2.5137768114730106, + "grad_norm": 6.549559593200684, + "learning_rate": 3.892427976326232e-06, + "loss": 2.3356, + "step": 3756 + }, + { + "epoch": 2.514445791696283, + "grad_norm": 6.7977094650268555, + "learning_rate": 3.881998765864389e-06, + "loss": 2.3373, + "step": 3757 + }, + { + "epoch": 2.515114771919555, + "grad_norm": 8.165121078491211, + "learning_rate": 3.871582369738056e-06, + "loss": 2.6503, + "step": 3758 + }, + { + "epoch": 2.515783752142827, + "grad_norm": 4.671085834503174, + "learning_rate": 3.86117879426785e-06, + "loss": 2.0988, + "step": 3759 + }, + { + "epoch": 2.5164527323660995, + "grad_norm": 6.805253505706787, + "learning_rate": 3.850788045766643e-06, + "loss": 2.519, + "step": 3760 + }, + { + "epoch": 2.5171217125893715, + "grad_norm": 5.543329238891602, + "learning_rate": 3.840410130539493e-06, + "loss": 2.4166, + "step": 3761 + }, + { + "epoch": 2.5177906928126434, + "grad_norm": 7.204930305480957, + "learning_rate": 3.830045054883702e-06, + "loss": 2.4414, + "step": 3762 + }, + { + "epoch": 2.518459673035916, + "grad_norm": 4.907789707183838, + "learning_rate": 3.819692825088755e-06, + "loss": 2.2819, + "step": 3763 + }, + { + "epoch": 2.519128653259188, + "grad_norm": 5.216022491455078, + "learning_rate": 3.8093534474363534e-06, + "loss": 2.2202, + "step": 3764 + }, + { + "epoch": 2.5197976334824603, + "grad_norm": 5.094801425933838, + "learning_rate": 3.7990269282003943e-06, + "loss": 2.3462, + "step": 3765 + }, + { + "epoch": 2.5204666137057323, + "grad_norm": 6.795177936553955, + "learning_rate": 3.788713273646985e-06, + "loss": 2.4102, + "step": 3766 + }, + { + "epoch": 2.5211355939290043, + "grad_norm": 6.91445255279541, + "learning_rate": 3.778412490034408e-06, + "loss": 2.3729, + "step": 3767 + }, + { + "epoch": 2.5218045741522768, + "grad_norm": 4.927152156829834, + "learning_rate": 3.7681245836131556e-06, + "loss": 2.3487, + "step": 3768 + }, + { + "epoch": 2.5224735543755488, + "grad_norm": 6.471567630767822, + "learning_rate": 3.7578495606258867e-06, + "loss": 2.4755, + "step": 3769 + }, + { + "epoch": 2.5231425345988208, + "grad_norm": 5.427146911621094, + "learning_rate": 3.7475874273074723e-06, + "loss": 2.1768, + "step": 3770 + }, + { + "epoch": 2.523811514822093, + "grad_norm": 4.785037040710449, + "learning_rate": 3.7373381898849145e-06, + "loss": 2.3706, + "step": 3771 + }, + { + "epoch": 2.524480495045365, + "grad_norm": 7.640030384063721, + "learning_rate": 3.727101854577436e-06, + "loss": 2.4037, + "step": 3772 + }, + { + "epoch": 2.5251494752686376, + "grad_norm": 5.508459091186523, + "learning_rate": 3.7168784275964015e-06, + "loss": 2.3228, + "step": 3773 + }, + { + "epoch": 2.5258184554919096, + "grad_norm": 6.27532958984375, + "learning_rate": 3.706667915145365e-06, + "loss": 2.5784, + "step": 3774 + }, + { + "epoch": 2.5264874357151816, + "grad_norm": 5.630836009979248, + "learning_rate": 3.6964703234200205e-06, + "loss": 2.532, + "step": 3775 + }, + { + "epoch": 2.527156415938454, + "grad_norm": 6.564334392547607, + "learning_rate": 3.6862856586082463e-06, + "loss": 2.3323, + "step": 3776 + }, + { + "epoch": 2.527825396161726, + "grad_norm": 7.441055774688721, + "learning_rate": 3.6761139268900594e-06, + "loss": 2.8551, + "step": 3777 + }, + { + "epoch": 2.528494376384998, + "grad_norm": 5.700356483459473, + "learning_rate": 3.6659551344376346e-06, + "loss": 2.3666, + "step": 3778 + }, + { + "epoch": 2.5291633566082705, + "grad_norm": 7.274041175842285, + "learning_rate": 3.655809287415285e-06, + "loss": 2.1705, + "step": 3779 + }, + { + "epoch": 2.5298323368315425, + "grad_norm": 6.163374423980713, + "learning_rate": 3.6456763919794938e-06, + "loss": 2.3205, + "step": 3780 + }, + { + "epoch": 2.5305013170548145, + "grad_norm": 5.818201541900635, + "learning_rate": 3.6355564542788574e-06, + "loss": 2.4823, + "step": 3781 + }, + { + "epoch": 2.531170297278087, + "grad_norm": 8.854202270507812, + "learning_rate": 3.6254494804541295e-06, + "loss": 2.4959, + "step": 3782 + }, + { + "epoch": 2.531839277501359, + "grad_norm": 4.19840669631958, + "learning_rate": 3.6153554766381796e-06, + "loss": 2.2509, + "step": 3783 + }, + { + "epoch": 2.532508257724631, + "grad_norm": 5.034258842468262, + "learning_rate": 3.605274448956031e-06, + "loss": 2.2866, + "step": 3784 + }, + { + "epoch": 2.5331772379479034, + "grad_norm": 5.948155403137207, + "learning_rate": 3.595206403524812e-06, + "loss": 2.2271, + "step": 3785 + }, + { + "epoch": 2.5338462181711754, + "grad_norm": 5.903778553009033, + "learning_rate": 3.58515134645378e-06, + "loss": 2.615, + "step": 3786 + }, + { + "epoch": 2.5345151983944474, + "grad_norm": 5.3802900314331055, + "learning_rate": 3.5751092838443053e-06, + "loss": 2.5399, + "step": 3787 + }, + { + "epoch": 2.53518417861772, + "grad_norm": 5.97119665145874, + "learning_rate": 3.5650802217898876e-06, + "loss": 2.7474, + "step": 3788 + }, + { + "epoch": 2.535853158840992, + "grad_norm": 6.934410095214844, + "learning_rate": 3.5550641663761266e-06, + "loss": 2.5269, + "step": 3789 + }, + { + "epoch": 2.536522139064264, + "grad_norm": 6.92349910736084, + "learning_rate": 3.545061123680735e-06, + "loss": 2.3674, + "step": 3790 + }, + { + "epoch": 2.5371911192875363, + "grad_norm": 4.380884647369385, + "learning_rate": 3.5350710997735263e-06, + "loss": 2.2402, + "step": 3791 + }, + { + "epoch": 2.5378600995108083, + "grad_norm": 7.2302350997924805, + "learning_rate": 3.525094100716414e-06, + "loss": 2.4676, + "step": 3792 + }, + { + "epoch": 2.5385290797340803, + "grad_norm": 5.280435562133789, + "learning_rate": 3.5151301325634044e-06, + "loss": 2.3216, + "step": 3793 + }, + { + "epoch": 2.5391980599573527, + "grad_norm": 8.111344337463379, + "learning_rate": 3.5051792013606117e-06, + "loss": 2.7732, + "step": 3794 + }, + { + "epoch": 2.5398670401806247, + "grad_norm": 6.082609176635742, + "learning_rate": 3.4952413131462193e-06, + "loss": 2.5892, + "step": 3795 + }, + { + "epoch": 2.5405360204038967, + "grad_norm": 5.370868682861328, + "learning_rate": 3.4853164739505144e-06, + "loss": 2.6728, + "step": 3796 + }, + { + "epoch": 2.541205000627169, + "grad_norm": 4.289524078369141, + "learning_rate": 3.475404689795847e-06, + "loss": 2.2687, + "step": 3797 + }, + { + "epoch": 2.541873980850441, + "grad_norm": 7.013555526733398, + "learning_rate": 3.4655059666966693e-06, + "loss": 2.2848, + "step": 3798 + }, + { + "epoch": 2.542542961073713, + "grad_norm": 6.473840713500977, + "learning_rate": 3.455620310659488e-06, + "loss": 2.4463, + "step": 3799 + }, + { + "epoch": 2.5432119412969856, + "grad_norm": 6.8665971755981445, + "learning_rate": 3.445747727682888e-06, + "loss": 2.4289, + "step": 3800 + }, + { + "epoch": 2.5438809215202576, + "grad_norm": 5.581964015960693, + "learning_rate": 3.4358882237575136e-06, + "loss": 2.2628, + "step": 3801 + }, + { + "epoch": 2.5445499017435296, + "grad_norm": 8.336128234863281, + "learning_rate": 3.4260418048660924e-06, + "loss": 2.6814, + "step": 3802 + }, + { + "epoch": 2.545218881966802, + "grad_norm": 4.735259532928467, + "learning_rate": 3.4162084769833903e-06, + "loss": 2.0036, + "step": 3803 + }, + { + "epoch": 2.545887862190074, + "grad_norm": 6.293749809265137, + "learning_rate": 3.4063882460762474e-06, + "loss": 2.3973, + "step": 3804 + }, + { + "epoch": 2.546556842413346, + "grad_norm": 5.786930561065674, + "learning_rate": 3.3965811181035393e-06, + "loss": 2.6541, + "step": 3805 + }, + { + "epoch": 2.5472258226366185, + "grad_norm": 8.288121223449707, + "learning_rate": 3.3867870990162116e-06, + "loss": 2.3969, + "step": 3806 + }, + { + "epoch": 2.5478948028598905, + "grad_norm": 7.866008281707764, + "learning_rate": 3.3770061947572375e-06, + "loss": 2.4458, + "step": 3807 + }, + { + "epoch": 2.5485637830831624, + "grad_norm": 5.752938747406006, + "learning_rate": 3.367238411261636e-06, + "loss": 2.6208, + "step": 3808 + }, + { + "epoch": 2.549232763306435, + "grad_norm": 6.850444316864014, + "learning_rate": 3.357483754456464e-06, + "loss": 2.1619, + "step": 3809 + }, + { + "epoch": 2.549901743529707, + "grad_norm": 6.017387390136719, + "learning_rate": 3.3477422302608254e-06, + "loss": 2.2431, + "step": 3810 + }, + { + "epoch": 2.550570723752979, + "grad_norm": 5.909073829650879, + "learning_rate": 3.3380138445858354e-06, + "loss": 2.3448, + "step": 3811 + }, + { + "epoch": 2.5512397039762513, + "grad_norm": 5.955220699310303, + "learning_rate": 3.328298603334659e-06, + "loss": 2.1152, + "step": 3812 + }, + { + "epoch": 2.5519086841995233, + "grad_norm": 5.710453510284424, + "learning_rate": 3.3185965124024693e-06, + "loss": 2.6014, + "step": 3813 + }, + { + "epoch": 2.5525776644227953, + "grad_norm": 6.36259126663208, + "learning_rate": 3.3089075776764596e-06, + "loss": 2.5856, + "step": 3814 + }, + { + "epoch": 2.5532466446460678, + "grad_norm": 5.264303684234619, + "learning_rate": 3.2992318050358432e-06, + "loss": 2.409, + "step": 3815 + }, + { + "epoch": 2.5539156248693398, + "grad_norm": 4.835752964019775, + "learning_rate": 3.289569200351858e-06, + "loss": 2.3133, + "step": 3816 + }, + { + "epoch": 2.5545846050926118, + "grad_norm": 5.330892086029053, + "learning_rate": 3.2799197694877272e-06, + "loss": 2.3577, + "step": 3817 + }, + { + "epoch": 2.555253585315884, + "grad_norm": 5.996495246887207, + "learning_rate": 3.2702835182987112e-06, + "loss": 2.6054, + "step": 3818 + }, + { + "epoch": 2.555922565539156, + "grad_norm": 5.097886562347412, + "learning_rate": 3.2606604526320387e-06, + "loss": 2.51, + "step": 3819 + }, + { + "epoch": 2.556591545762428, + "grad_norm": 5.455984115600586, + "learning_rate": 3.2510505783269723e-06, + "loss": 2.2337, + "step": 3820 + }, + { + "epoch": 2.5572605259857006, + "grad_norm": 5.55591344833374, + "learning_rate": 3.241453901214747e-06, + "loss": 2.4891, + "step": 3821 + }, + { + "epoch": 2.5579295062089726, + "grad_norm": 6.587954521179199, + "learning_rate": 3.2318704271185913e-06, + "loss": 2.5844, + "step": 3822 + }, + { + "epoch": 2.5585984864322446, + "grad_norm": 4.741420745849609, + "learning_rate": 3.2223001618537225e-06, + "loss": 2.3086, + "step": 3823 + }, + { + "epoch": 2.559267466655517, + "grad_norm": 5.835939884185791, + "learning_rate": 3.2127431112273593e-06, + "loss": 2.1317, + "step": 3824 + }, + { + "epoch": 2.559936446878789, + "grad_norm": 4.986764430999756, + "learning_rate": 3.2031992810386797e-06, + "loss": 2.2046, + "step": 3825 + }, + { + "epoch": 2.560605427102061, + "grad_norm": 7.454900741577148, + "learning_rate": 3.1936686770788564e-06, + "loss": 2.4712, + "step": 3826 + }, + { + "epoch": 2.5612744073253335, + "grad_norm": 7.282034397125244, + "learning_rate": 3.1841513051310222e-06, + "loss": 2.4204, + "step": 3827 + }, + { + "epoch": 2.5619433875486055, + "grad_norm": 6.2602715492248535, + "learning_rate": 3.1746471709702964e-06, + "loss": 2.3055, + "step": 3828 + }, + { + "epoch": 2.5626123677718775, + "grad_norm": 5.7197957038879395, + "learning_rate": 3.1651562803637515e-06, + "loss": 2.2477, + "step": 3829 + }, + { + "epoch": 2.56328134799515, + "grad_norm": 5.12856912612915, + "learning_rate": 3.1556786390704306e-06, + "loss": 2.4656, + "step": 3830 + }, + { + "epoch": 2.563950328218422, + "grad_norm": 7.175702095031738, + "learning_rate": 3.1462142528413312e-06, + "loss": 2.4264, + "step": 3831 + }, + { + "epoch": 2.564619308441694, + "grad_norm": 6.429877758026123, + "learning_rate": 3.136763127419423e-06, + "loss": 2.453, + "step": 3832 + }, + { + "epoch": 2.5652882886649664, + "grad_norm": 6.166929721832275, + "learning_rate": 3.1273252685396077e-06, + "loss": 2.4965, + "step": 3833 + }, + { + "epoch": 2.5659572688882384, + "grad_norm": 5.894916534423828, + "learning_rate": 3.1179006819287594e-06, + "loss": 2.5319, + "step": 3834 + }, + { + "epoch": 2.5666262491115104, + "grad_norm": 5.242392063140869, + "learning_rate": 3.1084893733056784e-06, + "loss": 2.3256, + "step": 3835 + }, + { + "epoch": 2.567295229334783, + "grad_norm": 7.799831390380859, + "learning_rate": 3.0990913483811196e-06, + "loss": 2.4413, + "step": 3836 + }, + { + "epoch": 2.567964209558055, + "grad_norm": 5.978551387786865, + "learning_rate": 3.089706612857768e-06, + "loss": 2.2382, + "step": 3837 + }, + { + "epoch": 2.568633189781327, + "grad_norm": 5.417609691619873, + "learning_rate": 3.0803351724302653e-06, + "loss": 2.2806, + "step": 3838 + }, + { + "epoch": 2.5693021700045993, + "grad_norm": 5.803733825683594, + "learning_rate": 3.070977032785155e-06, + "loss": 2.4982, + "step": 3839 + }, + { + "epoch": 2.5699711502278713, + "grad_norm": 6.0389838218688965, + "learning_rate": 3.061632199600939e-06, + "loss": 2.2292, + "step": 3840 + }, + { + "epoch": 2.5706401304511437, + "grad_norm": 5.605448246002197, + "learning_rate": 3.052300678548023e-06, + "loss": 2.4698, + "step": 3841 + }, + { + "epoch": 2.5713091106744157, + "grad_norm": 7.053035736083984, + "learning_rate": 3.042982475288755e-06, + "loss": 2.4973, + "step": 3842 + }, + { + "epoch": 2.5719780908976877, + "grad_norm": 6.133776664733887, + "learning_rate": 3.0336775954773827e-06, + "loss": 2.7106, + "step": 3843 + }, + { + "epoch": 2.57264707112096, + "grad_norm": 5.7862548828125, + "learning_rate": 3.0243860447600803e-06, + "loss": 2.2657, + "step": 3844 + }, + { + "epoch": 2.573316051344232, + "grad_norm": 6.445812225341797, + "learning_rate": 3.015107828774924e-06, + "loss": 2.4284, + "step": 3845 + }, + { + "epoch": 2.573985031567504, + "grad_norm": 4.764901638031006, + "learning_rate": 3.005842953151916e-06, + "loss": 2.2451, + "step": 3846 + }, + { + "epoch": 2.5746540117907766, + "grad_norm": 7.716450214385986, + "learning_rate": 2.9965914235129455e-06, + "loss": 2.339, + "step": 3847 + }, + { + "epoch": 2.5753229920140486, + "grad_norm": 5.882161617279053, + "learning_rate": 2.987353245471816e-06, + "loss": 2.3177, + "step": 3848 + }, + { + "epoch": 2.5759919722373206, + "grad_norm": 7.379197597503662, + "learning_rate": 2.978128424634219e-06, + "loss": 2.4015, + "step": 3849 + }, + { + "epoch": 2.576660952460593, + "grad_norm": 4.584736347198486, + "learning_rate": 2.9689169665977566e-06, + "loss": 2.1363, + "step": 3850 + }, + { + "epoch": 2.577329932683865, + "grad_norm": 6.116579532623291, + "learning_rate": 2.959718876951903e-06, + "loss": 2.6017, + "step": 3851 + }, + { + "epoch": 2.5779989129071375, + "grad_norm": 5.875605583190918, + "learning_rate": 2.950534161278032e-06, + "loss": 2.2601, + "step": 3852 + }, + { + "epoch": 2.5786678931304094, + "grad_norm": 5.771245002746582, + "learning_rate": 2.9413628251493934e-06, + "loss": 2.3667, + "step": 3853 + }, + { + "epoch": 2.5793368733536814, + "grad_norm": 6.541346073150635, + "learning_rate": 2.9322048741311365e-06, + "loss": 2.3939, + "step": 3854 + }, + { + "epoch": 2.580005853576954, + "grad_norm": 4.985065460205078, + "learning_rate": 2.9230603137802626e-06, + "loss": 2.3159, + "step": 3855 + }, + { + "epoch": 2.580674833800226, + "grad_norm": 8.396684646606445, + "learning_rate": 2.913929149645678e-06, + "loss": 2.3722, + "step": 3856 + }, + { + "epoch": 2.581343814023498, + "grad_norm": 6.666956901550293, + "learning_rate": 2.904811387268136e-06, + "loss": 2.2247, + "step": 3857 + }, + { + "epoch": 2.5820127942467703, + "grad_norm": 7.144660949707031, + "learning_rate": 2.8957070321802615e-06, + "loss": 2.4659, + "step": 3858 + }, + { + "epoch": 2.5826817744700423, + "grad_norm": 4.215707302093506, + "learning_rate": 2.886616089906549e-06, + "loss": 2.3, + "step": 3859 + }, + { + "epoch": 2.5833507546933143, + "grad_norm": 5.4257612228393555, + "learning_rate": 2.8775385659633615e-06, + "loss": 2.295, + "step": 3860 + }, + { + "epoch": 2.5840197349165868, + "grad_norm": 6.341053009033203, + "learning_rate": 2.8684744658588975e-06, + "loss": 2.5079, + "step": 3861 + }, + { + "epoch": 2.5846887151398588, + "grad_norm": 4.528788089752197, + "learning_rate": 2.8594237950932385e-06, + "loss": 2.2241, + "step": 3862 + }, + { + "epoch": 2.5853576953631308, + "grad_norm": 5.509812355041504, + "learning_rate": 2.8503865591582913e-06, + "loss": 2.2686, + "step": 3863 + }, + { + "epoch": 2.586026675586403, + "grad_norm": 7.177139759063721, + "learning_rate": 2.841362763537833e-06, + "loss": 2.4926, + "step": 3864 + }, + { + "epoch": 2.586695655809675, + "grad_norm": 5.077939987182617, + "learning_rate": 2.8323524137074646e-06, + "loss": 2.2064, + "step": 3865 + }, + { + "epoch": 2.587364636032947, + "grad_norm": 5.281102657318115, + "learning_rate": 2.823355515134643e-06, + "loss": 2.5352, + "step": 3866 + }, + { + "epoch": 2.5880336162562196, + "grad_norm": 5.470816135406494, + "learning_rate": 2.8143720732786483e-06, + "loss": 2.4672, + "step": 3867 + }, + { + "epoch": 2.5887025964794916, + "grad_norm": 5.3967132568359375, + "learning_rate": 2.8054020935906146e-06, + "loss": 2.4062, + "step": 3868 + }, + { + "epoch": 2.5893715767027636, + "grad_norm": 5.98785924911499, + "learning_rate": 2.7964455815134876e-06, + "loss": 2.6115, + "step": 3869 + }, + { + "epoch": 2.590040556926036, + "grad_norm": 5.799724102020264, + "learning_rate": 2.7875025424820622e-06, + "loss": 2.2697, + "step": 3870 + }, + { + "epoch": 2.590709537149308, + "grad_norm": 5.382849216461182, + "learning_rate": 2.7785729819229336e-06, + "loss": 2.3281, + "step": 3871 + }, + { + "epoch": 2.59137851737258, + "grad_norm": 4.921668529510498, + "learning_rate": 2.7696569052545416e-06, + "loss": 2.3425, + "step": 3872 + }, + { + "epoch": 2.5920474975958525, + "grad_norm": 6.8784356117248535, + "learning_rate": 2.7607543178871265e-06, + "loss": 2.4721, + "step": 3873 + }, + { + "epoch": 2.5927164778191245, + "grad_norm": 6.990794658660889, + "learning_rate": 2.751865225222752e-06, + "loss": 2.4409, + "step": 3874 + }, + { + "epoch": 2.5933854580423965, + "grad_norm": 5.708813190460205, + "learning_rate": 2.7429896326552824e-06, + "loss": 2.4879, + "step": 3875 + }, + { + "epoch": 2.594054438265669, + "grad_norm": 8.053181648254395, + "learning_rate": 2.734127545570414e-06, + "loss": 2.8147, + "step": 3876 + }, + { + "epoch": 2.594723418488941, + "grad_norm": 5.7965922355651855, + "learning_rate": 2.725278969345618e-06, + "loss": 2.3624, + "step": 3877 + }, + { + "epoch": 2.595392398712213, + "grad_norm": 7.975802421569824, + "learning_rate": 2.7164439093501942e-06, + "loss": 2.5226, + "step": 3878 + }, + { + "epoch": 2.5960613789354854, + "grad_norm": 6.369809150695801, + "learning_rate": 2.707622370945223e-06, + "loss": 2.3842, + "step": 3879 + }, + { + "epoch": 2.5967303591587574, + "grad_norm": 5.080902576446533, + "learning_rate": 2.6988143594835868e-06, + "loss": 2.3923, + "step": 3880 + }, + { + "epoch": 2.5973993393820294, + "grad_norm": 6.060383319854736, + "learning_rate": 2.6900198803099545e-06, + "loss": 2.5399, + "step": 3881 + }, + { + "epoch": 2.598068319605302, + "grad_norm": 5.512672424316406, + "learning_rate": 2.681238938760797e-06, + "loss": 2.5301, + "step": 3882 + }, + { + "epoch": 2.598737299828574, + "grad_norm": 6.313393592834473, + "learning_rate": 2.672471540164348e-06, + "loss": 2.3117, + "step": 3883 + }, + { + "epoch": 2.599406280051846, + "grad_norm": 5.462608337402344, + "learning_rate": 2.663717689840653e-06, + "loss": 2.4511, + "step": 3884 + }, + { + "epoch": 2.6000752602751183, + "grad_norm": 5.515672206878662, + "learning_rate": 2.654977393101507e-06, + "loss": 2.342, + "step": 3885 + }, + { + "epoch": 2.6007442404983903, + "grad_norm": 7.675982475280762, + "learning_rate": 2.6462506552505013e-06, + "loss": 2.3332, + "step": 3886 + }, + { + "epoch": 2.6014132207216623, + "grad_norm": 5.95807409286499, + "learning_rate": 2.6375374815829938e-06, + "loss": 2.3946, + "step": 3887 + }, + { + "epoch": 2.6020822009449347, + "grad_norm": 4.934159278869629, + "learning_rate": 2.628837877386106e-06, + "loss": 2.3569, + "step": 3888 + }, + { + "epoch": 2.6027511811682067, + "grad_norm": 6.970942497253418, + "learning_rate": 2.6201518479387262e-06, + "loss": 2.7085, + "step": 3889 + }, + { + "epoch": 2.6034201613914787, + "grad_norm": 5.228874683380127, + "learning_rate": 2.611479398511518e-06, + "loss": 2.2416, + "step": 3890 + }, + { + "epoch": 2.604089141614751, + "grad_norm": 8.109018325805664, + "learning_rate": 2.6028205343668873e-06, + "loss": 2.5934, + "step": 3891 + }, + { + "epoch": 2.604758121838023, + "grad_norm": 6.200860500335693, + "learning_rate": 2.5941752607590158e-06, + "loss": 2.6139, + "step": 3892 + }, + { + "epoch": 2.605427102061295, + "grad_norm": 5.979791164398193, + "learning_rate": 2.5855435829338165e-06, + "loss": 2.2434, + "step": 3893 + }, + { + "epoch": 2.6060960822845676, + "grad_norm": 5.927387714385986, + "learning_rate": 2.57692550612898e-06, + "loss": 2.3326, + "step": 3894 + }, + { + "epoch": 2.6067650625078396, + "grad_norm": 5.063274383544922, + "learning_rate": 2.568321035573906e-06, + "loss": 2.439, + "step": 3895 + }, + { + "epoch": 2.6074340427311116, + "grad_norm": 7.780815124511719, + "learning_rate": 2.559730176489775e-06, + "loss": 2.6794, + "step": 3896 + }, + { + "epoch": 2.608103022954384, + "grad_norm": 5.996585369110107, + "learning_rate": 2.551152934089482e-06, + "loss": 2.3607, + "step": 3897 + }, + { + "epoch": 2.608772003177656, + "grad_norm": 6.469350814819336, + "learning_rate": 2.542589313577684e-06, + "loss": 2.3584, + "step": 3898 + }, + { + "epoch": 2.609440983400928, + "grad_norm": 6.438049793243408, + "learning_rate": 2.534039320150744e-06, + "loss": 2.2896, + "step": 3899 + }, + { + "epoch": 2.6101099636242004, + "grad_norm": 5.831080913543701, + "learning_rate": 2.525502958996784e-06, + "loss": 2.2082, + "step": 3900 + }, + { + "epoch": 2.6107789438474724, + "grad_norm": 5.186378479003906, + "learning_rate": 2.516980235295635e-06, + "loss": 2.3198, + "step": 3901 + }, + { + "epoch": 2.6114479240707444, + "grad_norm": 6.122509002685547, + "learning_rate": 2.5084711542188555e-06, + "loss": 2.2114, + "step": 3902 + }, + { + "epoch": 2.612116904294017, + "grad_norm": 2003.243896484375, + "learning_rate": 2.4999757209297286e-06, + "loss": 2.2811, + "step": 3903 + }, + { + "epoch": 2.612785884517289, + "grad_norm": 7.195806503295898, + "learning_rate": 2.4914939405832616e-06, + "loss": 2.4325, + "step": 3904 + }, + { + "epoch": 2.613454864740561, + "grad_norm": 5.923693656921387, + "learning_rate": 2.4830258183261624e-06, + "loss": 2.6001, + "step": 3905 + }, + { + "epoch": 2.6141238449638333, + "grad_norm": 5.774631500244141, + "learning_rate": 2.474571359296873e-06, + "loss": 2.4069, + "step": 3906 + }, + { + "epoch": 2.6147928251871053, + "grad_norm": 4.974096775054932, + "learning_rate": 2.466130568625519e-06, + "loss": 2.5573, + "step": 3907 + }, + { + "epoch": 2.6154618054103773, + "grad_norm": 5.13218355178833, + "learning_rate": 2.457703451433957e-06, + "loss": 2.3746, + "step": 3908 + }, + { + "epoch": 2.6161307856336498, + "grad_norm": 5.425072193145752, + "learning_rate": 2.449290012835731e-06, + "loss": 2.2894, + "step": 3909 + }, + { + "epoch": 2.6167997658569218, + "grad_norm": 5.96708869934082, + "learning_rate": 2.440890257936085e-06, + "loss": 2.0545, + "step": 3910 + }, + { + "epoch": 2.6174687460801938, + "grad_norm": 5.687791347503662, + "learning_rate": 2.432504191831961e-06, + "loss": 2.3528, + "step": 3911 + }, + { + "epoch": 2.618137726303466, + "grad_norm": 4.508747100830078, + "learning_rate": 2.424131819612005e-06, + "loss": 2.5651, + "step": 3912 + }, + { + "epoch": 2.618806706526738, + "grad_norm": 4.661840438842773, + "learning_rate": 2.415773146356537e-06, + "loss": 2.0561, + "step": 3913 + }, + { + "epoch": 2.61947568675001, + "grad_norm": 6.074620723724365, + "learning_rate": 2.4074281771375828e-06, + "loss": 2.4737, + "step": 3914 + }, + { + "epoch": 2.6201446669732826, + "grad_norm": 3.852896213531494, + "learning_rate": 2.399096917018834e-06, + "loss": 2.2016, + "step": 3915 + }, + { + "epoch": 2.6208136471965546, + "grad_norm": 6.536360263824463, + "learning_rate": 2.3907793710556876e-06, + "loss": 2.2546, + "step": 3916 + }, + { + "epoch": 2.6214826274198266, + "grad_norm": 5.735913276672363, + "learning_rate": 2.382475544295182e-06, + "loss": 2.3761, + "step": 3917 + }, + { + "epoch": 2.622151607643099, + "grad_norm": 8.968220710754395, + "learning_rate": 2.3741854417760695e-06, + "loss": 2.3742, + "step": 3918 + }, + { + "epoch": 2.622820587866371, + "grad_norm": 7.0541911125183105, + "learning_rate": 2.365909068528746e-06, + "loss": 2.7518, + "step": 3919 + }, + { + "epoch": 2.6234895680896435, + "grad_norm": 5.832033634185791, + "learning_rate": 2.3576464295753e-06, + "loss": 2.4419, + "step": 3920 + }, + { + "epoch": 2.6241585483129155, + "grad_norm": 6.212060451507568, + "learning_rate": 2.3493975299294617e-06, + "loss": 2.5095, + "step": 3921 + }, + { + "epoch": 2.6248275285361875, + "grad_norm": 6.432426929473877, + "learning_rate": 2.3411623745966502e-06, + "loss": 2.4872, + "step": 3922 + }, + { + "epoch": 2.62549650875946, + "grad_norm": 4.513784885406494, + "learning_rate": 2.3329409685739268e-06, + "loss": 2.2506, + "step": 3923 + }, + { + "epoch": 2.626165488982732, + "grad_norm": 6.6362175941467285, + "learning_rate": 2.3247333168500112e-06, + "loss": 2.5336, + "step": 3924 + }, + { + "epoch": 2.626834469206004, + "grad_norm": 5.738683700561523, + "learning_rate": 2.3165394244052757e-06, + "loss": 2.2842, + "step": 3925 + }, + { + "epoch": 2.6275034494292764, + "grad_norm": 6.834362030029297, + "learning_rate": 2.3083592962117594e-06, + "loss": 2.647, + "step": 3926 + }, + { + "epoch": 2.6281724296525484, + "grad_norm": 4.8380608558654785, + "learning_rate": 2.300192937233128e-06, + "loss": 2.1545, + "step": 3927 + }, + { + "epoch": 2.628841409875821, + "grad_norm": 5.433319568634033, + "learning_rate": 2.2920403524247096e-06, + "loss": 2.3643, + "step": 3928 + }, + { + "epoch": 2.629510390099093, + "grad_norm": 5.940316200256348, + "learning_rate": 2.283901546733461e-06, + "loss": 2.3231, + "step": 3929 + }, + { + "epoch": 2.630179370322365, + "grad_norm": 5.8874335289001465, + "learning_rate": 2.275776525097989e-06, + "loss": 2.3986, + "step": 3930 + }, + { + "epoch": 2.6308483505456373, + "grad_norm": 6.751160621643066, + "learning_rate": 2.2676652924485258e-06, + "loss": 2.5774, + "step": 3931 + }, + { + "epoch": 2.6315173307689093, + "grad_norm": 5.059715747833252, + "learning_rate": 2.2595678537069452e-06, + "loss": 2.4019, + "step": 3932 + }, + { + "epoch": 2.6321863109921813, + "grad_norm": 4.548704147338867, + "learning_rate": 2.251484213786739e-06, + "loss": 2.2634, + "step": 3933 + }, + { + "epoch": 2.6328552912154537, + "grad_norm": 6.632992267608643, + "learning_rate": 2.243414377593045e-06, + "loss": 2.4384, + "step": 3934 + }, + { + "epoch": 2.6335242714387257, + "grad_norm": 6.751095771789551, + "learning_rate": 2.235358350022604e-06, + "loss": 2.7334, + "step": 3935 + }, + { + "epoch": 2.6341932516619977, + "grad_norm": 5.870697498321533, + "learning_rate": 2.2273161359637973e-06, + "loss": 2.4939, + "step": 3936 + }, + { + "epoch": 2.63486223188527, + "grad_norm": 4.490385055541992, + "learning_rate": 2.219287740296605e-06, + "loss": 2.459, + "step": 3937 + }, + { + "epoch": 2.635531212108542, + "grad_norm": 5.850098609924316, + "learning_rate": 2.211273167892647e-06, + "loss": 2.2702, + "step": 3938 + }, + { + "epoch": 2.636200192331814, + "grad_norm": 6.927645206451416, + "learning_rate": 2.2032724236151185e-06, + "loss": 2.2674, + "step": 3939 + }, + { + "epoch": 2.6368691725550866, + "grad_norm": 5.658666610717773, + "learning_rate": 2.195285512318862e-06, + "loss": 2.4502, + "step": 3940 + }, + { + "epoch": 2.6375381527783586, + "grad_norm": 7.638965606689453, + "learning_rate": 2.1873124388503e-06, + "loss": 2.2698, + "step": 3941 + }, + { + "epoch": 2.6382071330016306, + "grad_norm": 6.148594856262207, + "learning_rate": 2.17935320804748e-06, + "loss": 2.3034, + "step": 3942 + }, + { + "epoch": 2.638876113224903, + "grad_norm": 4.896574974060059, + "learning_rate": 2.171407824740024e-06, + "loss": 2.4125, + "step": 3943 + }, + { + "epoch": 2.639545093448175, + "grad_norm": 6.364518165588379, + "learning_rate": 2.1634762937491727e-06, + "loss": 2.3654, + "step": 3944 + }, + { + "epoch": 2.640214073671447, + "grad_norm": 7.4294586181640625, + "learning_rate": 2.155558619887757e-06, + "loss": 2.4234, + "step": 3945 + }, + { + "epoch": 2.6408830538947194, + "grad_norm": 5.3314690589904785, + "learning_rate": 2.147654807960189e-06, + "loss": 2.3142, + "step": 3946 + }, + { + "epoch": 2.6415520341179914, + "grad_norm": 4.827838897705078, + "learning_rate": 2.139764862762475e-06, + "loss": 2.3677, + "step": 3947 + }, + { + "epoch": 2.6422210143412634, + "grad_norm": 5.797870635986328, + "learning_rate": 2.1318887890822174e-06, + "loss": 2.1416, + "step": 3948 + }, + { + "epoch": 2.642889994564536, + "grad_norm": 6.218014240264893, + "learning_rate": 2.12402659169858e-06, + "loss": 2.2067, + "step": 3949 + }, + { + "epoch": 2.643558974787808, + "grad_norm": 5.623363494873047, + "learning_rate": 2.1161782753823357e-06, + "loss": 2.5201, + "step": 3950 + }, + { + "epoch": 2.64422795501108, + "grad_norm": 5.521179676055908, + "learning_rate": 2.108343844895799e-06, + "loss": 2.3229, + "step": 3951 + }, + { + "epoch": 2.6448969352343523, + "grad_norm": 7.591794967651367, + "learning_rate": 2.100523304992896e-06, + "loss": 2.4609, + "step": 3952 + }, + { + "epoch": 2.6455659154576243, + "grad_norm": 6.253237247467041, + "learning_rate": 2.0927166604190918e-06, + "loss": 2.4531, + "step": 3953 + }, + { + "epoch": 2.6462348956808963, + "grad_norm": 9.044564247131348, + "learning_rate": 2.0849239159114407e-06, + "loss": 2.7153, + "step": 3954 + }, + { + "epoch": 2.6469038759041688, + "grad_norm": 4.52829122543335, + "learning_rate": 2.0771450761985445e-06, + "loss": 2.133, + "step": 3955 + }, + { + "epoch": 2.6475728561274408, + "grad_norm": 5.785518169403076, + "learning_rate": 2.0693801460005936e-06, + "loss": 2.4795, + "step": 3956 + }, + { + "epoch": 2.6482418363507128, + "grad_norm": 5.381036758422852, + "learning_rate": 2.0616291300293076e-06, + "loss": 2.3966, + "step": 3957 + }, + { + "epoch": 2.648910816573985, + "grad_norm": 7.2144775390625, + "learning_rate": 2.0538920329879892e-06, + "loss": 2.0823, + "step": 3958 + }, + { + "epoch": 2.649579796797257, + "grad_norm": 5.981444358825684, + "learning_rate": 2.046168859571482e-06, + "loss": 2.3909, + "step": 3959 + }, + { + "epoch": 2.650248777020529, + "grad_norm": 6.732827663421631, + "learning_rate": 2.038459614466179e-06, + "loss": 2.4474, + "step": 3960 + }, + { + "epoch": 2.6509177572438016, + "grad_norm": 5.331604957580566, + "learning_rate": 2.0307643023500245e-06, + "loss": 2.3749, + "step": 3961 + }, + { + "epoch": 2.6515867374670736, + "grad_norm": 5.6953277587890625, + "learning_rate": 2.0230829278925167e-06, + "loss": 2.2316, + "step": 3962 + }, + { + "epoch": 2.6522557176903456, + "grad_norm": 4.2728753089904785, + "learning_rate": 2.01541549575468e-06, + "loss": 2.1244, + "step": 3963 + }, + { + "epoch": 2.652924697913618, + "grad_norm": 6.199775695800781, + "learning_rate": 2.007762010589098e-06, + "loss": 2.5994, + "step": 3964 + }, + { + "epoch": 2.65359367813689, + "grad_norm": 5.380782127380371, + "learning_rate": 2.0001224770398675e-06, + "loss": 2.512, + "step": 3965 + }, + { + "epoch": 2.654262658360162, + "grad_norm": 6.1900553703308105, + "learning_rate": 1.992496899742649e-06, + "loss": 2.6258, + "step": 3966 + }, + { + "epoch": 2.6549316385834345, + "grad_norm": 4.216786861419678, + "learning_rate": 1.9848852833246084e-06, + "loss": 2.4047, + "step": 3967 + }, + { + "epoch": 2.6556006188067065, + "grad_norm": 7.381101131439209, + "learning_rate": 1.9772876324044507e-06, + "loss": 2.4173, + "step": 3968 + }, + { + "epoch": 2.6562695990299785, + "grad_norm": 4.902368068695068, + "learning_rate": 1.9697039515924026e-06, + "loss": 2.3725, + "step": 3969 + }, + { + "epoch": 2.656938579253251, + "grad_norm": 5.404871940612793, + "learning_rate": 1.962134245490227e-06, + "loss": 2.3183, + "step": 3970 + }, + { + "epoch": 2.657607559476523, + "grad_norm": 4.887608528137207, + "learning_rate": 1.9545785186911887e-06, + "loss": 2.4532, + "step": 3971 + }, + { + "epoch": 2.658276539699795, + "grad_norm": 6.764144420623779, + "learning_rate": 1.9470367757800867e-06, + "loss": 2.7451, + "step": 3972 + }, + { + "epoch": 2.6589455199230674, + "grad_norm": 6.065435409545898, + "learning_rate": 1.939509021333219e-06, + "loss": 2.4556, + "step": 3973 + }, + { + "epoch": 2.6596145001463394, + "grad_norm": 5.688309192657471, + "learning_rate": 1.9319952599184066e-06, + "loss": 2.2635, + "step": 3974 + }, + { + "epoch": 2.6602834803696114, + "grad_norm": 4.4196696281433105, + "learning_rate": 1.9244954960949775e-06, + "loss": 2.2008, + "step": 3975 + }, + { + "epoch": 2.660952460592884, + "grad_norm": 6.364865303039551, + "learning_rate": 1.9170097344137603e-06, + "loss": 2.3602, + "step": 3976 + }, + { + "epoch": 2.661621440816156, + "grad_norm": 6.421303749084473, + "learning_rate": 1.909537979417089e-06, + "loss": 2.3367, + "step": 3977 + }, + { + "epoch": 2.662290421039428, + "grad_norm": 7.493662357330322, + "learning_rate": 1.902080235638809e-06, + "loss": 2.3905, + "step": 3978 + }, + { + "epoch": 2.6629594012627003, + "grad_norm": 4.8052754402160645, + "learning_rate": 1.894636507604247e-06, + "loss": 2.0476, + "step": 3979 + }, + { + "epoch": 2.6636283814859723, + "grad_norm": 6.926331996917725, + "learning_rate": 1.8872067998302406e-06, + "loss": 2.5446, + "step": 3980 + }, + { + "epoch": 2.6642973617092442, + "grad_norm": 7.540713310241699, + "learning_rate": 1.879791116825108e-06, + "loss": 2.8578, + "step": 3981 + }, + { + "epoch": 2.6649663419325167, + "grad_norm": 6.341775417327881, + "learning_rate": 1.872389463088664e-06, + "loss": 2.3347, + "step": 3982 + }, + { + "epoch": 2.6656353221557887, + "grad_norm": 4.927845478057861, + "learning_rate": 1.8650018431122047e-06, + "loss": 2.2441, + "step": 3983 + }, + { + "epoch": 2.6663043023790607, + "grad_norm": 4.369083404541016, + "learning_rate": 1.857628261378519e-06, + "loss": 2.1603, + "step": 3984 + }, + { + "epoch": 2.666973282602333, + "grad_norm": 6.419517993927002, + "learning_rate": 1.850268722361867e-06, + "loss": 2.3367, + "step": 3985 + }, + { + "epoch": 2.667642262825605, + "grad_norm": 4.671262741088867, + "learning_rate": 1.8429232305280013e-06, + "loss": 2.1097, + "step": 3986 + }, + { + "epoch": 2.668311243048877, + "grad_norm": 8.195043563842773, + "learning_rate": 1.8355917903341347e-06, + "loss": 2.6112, + "step": 3987 + }, + { + "epoch": 2.6689802232721496, + "grad_norm": 4.158875465393066, + "learning_rate": 1.8282744062289691e-06, + "loss": 2.2482, + "step": 3988 + }, + { + "epoch": 2.6696492034954216, + "grad_norm": 4.787658214569092, + "learning_rate": 1.8209710826526644e-06, + "loss": 2.1707, + "step": 3989 + }, + { + "epoch": 2.6703181837186936, + "grad_norm": 6.542713642120361, + "learning_rate": 1.8136818240368587e-06, + "loss": 2.4107, + "step": 3990 + }, + { + "epoch": 2.670987163941966, + "grad_norm": 5.082743167877197, + "learning_rate": 1.8064066348046416e-06, + "loss": 2.3402, + "step": 3991 + }, + { + "epoch": 2.671656144165238, + "grad_norm": 5.794971942901611, + "learning_rate": 1.7991455193705847e-06, + "loss": 2.536, + "step": 3992 + }, + { + "epoch": 2.67232512438851, + "grad_norm": 6.1262431144714355, + "learning_rate": 1.7918984821407025e-06, + "loss": 2.4755, + "step": 3993 + }, + { + "epoch": 2.6729941046117824, + "grad_norm": 7.816492080688477, + "learning_rate": 1.784665527512483e-06, + "loss": 2.786, + "step": 3994 + }, + { + "epoch": 2.6736630848350544, + "grad_norm": 5.697868347167969, + "learning_rate": 1.7774466598748458e-06, + "loss": 2.3623, + "step": 3995 + }, + { + "epoch": 2.674332065058327, + "grad_norm": 4.466985702514648, + "learning_rate": 1.7702418836081925e-06, + "loss": 2.178, + "step": 3996 + }, + { + "epoch": 2.675001045281599, + "grad_norm": 5.313891887664795, + "learning_rate": 1.7630512030843482e-06, + "loss": 2.4079, + "step": 3997 + }, + { + "epoch": 2.675670025504871, + "grad_norm": 6.489355087280273, + "learning_rate": 1.7558746226665978e-06, + "loss": 2.2602, + "step": 3998 + }, + { + "epoch": 2.6763390057281433, + "grad_norm": 6.1382575035095215, + "learning_rate": 1.7487121467096634e-06, + "loss": 2.6399, + "step": 3999 + }, + { + "epoch": 2.6770079859514153, + "grad_norm": 5.874249458312988, + "learning_rate": 1.7415637795597183e-06, + "loss": 2.4056, + "step": 4000 + }, + { + "epoch": 2.6776769661746873, + "grad_norm": 4.221576690673828, + "learning_rate": 1.7344295255543652e-06, + "loss": 2.3144, + "step": 4001 + }, + { + "epoch": 2.6783459463979598, + "grad_norm": 8.345187187194824, + "learning_rate": 1.7273093890226495e-06, + "loss": 2.6859, + "step": 4002 + }, + { + "epoch": 2.6790149266212318, + "grad_norm": 7.682225704193115, + "learning_rate": 1.7202033742850455e-06, + "loss": 2.3924, + "step": 4003 + }, + { + "epoch": 2.6796839068445037, + "grad_norm": 5.902778148651123, + "learning_rate": 1.7131114856534597e-06, + "loss": 2.2922, + "step": 4004 + }, + { + "epoch": 2.680352887067776, + "grad_norm": 5.096508979797363, + "learning_rate": 1.706033727431225e-06, + "loss": 2.2405, + "step": 4005 + }, + { + "epoch": 2.681021867291048, + "grad_norm": 6.383700370788574, + "learning_rate": 1.6989701039131083e-06, + "loss": 2.6369, + "step": 4006 + }, + { + "epoch": 2.6816908475143206, + "grad_norm": 5.253861427307129, + "learning_rate": 1.6919206193852871e-06, + "loss": 2.5559, + "step": 4007 + }, + { + "epoch": 2.6823598277375926, + "grad_norm": 5.715725421905518, + "learning_rate": 1.6848852781253726e-06, + "loss": 2.4958, + "step": 4008 + }, + { + "epoch": 2.6830288079608646, + "grad_norm": 6.184985160827637, + "learning_rate": 1.6778640844023807e-06, + "loss": 2.6148, + "step": 4009 + }, + { + "epoch": 2.683697788184137, + "grad_norm": 6.2680511474609375, + "learning_rate": 1.670857042476756e-06, + "loss": 2.3371, + "step": 4010 + }, + { + "epoch": 2.684366768407409, + "grad_norm": 5.275510311126709, + "learning_rate": 1.6638641566003476e-06, + "loss": 2.3032, + "step": 4011 + }, + { + "epoch": 2.685035748630681, + "grad_norm": 5.637404441833496, + "learning_rate": 1.656885431016414e-06, + "loss": 2.4423, + "step": 4012 + }, + { + "epoch": 2.6857047288539535, + "grad_norm": 6.447161674499512, + "learning_rate": 1.6499208699596202e-06, + "loss": 2.2069, + "step": 4013 + }, + { + "epoch": 2.6863737090772255, + "grad_norm": 4.44089412689209, + "learning_rate": 1.6429704776560473e-06, + "loss": 2.3369, + "step": 4014 + }, + { + "epoch": 2.6870426893004975, + "grad_norm": 6.151037693023682, + "learning_rate": 1.636034258323163e-06, + "loss": 2.5013, + "step": 4015 + }, + { + "epoch": 2.68771166952377, + "grad_norm": 7.591999053955078, + "learning_rate": 1.6291122161698507e-06, + "loss": 2.7118, + "step": 4016 + }, + { + "epoch": 2.688380649747042, + "grad_norm": 6.277866840362549, + "learning_rate": 1.6222043553963783e-06, + "loss": 2.7038, + "step": 4017 + }, + { + "epoch": 2.689049629970314, + "grad_norm": 5.451652526855469, + "learning_rate": 1.6153106801944178e-06, + "loss": 2.1162, + "step": 4018 + }, + { + "epoch": 2.6897186101935864, + "grad_norm": 9.076871871948242, + "learning_rate": 1.6084311947470259e-06, + "loss": 2.7891, + "step": 4019 + }, + { + "epoch": 2.6903875904168584, + "grad_norm": 10.142724990844727, + "learning_rate": 1.6015659032286522e-06, + "loss": 2.9686, + "step": 4020 + }, + { + "epoch": 2.6910565706401304, + "grad_norm": 6.541345596313477, + "learning_rate": 1.5947148098051313e-06, + "loss": 2.2838, + "step": 4021 + }, + { + "epoch": 2.691725550863403, + "grad_norm": 6.66045618057251, + "learning_rate": 1.587877918633693e-06, + "loss": 2.5367, + "step": 4022 + }, + { + "epoch": 2.692394531086675, + "grad_norm": 4.286031723022461, + "learning_rate": 1.5810552338629325e-06, + "loss": 2.5117, + "step": 4023 + }, + { + "epoch": 2.693063511309947, + "grad_norm": 5.302545070648193, + "learning_rate": 1.5742467596328386e-06, + "loss": 2.4019, + "step": 4024 + }, + { + "epoch": 2.6937324915332193, + "grad_norm": 4.215207576751709, + "learning_rate": 1.5674525000747702e-06, + "loss": 2.1977, + "step": 4025 + }, + { + "epoch": 2.6944014717564913, + "grad_norm": 6.151794910430908, + "learning_rate": 1.5606724593114602e-06, + "loss": 2.4523, + "step": 4026 + }, + { + "epoch": 2.6950704519797632, + "grad_norm": 6.316401958465576, + "learning_rate": 1.5539066414570146e-06, + "loss": 2.2525, + "step": 4027 + }, + { + "epoch": 2.6957394322030357, + "grad_norm": 4.581953525543213, + "learning_rate": 1.5471550506169163e-06, + "loss": 2.3974, + "step": 4028 + }, + { + "epoch": 2.6964084124263077, + "grad_norm": 4.979285717010498, + "learning_rate": 1.5404176908879992e-06, + "loss": 2.3108, + "step": 4029 + }, + { + "epoch": 2.6970773926495797, + "grad_norm": 6.403218746185303, + "learning_rate": 1.5336945663584845e-06, + "loss": 2.3841, + "step": 4030 + }, + { + "epoch": 2.697746372872852, + "grad_norm": 5.133539199829102, + "learning_rate": 1.5269856811079285e-06, + "loss": 2.3798, + "step": 4031 + }, + { + "epoch": 2.698415353096124, + "grad_norm": 5.453507423400879, + "learning_rate": 1.520291039207275e-06, + "loss": 2.4321, + "step": 4032 + }, + { + "epoch": 2.699084333319396, + "grad_norm": 4.932613372802734, + "learning_rate": 1.513610644718802e-06, + "loss": 2.4713, + "step": 4033 + }, + { + "epoch": 2.6997533135426686, + "grad_norm": 6.313424110412598, + "learning_rate": 1.506944501696156e-06, + "loss": 2.4277, + "step": 4034 + }, + { + "epoch": 2.7004222937659406, + "grad_norm": 5.967796325683594, + "learning_rate": 1.500292614184326e-06, + "loss": 2.4136, + "step": 4035 + }, + { + "epoch": 2.7010912739892126, + "grad_norm": 5.476470470428467, + "learning_rate": 1.4936549862196614e-06, + "loss": 2.3291, + "step": 4036 + }, + { + "epoch": 2.701760254212485, + "grad_norm": 5.608387470245361, + "learning_rate": 1.4870316218298485e-06, + "loss": 2.3928, + "step": 4037 + }, + { + "epoch": 2.702429234435757, + "grad_norm": 5.574318885803223, + "learning_rate": 1.4804225250339282e-06, + "loss": 2.4586, + "step": 4038 + }, + { + "epoch": 2.703098214659029, + "grad_norm": 6.428974151611328, + "learning_rate": 1.4738276998422756e-06, + "loss": 2.4952, + "step": 4039 + }, + { + "epoch": 2.7037671948823014, + "grad_norm": 6.895018100738525, + "learning_rate": 1.4672471502566176e-06, + "loss": 2.406, + "step": 4040 + }, + { + "epoch": 2.7044361751055734, + "grad_norm": 5.928206920623779, + "learning_rate": 1.4606808802700016e-06, + "loss": 2.648, + "step": 4041 + }, + { + "epoch": 2.7051051553288454, + "grad_norm": 6.693371295928955, + "learning_rate": 1.454128893866824e-06, + "loss": 2.7054, + "step": 4042 + }, + { + "epoch": 2.705774135552118, + "grad_norm": 7.416831016540527, + "learning_rate": 1.4475911950228015e-06, + "loss": 2.4132, + "step": 4043 + }, + { + "epoch": 2.70644311577539, + "grad_norm": 5.891541481018066, + "learning_rate": 1.441067787705e-06, + "loss": 2.0346, + "step": 4044 + }, + { + "epoch": 2.707112095998662, + "grad_norm": 5.5919623374938965, + "learning_rate": 1.4345586758717916e-06, + "loss": 2.2097, + "step": 4045 + }, + { + "epoch": 2.7077810762219343, + "grad_norm": 5.639929294586182, + "learning_rate": 1.428063863472895e-06, + "loss": 2.5591, + "step": 4046 + }, + { + "epoch": 2.7084500564452063, + "grad_norm": 5.647850036621094, + "learning_rate": 1.421583354449338e-06, + "loss": 2.4194, + "step": 4047 + }, + { + "epoch": 2.7091190366684783, + "grad_norm": 7.08728551864624, + "learning_rate": 1.4151171527334668e-06, + "loss": 2.6051, + "step": 4048 + }, + { + "epoch": 2.7097880168917508, + "grad_norm": 6.5381669998168945, + "learning_rate": 1.4086652622489537e-06, + "loss": 2.6292, + "step": 4049 + }, + { + "epoch": 2.7104569971150227, + "grad_norm": 5.33787727355957, + "learning_rate": 1.4022276869107925e-06, + "loss": 2.4832, + "step": 4050 + }, + { + "epoch": 2.7111259773382947, + "grad_norm": 7.742366313934326, + "learning_rate": 1.3958044306252749e-06, + "loss": 2.7243, + "step": 4051 + }, + { + "epoch": 2.711794957561567, + "grad_norm": 7.627612113952637, + "learning_rate": 1.389395497290019e-06, + "loss": 2.8328, + "step": 4052 + }, + { + "epoch": 2.712463937784839, + "grad_norm": 7.918229579925537, + "learning_rate": 1.3830008907939369e-06, + "loss": 2.6525, + "step": 4053 + }, + { + "epoch": 2.713132918008111, + "grad_norm": 5.160806655883789, + "learning_rate": 1.3766206150172655e-06, + "loss": 2.4521, + "step": 4054 + }, + { + "epoch": 2.7138018982313836, + "grad_norm": 6.1942667961120605, + "learning_rate": 1.3702546738315308e-06, + "loss": 2.2583, + "step": 4055 + }, + { + "epoch": 2.7144708784546556, + "grad_norm": 5.8853349685668945, + "learning_rate": 1.3639030710995622e-06, + "loss": 2.4746, + "step": 4056 + }, + { + "epoch": 2.7151398586779276, + "grad_norm": 4.415451526641846, + "learning_rate": 1.3575658106754929e-06, + "loss": 2.1843, + "step": 4057 + }, + { + "epoch": 2.7158088389012, + "grad_norm": 6.340837478637695, + "learning_rate": 1.3512428964047586e-06, + "loss": 2.2082, + "step": 4058 + }, + { + "epoch": 2.716477819124472, + "grad_norm": 6.054290771484375, + "learning_rate": 1.3449343321240732e-06, + "loss": 2.4201, + "step": 4059 + }, + { + "epoch": 2.717146799347744, + "grad_norm": 7.145515441894531, + "learning_rate": 1.3386401216614636e-06, + "loss": 2.5659, + "step": 4060 + }, + { + "epoch": 2.7178157795710165, + "grad_norm": 5.219954490661621, + "learning_rate": 1.3323602688362301e-06, + "loss": 2.2271, + "step": 4061 + }, + { + "epoch": 2.7184847597942885, + "grad_norm": 6.85549783706665, + "learning_rate": 1.3260947774589749e-06, + "loss": 2.1972, + "step": 4062 + }, + { + "epoch": 2.7191537400175605, + "grad_norm": 5.227082252502441, + "learning_rate": 1.3198436513315648e-06, + "loss": 2.4371, + "step": 4063 + }, + { + "epoch": 2.719822720240833, + "grad_norm": 5.493729114532471, + "learning_rate": 1.3136068942471742e-06, + "loss": 2.6991, + "step": 4064 + }, + { + "epoch": 2.720491700464105, + "grad_norm": 5.099348545074463, + "learning_rate": 1.3073845099902393e-06, + "loss": 2.1417, + "step": 4065 + }, + { + "epoch": 2.721160680687377, + "grad_norm": 4.658766746520996, + "learning_rate": 1.301176502336493e-06, + "loss": 2.323, + "step": 4066 + }, + { + "epoch": 2.7218296609106494, + "grad_norm": 5.050058841705322, + "learning_rate": 1.2949828750529246e-06, + "loss": 2.4467, + "step": 4067 + }, + { + "epoch": 2.7224986411339214, + "grad_norm": 7.359899997711182, + "learning_rate": 1.2888036318978143e-06, + "loss": 2.6114, + "step": 4068 + }, + { + "epoch": 2.7231676213571934, + "grad_norm": 6.556347846984863, + "learning_rate": 1.2826387766207037e-06, + "loss": 2.5856, + "step": 4069 + }, + { + "epoch": 2.723836601580466, + "grad_norm": 4.9075822830200195, + "learning_rate": 1.2764883129624094e-06, + "loss": 2.1575, + "step": 4070 + }, + { + "epoch": 2.724505581803738, + "grad_norm": 3.972384214401245, + "learning_rate": 1.2703522446550071e-06, + "loss": 2.3932, + "step": 4071 + }, + { + "epoch": 2.72517456202701, + "grad_norm": 4.784709453582764, + "learning_rate": 1.2642305754218541e-06, + "loss": 2.2837, + "step": 4072 + }, + { + "epoch": 2.7258435422502822, + "grad_norm": 4.90197229385376, + "learning_rate": 1.2581233089775462e-06, + "loss": 2.405, + "step": 4073 + }, + { + "epoch": 2.7265125224735542, + "grad_norm": 4.2208638191223145, + "learning_rate": 1.2520304490279661e-06, + "loss": 2.0995, + "step": 4074 + }, + { + "epoch": 2.7271815026968267, + "grad_norm": 5.6266584396362305, + "learning_rate": 1.2459519992702313e-06, + "loss": 2.1565, + "step": 4075 + }, + { + "epoch": 2.7278504829200987, + "grad_norm": 5.349244117736816, + "learning_rate": 1.239887963392733e-06, + "loss": 2.2806, + "step": 4076 + }, + { + "epoch": 2.7285194631433707, + "grad_norm": 4.174487590789795, + "learning_rate": 1.2338383450751056e-06, + "loss": 2.3757, + "step": 4077 + }, + { + "epoch": 2.729188443366643, + "grad_norm": 7.897218227386475, + "learning_rate": 1.2278031479882408e-06, + "loss": 2.4715, + "step": 4078 + }, + { + "epoch": 2.729857423589915, + "grad_norm": 6.350612163543701, + "learning_rate": 1.2217823757942698e-06, + "loss": 2.4657, + "step": 4079 + }, + { + "epoch": 2.730526403813187, + "grad_norm": 9.551507949829102, + "learning_rate": 1.2157760321465873e-06, + "loss": 2.6196, + "step": 4080 + }, + { + "epoch": 2.7311953840364596, + "grad_norm": 5.328859806060791, + "learning_rate": 1.2097841206898137e-06, + "loss": 2.3488, + "step": 4081 + }, + { + "epoch": 2.7318643642597316, + "grad_norm": 5.380430698394775, + "learning_rate": 1.2038066450598296e-06, + "loss": 2.4842, + "step": 4082 + }, + { + "epoch": 2.7325333444830036, + "grad_norm": 4.378293991088867, + "learning_rate": 1.1978436088837446e-06, + "loss": 2.1248, + "step": 4083 + }, + { + "epoch": 2.733202324706276, + "grad_norm": 7.890618801116943, + "learning_rate": 1.1918950157799147e-06, + "loss": 2.4735, + "step": 4084 + }, + { + "epoch": 2.733871304929548, + "grad_norm": 5.002713203430176, + "learning_rate": 1.185960869357916e-06, + "loss": 2.2967, + "step": 4085 + }, + { + "epoch": 2.7345402851528204, + "grad_norm": 5.634923458099365, + "learning_rate": 1.1800411732185822e-06, + "loss": 2.3365, + "step": 4086 + }, + { + "epoch": 2.7352092653760924, + "grad_norm": 6.980934143066406, + "learning_rate": 1.174135930953954e-06, + "loss": 2.6051, + "step": 4087 + }, + { + "epoch": 2.7358782455993644, + "grad_norm": 5.857748985290527, + "learning_rate": 1.1682451461473259e-06, + "loss": 2.3321, + "step": 4088 + }, + { + "epoch": 2.736547225822637, + "grad_norm": 7.785706043243408, + "learning_rate": 1.1623688223731943e-06, + "loss": 2.5815, + "step": 4089 + }, + { + "epoch": 2.737216206045909, + "grad_norm": 9.294645309448242, + "learning_rate": 1.1565069631973068e-06, + "loss": 2.4293, + "step": 4090 + }, + { + "epoch": 2.737885186269181, + "grad_norm": 5.226867198944092, + "learning_rate": 1.1506595721766129e-06, + "loss": 2.2245, + "step": 4091 + }, + { + "epoch": 2.7385541664924533, + "grad_norm": 6.057205677032471, + "learning_rate": 1.1448266528592933e-06, + "loss": 2.4675, + "step": 4092 + }, + { + "epoch": 2.7392231467157253, + "grad_norm": 6.411067485809326, + "learning_rate": 1.1390082087847393e-06, + "loss": 2.4494, + "step": 4093 + }, + { + "epoch": 2.7398921269389973, + "grad_norm": 5.002102851867676, + "learning_rate": 1.133204243483571e-06, + "loss": 2.1759, + "step": 4094 + }, + { + "epoch": 2.7405611071622697, + "grad_norm": 4.812438011169434, + "learning_rate": 1.127414760477613e-06, + "loss": 2.3496, + "step": 4095 + }, + { + "epoch": 2.7412300873855417, + "grad_norm": 5.3491740226745605, + "learning_rate": 1.1216397632799053e-06, + "loss": 2.433, + "step": 4096 + }, + { + "epoch": 2.7418990676088137, + "grad_norm": 5.9710235595703125, + "learning_rate": 1.1158792553946972e-06, + "loss": 2.4126, + "step": 4097 + }, + { + "epoch": 2.742568047832086, + "grad_norm": 5.2846832275390625, + "learning_rate": 1.1101332403174485e-06, + "loss": 2.3525, + "step": 4098 + }, + { + "epoch": 2.743237028055358, + "grad_norm": 5.906856536865234, + "learning_rate": 1.104401721534823e-06, + "loss": 2.2795, + "step": 4099 + }, + { + "epoch": 2.74390600827863, + "grad_norm": 7.289385795593262, + "learning_rate": 1.0986847025246854e-06, + "loss": 2.5107, + "step": 4100 + }, + { + "epoch": 2.7445749885019026, + "grad_norm": 5.033178806304932, + "learning_rate": 1.092982186756103e-06, + "loss": 2.1139, + "step": 4101 + }, + { + "epoch": 2.7452439687251746, + "grad_norm": 5.149725914001465, + "learning_rate": 1.0872941776893492e-06, + "loss": 2.3101, + "step": 4102 + }, + { + "epoch": 2.7459129489484466, + "grad_norm": 7.19143009185791, + "learning_rate": 1.0816206787758853e-06, + "loss": 2.171, + "step": 4103 + }, + { + "epoch": 2.746581929171719, + "grad_norm": 7.696903705596924, + "learning_rate": 1.0759616934583744e-06, + "loss": 2.4951, + "step": 4104 + }, + { + "epoch": 2.747250909394991, + "grad_norm": 4.7374396324157715, + "learning_rate": 1.0703172251706694e-06, + "loss": 2.2904, + "step": 4105 + }, + { + "epoch": 2.747919889618263, + "grad_norm": 5.831841945648193, + "learning_rate": 1.0646872773378224e-06, + "loss": 2.4245, + "step": 4106 + }, + { + "epoch": 2.7485888698415355, + "grad_norm": 7.000763416290283, + "learning_rate": 1.0590718533760534e-06, + "loss": 2.4687, + "step": 4107 + }, + { + "epoch": 2.7492578500648075, + "grad_norm": 8.514759063720703, + "learning_rate": 1.053470956692798e-06, + "loss": 2.542, + "step": 4108 + }, + { + "epoch": 2.7499268302880795, + "grad_norm": 7.83512020111084, + "learning_rate": 1.0478845906866515e-06, + "loss": 2.3176, + "step": 4109 + }, + { + "epoch": 2.750595810511352, + "grad_norm": 5.963386058807373, + "learning_rate": 1.04231275874741e-06, + "loss": 2.5646, + "step": 4110 + }, + { + "epoch": 2.751264790734624, + "grad_norm": 5.898660659790039, + "learning_rate": 1.0367554642560396e-06, + "loss": 2.6097, + "step": 4111 + }, + { + "epoch": 2.751933770957896, + "grad_norm": 6.957448959350586, + "learning_rate": 1.0312127105846947e-06, + "loss": 2.3949, + "step": 4112 + }, + { + "epoch": 2.7526027511811684, + "grad_norm": 4.711306095123291, + "learning_rate": 1.0256845010966937e-06, + "loss": 2.1952, + "step": 4113 + }, + { + "epoch": 2.7532717314044404, + "grad_norm": 6.216196537017822, + "learning_rate": 1.0201708391465392e-06, + "loss": 2.5061, + "step": 4114 + }, + { + "epoch": 2.7539407116277124, + "grad_norm": 6.628659725189209, + "learning_rate": 1.0146717280798995e-06, + "loss": 2.4795, + "step": 4115 + }, + { + "epoch": 2.754609691850985, + "grad_norm": 5.836121082305908, + "learning_rate": 1.0091871712336248e-06, + "loss": 2.3887, + "step": 4116 + }, + { + "epoch": 2.755278672074257, + "grad_norm": 5.682483673095703, + "learning_rate": 1.0037171719357198e-06, + "loss": 2.6356, + "step": 4117 + }, + { + "epoch": 2.755947652297529, + "grad_norm": 6.360892295837402, + "learning_rate": 9.982617335053706e-07, + "loss": 2.1671, + "step": 4118 + }, + { + "epoch": 2.7566166325208012, + "grad_norm": 6.978191375732422, + "learning_rate": 9.928208592529075e-07, + "loss": 2.3249, + "step": 4119 + }, + { + "epoch": 2.7572856127440732, + "grad_norm": 5.7712836265563965, + "learning_rate": 9.873945524798505e-07, + "loss": 2.5368, + "step": 4120 + }, + { + "epoch": 2.7579545929673452, + "grad_norm": 4.721527099609375, + "learning_rate": 9.819828164788546e-07, + "loss": 2.2627, + "step": 4121 + }, + { + "epoch": 2.7586235731906177, + "grad_norm": 9.053016662597656, + "learning_rate": 9.765856545337486e-07, + "loss": 2.5034, + "step": 4122 + }, + { + "epoch": 2.7592925534138897, + "grad_norm": 4.110961437225342, + "learning_rate": 9.7120306991951e-07, + "loss": 2.3633, + "step": 4123 + }, + { + "epoch": 2.7599615336371617, + "grad_norm": 5.292178630828857, + "learning_rate": 9.658350659022764e-07, + "loss": 2.3943, + "step": 4124 + }, + { + "epoch": 2.760630513860434, + "grad_norm": 5.479394912719727, + "learning_rate": 9.604816457393306e-07, + "loss": 2.304, + "step": 4125 + }, + { + "epoch": 2.761299494083706, + "grad_norm": 6.752635955810547, + "learning_rate": 9.551428126791189e-07, + "loss": 2.3515, + "step": 4126 + }, + { + "epoch": 2.761968474306978, + "grad_norm": 5.404762268066406, + "learning_rate": 9.498185699612222e-07, + "loss": 2.0058, + "step": 4127 + }, + { + "epoch": 2.7626374545302506, + "grad_norm": 4.209757328033447, + "learning_rate": 9.445089208163782e-07, + "loss": 2.2546, + "step": 4128 + }, + { + "epoch": 2.7633064347535226, + "grad_norm": 5.685204029083252, + "learning_rate": 9.39213868466457e-07, + "loss": 2.2413, + "step": 4129 + }, + { + "epoch": 2.7639754149767946, + "grad_norm": 5.597134590148926, + "learning_rate": 9.339334161244884e-07, + "loss": 2.5162, + "step": 4130 + }, + { + "epoch": 2.764644395200067, + "grad_norm": 8.365514755249023, + "learning_rate": 9.28667566994626e-07, + "loss": 2.3021, + "step": 4131 + }, + { + "epoch": 2.765313375423339, + "grad_norm": 5.057498931884766, + "learning_rate": 9.234163242721805e-07, + "loss": 2.6119, + "step": 4132 + }, + { + "epoch": 2.765982355646611, + "grad_norm": 5.4875168800354, + "learning_rate": 9.181796911435781e-07, + "loss": 2.5006, + "step": 4133 + }, + { + "epoch": 2.7666513358698834, + "grad_norm": 6.51283073425293, + "learning_rate": 9.12957670786399e-07, + "loss": 2.6233, + "step": 4134 + }, + { + "epoch": 2.7673203160931554, + "grad_norm": 5.691632270812988, + "learning_rate": 9.077502663693449e-07, + "loss": 2.4109, + "step": 4135 + }, + { + "epoch": 2.7679892963164274, + "grad_norm": 5.830533027648926, + "learning_rate": 9.025574810522547e-07, + "loss": 2.4478, + "step": 4136 + }, + { + "epoch": 2.7686582765397, + "grad_norm": 5.449154853820801, + "learning_rate": 8.973793179860857e-07, + "loss": 2.257, + "step": 4137 + }, + { + "epoch": 2.769327256762972, + "grad_norm": 4.6618876457214355, + "learning_rate": 8.922157803129411e-07, + "loss": 2.4382, + "step": 4138 + }, + { + "epoch": 2.769996236986244, + "grad_norm": 4.898151874542236, + "learning_rate": 8.870668711660341e-07, + "loss": 2.2473, + "step": 4139 + }, + { + "epoch": 2.7706652172095163, + "grad_norm": 4.282668113708496, + "learning_rate": 8.819325936697071e-07, + "loss": 2.3731, + "step": 4140 + }, + { + "epoch": 2.7713341974327883, + "grad_norm": 6.967705726623535, + "learning_rate": 8.768129509394207e-07, + "loss": 2.458, + "step": 4141 + }, + { + "epoch": 2.7720031776560603, + "grad_norm": 5.49399471282959, + "learning_rate": 8.717079460817651e-07, + "loss": 2.3649, + "step": 4142 + }, + { + "epoch": 2.7726721578793327, + "grad_norm": 5.348742961883545, + "learning_rate": 8.666175821944317e-07, + "loss": 2.3196, + "step": 4143 + }, + { + "epoch": 2.7733411381026047, + "grad_norm": 6.487933158874512, + "learning_rate": 8.615418623662441e-07, + "loss": 2.4778, + "step": 4144 + }, + { + "epoch": 2.7740101183258767, + "grad_norm": 5.777150630950928, + "learning_rate": 8.564807896771276e-07, + "loss": 2.3753, + "step": 4145 + }, + { + "epoch": 2.774679098549149, + "grad_norm": 5.667761325836182, + "learning_rate": 8.514343671981284e-07, + "loss": 2.4965, + "step": 4146 + }, + { + "epoch": 2.775348078772421, + "grad_norm": 5.203721046447754, + "learning_rate": 8.46402597991397e-07, + "loss": 2.1892, + "step": 4147 + }, + { + "epoch": 2.776017058995693, + "grad_norm": 5.296346664428711, + "learning_rate": 8.413854851101965e-07, + "loss": 2.1589, + "step": 4148 + }, + { + "epoch": 2.7766860392189656, + "grad_norm": 5.345564842224121, + "learning_rate": 8.363830315988947e-07, + "loss": 2.4215, + "step": 4149 + }, + { + "epoch": 2.7773550194422376, + "grad_norm": 5.400369644165039, + "learning_rate": 8.31395240492966e-07, + "loss": 2.2363, + "step": 4150 + }, + { + "epoch": 2.7780239996655096, + "grad_norm": 6.626008033752441, + "learning_rate": 8.26422114818981e-07, + "loss": 2.5988, + "step": 4151 + }, + { + "epoch": 2.778692979888782, + "grad_norm": 6.0312676429748535, + "learning_rate": 8.214636575946232e-07, + "loss": 2.2416, + "step": 4152 + }, + { + "epoch": 2.779361960112054, + "grad_norm": 3.866189479827881, + "learning_rate": 8.165198718286609e-07, + "loss": 2.3186, + "step": 4153 + }, + { + "epoch": 2.7800309403353265, + "grad_norm": 7.005765914916992, + "learning_rate": 8.11590760520975e-07, + "loss": 2.4311, + "step": 4154 + }, + { + "epoch": 2.7806999205585985, + "grad_norm": 6.1722025871276855, + "learning_rate": 8.066763266625282e-07, + "loss": 2.4592, + "step": 4155 + }, + { + "epoch": 2.7813689007818705, + "grad_norm": 6.352249622344971, + "learning_rate": 8.017765732353883e-07, + "loss": 2.5466, + "step": 4156 + }, + { + "epoch": 2.782037881005143, + "grad_norm": 3.966346025466919, + "learning_rate": 7.968915032127073e-07, + "loss": 2.1218, + "step": 4157 + }, + { + "epoch": 2.782706861228415, + "grad_norm": 6.1780242919921875, + "learning_rate": 7.920211195587335e-07, + "loss": 2.5202, + "step": 4158 + }, + { + "epoch": 2.783375841451687, + "grad_norm": 4.9341349601745605, + "learning_rate": 7.871654252287919e-07, + "loss": 2.3337, + "step": 4159 + }, + { + "epoch": 2.7840448216749594, + "grad_norm": 4.46893310546875, + "learning_rate": 7.823244231693088e-07, + "loss": 2.3951, + "step": 4160 + }, + { + "epoch": 2.7847138018982314, + "grad_norm": 6.117650032043457, + "learning_rate": 7.774981163177875e-07, + "loss": 2.108, + "step": 4161 + }, + { + "epoch": 2.785382782121504, + "grad_norm": 5.080147743225098, + "learning_rate": 7.726865076028183e-07, + "loss": 2.169, + "step": 4162 + }, + { + "epoch": 2.786051762344776, + "grad_norm": 2.893040895462036, + "learning_rate": 7.678895999440633e-07, + "loss": 1.9607, + "step": 4163 + }, + { + "epoch": 2.786720742568048, + "grad_norm": 6.5184149742126465, + "learning_rate": 7.631073962522772e-07, + "loss": 2.3553, + "step": 4164 + }, + { + "epoch": 2.7873897227913202, + "grad_norm": 8.106980323791504, + "learning_rate": 7.583398994292834e-07, + "loss": 2.2538, + "step": 4165 + }, + { + "epoch": 2.7880587030145922, + "grad_norm": 6.6520256996154785, + "learning_rate": 7.535871123679816e-07, + "loss": 2.5547, + "step": 4166 + }, + { + "epoch": 2.7887276832378642, + "grad_norm": 5.381551265716553, + "learning_rate": 7.48849037952351e-07, + "loss": 2.2933, + "step": 4167 + }, + { + "epoch": 2.7893966634611367, + "grad_norm": 6.773391246795654, + "learning_rate": 7.441256790574391e-07, + "loss": 2.1535, + "step": 4168 + }, + { + "epoch": 2.7900656436844087, + "grad_norm": 3.780353546142578, + "learning_rate": 7.394170385493615e-07, + "loss": 2.2979, + "step": 4169 + }, + { + "epoch": 2.7907346239076807, + "grad_norm": 7.604673862457275, + "learning_rate": 7.347231192853105e-07, + "loss": 2.7916, + "step": 4170 + }, + { + "epoch": 2.791403604130953, + "grad_norm": 5.448803901672363, + "learning_rate": 7.300439241135437e-07, + "loss": 2.469, + "step": 4171 + }, + { + "epoch": 2.792072584354225, + "grad_norm": 4.242155075073242, + "learning_rate": 7.253794558733734e-07, + "loss": 2.213, + "step": 4172 + }, + { + "epoch": 2.792741564577497, + "grad_norm": 4.53715705871582, + "learning_rate": 7.207297173951883e-07, + "loss": 2.1587, + "step": 4173 + }, + { + "epoch": 2.7934105448007696, + "grad_norm": 6.353997230529785, + "learning_rate": 7.160947115004397e-07, + "loss": 2.2994, + "step": 4174 + }, + { + "epoch": 2.7940795250240416, + "grad_norm": 6.6785383224487305, + "learning_rate": 7.11474441001625e-07, + "loss": 2.5409, + "step": 4175 + }, + { + "epoch": 2.7947485052473136, + "grad_norm": 6.029831886291504, + "learning_rate": 7.068689087023156e-07, + "loss": 2.561, + "step": 4176 + }, + { + "epoch": 2.795417485470586, + "grad_norm": 4.117969512939453, + "learning_rate": 7.022781173971316e-07, + "loss": 2.1443, + "step": 4177 + }, + { + "epoch": 2.796086465693858, + "grad_norm": 5.290247917175293, + "learning_rate": 6.977020698717529e-07, + "loss": 2.3684, + "step": 4178 + }, + { + "epoch": 2.79675544591713, + "grad_norm": 4.087730884552002, + "learning_rate": 6.931407689029112e-07, + "loss": 2.2693, + "step": 4179 + }, + { + "epoch": 2.7974244261404024, + "grad_norm": 5.306834697723389, + "learning_rate": 6.885942172583843e-07, + "loss": 2.2581, + "step": 4180 + }, + { + "epoch": 2.7980934063636744, + "grad_norm": 6.436861991882324, + "learning_rate": 6.840624176970068e-07, + "loss": 2.4854, + "step": 4181 + }, + { + "epoch": 2.7987623865869464, + "grad_norm": 5.202316761016846, + "learning_rate": 6.795453729686624e-07, + "loss": 2.2998, + "step": 4182 + }, + { + "epoch": 2.799431366810219, + "grad_norm": 6.2068939208984375, + "learning_rate": 6.750430858142753e-07, + "loss": 2.4708, + "step": 4183 + }, + { + "epoch": 2.800100347033491, + "grad_norm": 6.2275872230529785, + "learning_rate": 6.705555589658242e-07, + "loss": 2.4316, + "step": 4184 + }, + { + "epoch": 2.800769327256763, + "grad_norm": 7.06058931350708, + "learning_rate": 6.660827951463222e-07, + "loss": 2.4336, + "step": 4185 + }, + { + "epoch": 2.8014383074800353, + "grad_norm": 4.736634254455566, + "learning_rate": 6.616247970698319e-07, + "loss": 2.3723, + "step": 4186 + }, + { + "epoch": 2.8021072877033073, + "grad_norm": 5.265098571777344, + "learning_rate": 6.571815674414506e-07, + "loss": 2.2787, + "step": 4187 + }, + { + "epoch": 2.8027762679265793, + "grad_norm": 6.323796272277832, + "learning_rate": 6.52753108957313e-07, + "loss": 2.4384, + "step": 4188 + }, + { + "epoch": 2.8034452481498517, + "grad_norm": 6.203528881072998, + "learning_rate": 6.483394243045948e-07, + "loss": 2.2997, + "step": 4189 + }, + { + "epoch": 2.8041142283731237, + "grad_norm": 5.526379585266113, + "learning_rate": 6.439405161615092e-07, + "loss": 2.4637, + "step": 4190 + }, + { + "epoch": 2.8047832085963957, + "grad_norm": 6.816434383392334, + "learning_rate": 6.395563871972932e-07, + "loss": 2.522, + "step": 4191 + }, + { + "epoch": 2.805452188819668, + "grad_norm": 6.053694725036621, + "learning_rate": 6.351870400722271e-07, + "loss": 2.3534, + "step": 4192 + }, + { + "epoch": 2.80612116904294, + "grad_norm": 6.001242637634277, + "learning_rate": 6.308324774376179e-07, + "loss": 2.5279, + "step": 4193 + }, + { + "epoch": 2.806790149266212, + "grad_norm": 6.614064693450928, + "learning_rate": 6.264927019357963e-07, + "loss": 2.4155, + "step": 4194 + }, + { + "epoch": 2.8074591294894846, + "grad_norm": 5.292637348175049, + "learning_rate": 6.221677162001227e-07, + "loss": 2.627, + "step": 4195 + }, + { + "epoch": 2.8081281097127566, + "grad_norm": 4.823739528656006, + "learning_rate": 6.178575228549921e-07, + "loss": 2.329, + "step": 4196 + }, + { + "epoch": 2.8087970899360286, + "grad_norm": 6.928796291351318, + "learning_rate": 6.135621245158068e-07, + "loss": 2.4053, + "step": 4197 + }, + { + "epoch": 2.809466070159301, + "grad_norm": 5.607747554779053, + "learning_rate": 6.092815237890098e-07, + "loss": 2.2722, + "step": 4198 + }, + { + "epoch": 2.810135050382573, + "grad_norm": 5.910256385803223, + "learning_rate": 6.050157232720482e-07, + "loss": 2.4634, + "step": 4199 + }, + { + "epoch": 2.810804030605845, + "grad_norm": 5.091660022735596, + "learning_rate": 6.007647255534015e-07, + "loss": 2.5646, + "step": 4200 + }, + { + "epoch": 2.8114730108291175, + "grad_norm": 5.403104782104492, + "learning_rate": 5.965285332125592e-07, + "loss": 2.3601, + "step": 4201 + }, + { + "epoch": 2.8121419910523895, + "grad_norm": 5.757556915283203, + "learning_rate": 5.923071488200316e-07, + "loss": 2.3875, + "step": 4202 + }, + { + "epoch": 2.8128109712756615, + "grad_norm": 6.980268955230713, + "learning_rate": 5.881005749373336e-07, + "loss": 2.5182, + "step": 4203 + }, + { + "epoch": 2.813479951498934, + "grad_norm": 7.380778789520264, + "learning_rate": 5.839088141170096e-07, + "loss": 2.5465, + "step": 4204 + }, + { + "epoch": 2.814148931722206, + "grad_norm": 5.712233066558838, + "learning_rate": 5.797318689026027e-07, + "loss": 2.4651, + "step": 4205 + }, + { + "epoch": 2.814817911945478, + "grad_norm": 4.818029403686523, + "learning_rate": 5.755697418286715e-07, + "loss": 2.0927, + "step": 4206 + }, + { + "epoch": 2.8154868921687504, + "grad_norm": 5.185235023498535, + "learning_rate": 5.714224354207792e-07, + "loss": 2.249, + "step": 4207 + }, + { + "epoch": 2.8161558723920224, + "grad_norm": 5.423341751098633, + "learning_rate": 5.672899521954988e-07, + "loss": 2.2593, + "step": 4208 + }, + { + "epoch": 2.8168248526152944, + "grad_norm": 4.145195484161377, + "learning_rate": 5.631722946604107e-07, + "loss": 2.3299, + "step": 4209 + }, + { + "epoch": 2.817493832838567, + "grad_norm": 6.880926609039307, + "learning_rate": 5.590694653140937e-07, + "loss": 2.3217, + "step": 4210 + }, + { + "epoch": 2.818162813061839, + "grad_norm": 6.196890830993652, + "learning_rate": 5.549814666461289e-07, + "loss": 2.3213, + "step": 4211 + }, + { + "epoch": 2.818831793285111, + "grad_norm": 4.889017581939697, + "learning_rate": 5.509083011371042e-07, + "loss": 2.3322, + "step": 4212 + }, + { + "epoch": 2.8195007735083832, + "grad_norm": 5.942519664764404, + "learning_rate": 5.468499712586034e-07, + "loss": 2.3673, + "step": 4213 + }, + { + "epoch": 2.8201697537316552, + "grad_norm": 5.434177398681641, + "learning_rate": 5.428064794732096e-07, + "loss": 2.3599, + "step": 4214 + }, + { + "epoch": 2.8208387339549272, + "grad_norm": 7.173910617828369, + "learning_rate": 5.387778282344963e-07, + "loss": 2.4231, + "step": 4215 + }, + { + "epoch": 2.8215077141781997, + "grad_norm": 5.432707786560059, + "learning_rate": 5.347640199870385e-07, + "loss": 2.2901, + "step": 4216 + }, + { + "epoch": 2.8221766944014717, + "grad_norm": 7.470532417297363, + "learning_rate": 5.307650571664019e-07, + "loss": 2.3276, + "step": 4217 + }, + { + "epoch": 2.8228456746247437, + "grad_norm": 5.786896228790283, + "learning_rate": 5.267809421991454e-07, + "loss": 2.3759, + "step": 4218 + }, + { + "epoch": 2.823514654848016, + "grad_norm": 6.176935195922852, + "learning_rate": 5.228116775028131e-07, + "loss": 2.2659, + "step": 4219 + }, + { + "epoch": 2.824183635071288, + "grad_norm": 4.736682415008545, + "learning_rate": 5.188572654859475e-07, + "loss": 2.187, + "step": 4220 + }, + { + "epoch": 2.82485261529456, + "grad_norm": 7.183059215545654, + "learning_rate": 5.149177085480711e-07, + "loss": 2.4874, + "step": 4221 + }, + { + "epoch": 2.8255215955178326, + "grad_norm": 5.517785549163818, + "learning_rate": 5.109930090796938e-07, + "loss": 2.4399, + "step": 4222 + }, + { + "epoch": 2.8261905757411045, + "grad_norm": 6.00811243057251, + "learning_rate": 5.070831694623135e-07, + "loss": 2.3459, + "step": 4223 + }, + { + "epoch": 2.8268595559643765, + "grad_norm": 6.219573020935059, + "learning_rate": 5.031881920684045e-07, + "loss": 2.454, + "step": 4224 + }, + { + "epoch": 2.827528536187649, + "grad_norm": 5.692752838134766, + "learning_rate": 4.993080792614264e-07, + "loss": 2.2848, + "step": 4225 + }, + { + "epoch": 2.828197516410921, + "grad_norm": 5.466771602630615, + "learning_rate": 4.954428333958233e-07, + "loss": 2.1424, + "step": 4226 + }, + { + "epoch": 2.828866496634193, + "grad_norm": 7.350508213043213, + "learning_rate": 4.915924568170083e-07, + "loss": 2.6347, + "step": 4227 + }, + { + "epoch": 2.8295354768574654, + "grad_norm": 6.284493446350098, + "learning_rate": 4.877569518613845e-07, + "loss": 2.3702, + "step": 4228 + }, + { + "epoch": 2.8302044570807374, + "grad_norm": 6.324103355407715, + "learning_rate": 4.83936320856318e-07, + "loss": 2.3992, + "step": 4229 + }, + { + "epoch": 2.83087343730401, + "grad_norm": 6.568131923675537, + "learning_rate": 4.8013056612016e-07, + "loss": 2.2286, + "step": 4230 + }, + { + "epoch": 2.831542417527282, + "grad_norm": 5.109436988830566, + "learning_rate": 4.763396899622269e-07, + "loss": 2.5888, + "step": 4231 + }, + { + "epoch": 2.832211397750554, + "grad_norm": 8.006053924560547, + "learning_rate": 4.7256369468281216e-07, + "loss": 2.4717, + "step": 4232 + }, + { + "epoch": 2.8328803779738263, + "grad_norm": 7.116024494171143, + "learning_rate": 4.6880258257317455e-07, + "loss": 2.3607, + "step": 4233 + }, + { + "epoch": 2.8335493581970983, + "grad_norm": 6.0267333984375, + "learning_rate": 4.650563559155496e-07, + "loss": 2.4576, + "step": 4234 + }, + { + "epoch": 2.8342183384203703, + "grad_norm": 5.429437160491943, + "learning_rate": 4.613250169831301e-07, + "loss": 2.4184, + "step": 4235 + }, + { + "epoch": 2.8348873186436427, + "grad_norm": 5.864874839782715, + "learning_rate": 4.5760856804008824e-07, + "loss": 2.2287, + "step": 4236 + }, + { + "epoch": 2.8355562988669147, + "grad_norm": 8.120298385620117, + "learning_rate": 4.539070113415478e-07, + "loss": 2.3324, + "step": 4237 + }, + { + "epoch": 2.8362252790901867, + "grad_norm": 5.091477870941162, + "learning_rate": 4.50220349133601e-07, + "loss": 2.382, + "step": 4238 + }, + { + "epoch": 2.836894259313459, + "grad_norm": 6.633947849273682, + "learning_rate": 4.465485836533029e-07, + "loss": 2.3962, + "step": 4239 + }, + { + "epoch": 2.837563239536731, + "grad_norm": 4.840530872344971, + "learning_rate": 4.4289171712867394e-07, + "loss": 2.2686, + "step": 4240 + }, + { + "epoch": 2.8382322197600036, + "grad_norm": 5.4004387855529785, + "learning_rate": 4.392497517786809e-07, + "loss": 2.4506, + "step": 4241 + }, + { + "epoch": 2.8389011999832756, + "grad_norm": 6.139686107635498, + "learning_rate": 4.356226898132643e-07, + "loss": 2.2481, + "step": 4242 + }, + { + "epoch": 2.8395701802065476, + "grad_norm": 6.8809661865234375, + "learning_rate": 4.3201053343331076e-07, + "loss": 2.6074, + "step": 4243 + }, + { + "epoch": 2.84023916042982, + "grad_norm": 5.463768005371094, + "learning_rate": 4.2841328483066436e-07, + "loss": 2.212, + "step": 4244 + }, + { + "epoch": 2.840908140653092, + "grad_norm": 4.971597671508789, + "learning_rate": 4.2483094618812614e-07, + "loss": 2.3515, + "step": 4245 + }, + { + "epoch": 2.841577120876364, + "grad_norm": 5.4091620445251465, + "learning_rate": 4.2126351967944346e-07, + "loss": 2.3415, + "step": 4246 + }, + { + "epoch": 2.8422461010996365, + "grad_norm": 6.406443119049072, + "learning_rate": 4.177110074693236e-07, + "loss": 2.5265, + "step": 4247 + }, + { + "epoch": 2.8429150813229085, + "grad_norm": 4.372164726257324, + "learning_rate": 4.1417341171341995e-07, + "loss": 2.3173, + "step": 4248 + }, + { + "epoch": 2.8435840615461805, + "grad_norm": 6.14501953125, + "learning_rate": 4.1065073455832934e-07, + "loss": 2.5101, + "step": 4249 + }, + { + "epoch": 2.844253041769453, + "grad_norm": 6.765868186950684, + "learning_rate": 4.0714297814160584e-07, + "loss": 2.4154, + "step": 4250 + }, + { + "epoch": 2.844922021992725, + "grad_norm": 5.7409844398498535, + "learning_rate": 4.036501445917412e-07, + "loss": 2.1985, + "step": 4251 + }, + { + "epoch": 2.845591002215997, + "grad_norm": 4.357446670532227, + "learning_rate": 4.0017223602818177e-07, + "loss": 2.2625, + "step": 4252 + }, + { + "epoch": 2.8462599824392694, + "grad_norm": 6.567503929138184, + "learning_rate": 3.9670925456130047e-07, + "loss": 2.4698, + "step": 4253 + }, + { + "epoch": 2.8469289626625414, + "grad_norm": 5.574277877807617, + "learning_rate": 3.932612022924359e-07, + "loss": 2.3923, + "step": 4254 + }, + { + "epoch": 2.8475979428858134, + "grad_norm": 4.975925445556641, + "learning_rate": 3.8982808131384494e-07, + "loss": 2.2427, + "step": 4255 + }, + { + "epoch": 2.848266923109086, + "grad_norm": 6.107678413391113, + "learning_rate": 3.864098937087418e-07, + "loss": 2.2134, + "step": 4256 + }, + { + "epoch": 2.848935903332358, + "grad_norm": 4.392385959625244, + "learning_rate": 3.8300664155126453e-07, + "loss": 2.32, + "step": 4257 + }, + { + "epoch": 2.84960488355563, + "grad_norm": 5.014101028442383, + "learning_rate": 3.796183269065029e-07, + "loss": 2.4174, + "step": 4258 + }, + { + "epoch": 2.8502738637789022, + "grad_norm": 6.695645332336426, + "learning_rate": 3.7624495183047335e-07, + "loss": 2.6843, + "step": 4259 + }, + { + "epoch": 2.8509428440021742, + "grad_norm": 6.523192405700684, + "learning_rate": 3.728865183701274e-07, + "loss": 2.1647, + "step": 4260 + }, + { + "epoch": 2.8516118242254462, + "grad_norm": 5.986507415771484, + "learning_rate": 3.695430285633489e-07, + "loss": 2.3953, + "step": 4261 + }, + { + "epoch": 2.8522808044487187, + "grad_norm": 5.079738140106201, + "learning_rate": 3.662144844389648e-07, + "loss": 2.4982, + "step": 4262 + }, + { + "epoch": 2.8529497846719907, + "grad_norm": 6.315966606140137, + "learning_rate": 3.629008880167151e-07, + "loss": 2.3707, + "step": 4263 + }, + { + "epoch": 2.8536187648952627, + "grad_norm": 7.284508228302002, + "learning_rate": 3.5960224130728857e-07, + "loss": 2.5651, + "step": 4264 + }, + { + "epoch": 2.854287745118535, + "grad_norm": 5.085353374481201, + "learning_rate": 3.56318546312287e-07, + "loss": 2.2937, + "step": 4265 + }, + { + "epoch": 2.854956725341807, + "grad_norm": 6.125516414642334, + "learning_rate": 3.530498050242498e-07, + "loss": 2.2674, + "step": 4266 + }, + { + "epoch": 2.855625705565079, + "grad_norm": 5.104428768157959, + "learning_rate": 3.497960194266403e-07, + "loss": 2.3289, + "step": 4267 + }, + { + "epoch": 2.8562946857883516, + "grad_norm": 6.032754898071289, + "learning_rate": 3.4655719149384035e-07, + "loss": 2.3707, + "step": 4268 + }, + { + "epoch": 2.8569636660116235, + "grad_norm": 6.489772319793701, + "learning_rate": 3.433333231911584e-07, + "loss": 2.6222, + "step": 4269 + }, + { + "epoch": 2.8576326462348955, + "grad_norm": 3.822669267654419, + "learning_rate": 3.4012441647483797e-07, + "loss": 2.1233, + "step": 4270 + }, + { + "epoch": 2.858301626458168, + "grad_norm": 4.899784088134766, + "learning_rate": 3.3693047329202145e-07, + "loss": 2.3748, + "step": 4271 + }, + { + "epoch": 2.85897060668144, + "grad_norm": 6.515200614929199, + "learning_rate": 3.3375149558079186e-07, + "loss": 2.4037, + "step": 4272 + }, + { + "epoch": 2.859639586904712, + "grad_norm": 4.8717756271362305, + "learning_rate": 3.3058748527013684e-07, + "loss": 2.2241, + "step": 4273 + }, + { + "epoch": 2.8603085671279844, + "grad_norm": 6.067234516143799, + "learning_rate": 3.274384442799733e-07, + "loss": 2.313, + "step": 4274 + }, + { + "epoch": 2.8609775473512564, + "grad_norm": 5.540458679199219, + "learning_rate": 3.243043745211255e-07, + "loss": 2.2572, + "step": 4275 + }, + { + "epoch": 2.8616465275745284, + "grad_norm": 5.462283611297607, + "learning_rate": 3.2118527789533617e-07, + "loss": 2.345, + "step": 4276 + }, + { + "epoch": 2.862315507797801, + "grad_norm": 6.649262428283691, + "learning_rate": 3.1808115629526627e-07, + "loss": 2.497, + "step": 4277 + }, + { + "epoch": 2.862984488021073, + "grad_norm": 6.492066383361816, + "learning_rate": 3.149920116044841e-07, + "loss": 2.3873, + "step": 4278 + }, + { + "epoch": 2.863653468244345, + "grad_norm": 6.995612144470215, + "learning_rate": 3.1191784569747363e-07, + "loss": 2.4786, + "step": 4279 + }, + { + "epoch": 2.8643224484676173, + "grad_norm": 5.4300456047058105, + "learning_rate": 3.088586604396315e-07, + "loss": 2.3189, + "step": 4280 + }, + { + "epoch": 2.8649914286908893, + "grad_norm": 6.304417133331299, + "learning_rate": 3.05814457687259e-07, + "loss": 2.4187, + "step": 4281 + }, + { + "epoch": 2.8656604089141613, + "grad_norm": 7.756254196166992, + "learning_rate": 3.0278523928756744e-07, + "loss": 2.609, + "step": 4282 + }, + { + "epoch": 2.8663293891374337, + "grad_norm": 4.522486686706543, + "learning_rate": 2.99771007078678e-07, + "loss": 2.1068, + "step": 4283 + }, + { + "epoch": 2.8669983693607057, + "grad_norm": 5.969696998596191, + "learning_rate": 2.967717628896166e-07, + "loss": 2.1122, + "step": 4284 + }, + { + "epoch": 2.8676673495839777, + "grad_norm": 7.251203536987305, + "learning_rate": 2.9378750854031633e-07, + "loss": 2.7553, + "step": 4285 + }, + { + "epoch": 2.86833632980725, + "grad_norm": 5.428221702575684, + "learning_rate": 2.908182458416148e-07, + "loss": 2.3637, + "step": 4286 + }, + { + "epoch": 2.869005310030522, + "grad_norm": 4.700796604156494, + "learning_rate": 2.878639765952457e-07, + "loss": 2.2546, + "step": 4287 + }, + { + "epoch": 2.869674290253794, + "grad_norm": 8.134806632995605, + "learning_rate": 2.8492470259385564e-07, + "loss": 2.8644, + "step": 4288 + }, + { + "epoch": 2.8703432704770666, + "grad_norm": 5.040346622467041, + "learning_rate": 2.820004256209846e-07, + "loss": 2.3481, + "step": 4289 + }, + { + "epoch": 2.8710122507003386, + "grad_norm": 6.581025123596191, + "learning_rate": 2.7909114745107434e-07, + "loss": 2.1581, + "step": 4290 + }, + { + "epoch": 2.8716812309236106, + "grad_norm": 6.895389556884766, + "learning_rate": 2.761968698494627e-07, + "loss": 2.3665, + "step": 4291 + }, + { + "epoch": 2.872350211146883, + "grad_norm": 4.398426055908203, + "learning_rate": 2.733175945723948e-07, + "loss": 2.0883, + "step": 4292 + }, + { + "epoch": 2.873019191370155, + "grad_norm": 4.57658576965332, + "learning_rate": 2.704533233669981e-07, + "loss": 2.5366, + "step": 4293 + }, + { + "epoch": 2.873688171593427, + "grad_norm": 6.411903381347656, + "learning_rate": 2.6760405797131014e-07, + "loss": 2.5873, + "step": 4294 + }, + { + "epoch": 2.8743571518166995, + "grad_norm": 5.395082950592041, + "learning_rate": 2.647698001142507e-07, + "loss": 2.3981, + "step": 4295 + }, + { + "epoch": 2.8750261320399715, + "grad_norm": 6.638852119445801, + "learning_rate": 2.619505515156412e-07, + "loss": 2.5226, + "step": 4296 + }, + { + "epoch": 2.8756951122632435, + "grad_norm": 5.051377773284912, + "learning_rate": 2.59146313886191e-07, + "loss": 2.2519, + "step": 4297 + }, + { + "epoch": 2.876364092486516, + "grad_norm": 7.073669910430908, + "learning_rate": 2.56357088927503e-07, + "loss": 2.7141, + "step": 4298 + }, + { + "epoch": 2.877033072709788, + "grad_norm": 5.255229473114014, + "learning_rate": 2.535828783320704e-07, + "loss": 2.1585, + "step": 4299 + }, + { + "epoch": 2.87770205293306, + "grad_norm": 6.388033390045166, + "learning_rate": 2.508236837832745e-07, + "loss": 2.5506, + "step": 4300 + }, + { + "epoch": 2.8783710331563324, + "grad_norm": 6.134374618530273, + "learning_rate": 2.480795069553871e-07, + "loss": 2.2014, + "step": 4301 + }, + { + "epoch": 2.8790400133796044, + "grad_norm": 4.835010051727295, + "learning_rate": 2.453503495135651e-07, + "loss": 2.2588, + "step": 4302 + }, + { + "epoch": 2.8797089936028764, + "grad_norm": 5.617628574371338, + "learning_rate": 2.4263621311385323e-07, + "loss": 2.3904, + "step": 4303 + }, + { + "epoch": 2.880377973826149, + "grad_norm": 6.191822528839111, + "learning_rate": 2.399370994031813e-07, + "loss": 2.3112, + "step": 4304 + }, + { + "epoch": 2.881046954049421, + "grad_norm": 5.092449188232422, + "learning_rate": 2.3725301001935872e-07, + "loss": 2.4298, + "step": 4305 + }, + { + "epoch": 2.881715934272693, + "grad_norm": 4.669118404388428, + "learning_rate": 2.3458394659108817e-07, + "loss": 2.3998, + "step": 4306 + }, + { + "epoch": 2.8823849144959652, + "grad_norm": 4.986922740936279, + "learning_rate": 2.3192991073794358e-07, + "loss": 2.1137, + "step": 4307 + }, + { + "epoch": 2.8830538947192372, + "grad_norm": 6.690556526184082, + "learning_rate": 2.2929090407038945e-07, + "loss": 2.3758, + "step": 4308 + }, + { + "epoch": 2.8837228749425097, + "grad_norm": 5.798831939697266, + "learning_rate": 2.2666692818976154e-07, + "loss": 2.3677, + "step": 4309 + }, + { + "epoch": 2.8843918551657817, + "grad_norm": 5.963528156280518, + "learning_rate": 2.2405798468828622e-07, + "loss": 2.4495, + "step": 4310 + }, + { + "epoch": 2.8850608353890537, + "grad_norm": 6.281216144561768, + "learning_rate": 2.214640751490582e-07, + "loss": 2.5534, + "step": 4311 + }, + { + "epoch": 2.885729815612326, + "grad_norm": 6.020514011383057, + "learning_rate": 2.1888520114605736e-07, + "loss": 2.4876, + "step": 4312 + }, + { + "epoch": 2.886398795835598, + "grad_norm": 5.4072675704956055, + "learning_rate": 2.1632136424412918e-07, + "loss": 2.4076, + "step": 4313 + }, + { + "epoch": 2.88706777605887, + "grad_norm": 5.209415912628174, + "learning_rate": 2.1377256599900696e-07, + "loss": 2.2999, + "step": 4314 + }, + { + "epoch": 2.8877367562821425, + "grad_norm": 4.96342658996582, + "learning_rate": 2.1123880795729246e-07, + "loss": 2.1993, + "step": 4315 + }, + { + "epoch": 2.8884057365054145, + "grad_norm": 4.811527729034424, + "learning_rate": 2.087200916564641e-07, + "loss": 2.1189, + "step": 4316 + }, + { + "epoch": 2.8890747167286865, + "grad_norm": 7.9704484939575195, + "learning_rate": 2.0621641862486606e-07, + "loss": 2.4761, + "step": 4317 + }, + { + "epoch": 2.889743696951959, + "grad_norm": 5.8605055809021, + "learning_rate": 2.0372779038172195e-07, + "loss": 2.6451, + "step": 4318 + }, + { + "epoch": 2.890412677175231, + "grad_norm": 5.815000057220459, + "learning_rate": 2.0125420843712106e-07, + "loss": 2.2113, + "step": 4319 + }, + { + "epoch": 2.8910816573985034, + "grad_norm": 5.124016761779785, + "learning_rate": 1.987956742920266e-07, + "loss": 2.4583, + "step": 4320 + }, + { + "epoch": 2.8917506376217754, + "grad_norm": 7.583825588226318, + "learning_rate": 1.9635218943827029e-07, + "loss": 2.5664, + "step": 4321 + }, + { + "epoch": 2.8924196178450474, + "grad_norm": 3.785649061203003, + "learning_rate": 1.939237553585521e-07, + "loss": 2.0556, + "step": 4322 + }, + { + "epoch": 2.89308859806832, + "grad_norm": 7.821784019470215, + "learning_rate": 1.915103735264323e-07, + "loss": 2.4768, + "step": 4323 + }, + { + "epoch": 2.893757578291592, + "grad_norm": 6.73760986328125, + "learning_rate": 1.8911204540635051e-07, + "loss": 2.5215, + "step": 4324 + }, + { + "epoch": 2.894426558514864, + "grad_norm": 5.312856674194336, + "learning_rate": 1.86728772453601e-07, + "loss": 2.2702, + "step": 4325 + }, + { + "epoch": 2.8950955387381363, + "grad_norm": 5.01576566696167, + "learning_rate": 1.8436055611434354e-07, + "loss": 2.5329, + "step": 4326 + }, + { + "epoch": 2.8957645189614083, + "grad_norm": 5.448889255523682, + "learning_rate": 1.8200739782560927e-07, + "loss": 2.557, + "step": 4327 + }, + { + "epoch": 2.8964334991846803, + "grad_norm": 6.385546684265137, + "learning_rate": 1.7966929901528372e-07, + "loss": 2.3601, + "step": 4328 + }, + { + "epoch": 2.8971024794079527, + "grad_norm": 5.114201545715332, + "learning_rate": 1.7734626110211538e-07, + "loss": 2.3914, + "step": 4329 + }, + { + "epoch": 2.8977714596312247, + "grad_norm": 7.0907087326049805, + "learning_rate": 1.7503828549572387e-07, + "loss": 2.6902, + "step": 4330 + }, + { + "epoch": 2.8984404398544967, + "grad_norm": 7.06349515914917, + "learning_rate": 1.7274537359657505e-07, + "loss": 2.902, + "step": 4331 + }, + { + "epoch": 2.899109420077769, + "grad_norm": 5.921429634094238, + "learning_rate": 1.704675267960032e-07, + "loss": 2.5978, + "step": 4332 + }, + { + "epoch": 2.899778400301041, + "grad_norm": 5.059203147888184, + "learning_rate": 1.6820474647619988e-07, + "loss": 2.4209, + "step": 4333 + }, + { + "epoch": 2.900447380524313, + "grad_norm": 5.834001064300537, + "learning_rate": 1.6595703401020847e-07, + "loss": 2.4091, + "step": 4334 + }, + { + "epoch": 2.9011163607475856, + "grad_norm": 6.80534553527832, + "learning_rate": 1.6372439076193512e-07, + "loss": 2.2947, + "step": 4335 + }, + { + "epoch": 2.9017853409708576, + "grad_norm": 6.701843738555908, + "learning_rate": 1.6150681808614343e-07, + "loss": 2.3293, + "step": 4336 + }, + { + "epoch": 2.9024543211941296, + "grad_norm": 5.845097541809082, + "learning_rate": 1.5930431732844586e-07, + "loss": 2.5831, + "step": 4337 + }, + { + "epoch": 2.903123301417402, + "grad_norm": 4.300020217895508, + "learning_rate": 1.5711688982531503e-07, + "loss": 2.2023, + "step": 4338 + }, + { + "epoch": 2.903792281640674, + "grad_norm": 6.107841968536377, + "learning_rate": 1.549445369040753e-07, + "loss": 2.5659, + "step": 4339 + }, + { + "epoch": 2.904461261863946, + "grad_norm": 4.757116794586182, + "learning_rate": 1.5278725988290277e-07, + "loss": 2.3094, + "step": 4340 + }, + { + "epoch": 2.9051302420872185, + "grad_norm": 4.66880989074707, + "learning_rate": 1.5064506007082534e-07, + "loss": 2.3927, + "step": 4341 + }, + { + "epoch": 2.9057992223104905, + "grad_norm": 5.9844255447387695, + "learning_rate": 1.4851793876772546e-07, + "loss": 2.4113, + "step": 4342 + }, + { + "epoch": 2.9064682025337625, + "grad_norm": 4.938234329223633, + "learning_rate": 1.4640589726432896e-07, + "loss": 2.4548, + "step": 4343 + }, + { + "epoch": 2.907137182757035, + "grad_norm": 6.403194427490234, + "learning_rate": 1.4430893684221903e-07, + "loss": 2.5296, + "step": 4344 + }, + { + "epoch": 2.907806162980307, + "grad_norm": 5.535197734832764, + "learning_rate": 1.4222705877382224e-07, + "loss": 2.353, + "step": 4345 + }, + { + "epoch": 2.908475143203579, + "grad_norm": 5.3580498695373535, + "learning_rate": 1.4016026432242257e-07, + "loss": 2.439, + "step": 4346 + }, + { + "epoch": 2.9091441234268514, + "grad_norm": 4.618601322174072, + "learning_rate": 1.3810855474213623e-07, + "loss": 2.3629, + "step": 4347 + }, + { + "epoch": 2.9098131036501234, + "grad_norm": 5.90875768661499, + "learning_rate": 1.3607193127793683e-07, + "loss": 2.2707, + "step": 4348 + }, + { + "epoch": 2.9104820838733954, + "grad_norm": 6.160029411315918, + "learning_rate": 1.3405039516564133e-07, + "loss": 2.3531, + "step": 4349 + }, + { + "epoch": 2.911151064096668, + "grad_norm": 7.830750465393066, + "learning_rate": 1.320439476319102e-07, + "loss": 2.2855, + "step": 4350 + }, + { + "epoch": 2.91182004431994, + "grad_norm": 5.642902374267578, + "learning_rate": 1.3005258989425006e-07, + "loss": 2.2485, + "step": 4351 + }, + { + "epoch": 2.912489024543212, + "grad_norm": 7.144786357879639, + "learning_rate": 1.280763231610138e-07, + "loss": 2.4179, + "step": 4352 + }, + { + "epoch": 2.9131580047664842, + "grad_norm": 6.509337425231934, + "learning_rate": 1.261151486313866e-07, + "loss": 2.3787, + "step": 4353 + }, + { + "epoch": 2.9138269849897562, + "grad_norm": 6.776416301727295, + "learning_rate": 1.2416906749540825e-07, + "loss": 2.6353, + "step": 4354 + }, + { + "epoch": 2.9144959652130282, + "grad_norm": 6.795693874359131, + "learning_rate": 1.222380809339535e-07, + "loss": 2.5738, + "step": 4355 + }, + { + "epoch": 2.9151649454363007, + "grad_norm": 5.656535625457764, + "learning_rate": 1.203221901187407e-07, + "loss": 2.4377, + "step": 4356 + }, + { + "epoch": 2.9158339256595727, + "grad_norm": 7.705094814300537, + "learning_rate": 1.1842139621232041e-07, + "loss": 2.5598, + "step": 4357 + }, + { + "epoch": 2.9165029058828447, + "grad_norm": 5.217077732086182, + "learning_rate": 1.1653570036809225e-07, + "loss": 2.3478, + "step": 4358 + }, + { + "epoch": 2.917171886106117, + "grad_norm": 7.637456893920898, + "learning_rate": 1.1466510373029094e-07, + "loss": 2.7623, + "step": 4359 + }, + { + "epoch": 2.917840866329389, + "grad_norm": 6.277545928955078, + "learning_rate": 1.1280960743398905e-07, + "loss": 2.538, + "step": 4360 + }, + { + "epoch": 2.918509846552661, + "grad_norm": 7.600462436676025, + "learning_rate": 1.1096921260509152e-07, + "loss": 2.5054, + "step": 4361 + }, + { + "epoch": 2.9191788267759335, + "grad_norm": 3.608879327774048, + "learning_rate": 1.0914392036034948e-07, + "loss": 2.3373, + "step": 4362 + }, + { + "epoch": 2.9198478069992055, + "grad_norm": 5.8420891761779785, + "learning_rate": 1.0733373180734085e-07, + "loss": 2.5168, + "step": 4363 + }, + { + "epoch": 2.9205167872224775, + "grad_norm": 6.544290542602539, + "learning_rate": 1.0553864804448144e-07, + "loss": 2.4057, + "step": 4364 + }, + { + "epoch": 2.92118576744575, + "grad_norm": 5.117260932922363, + "learning_rate": 1.0375867016102492e-07, + "loss": 2.0938, + "step": 4365 + }, + { + "epoch": 2.921854747669022, + "grad_norm": 5.88095235824585, + "learning_rate": 1.0199379923705732e-07, + "loss": 2.216, + "step": 4366 + }, + { + "epoch": 2.922523727892294, + "grad_norm": 9.378856658935547, + "learning_rate": 1.0024403634349422e-07, + "loss": 2.6385, + "step": 4367 + }, + { + "epoch": 2.9231927081155664, + "grad_norm": 5.765771389007568, + "learning_rate": 9.85093825420863e-08, + "loss": 2.3191, + "step": 4368 + }, + { + "epoch": 2.9238616883388384, + "grad_norm": 6.975998878479004, + "learning_rate": 9.678983888541381e-08, + "loss": 2.4263, + "step": 4369 + }, + { + "epoch": 2.9245306685621104, + "grad_norm": 5.410098552703857, + "learning_rate": 9.50854064168949e-08, + "loss": 2.3528, + "step": 4370 + }, + { + "epoch": 2.925199648785383, + "grad_norm": 5.118021488189697, + "learning_rate": 9.339608617077167e-08, + "loss": 2.2044, + "step": 4371 + }, + { + "epoch": 2.925868629008655, + "grad_norm": 6.039196968078613, + "learning_rate": 9.172187917211861e-08, + "loss": 2.249, + "step": 4372 + }, + { + "epoch": 2.926537609231927, + "grad_norm": 6.588985919952393, + "learning_rate": 9.006278643683696e-08, + "loss": 2.5823, + "step": 4373 + }, + { + "epoch": 2.9272065894551993, + "grad_norm": 5.674143314361572, + "learning_rate": 8.841880897166311e-08, + "loss": 2.37, + "step": 4374 + }, + { + "epoch": 2.9278755696784713, + "grad_norm": 5.154245853424072, + "learning_rate": 8.678994777415184e-08, + "loss": 2.1953, + "step": 4375 + }, + { + "epoch": 2.9285445499017433, + "grad_norm": 6.932987213134766, + "learning_rate": 8.517620383269587e-08, + "loss": 2.6793, + "step": 4376 + }, + { + "epoch": 2.9292135301250157, + "grad_norm": 7.161120891571045, + "learning_rate": 8.357757812650912e-08, + "loss": 2.5067, + "step": 4377 + }, + { + "epoch": 2.9298825103482877, + "grad_norm": 4.879586696624756, + "learning_rate": 8.199407162562955e-08, + "loss": 2.4626, + "step": 4378 + }, + { + "epoch": 2.9305514905715597, + "grad_norm": 6.024034023284912, + "learning_rate": 8.042568529092464e-08, + "loss": 2.5307, + "step": 4379 + }, + { + "epoch": 2.931220470794832, + "grad_norm": 5.760753154754639, + "learning_rate": 7.887242007408868e-08, + "loss": 2.3045, + "step": 4380 + }, + { + "epoch": 2.931889451018104, + "grad_norm": 3.9770023822784424, + "learning_rate": 7.733427691763439e-08, + "loss": 2.2878, + "step": 4381 + }, + { + "epoch": 2.932558431241376, + "grad_norm": 5.842418193817139, + "learning_rate": 7.581125675490686e-08, + "loss": 2.184, + "step": 4382 + }, + { + "epoch": 2.9332274114646486, + "grad_norm": 5.048346042633057, + "learning_rate": 7.430336051006681e-08, + "loss": 2.2816, + "step": 4383 + }, + { + "epoch": 2.9338963916879206, + "grad_norm": 4.191540718078613, + "learning_rate": 7.281058909810179e-08, + "loss": 2.2425, + "step": 4384 + }, + { + "epoch": 2.9345653719111926, + "grad_norm": 5.113699913024902, + "learning_rate": 7.133294342481778e-08, + "loss": 2.337, + "step": 4385 + }, + { + "epoch": 2.935234352134465, + "grad_norm": 4.893774032592773, + "learning_rate": 6.987042438684755e-08, + "loss": 2.3867, + "step": 4386 + }, + { + "epoch": 2.935903332357737, + "grad_norm": 6.493711948394775, + "learning_rate": 6.842303287164509e-08, + "loss": 2.4219, + "step": 4387 + }, + { + "epoch": 2.9365723125810095, + "grad_norm": 6.099575042724609, + "learning_rate": 6.699076975748009e-08, + "loss": 2.3463, + "step": 4388 + }, + { + "epoch": 2.9372412928042815, + "grad_norm": 7.436191082000732, + "learning_rate": 6.5573635913449e-08, + "loss": 2.4256, + "step": 4389 + }, + { + "epoch": 2.9379102730275535, + "grad_norm": 4.026017665863037, + "learning_rate": 6.417163219945843e-08, + "loss": 2.052, + "step": 4390 + }, + { + "epoch": 2.938579253250826, + "grad_norm": 6.60069465637207, + "learning_rate": 6.278475946624451e-08, + "loss": 2.4887, + "step": 4391 + }, + { + "epoch": 2.939248233474098, + "grad_norm": 5.6980180740356445, + "learning_rate": 6.141301855535353e-08, + "loss": 2.4865, + "step": 4392 + }, + { + "epoch": 2.93991721369737, + "grad_norm": 5.30901575088501, + "learning_rate": 6.005641029915577e-08, + "loss": 2.4656, + "step": 4393 + }, + { + "epoch": 2.9405861939206424, + "grad_norm": 5.176517009735107, + "learning_rate": 5.871493552083718e-08, + "loss": 2.2869, + "step": 4394 + }, + { + "epoch": 2.9412551741439144, + "grad_norm": 5.627009391784668, + "learning_rate": 5.7388595034396643e-08, + "loss": 2.4442, + "step": 4395 + }, + { + "epoch": 2.941924154367187, + "grad_norm": 7.613918781280518, + "learning_rate": 5.6077389644659804e-08, + "loss": 2.6838, + "step": 4396 + }, + { + "epoch": 2.942593134590459, + "grad_norm": 5.842780590057373, + "learning_rate": 5.4781320147254126e-08, + "loss": 2.2222, + "step": 4397 + }, + { + "epoch": 2.943262114813731, + "grad_norm": 6.697509765625, + "learning_rate": 5.350038732863938e-08, + "loss": 2.2665, + "step": 4398 + }, + { + "epoch": 2.9439310950370032, + "grad_norm": 6.246048450469971, + "learning_rate": 5.2234591966074385e-08, + "loss": 2.1568, + "step": 4399 + }, + { + "epoch": 2.9446000752602752, + "grad_norm": 6.492537498474121, + "learning_rate": 5.098393482763919e-08, + "loss": 2.3974, + "step": 4400 + }, + { + "epoch": 2.9452690554835472, + "grad_norm": 4.946436882019043, + "learning_rate": 4.974841667223506e-08, + "loss": 2.1453, + "step": 4401 + }, + { + "epoch": 2.9459380357068197, + "grad_norm": 5.089686393737793, + "learning_rate": 4.852803824956509e-08, + "loss": 2.4301, + "step": 4402 + }, + { + "epoch": 2.9466070159300917, + "grad_norm": 4.462343215942383, + "learning_rate": 4.732280030015357e-08, + "loss": 2.3344, + "step": 4403 + }, + { + "epoch": 2.9472759961533637, + "grad_norm": 4.753413677215576, + "learning_rate": 4.6132703555332166e-08, + "loss": 2.2487, + "step": 4404 + }, + { + "epoch": 2.947944976376636, + "grad_norm": 5.297506332397461, + "learning_rate": 4.4957748737251e-08, + "loss": 2.3433, + "step": 4405 + }, + { + "epoch": 2.948613956599908, + "grad_norm": 7.362310886383057, + "learning_rate": 4.3797936558867547e-08, + "loss": 2.2548, + "step": 4406 + }, + { + "epoch": 2.94928293682318, + "grad_norm": 5.103241443634033, + "learning_rate": 4.265326772395217e-08, + "loss": 2.2983, + "step": 4407 + }, + { + "epoch": 2.9499519170464525, + "grad_norm": 6.195858478546143, + "learning_rate": 4.152374292708539e-08, + "loss": 2.4762, + "step": 4408 + }, + { + "epoch": 2.9506208972697245, + "grad_norm": 5.502584934234619, + "learning_rate": 4.0409362853660594e-08, + "loss": 2.5119, + "step": 4409 + }, + { + "epoch": 2.9512898774929965, + "grad_norm": 6.203848361968994, + "learning_rate": 3.931012817987856e-08, + "loss": 2.3521, + "step": 4410 + }, + { + "epoch": 2.951958857716269, + "grad_norm": 7.9124650955200195, + "learning_rate": 3.822603957275295e-08, + "loss": 2.5627, + "step": 4411 + }, + { + "epoch": 2.952627837939541, + "grad_norm": 6.0643439292907715, + "learning_rate": 3.715709769010478e-08, + "loss": 2.4871, + "step": 4412 + }, + { + "epoch": 2.953296818162813, + "grad_norm": 5.850800514221191, + "learning_rate": 3.6103303180565206e-08, + "loss": 2.1664, + "step": 4413 + }, + { + "epoch": 2.9539657983860854, + "grad_norm": 5.049599647521973, + "learning_rate": 3.506465668357273e-08, + "loss": 2.3831, + "step": 4414 + }, + { + "epoch": 2.9546347786093574, + "grad_norm": 5.44934606552124, + "learning_rate": 3.40411588293732e-08, + "loss": 2.3453, + "step": 4415 + }, + { + "epoch": 2.9553037588326294, + "grad_norm": 5.700920104980469, + "learning_rate": 3.303281023902261e-08, + "loss": 2.2948, + "step": 4416 + }, + { + "epoch": 2.955972739055902, + "grad_norm": 5.786959648132324, + "learning_rate": 3.203961152438428e-08, + "loss": 2.7305, + "step": 4417 + }, + { + "epoch": 2.956641719279174, + "grad_norm": 6.005786418914795, + "learning_rate": 3.1061563288131704e-08, + "loss": 2.138, + "step": 4418 + }, + { + "epoch": 2.957310699502446, + "grad_norm": 6.71798849105835, + "learning_rate": 3.0098666123731803e-08, + "loss": 2.4365, + "step": 4419 + }, + { + "epoch": 2.9579796797257183, + "grad_norm": 6.043598175048828, + "learning_rate": 2.9150920615478305e-08, + "loss": 2.571, + "step": 4420 + }, + { + "epoch": 2.9586486599489903, + "grad_norm": 7.278130531311035, + "learning_rate": 2.8218327338452866e-08, + "loss": 2.5911, + "step": 4421 + }, + { + "epoch": 2.9593176401722623, + "grad_norm": 7.193459510803223, + "learning_rate": 2.730088685855281e-08, + "loss": 2.4378, + "step": 4422 + }, + { + "epoch": 2.9599866203955347, + "grad_norm": 5.306958198547363, + "learning_rate": 2.6398599732477268e-08, + "loss": 2.1649, + "step": 4423 + }, + { + "epoch": 2.9606556006188067, + "grad_norm": 6.152875900268555, + "learning_rate": 2.551146650773273e-08, + "loss": 2.4779, + "step": 4424 + }, + { + "epoch": 2.9613245808420787, + "grad_norm": 4.4504194259643555, + "learning_rate": 2.4639487722624698e-08, + "loss": 2.2746, + "step": 4425 + }, + { + "epoch": 2.961993561065351, + "grad_norm": 6.299310684204102, + "learning_rate": 2.3782663906274372e-08, + "loss": 2.4552, + "step": 4426 + }, + { + "epoch": 2.962662541288623, + "grad_norm": 5.881255149841309, + "learning_rate": 2.294099557859364e-08, + "loss": 2.438, + "step": 4427 + }, + { + "epoch": 2.963331521511895, + "grad_norm": 8.662530899047852, + "learning_rate": 2.211448325030452e-08, + "loss": 2.2639, + "step": 4428 + }, + { + "epoch": 2.9640005017351676, + "grad_norm": 5.893693923950195, + "learning_rate": 2.1303127422933612e-08, + "loss": 2.3174, + "step": 4429 + }, + { + "epoch": 2.9646694819584396, + "grad_norm": 7.124366760253906, + "learning_rate": 2.0506928588809315e-08, + "loss": 2.2546, + "step": 4430 + }, + { + "epoch": 2.9653384621817116, + "grad_norm": 7.78939962387085, + "learning_rate": 1.9725887231061834e-08, + "loss": 2.6583, + "step": 4431 + }, + { + "epoch": 2.966007442404984, + "grad_norm": 5.903192043304443, + "learning_rate": 1.896000382362595e-08, + "loss": 2.4547, + "step": 4432 + }, + { + "epoch": 2.966676422628256, + "grad_norm": 5.279549598693848, + "learning_rate": 1.8209278831235466e-08, + "loss": 2.2452, + "step": 4433 + }, + { + "epoch": 2.967345402851528, + "grad_norm": 4.609355449676514, + "learning_rate": 1.7473712709428768e-08, + "loss": 2.2429, + "step": 4434 + }, + { + "epoch": 2.9680143830748005, + "grad_norm": 4.304797172546387, + "learning_rate": 1.6753305904546047e-08, + "loss": 2.2038, + "step": 4435 + }, + { + "epoch": 2.9686833632980725, + "grad_norm": 6.743269920349121, + "learning_rate": 1.6048058853723734e-08, + "loss": 2.5786, + "step": 4436 + }, + { + "epoch": 2.9693523435213445, + "grad_norm": 5.148039817810059, + "learning_rate": 1.5357971984908403e-08, + "loss": 2.5455, + "step": 4437 + }, + { + "epoch": 2.970021323744617, + "grad_norm": 6.280725479125977, + "learning_rate": 1.4683045716840093e-08, + "loss": 2.6173, + "step": 4438 + }, + { + "epoch": 2.970690303967889, + "grad_norm": 5.363486289978027, + "learning_rate": 1.402328045906065e-08, + "loss": 2.5327, + "step": 4439 + }, + { + "epoch": 2.971359284191161, + "grad_norm": 5.093814373016357, + "learning_rate": 1.3378676611916496e-08, + "loss": 2.3955, + "step": 4440 + }, + { + "epoch": 2.9720282644144334, + "grad_norm": 7.159473419189453, + "learning_rate": 1.2749234566550306e-08, + "loss": 2.4722, + "step": 4441 + }, + { + "epoch": 2.9726972446377053, + "grad_norm": 5.013416290283203, + "learning_rate": 1.2134954704906554e-08, + "loss": 2.3378, + "step": 4442 + }, + { + "epoch": 2.9733662248609773, + "grad_norm": 6.836769104003906, + "learning_rate": 1.1535837399723193e-08, + "loss": 2.6785, + "step": 4443 + }, + { + "epoch": 2.97403520508425, + "grad_norm": 5.976371765136719, + "learning_rate": 1.0951883014548304e-08, + "loss": 2.4482, + "step": 4444 + }, + { + "epoch": 2.974704185307522, + "grad_norm": 6.51096773147583, + "learning_rate": 1.0383091903720666e-08, + "loss": 2.198, + "step": 4445 + }, + { + "epoch": 2.975373165530794, + "grad_norm": 5.580941677093506, + "learning_rate": 9.829464412383637e-09, + "loss": 2.3164, + "step": 4446 + }, + { + "epoch": 2.9760421457540662, + "grad_norm": 4.753101825714111, + "learning_rate": 9.291000876471278e-09, + "loss": 2.2085, + "step": 4447 + }, + { + "epoch": 2.9767111259773382, + "grad_norm": 5.79316520690918, + "learning_rate": 8.767701622727775e-09, + "loss": 2.1411, + "step": 4448 + }, + { + "epoch": 2.97738010620061, + "grad_norm": 5.860408306121826, + "learning_rate": 8.25956696868524e-09, + "loss": 2.2367, + "step": 4449 + }, + { + "epoch": 2.9780490864238827, + "grad_norm": 5.748015403747559, + "learning_rate": 7.766597222680361e-09, + "loss": 2.1812, + "step": 4450 + }, + { + "epoch": 2.9787180666471547, + "grad_norm": 5.077130317687988, + "learning_rate": 7.288792683846079e-09, + "loss": 2.2052, + "step": 4451 + }, + { + "epoch": 2.9793870468704267, + "grad_norm": 5.614157199859619, + "learning_rate": 6.826153642108812e-09, + "loss": 2.462, + "step": 4452 + }, + { + "epoch": 2.980056027093699, + "grad_norm": 4.780637264251709, + "learning_rate": 6.378680378199553e-09, + "loss": 2.2046, + "step": 4453 + }, + { + "epoch": 2.980725007316971, + "grad_norm": 5.087644577026367, + "learning_rate": 5.946373163645547e-09, + "loss": 2.4507, + "step": 4454 + }, + { + "epoch": 2.981393987540243, + "grad_norm": 4.729597091674805, + "learning_rate": 5.529232260764738e-09, + "loss": 2.1868, + "step": 4455 + }, + { + "epoch": 2.9820629677635155, + "grad_norm": 5.412956714630127, + "learning_rate": 5.127257922679651e-09, + "loss": 2.2628, + "step": 4456 + }, + { + "epoch": 2.9827319479867875, + "grad_norm": 5.601459980010986, + "learning_rate": 4.7404503933062835e-09, + "loss": 2.6793, + "step": 4457 + }, + { + "epoch": 2.9834009282100595, + "grad_norm": 4.8878374099731445, + "learning_rate": 4.3688099073568855e-09, + "loss": 2.2191, + "step": 4458 + }, + { + "epoch": 2.984069908433332, + "grad_norm": 5.0792951583862305, + "learning_rate": 4.012336690345508e-09, + "loss": 2.272, + "step": 4459 + }, + { + "epoch": 2.984738888656604, + "grad_norm": 5.414526462554932, + "learning_rate": 3.6710309585769044e-09, + "loss": 2.4166, + "step": 4460 + }, + { + "epoch": 2.985407868879876, + "grad_norm": 5.346670150756836, + "learning_rate": 3.344892919152076e-09, + "loss": 2.5593, + "step": 4461 + }, + { + "epoch": 2.9860768491031484, + "grad_norm": 5.360459327697754, + "learning_rate": 3.033922769973829e-09, + "loss": 2.1137, + "step": 4462 + }, + { + "epoch": 2.9867458293264204, + "grad_norm": 4.147956371307373, + "learning_rate": 2.738120699735669e-09, + "loss": 2.2612, + "step": 4463 + }, + { + "epoch": 2.987414809549693, + "grad_norm": 6.208757400512695, + "learning_rate": 2.457486887932903e-09, + "loss": 2.2765, + "step": 4464 + }, + { + "epoch": 2.988083789772965, + "grad_norm": 6.6931915283203125, + "learning_rate": 2.1920215048487625e-09, + "loss": 2.2574, + "step": 4465 + }, + { + "epoch": 2.988752769996237, + "grad_norm": 5.976284980773926, + "learning_rate": 1.9417247115682822e-09, + "loss": 2.1586, + "step": 4466 + }, + { + "epoch": 2.9894217502195093, + "grad_norm": 7.707475185394287, + "learning_rate": 1.7065966599699724e-09, + "loss": 2.453, + "step": 4467 + }, + { + "epoch": 2.9900907304427813, + "grad_norm": 4.192013263702393, + "learning_rate": 1.4866374927313687e-09, + "loss": 2.0627, + "step": 4468 + }, + { + "epoch": 2.9907597106660533, + "grad_norm": 4.631329536437988, + "learning_rate": 1.281847343320708e-09, + "loss": 2.4899, + "step": 4469 + }, + { + "epoch": 2.9914286908893257, + "grad_norm": 5.521909713745117, + "learning_rate": 1.0922263360080287e-09, + "loss": 2.3268, + "step": 4470 + }, + { + "epoch": 2.9920976711125977, + "grad_norm": 5.255979537963867, + "learning_rate": 9.177745858485187e-10, + "loss": 2.4397, + "step": 4471 + }, + { + "epoch": 2.9927666513358697, + "grad_norm": 4.689284324645996, + "learning_rate": 7.584921987019433e-10, + "loss": 2.2405, + "step": 4472 + }, + { + "epoch": 2.993435631559142, + "grad_norm": 5.5075225830078125, + "learning_rate": 6.143792712215435e-10, + "loss": 2.1497, + "step": 4473 + }, + { + "epoch": 2.994104611782414, + "grad_norm": 5.804144382476807, + "learning_rate": 4.854358908512602e-10, + "loss": 2.4019, + "step": 4474 + }, + { + "epoch": 2.9947735920056866, + "grad_norm": 5.247983455657959, + "learning_rate": 3.7166213583683664e-10, + "loss": 2.2047, + "step": 4475 + }, + { + "epoch": 2.9954425722289586, + "grad_norm": 5.321116924285889, + "learning_rate": 2.7305807521471607e-10, + "loss": 2.3354, + "step": 4476 + }, + { + "epoch": 2.9961115524522306, + "grad_norm": 5.585231781005859, + "learning_rate": 1.896237688175928e-10, + "loss": 2.3274, + "step": 4477 + }, + { + "epoch": 2.996780532675503, + "grad_norm": 6.140851020812988, + "learning_rate": 1.213592672744124e-10, + "loss": 2.2072, + "step": 4478 + }, + { + "epoch": 2.997449512898775, + "grad_norm": 7.330341339111328, + "learning_rate": 6.826461200482026e-11, + "loss": 2.5641, + "step": 4479 + }, + { + "epoch": 2.998118493122047, + "grad_norm": 5.906470775604248, + "learning_rate": 3.0339835227488624e-11, + "loss": 2.4656, + "step": 4480 + }, + { + "epoch": 2.9987874733453195, + "grad_norm": 5.045249938964844, + "learning_rate": 7.584959957340764e-12, + "loss": 2.3551, + "step": 4481 + }, + { + "epoch": 2.9994564535685915, + "grad_norm": 4.170248985290527, + "learning_rate": 0.0, + "loss": 2.028, + "step": 4482 + } + ], + "logging_steps": 1, + "max_steps": 4482, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.3171505984687872e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}