diff --git "a/last-checkpoint/trainer_state.json" "b/last-checkpoint/trainer_state.json" --- "a/last-checkpoint/trainer_state.json" +++ "b/last-checkpoint/trainer_state.json" @@ -1,9 +1,9 @@ { "best_metric": 0.3152608573436737, "best_model_checkpoint": "./w2v-bert-2.0-igbo_naijavoices_100h/checkpoint-8000", - "epoch": 37.93103448275862, + "epoch": 41.37931034482759, "eval_steps": 1000, - "global_step": 11000, + "global_step": 12000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -77117,6 +77117,7016 @@ "eval_steps_per_second": 0.169, "eval_wer": 0.2996894409937888, "step": 11000 + }, + { + "epoch": 37.93448275862069, + "grad_norm": 2.244279146194458, + "learning_rate": 2.8277701149425287e-05, + "loss": 0.1306, + "step": 11001 + }, + { + "epoch": 37.93793103448276, + "grad_norm": 0.8733891844749451, + "learning_rate": 2.8277241379310346e-05, + "loss": 0.1239, + "step": 11002 + }, + { + "epoch": 37.94137931034483, + "grad_norm": 1.3579665422439575, + "learning_rate": 2.8276781609195404e-05, + "loss": 0.1501, + "step": 11003 + }, + { + "epoch": 37.9448275862069, + "grad_norm": 0.8650179505348206, + "learning_rate": 2.827632183908046e-05, + "loss": 0.1442, + "step": 11004 + }, + { + "epoch": 37.94827586206897, + "grad_norm": 1.1297799348831177, + "learning_rate": 2.8275862068965518e-05, + "loss": 0.2019, + "step": 11005 + }, + { + "epoch": 37.95172413793104, + "grad_norm": 1.5747121572494507, + "learning_rate": 2.8275402298850577e-05, + "loss": 0.2113, + "step": 11006 + }, + { + "epoch": 37.9551724137931, + "grad_norm": 0.7033880949020386, + "learning_rate": 2.8274942528735632e-05, + "loss": 0.1966, + "step": 11007 + }, + { + "epoch": 37.95862068965517, + "grad_norm": 0.9622392654418945, + "learning_rate": 2.827448275862069e-05, + "loss": 0.203, + "step": 11008 + }, + { + "epoch": 37.96206896551724, + "grad_norm": 0.8751043081283569, + "learning_rate": 2.8274022988505746e-05, + "loss": 0.1979, + "step": 11009 + }, + { + "epoch": 37.96551724137931, + "grad_norm": 0.5318447947502136, + "learning_rate": 2.8273563218390805e-05, + "loss": 0.1627, + "step": 11010 + }, + { + "epoch": 37.96896551724138, + "grad_norm": 0.8166975975036621, + "learning_rate": 2.8273103448275864e-05, + "loss": 0.1566, + "step": 11011 + }, + { + "epoch": 37.97241379310345, + "grad_norm": 0.6261289119720459, + "learning_rate": 2.827264367816092e-05, + "loss": 0.1598, + "step": 11012 + }, + { + "epoch": 37.97586206896552, + "grad_norm": 0.5284278392791748, + "learning_rate": 2.8272183908045978e-05, + "loss": 0.1562, + "step": 11013 + }, + { + "epoch": 37.97931034482759, + "grad_norm": 0.6055872440338135, + "learning_rate": 2.8271724137931036e-05, + "loss": 0.1653, + "step": 11014 + }, + { + "epoch": 37.98275862068966, + "grad_norm": 1.0563055276870728, + "learning_rate": 2.827126436781609e-05, + "loss": 0.1318, + "step": 11015 + }, + { + "epoch": 37.98620689655172, + "grad_norm": 0.937923014163971, + "learning_rate": 2.827080459770115e-05, + "loss": 0.1388, + "step": 11016 + }, + { + "epoch": 37.98965517241379, + "grad_norm": 0.6607889533042908, + "learning_rate": 2.8270344827586206e-05, + "loss": 0.1404, + "step": 11017 + }, + { + "epoch": 37.99310344827586, + "grad_norm": 0.6529019474983215, + "learning_rate": 2.8269885057471268e-05, + "loss": 0.1232, + "step": 11018 + }, + { + "epoch": 37.99655172413793, + "grad_norm": 0.7910711765289307, + "learning_rate": 2.8269425287356323e-05, + "loss": 0.1587, + "step": 11019 + }, + { + "epoch": 38.0, + "grad_norm": 1.2357171773910522, + "learning_rate": 2.8268965517241378e-05, + "loss": 0.1834, + "step": 11020 + }, + { + "epoch": 38.00344827586207, + "grad_norm": 1.288411259651184, + "learning_rate": 2.8268505747126437e-05, + "loss": 0.2112, + "step": 11021 + }, + { + "epoch": 38.00689655172414, + "grad_norm": 1.1342320442199707, + "learning_rate": 2.8268045977011496e-05, + "loss": 0.196, + "step": 11022 + }, + { + "epoch": 38.01034482758621, + "grad_norm": 0.6177983283996582, + "learning_rate": 2.8267586206896554e-05, + "loss": 0.1971, + "step": 11023 + }, + { + "epoch": 38.01379310344828, + "grad_norm": 1.423910140991211, + "learning_rate": 2.826712643678161e-05, + "loss": 0.1956, + "step": 11024 + }, + { + "epoch": 38.01724137931034, + "grad_norm": 0.557257354259491, + "learning_rate": 2.8266666666666665e-05, + "loss": 0.183, + "step": 11025 + }, + { + "epoch": 38.02068965517241, + "grad_norm": 0.6663224101066589, + "learning_rate": 2.8266206896551727e-05, + "loss": 0.1608, + "step": 11026 + }, + { + "epoch": 38.02413793103448, + "grad_norm": 0.7377750873565674, + "learning_rate": 2.8265747126436782e-05, + "loss": 0.1814, + "step": 11027 + }, + { + "epoch": 38.02758620689655, + "grad_norm": 1.8387467861175537, + "learning_rate": 2.826528735632184e-05, + "loss": 0.1764, + "step": 11028 + }, + { + "epoch": 38.03103448275862, + "grad_norm": 0.6119486093521118, + "learning_rate": 2.8264827586206896e-05, + "loss": 0.1495, + "step": 11029 + }, + { + "epoch": 38.03448275862069, + "grad_norm": 0.5738792419433594, + "learning_rate": 2.8264367816091955e-05, + "loss": 0.169, + "step": 11030 + }, + { + "epoch": 38.03793103448276, + "grad_norm": 1.0546351671218872, + "learning_rate": 2.8263908045977014e-05, + "loss": 0.167, + "step": 11031 + }, + { + "epoch": 38.04137931034483, + "grad_norm": 0.576999843120575, + "learning_rate": 2.826344827586207e-05, + "loss": 0.1506, + "step": 11032 + }, + { + "epoch": 38.0448275862069, + "grad_norm": 0.5732587575912476, + "learning_rate": 2.8262988505747128e-05, + "loss": 0.1443, + "step": 11033 + }, + { + "epoch": 38.04827586206896, + "grad_norm": 1.4059163331985474, + "learning_rate": 2.8262528735632186e-05, + "loss": 0.1423, + "step": 11034 + }, + { + "epoch": 38.05172413793103, + "grad_norm": 1.0474202632904053, + "learning_rate": 2.826206896551724e-05, + "loss": 0.1283, + "step": 11035 + }, + { + "epoch": 38.0551724137931, + "grad_norm": 0.6787098050117493, + "learning_rate": 2.82616091954023e-05, + "loss": 0.1193, + "step": 11036 + }, + { + "epoch": 38.05862068965517, + "grad_norm": 2.1983323097229004, + "learning_rate": 2.8261149425287355e-05, + "loss": 0.1413, + "step": 11037 + }, + { + "epoch": 38.06206896551724, + "grad_norm": 0.5369946360588074, + "learning_rate": 2.8260689655172418e-05, + "loss": 0.1137, + "step": 11038 + }, + { + "epoch": 38.06551724137931, + "grad_norm": 0.7499077916145325, + "learning_rate": 2.8260229885057473e-05, + "loss": 0.1353, + "step": 11039 + }, + { + "epoch": 38.06896551724138, + "grad_norm": 1.709822177886963, + "learning_rate": 2.8259770114942528e-05, + "loss": 0.13, + "step": 11040 + }, + { + "epoch": 38.07241379310345, + "grad_norm": 1.5337316989898682, + "learning_rate": 2.8259310344827587e-05, + "loss": 0.1139, + "step": 11041 + }, + { + "epoch": 38.07586206896552, + "grad_norm": 0.6773917078971863, + "learning_rate": 2.8258850574712645e-05, + "loss": 0.1149, + "step": 11042 + }, + { + "epoch": 38.07931034482758, + "grad_norm": 0.7857959270477295, + "learning_rate": 2.82583908045977e-05, + "loss": 0.1131, + "step": 11043 + }, + { + "epoch": 38.08275862068965, + "grad_norm": 1.449794054031372, + "learning_rate": 2.825793103448276e-05, + "loss": 0.133, + "step": 11044 + }, + { + "epoch": 38.08620689655172, + "grad_norm": 1.6826680898666382, + "learning_rate": 2.8257471264367815e-05, + "loss": 0.1824, + "step": 11045 + }, + { + "epoch": 38.08965517241379, + "grad_norm": 1.0662840604782104, + "learning_rate": 2.8257011494252877e-05, + "loss": 0.223, + "step": 11046 + }, + { + "epoch": 38.09310344827586, + "grad_norm": 0.9584540724754333, + "learning_rate": 2.8256551724137932e-05, + "loss": 0.1891, + "step": 11047 + }, + { + "epoch": 38.09655172413793, + "grad_norm": 0.53106689453125, + "learning_rate": 2.8256091954022987e-05, + "loss": 0.1755, + "step": 11048 + }, + { + "epoch": 38.1, + "grad_norm": 0.59113609790802, + "learning_rate": 2.8255632183908046e-05, + "loss": 0.1854, + "step": 11049 + }, + { + "epoch": 38.10344827586207, + "grad_norm": 1.7208354473114014, + "learning_rate": 2.8255172413793105e-05, + "loss": 0.1701, + "step": 11050 + }, + { + "epoch": 38.10689655172414, + "grad_norm": 0.7906379103660583, + "learning_rate": 2.8254712643678163e-05, + "loss": 0.1484, + "step": 11051 + }, + { + "epoch": 38.110344827586204, + "grad_norm": 0.6353622674942017, + "learning_rate": 2.825425287356322e-05, + "loss": 0.1662, + "step": 11052 + }, + { + "epoch": 38.11379310344827, + "grad_norm": 0.9228904247283936, + "learning_rate": 2.8253793103448274e-05, + "loss": 0.172, + "step": 11053 + }, + { + "epoch": 38.11724137931034, + "grad_norm": 1.6312378644943237, + "learning_rate": 2.8253333333333336e-05, + "loss": 0.1681, + "step": 11054 + }, + { + "epoch": 38.12068965517241, + "grad_norm": 0.7363501787185669, + "learning_rate": 2.825287356321839e-05, + "loss": 0.15, + "step": 11055 + }, + { + "epoch": 38.12413793103448, + "grad_norm": 1.122572660446167, + "learning_rate": 2.825241379310345e-05, + "loss": 0.1682, + "step": 11056 + }, + { + "epoch": 38.12758620689655, + "grad_norm": 2.118699073791504, + "learning_rate": 2.8251954022988505e-05, + "loss": 0.1745, + "step": 11057 + }, + { + "epoch": 38.13103448275862, + "grad_norm": 0.5879372358322144, + "learning_rate": 2.8251494252873564e-05, + "loss": 0.1575, + "step": 11058 + }, + { + "epoch": 38.13448275862069, + "grad_norm": 0.949296236038208, + "learning_rate": 2.8251034482758623e-05, + "loss": 0.1591, + "step": 11059 + }, + { + "epoch": 38.13793103448276, + "grad_norm": 0.6345550417900085, + "learning_rate": 2.8250574712643678e-05, + "loss": 0.1834, + "step": 11060 + }, + { + "epoch": 38.141379310344824, + "grad_norm": 0.9381221532821655, + "learning_rate": 2.8250114942528737e-05, + "loss": 0.1406, + "step": 11061 + }, + { + "epoch": 38.144827586206894, + "grad_norm": 0.6490294337272644, + "learning_rate": 2.8249655172413795e-05, + "loss": 0.1411, + "step": 11062 + }, + { + "epoch": 38.148275862068964, + "grad_norm": 0.621123194694519, + "learning_rate": 2.824919540229885e-05, + "loss": 0.1378, + "step": 11063 + }, + { + "epoch": 38.15172413793103, + "grad_norm": 0.6010726094245911, + "learning_rate": 2.824873563218391e-05, + "loss": 0.1251, + "step": 11064 + }, + { + "epoch": 38.1551724137931, + "grad_norm": 0.844870924949646, + "learning_rate": 2.8248275862068965e-05, + "loss": 0.1337, + "step": 11065 + }, + { + "epoch": 38.15862068965517, + "grad_norm": 0.9412791728973389, + "learning_rate": 2.8247816091954027e-05, + "loss": 0.1257, + "step": 11066 + }, + { + "epoch": 38.16206896551724, + "grad_norm": 3.892554521560669, + "learning_rate": 2.8247356321839082e-05, + "loss": 0.1335, + "step": 11067 + }, + { + "epoch": 38.16551724137931, + "grad_norm": 1.4252945184707642, + "learning_rate": 2.8246896551724137e-05, + "loss": 0.1256, + "step": 11068 + }, + { + "epoch": 38.16896551724138, + "grad_norm": 0.8877184391021729, + "learning_rate": 2.8246436781609196e-05, + "loss": 0.1303, + "step": 11069 + }, + { + "epoch": 38.172413793103445, + "grad_norm": 2.387632369995117, + "learning_rate": 2.8245977011494255e-05, + "loss": 0.1634, + "step": 11070 + }, + { + "epoch": 38.175862068965515, + "grad_norm": 0.6913253664970398, + "learning_rate": 2.824551724137931e-05, + "loss": 0.2395, + "step": 11071 + }, + { + "epoch": 38.179310344827584, + "grad_norm": 0.519919753074646, + "learning_rate": 2.824505747126437e-05, + "loss": 0.2057, + "step": 11072 + }, + { + "epoch": 38.182758620689654, + "grad_norm": 0.8646356463432312, + "learning_rate": 2.8244597701149424e-05, + "loss": 0.1801, + "step": 11073 + }, + { + "epoch": 38.186206896551724, + "grad_norm": 0.6832991242408752, + "learning_rate": 2.8244137931034486e-05, + "loss": 0.173, + "step": 11074 + }, + { + "epoch": 38.189655172413794, + "grad_norm": 0.791799783706665, + "learning_rate": 2.824367816091954e-05, + "loss": 0.1578, + "step": 11075 + }, + { + "epoch": 38.19310344827586, + "grad_norm": 0.8161206841468811, + "learning_rate": 2.8243218390804597e-05, + "loss": 0.1594, + "step": 11076 + }, + { + "epoch": 38.19655172413793, + "grad_norm": 0.5462909936904907, + "learning_rate": 2.8242758620689655e-05, + "loss": 0.1881, + "step": 11077 + }, + { + "epoch": 38.2, + "grad_norm": 0.623665988445282, + "learning_rate": 2.8242298850574714e-05, + "loss": 0.174, + "step": 11078 + }, + { + "epoch": 38.203448275862065, + "grad_norm": 0.6887049078941345, + "learning_rate": 2.8241839080459773e-05, + "loss": 0.166, + "step": 11079 + }, + { + "epoch": 38.206896551724135, + "grad_norm": 0.7617496848106384, + "learning_rate": 2.8241379310344828e-05, + "loss": 0.1421, + "step": 11080 + }, + { + "epoch": 38.210344827586205, + "grad_norm": 1.2492121458053589, + "learning_rate": 2.8240919540229883e-05, + "loss": 0.1299, + "step": 11081 + }, + { + "epoch": 38.213793103448275, + "grad_norm": 0.5571796298027039, + "learning_rate": 2.8240459770114945e-05, + "loss": 0.1636, + "step": 11082 + }, + { + "epoch": 38.217241379310344, + "grad_norm": 0.548718273639679, + "learning_rate": 2.824e-05, + "loss": 0.1412, + "step": 11083 + }, + { + "epoch": 38.220689655172414, + "grad_norm": 0.5747441053390503, + "learning_rate": 2.823954022988506e-05, + "loss": 0.1289, + "step": 11084 + }, + { + "epoch": 38.224137931034484, + "grad_norm": 1.056795358657837, + "learning_rate": 2.8239080459770115e-05, + "loss": 0.1525, + "step": 11085 + }, + { + "epoch": 38.227586206896554, + "grad_norm": 0.6217753887176514, + "learning_rate": 2.8238620689655173e-05, + "loss": 0.1095, + "step": 11086 + }, + { + "epoch": 38.23103448275862, + "grad_norm": 0.5540239214897156, + "learning_rate": 2.8238160919540232e-05, + "loss": 0.1128, + "step": 11087 + }, + { + "epoch": 38.234482758620686, + "grad_norm": 0.7455006837844849, + "learning_rate": 2.8237701149425287e-05, + "loss": 0.1238, + "step": 11088 + }, + { + "epoch": 38.237931034482756, + "grad_norm": 0.701106071472168, + "learning_rate": 2.8237241379310346e-05, + "loss": 0.141, + "step": 11089 + }, + { + "epoch": 38.241379310344826, + "grad_norm": 0.6624748706817627, + "learning_rate": 2.8236781609195405e-05, + "loss": 0.1147, + "step": 11090 + }, + { + "epoch": 38.244827586206895, + "grad_norm": 1.6331150531768799, + "learning_rate": 2.823632183908046e-05, + "loss": 0.1235, + "step": 11091 + }, + { + "epoch": 38.248275862068965, + "grad_norm": 0.73601895570755, + "learning_rate": 2.823586206896552e-05, + "loss": 0.133, + "step": 11092 + }, + { + "epoch": 38.251724137931035, + "grad_norm": 0.6321110725402832, + "learning_rate": 2.8235402298850574e-05, + "loss": 0.1053, + "step": 11093 + }, + { + "epoch": 38.255172413793105, + "grad_norm": 0.8897535800933838, + "learning_rate": 2.8234942528735636e-05, + "loss": 0.1272, + "step": 11094 + }, + { + "epoch": 38.258620689655174, + "grad_norm": 1.3054646253585815, + "learning_rate": 2.823448275862069e-05, + "loss": 0.1575, + "step": 11095 + }, + { + "epoch": 38.262068965517244, + "grad_norm": 0.8358314037322998, + "learning_rate": 2.8234022988505747e-05, + "loss": 0.2264, + "step": 11096 + }, + { + "epoch": 38.265517241379314, + "grad_norm": 0.785050630569458, + "learning_rate": 2.8233563218390805e-05, + "loss": 0.1888, + "step": 11097 + }, + { + "epoch": 38.26896551724138, + "grad_norm": 0.5590908527374268, + "learning_rate": 2.8233103448275864e-05, + "loss": 0.1773, + "step": 11098 + }, + { + "epoch": 38.272413793103446, + "grad_norm": 0.7016497850418091, + "learning_rate": 2.823264367816092e-05, + "loss": 0.1844, + "step": 11099 + }, + { + "epoch": 38.275862068965516, + "grad_norm": 0.698447048664093, + "learning_rate": 2.8232183908045978e-05, + "loss": 0.174, + "step": 11100 + }, + { + "epoch": 38.279310344827586, + "grad_norm": 0.6120927929878235, + "learning_rate": 2.8231724137931033e-05, + "loss": 0.1828, + "step": 11101 + }, + { + "epoch": 38.282758620689656, + "grad_norm": 0.5784707069396973, + "learning_rate": 2.8231264367816095e-05, + "loss": 0.1687, + "step": 11102 + }, + { + "epoch": 38.286206896551725, + "grad_norm": 0.9990240931510925, + "learning_rate": 2.823080459770115e-05, + "loss": 0.1852, + "step": 11103 + }, + { + "epoch": 38.289655172413795, + "grad_norm": 0.6674383878707886, + "learning_rate": 2.8230344827586206e-05, + "loss": 0.1368, + "step": 11104 + }, + { + "epoch": 38.293103448275865, + "grad_norm": 0.5530248880386353, + "learning_rate": 2.8229885057471265e-05, + "loss": 0.1575, + "step": 11105 + }, + { + "epoch": 38.296551724137935, + "grad_norm": 0.9319071769714355, + "learning_rate": 2.8229425287356323e-05, + "loss": 0.1595, + "step": 11106 + }, + { + "epoch": 38.3, + "grad_norm": 1.5229263305664062, + "learning_rate": 2.8228965517241382e-05, + "loss": 0.1718, + "step": 11107 + }, + { + "epoch": 38.30344827586207, + "grad_norm": 0.8065203428268433, + "learning_rate": 2.8228505747126437e-05, + "loss": 0.1456, + "step": 11108 + }, + { + "epoch": 38.30689655172414, + "grad_norm": 0.522131621837616, + "learning_rate": 2.8228045977011493e-05, + "loss": 0.1281, + "step": 11109 + }, + { + "epoch": 38.310344827586206, + "grad_norm": 2.7977569103240967, + "learning_rate": 2.8227586206896555e-05, + "loss": 0.1345, + "step": 11110 + }, + { + "epoch": 38.313793103448276, + "grad_norm": 0.5845361351966858, + "learning_rate": 2.822712643678161e-05, + "loss": 0.1277, + "step": 11111 + }, + { + "epoch": 38.317241379310346, + "grad_norm": 1.2738466262817383, + "learning_rate": 2.822666666666667e-05, + "loss": 0.1467, + "step": 11112 + }, + { + "epoch": 38.320689655172416, + "grad_norm": 0.6461541056632996, + "learning_rate": 2.8226206896551724e-05, + "loss": 0.1256, + "step": 11113 + }, + { + "epoch": 38.324137931034485, + "grad_norm": 1.1052881479263306, + "learning_rate": 2.8225747126436783e-05, + "loss": 0.1244, + "step": 11114 + }, + { + "epoch": 38.327586206896555, + "grad_norm": 1.7344257831573486, + "learning_rate": 2.822528735632184e-05, + "loss": 0.1138, + "step": 11115 + }, + { + "epoch": 38.33103448275862, + "grad_norm": 1.1886646747589111, + "learning_rate": 2.8224827586206897e-05, + "loss": 0.1311, + "step": 11116 + }, + { + "epoch": 38.33448275862069, + "grad_norm": 1.0006709098815918, + "learning_rate": 2.8224367816091955e-05, + "loss": 0.1282, + "step": 11117 + }, + { + "epoch": 38.33793103448276, + "grad_norm": 1.5593358278274536, + "learning_rate": 2.8223908045977014e-05, + "loss": 0.1271, + "step": 11118 + }, + { + "epoch": 38.34137931034483, + "grad_norm": 5.1732940673828125, + "learning_rate": 2.822344827586207e-05, + "loss": 0.1236, + "step": 11119 + }, + { + "epoch": 38.3448275862069, + "grad_norm": 3.5342190265655518, + "learning_rate": 2.8222988505747128e-05, + "loss": 0.1547, + "step": 11120 + }, + { + "epoch": 38.34827586206897, + "grad_norm": 0.6172340512275696, + "learning_rate": 2.8222528735632183e-05, + "loss": 0.2441, + "step": 11121 + }, + { + "epoch": 38.351724137931036, + "grad_norm": 1.1009495258331299, + "learning_rate": 2.8222068965517245e-05, + "loss": 0.1905, + "step": 11122 + }, + { + "epoch": 38.355172413793106, + "grad_norm": 0.8726471662521362, + "learning_rate": 2.82216091954023e-05, + "loss": 0.1886, + "step": 11123 + }, + { + "epoch": 38.358620689655176, + "grad_norm": 0.8554416298866272, + "learning_rate": 2.8221149425287356e-05, + "loss": 0.2054, + "step": 11124 + }, + { + "epoch": 38.36206896551724, + "grad_norm": 1.0626298189163208, + "learning_rate": 2.8220689655172415e-05, + "loss": 0.1911, + "step": 11125 + }, + { + "epoch": 38.36551724137931, + "grad_norm": 0.6079199910163879, + "learning_rate": 2.8220229885057473e-05, + "loss": 0.1654, + "step": 11126 + }, + { + "epoch": 38.36896551724138, + "grad_norm": 0.6212504506111145, + "learning_rate": 2.8219770114942532e-05, + "loss": 0.1557, + "step": 11127 + }, + { + "epoch": 38.37241379310345, + "grad_norm": 0.496440052986145, + "learning_rate": 2.8219310344827587e-05, + "loss": 0.1584, + "step": 11128 + }, + { + "epoch": 38.37586206896552, + "grad_norm": 0.5732938647270203, + "learning_rate": 2.8218850574712642e-05, + "loss": 0.1365, + "step": 11129 + }, + { + "epoch": 38.37931034482759, + "grad_norm": 1.3416622877120972, + "learning_rate": 2.8218390804597705e-05, + "loss": 0.1582, + "step": 11130 + }, + { + "epoch": 38.38275862068966, + "grad_norm": 1.3807644844055176, + "learning_rate": 2.821793103448276e-05, + "loss": 0.1625, + "step": 11131 + }, + { + "epoch": 38.38620689655173, + "grad_norm": 0.9106972813606262, + "learning_rate": 2.8217471264367815e-05, + "loss": 0.147, + "step": 11132 + }, + { + "epoch": 38.389655172413796, + "grad_norm": 1.0389920473098755, + "learning_rate": 2.8217011494252874e-05, + "loss": 0.1461, + "step": 11133 + }, + { + "epoch": 38.39310344827586, + "grad_norm": 0.7202304005622864, + "learning_rate": 2.8216551724137933e-05, + "loss": 0.144, + "step": 11134 + }, + { + "epoch": 38.39655172413793, + "grad_norm": 0.555806577205658, + "learning_rate": 2.821609195402299e-05, + "loss": 0.1248, + "step": 11135 + }, + { + "epoch": 38.4, + "grad_norm": 0.7730570435523987, + "learning_rate": 2.8215632183908046e-05, + "loss": 0.1607, + "step": 11136 + }, + { + "epoch": 38.40344827586207, + "grad_norm": 0.7074033617973328, + "learning_rate": 2.8215172413793102e-05, + "loss": 0.127, + "step": 11137 + }, + { + "epoch": 38.40689655172414, + "grad_norm": 0.6953191161155701, + "learning_rate": 2.8214712643678164e-05, + "loss": 0.1298, + "step": 11138 + }, + { + "epoch": 38.41034482758621, + "grad_norm": 0.9239925742149353, + "learning_rate": 2.821425287356322e-05, + "loss": 0.1452, + "step": 11139 + }, + { + "epoch": 38.41379310344828, + "grad_norm": 1.2424966096878052, + "learning_rate": 2.8213793103448278e-05, + "loss": 0.1271, + "step": 11140 + }, + { + "epoch": 38.41724137931035, + "grad_norm": 2.8544068336486816, + "learning_rate": 2.8213333333333333e-05, + "loss": 0.1272, + "step": 11141 + }, + { + "epoch": 38.42068965517242, + "grad_norm": 0.7662988305091858, + "learning_rate": 2.8212873563218392e-05, + "loss": 0.1282, + "step": 11142 + }, + { + "epoch": 38.42413793103448, + "grad_norm": 0.9247673153877258, + "learning_rate": 2.821241379310345e-05, + "loss": 0.1239, + "step": 11143 + }, + { + "epoch": 38.42758620689655, + "grad_norm": 0.7490770816802979, + "learning_rate": 2.8211954022988506e-05, + "loss": 0.1217, + "step": 11144 + }, + { + "epoch": 38.43103448275862, + "grad_norm": 1.0376331806182861, + "learning_rate": 2.8211494252873564e-05, + "loss": 0.2088, + "step": 11145 + }, + { + "epoch": 38.43448275862069, + "grad_norm": 1.3446253538131714, + "learning_rate": 2.8211034482758623e-05, + "loss": 0.2585, + "step": 11146 + }, + { + "epoch": 38.43793103448276, + "grad_norm": 0.7463351488113403, + "learning_rate": 2.821057471264368e-05, + "loss": 0.1908, + "step": 11147 + }, + { + "epoch": 38.44137931034483, + "grad_norm": 0.9018734693527222, + "learning_rate": 2.8210114942528737e-05, + "loss": 0.2099, + "step": 11148 + }, + { + "epoch": 38.4448275862069, + "grad_norm": 0.5361537337303162, + "learning_rate": 2.8209655172413792e-05, + "loss": 0.1638, + "step": 11149 + }, + { + "epoch": 38.44827586206897, + "grad_norm": 0.693289577960968, + "learning_rate": 2.820919540229885e-05, + "loss": 0.1834, + "step": 11150 + }, + { + "epoch": 38.45172413793104, + "grad_norm": 0.7946658730506897, + "learning_rate": 2.820873563218391e-05, + "loss": 0.1651, + "step": 11151 + }, + { + "epoch": 38.4551724137931, + "grad_norm": 0.5924472808837891, + "learning_rate": 2.8208275862068965e-05, + "loss": 0.1621, + "step": 11152 + }, + { + "epoch": 38.45862068965517, + "grad_norm": 0.5958293676376343, + "learning_rate": 2.8207816091954024e-05, + "loss": 0.1666, + "step": 11153 + }, + { + "epoch": 38.46206896551724, + "grad_norm": 0.5247872471809387, + "learning_rate": 2.820735632183908e-05, + "loss": 0.157, + "step": 11154 + }, + { + "epoch": 38.46551724137931, + "grad_norm": 0.541880190372467, + "learning_rate": 2.820689655172414e-05, + "loss": 0.1612, + "step": 11155 + }, + { + "epoch": 38.46896551724138, + "grad_norm": 0.5855715870857239, + "learning_rate": 2.8206436781609196e-05, + "loss": 0.1473, + "step": 11156 + }, + { + "epoch": 38.47241379310345, + "grad_norm": 2.371551513671875, + "learning_rate": 2.8205977011494252e-05, + "loss": 0.1635, + "step": 11157 + }, + { + "epoch": 38.47586206896552, + "grad_norm": 0.6856994032859802, + "learning_rate": 2.820551724137931e-05, + "loss": 0.1512, + "step": 11158 + }, + { + "epoch": 38.47931034482759, + "grad_norm": 0.8492767810821533, + "learning_rate": 2.820505747126437e-05, + "loss": 0.148, + "step": 11159 + }, + { + "epoch": 38.48275862068966, + "grad_norm": 1.0750398635864258, + "learning_rate": 2.8204597701149424e-05, + "loss": 0.1501, + "step": 11160 + }, + { + "epoch": 38.48620689655172, + "grad_norm": 0.5936921238899231, + "learning_rate": 2.8204137931034483e-05, + "loss": 0.1534, + "step": 11161 + }, + { + "epoch": 38.48965517241379, + "grad_norm": 0.817501425743103, + "learning_rate": 2.820367816091954e-05, + "loss": 0.1196, + "step": 11162 + }, + { + "epoch": 38.49310344827586, + "grad_norm": 0.9979875087738037, + "learning_rate": 2.82032183908046e-05, + "loss": 0.1327, + "step": 11163 + }, + { + "epoch": 38.49655172413793, + "grad_norm": 0.7728829979896545, + "learning_rate": 2.8202758620689656e-05, + "loss": 0.1424, + "step": 11164 + }, + { + "epoch": 38.5, + "grad_norm": 0.8059385418891907, + "learning_rate": 2.820229885057471e-05, + "loss": 0.1282, + "step": 11165 + }, + { + "epoch": 38.50344827586207, + "grad_norm": 0.7044061422348022, + "learning_rate": 2.820183908045977e-05, + "loss": 0.1546, + "step": 11166 + }, + { + "epoch": 38.50689655172414, + "grad_norm": 0.6805763840675354, + "learning_rate": 2.820137931034483e-05, + "loss": 0.1217, + "step": 11167 + }, + { + "epoch": 38.51034482758621, + "grad_norm": 1.2128427028656006, + "learning_rate": 2.8200919540229887e-05, + "loss": 0.1011, + "step": 11168 + }, + { + "epoch": 38.51379310344828, + "grad_norm": 0.8982587456703186, + "learning_rate": 2.8200459770114942e-05, + "loss": 0.1269, + "step": 11169 + }, + { + "epoch": 38.51724137931034, + "grad_norm": 1.0519921779632568, + "learning_rate": 2.8199999999999998e-05, + "loss": 0.1741, + "step": 11170 + }, + { + "epoch": 38.52068965517241, + "grad_norm": 0.712051510810852, + "learning_rate": 2.819954022988506e-05, + "loss": 0.2228, + "step": 11171 + }, + { + "epoch": 38.52413793103448, + "grad_norm": 1.0443302392959595, + "learning_rate": 2.8199080459770115e-05, + "loss": 0.1965, + "step": 11172 + }, + { + "epoch": 38.52758620689655, + "grad_norm": 0.8447859883308411, + "learning_rate": 2.8198620689655174e-05, + "loss": 0.1917, + "step": 11173 + }, + { + "epoch": 38.53103448275862, + "grad_norm": 1.6274302005767822, + "learning_rate": 2.819816091954023e-05, + "loss": 0.1663, + "step": 11174 + }, + { + "epoch": 38.53448275862069, + "grad_norm": 0.6539332866668701, + "learning_rate": 2.8197701149425288e-05, + "loss": 0.184, + "step": 11175 + }, + { + "epoch": 38.53793103448276, + "grad_norm": 2.3592281341552734, + "learning_rate": 2.8197241379310346e-05, + "loss": 0.1568, + "step": 11176 + }, + { + "epoch": 38.54137931034483, + "grad_norm": 0.778899610042572, + "learning_rate": 2.81967816091954e-05, + "loss": 0.1901, + "step": 11177 + }, + { + "epoch": 38.5448275862069, + "grad_norm": 0.66343092918396, + "learning_rate": 2.819632183908046e-05, + "loss": 0.1642, + "step": 11178 + }, + { + "epoch": 38.54827586206896, + "grad_norm": 1.4064241647720337, + "learning_rate": 2.819586206896552e-05, + "loss": 0.1544, + "step": 11179 + }, + { + "epoch": 38.55172413793103, + "grad_norm": 0.7369663715362549, + "learning_rate": 2.8195402298850574e-05, + "loss": 0.138, + "step": 11180 + }, + { + "epoch": 38.5551724137931, + "grad_norm": 0.7397128343582153, + "learning_rate": 2.8194942528735633e-05, + "loss": 0.1493, + "step": 11181 + }, + { + "epoch": 38.55862068965517, + "grad_norm": 0.9013453722000122, + "learning_rate": 2.819448275862069e-05, + "loss": 0.1754, + "step": 11182 + }, + { + "epoch": 38.56206896551724, + "grad_norm": 0.7493063807487488, + "learning_rate": 2.819402298850575e-05, + "loss": 0.1474, + "step": 11183 + }, + { + "epoch": 38.56551724137931, + "grad_norm": 0.9977419972419739, + "learning_rate": 2.8193563218390806e-05, + "loss": 0.1724, + "step": 11184 + }, + { + "epoch": 38.56896551724138, + "grad_norm": 1.036911964416504, + "learning_rate": 2.819310344827586e-05, + "loss": 0.1644, + "step": 11185 + }, + { + "epoch": 38.57241379310345, + "grad_norm": 1.0633352994918823, + "learning_rate": 2.819264367816092e-05, + "loss": 0.1588, + "step": 11186 + }, + { + "epoch": 38.57586206896552, + "grad_norm": 1.0141501426696777, + "learning_rate": 2.819218390804598e-05, + "loss": 0.1336, + "step": 11187 + }, + { + "epoch": 38.57931034482758, + "grad_norm": 0.8761879205703735, + "learning_rate": 2.8191724137931034e-05, + "loss": 0.1414, + "step": 11188 + }, + { + "epoch": 38.58275862068965, + "grad_norm": 0.6292900443077087, + "learning_rate": 2.8191264367816092e-05, + "loss": 0.1258, + "step": 11189 + }, + { + "epoch": 38.58620689655172, + "grad_norm": 1.840221881866455, + "learning_rate": 2.8190804597701148e-05, + "loss": 0.1381, + "step": 11190 + }, + { + "epoch": 38.58965517241379, + "grad_norm": 0.7856280207633972, + "learning_rate": 2.819034482758621e-05, + "loss": 0.1404, + "step": 11191 + }, + { + "epoch": 38.59310344827586, + "grad_norm": 0.9368109107017517, + "learning_rate": 2.8189885057471265e-05, + "loss": 0.1211, + "step": 11192 + }, + { + "epoch": 38.59655172413793, + "grad_norm": 0.7232136130332947, + "learning_rate": 2.818942528735632e-05, + "loss": 0.1263, + "step": 11193 + }, + { + "epoch": 38.6, + "grad_norm": 0.7170366048812866, + "learning_rate": 2.818896551724138e-05, + "loss": 0.1318, + "step": 11194 + }, + { + "epoch": 38.60344827586207, + "grad_norm": 1.0634135007858276, + "learning_rate": 2.8188505747126438e-05, + "loss": 0.1605, + "step": 11195 + }, + { + "epoch": 38.60689655172414, + "grad_norm": 0.5701091289520264, + "learning_rate": 2.8188045977011496e-05, + "loss": 0.235, + "step": 11196 + }, + { + "epoch": 38.610344827586204, + "grad_norm": 0.7595037221908569, + "learning_rate": 2.818758620689655e-05, + "loss": 0.2031, + "step": 11197 + }, + { + "epoch": 38.61379310344827, + "grad_norm": 0.7609796524047852, + "learning_rate": 2.8187126436781607e-05, + "loss": 0.1746, + "step": 11198 + }, + { + "epoch": 38.61724137931034, + "grad_norm": 0.741205632686615, + "learning_rate": 2.818666666666667e-05, + "loss": 0.1755, + "step": 11199 + }, + { + "epoch": 38.62068965517241, + "grad_norm": 0.5766764283180237, + "learning_rate": 2.8186206896551724e-05, + "loss": 0.169, + "step": 11200 + }, + { + "epoch": 38.62413793103448, + "grad_norm": 0.8556082844734192, + "learning_rate": 2.8185747126436783e-05, + "loss": 0.1857, + "step": 11201 + }, + { + "epoch": 38.62758620689655, + "grad_norm": 0.8908397555351257, + "learning_rate": 2.8185287356321838e-05, + "loss": 0.1601, + "step": 11202 + }, + { + "epoch": 38.63103448275862, + "grad_norm": 1.4185616970062256, + "learning_rate": 2.8184827586206897e-05, + "loss": 0.1557, + "step": 11203 + }, + { + "epoch": 38.63448275862069, + "grad_norm": 0.6099755167961121, + "learning_rate": 2.8184367816091956e-05, + "loss": 0.1468, + "step": 11204 + }, + { + "epoch": 38.63793103448276, + "grad_norm": 0.7966406345367432, + "learning_rate": 2.818390804597701e-05, + "loss": 0.1514, + "step": 11205 + }, + { + "epoch": 38.641379310344824, + "grad_norm": 1.1357074975967407, + "learning_rate": 2.818344827586207e-05, + "loss": 0.1456, + "step": 11206 + }, + { + "epoch": 38.644827586206894, + "grad_norm": 1.0193994045257568, + "learning_rate": 2.8182988505747128e-05, + "loss": 0.16, + "step": 11207 + }, + { + "epoch": 38.648275862068964, + "grad_norm": 1.0271649360656738, + "learning_rate": 2.8182528735632184e-05, + "loss": 0.1416, + "step": 11208 + }, + { + "epoch": 38.65172413793103, + "grad_norm": 0.6540797352790833, + "learning_rate": 2.8182068965517242e-05, + "loss": 0.1504, + "step": 11209 + }, + { + "epoch": 38.6551724137931, + "grad_norm": 0.669283926486969, + "learning_rate": 2.8181609195402298e-05, + "loss": 0.1587, + "step": 11210 + }, + { + "epoch": 38.65862068965517, + "grad_norm": 3.3099746704101562, + "learning_rate": 2.818114942528736e-05, + "loss": 0.138, + "step": 11211 + }, + { + "epoch": 38.66206896551724, + "grad_norm": 0.7048425078392029, + "learning_rate": 2.8180689655172415e-05, + "loss": 0.1377, + "step": 11212 + }, + { + "epoch": 38.66551724137931, + "grad_norm": 3.8509955406188965, + "learning_rate": 2.818022988505747e-05, + "loss": 0.1572, + "step": 11213 + }, + { + "epoch": 38.66896551724138, + "grad_norm": 0.6190562844276428, + "learning_rate": 2.817977011494253e-05, + "loss": 0.117, + "step": 11214 + }, + { + "epoch": 38.672413793103445, + "grad_norm": 0.7845790386199951, + "learning_rate": 2.8179310344827588e-05, + "loss": 0.1339, + "step": 11215 + }, + { + "epoch": 38.675862068965515, + "grad_norm": 1.3034923076629639, + "learning_rate": 2.8178850574712646e-05, + "loss": 0.1236, + "step": 11216 + }, + { + "epoch": 38.679310344827584, + "grad_norm": 0.6990228295326233, + "learning_rate": 2.81783908045977e-05, + "loss": 0.1069, + "step": 11217 + }, + { + "epoch": 38.682758620689654, + "grad_norm": 0.9575676321983337, + "learning_rate": 2.8177931034482757e-05, + "loss": 0.1258, + "step": 11218 + }, + { + "epoch": 38.686206896551724, + "grad_norm": 0.7768901586532593, + "learning_rate": 2.817747126436782e-05, + "loss": 0.1392, + "step": 11219 + }, + { + "epoch": 38.689655172413794, + "grad_norm": 1.1386855840682983, + "learning_rate": 2.8177011494252874e-05, + "loss": 0.1685, + "step": 11220 + }, + { + "epoch": 38.69310344827586, + "grad_norm": 0.674602746963501, + "learning_rate": 2.817655172413793e-05, + "loss": 0.2232, + "step": 11221 + }, + { + "epoch": 38.69655172413793, + "grad_norm": 0.7140346765518188, + "learning_rate": 2.8176091954022988e-05, + "loss": 0.2039, + "step": 11222 + }, + { + "epoch": 38.7, + "grad_norm": 0.6936749219894409, + "learning_rate": 2.8175632183908047e-05, + "loss": 0.1843, + "step": 11223 + }, + { + "epoch": 38.703448275862065, + "grad_norm": 0.7373558282852173, + "learning_rate": 2.8175172413793106e-05, + "loss": 0.1581, + "step": 11224 + }, + { + "epoch": 38.706896551724135, + "grad_norm": 1.226414442062378, + "learning_rate": 2.817471264367816e-05, + "loss": 0.1753, + "step": 11225 + }, + { + "epoch": 38.710344827586205, + "grad_norm": 1.0694701671600342, + "learning_rate": 2.8174252873563216e-05, + "loss": 0.1643, + "step": 11226 + }, + { + "epoch": 38.713793103448275, + "grad_norm": 0.8881990909576416, + "learning_rate": 2.8173793103448278e-05, + "loss": 0.1766, + "step": 11227 + }, + { + "epoch": 38.717241379310344, + "grad_norm": 0.8632447719573975, + "learning_rate": 2.8173333333333334e-05, + "loss": 0.1886, + "step": 11228 + }, + { + "epoch": 38.720689655172414, + "grad_norm": 0.9795783758163452, + "learning_rate": 2.8172873563218392e-05, + "loss": 0.1837, + "step": 11229 + }, + { + "epoch": 38.724137931034484, + "grad_norm": 1.17487633228302, + "learning_rate": 2.8172413793103447e-05, + "loss": 0.145, + "step": 11230 + }, + { + "epoch": 38.727586206896554, + "grad_norm": 0.6265134811401367, + "learning_rate": 2.8171954022988506e-05, + "loss": 0.1517, + "step": 11231 + }, + { + "epoch": 38.73103448275862, + "grad_norm": 1.9006118774414062, + "learning_rate": 2.8171494252873565e-05, + "loss": 0.1642, + "step": 11232 + }, + { + "epoch": 38.734482758620686, + "grad_norm": 0.6617832779884338, + "learning_rate": 2.817103448275862e-05, + "loss": 0.1569, + "step": 11233 + }, + { + "epoch": 38.737931034482756, + "grad_norm": 0.9698827862739563, + "learning_rate": 2.817057471264368e-05, + "loss": 0.1408, + "step": 11234 + }, + { + "epoch": 38.741379310344826, + "grad_norm": 0.6907277703285217, + "learning_rate": 2.8170114942528738e-05, + "loss": 0.1459, + "step": 11235 + }, + { + "epoch": 38.744827586206895, + "grad_norm": 0.6659308075904846, + "learning_rate": 2.8169655172413793e-05, + "loss": 0.1455, + "step": 11236 + }, + { + "epoch": 38.748275862068965, + "grad_norm": 0.9962143898010254, + "learning_rate": 2.816919540229885e-05, + "loss": 0.1424, + "step": 11237 + }, + { + "epoch": 38.751724137931035, + "grad_norm": 0.5742040872573853, + "learning_rate": 2.8168735632183907e-05, + "loss": 0.1398, + "step": 11238 + }, + { + "epoch": 38.755172413793105, + "grad_norm": 0.9889978170394897, + "learning_rate": 2.816827586206897e-05, + "loss": 0.1347, + "step": 11239 + }, + { + "epoch": 38.758620689655174, + "grad_norm": 0.7384815812110901, + "learning_rate": 2.8167816091954024e-05, + "loss": 0.1177, + "step": 11240 + }, + { + "epoch": 38.762068965517244, + "grad_norm": 0.8404116034507751, + "learning_rate": 2.816735632183908e-05, + "loss": 0.129, + "step": 11241 + }, + { + "epoch": 38.765517241379314, + "grad_norm": 0.6753665804862976, + "learning_rate": 2.8166896551724138e-05, + "loss": 0.1138, + "step": 11242 + }, + { + "epoch": 38.76896551724138, + "grad_norm": 0.9615057706832886, + "learning_rate": 2.8166436781609197e-05, + "loss": 0.1261, + "step": 11243 + }, + { + "epoch": 38.772413793103446, + "grad_norm": 1.154229998588562, + "learning_rate": 2.8165977011494255e-05, + "loss": 0.1195, + "step": 11244 + }, + { + "epoch": 38.775862068965516, + "grad_norm": 1.661726474761963, + "learning_rate": 2.816551724137931e-05, + "loss": 0.1417, + "step": 11245 + }, + { + "epoch": 38.779310344827586, + "grad_norm": 2.558492660522461, + "learning_rate": 2.8165057471264366e-05, + "loss": 0.2247, + "step": 11246 + }, + { + "epoch": 38.782758620689656, + "grad_norm": 0.5422631502151489, + "learning_rate": 2.8164597701149428e-05, + "loss": 0.1866, + "step": 11247 + }, + { + "epoch": 38.786206896551725, + "grad_norm": 0.5166971683502197, + "learning_rate": 2.8164137931034483e-05, + "loss": 0.201, + "step": 11248 + }, + { + "epoch": 38.789655172413795, + "grad_norm": 0.8265708088874817, + "learning_rate": 2.816367816091954e-05, + "loss": 0.1776, + "step": 11249 + }, + { + "epoch": 38.793103448275865, + "grad_norm": 1.0693845748901367, + "learning_rate": 2.8163218390804597e-05, + "loss": 0.1796, + "step": 11250 + }, + { + "epoch": 38.796551724137935, + "grad_norm": 0.6405310034751892, + "learning_rate": 2.8162758620689656e-05, + "loss": 0.1773, + "step": 11251 + }, + { + "epoch": 38.8, + "grad_norm": 0.7177334427833557, + "learning_rate": 2.8162298850574715e-05, + "loss": 0.1751, + "step": 11252 + }, + { + "epoch": 38.80344827586207, + "grad_norm": 0.7087022662162781, + "learning_rate": 2.816183908045977e-05, + "loss": 0.1711, + "step": 11253 + }, + { + "epoch": 38.80689655172414, + "grad_norm": 0.6847899556159973, + "learning_rate": 2.8161379310344825e-05, + "loss": 0.1366, + "step": 11254 + }, + { + "epoch": 38.810344827586206, + "grad_norm": 0.6740031838417053, + "learning_rate": 2.8160919540229887e-05, + "loss": 0.1679, + "step": 11255 + }, + { + "epoch": 38.813793103448276, + "grad_norm": 0.8846389651298523, + "learning_rate": 2.8160459770114943e-05, + "loss": 0.1506, + "step": 11256 + }, + { + "epoch": 38.817241379310346, + "grad_norm": 1.050904393196106, + "learning_rate": 2.816e-05, + "loss": 0.1412, + "step": 11257 + }, + { + "epoch": 38.820689655172416, + "grad_norm": 0.5958207845687866, + "learning_rate": 2.8159540229885057e-05, + "loss": 0.146, + "step": 11258 + }, + { + "epoch": 38.824137931034485, + "grad_norm": 1.9387481212615967, + "learning_rate": 2.8159080459770115e-05, + "loss": 0.1524, + "step": 11259 + }, + { + "epoch": 38.827586206896555, + "grad_norm": 0.675316572189331, + "learning_rate": 2.8158620689655174e-05, + "loss": 0.1672, + "step": 11260 + }, + { + "epoch": 38.83103448275862, + "grad_norm": 0.8230834007263184, + "learning_rate": 2.815816091954023e-05, + "loss": 0.1504, + "step": 11261 + }, + { + "epoch": 38.83448275862069, + "grad_norm": 0.6209582090377808, + "learning_rate": 2.8157701149425288e-05, + "loss": 0.1349, + "step": 11262 + }, + { + "epoch": 38.83793103448276, + "grad_norm": 1.9919519424438477, + "learning_rate": 2.8157241379310347e-05, + "loss": 0.1511, + "step": 11263 + }, + { + "epoch": 38.84137931034483, + "grad_norm": 0.7193061113357544, + "learning_rate": 2.8156781609195402e-05, + "loss": 0.1433, + "step": 11264 + }, + { + "epoch": 38.8448275862069, + "grad_norm": 1.1764130592346191, + "learning_rate": 2.815632183908046e-05, + "loss": 0.1382, + "step": 11265 + }, + { + "epoch": 38.84827586206897, + "grad_norm": 2.513911485671997, + "learning_rate": 2.8155862068965516e-05, + "loss": 0.1382, + "step": 11266 + }, + { + "epoch": 38.851724137931036, + "grad_norm": 0.906965970993042, + "learning_rate": 2.8155402298850578e-05, + "loss": 0.1118, + "step": 11267 + }, + { + "epoch": 38.855172413793106, + "grad_norm": 0.846393883228302, + "learning_rate": 2.8154942528735633e-05, + "loss": 0.121, + "step": 11268 + }, + { + "epoch": 38.858620689655176, + "grad_norm": 1.06280517578125, + "learning_rate": 2.815448275862069e-05, + "loss": 0.1765, + "step": 11269 + }, + { + "epoch": 38.86206896551724, + "grad_norm": 1.3685685396194458, + "learning_rate": 2.8154022988505747e-05, + "loss": 0.1882, + "step": 11270 + }, + { + "epoch": 38.86551724137931, + "grad_norm": 0.7835462689399719, + "learning_rate": 2.8153563218390806e-05, + "loss": 0.2189, + "step": 11271 + }, + { + "epoch": 38.86896551724138, + "grad_norm": 0.5669861435890198, + "learning_rate": 2.8153103448275865e-05, + "loss": 0.1614, + "step": 11272 + }, + { + "epoch": 38.87241379310345, + "grad_norm": 0.8797566294670105, + "learning_rate": 2.815264367816092e-05, + "loss": 0.1926, + "step": 11273 + }, + { + "epoch": 38.87586206896552, + "grad_norm": 0.6453063488006592, + "learning_rate": 2.8152183908045975e-05, + "loss": 0.1725, + "step": 11274 + }, + { + "epoch": 38.87931034482759, + "grad_norm": 0.6178017258644104, + "learning_rate": 2.8151724137931037e-05, + "loss": 0.1767, + "step": 11275 + }, + { + "epoch": 38.88275862068966, + "grad_norm": 0.8502981662750244, + "learning_rate": 2.8151264367816093e-05, + "loss": 0.1924, + "step": 11276 + }, + { + "epoch": 38.88620689655173, + "grad_norm": 0.5046601295471191, + "learning_rate": 2.8150804597701148e-05, + "loss": 0.1677, + "step": 11277 + }, + { + "epoch": 38.889655172413796, + "grad_norm": 0.8824790716171265, + "learning_rate": 2.8150344827586207e-05, + "loss": 0.1649, + "step": 11278 + }, + { + "epoch": 38.89310344827586, + "grad_norm": 0.547552764415741, + "learning_rate": 2.8149885057471265e-05, + "loss": 0.169, + "step": 11279 + }, + { + "epoch": 38.89655172413793, + "grad_norm": 0.5297574400901794, + "learning_rate": 2.8149425287356324e-05, + "loss": 0.1518, + "step": 11280 + }, + { + "epoch": 38.9, + "grad_norm": 0.6053862571716309, + "learning_rate": 2.814896551724138e-05, + "loss": 0.1612, + "step": 11281 + }, + { + "epoch": 38.90344827586207, + "grad_norm": 0.7442946434020996, + "learning_rate": 2.8148505747126435e-05, + "loss": 0.1471, + "step": 11282 + }, + { + "epoch": 38.90689655172414, + "grad_norm": 0.6418216228485107, + "learning_rate": 2.8148045977011497e-05, + "loss": 0.1492, + "step": 11283 + }, + { + "epoch": 38.91034482758621, + "grad_norm": 0.5186421275138855, + "learning_rate": 2.8147586206896552e-05, + "loss": 0.1468, + "step": 11284 + }, + { + "epoch": 38.91379310344828, + "grad_norm": 0.6584075093269348, + "learning_rate": 2.814712643678161e-05, + "loss": 0.1599, + "step": 11285 + }, + { + "epoch": 38.91724137931035, + "grad_norm": 0.5549549460411072, + "learning_rate": 2.8146666666666666e-05, + "loss": 0.1358, + "step": 11286 + }, + { + "epoch": 38.92068965517242, + "grad_norm": 0.79872065782547, + "learning_rate": 2.8146206896551725e-05, + "loss": 0.1275, + "step": 11287 + }, + { + "epoch": 38.92413793103448, + "grad_norm": 0.7759601473808289, + "learning_rate": 2.8145747126436783e-05, + "loss": 0.1221, + "step": 11288 + }, + { + "epoch": 38.92758620689655, + "grad_norm": 1.0268313884735107, + "learning_rate": 2.814528735632184e-05, + "loss": 0.1338, + "step": 11289 + }, + { + "epoch": 38.93103448275862, + "grad_norm": 0.8918996453285217, + "learning_rate": 2.8144827586206897e-05, + "loss": 0.1302, + "step": 11290 + }, + { + "epoch": 38.93448275862069, + "grad_norm": 0.6283668279647827, + "learning_rate": 2.8144367816091956e-05, + "loss": 0.1472, + "step": 11291 + }, + { + "epoch": 38.93793103448276, + "grad_norm": 0.7148279547691345, + "learning_rate": 2.814390804597701e-05, + "loss": 0.1096, + "step": 11292 + }, + { + "epoch": 38.94137931034483, + "grad_norm": 0.8609217405319214, + "learning_rate": 2.814344827586207e-05, + "loss": 0.1181, + "step": 11293 + }, + { + "epoch": 38.9448275862069, + "grad_norm": 2.1705758571624756, + "learning_rate": 2.8142988505747125e-05, + "loss": 0.1173, + "step": 11294 + }, + { + "epoch": 38.94827586206897, + "grad_norm": 1.1649090051651, + "learning_rate": 2.8142528735632187e-05, + "loss": 0.1902, + "step": 11295 + }, + { + "epoch": 38.95172413793104, + "grad_norm": 0.7025328874588013, + "learning_rate": 2.8142068965517243e-05, + "loss": 0.1935, + "step": 11296 + }, + { + "epoch": 38.9551724137931, + "grad_norm": 0.7072032690048218, + "learning_rate": 2.8141609195402298e-05, + "loss": 0.17, + "step": 11297 + }, + { + "epoch": 38.95862068965517, + "grad_norm": 0.5120487809181213, + "learning_rate": 2.8141149425287357e-05, + "loss": 0.1778, + "step": 11298 + }, + { + "epoch": 38.96206896551724, + "grad_norm": 0.7953117489814758, + "learning_rate": 2.8140689655172415e-05, + "loss": 0.1643, + "step": 11299 + }, + { + "epoch": 38.96551724137931, + "grad_norm": 1.2818915843963623, + "learning_rate": 2.8140229885057474e-05, + "loss": 0.1648, + "step": 11300 + }, + { + "epoch": 38.96896551724138, + "grad_norm": 0.9333861470222473, + "learning_rate": 2.813977011494253e-05, + "loss": 0.1882, + "step": 11301 + }, + { + "epoch": 38.97241379310345, + "grad_norm": 0.685040295124054, + "learning_rate": 2.8139310344827585e-05, + "loss": 0.1538, + "step": 11302 + }, + { + "epoch": 38.97586206896552, + "grad_norm": 0.8104397058486938, + "learning_rate": 2.8138850574712647e-05, + "loss": 0.1505, + "step": 11303 + }, + { + "epoch": 38.97931034482759, + "grad_norm": 0.7120717763900757, + "learning_rate": 2.8138390804597702e-05, + "loss": 0.1573, + "step": 11304 + }, + { + "epoch": 38.98275862068966, + "grad_norm": 0.8509507179260254, + "learning_rate": 2.813793103448276e-05, + "loss": 0.1634, + "step": 11305 + }, + { + "epoch": 38.98620689655172, + "grad_norm": 1.565614104270935, + "learning_rate": 2.8137471264367816e-05, + "loss": 0.1192, + "step": 11306 + }, + { + "epoch": 38.98965517241379, + "grad_norm": 1.0979952812194824, + "learning_rate": 2.8137011494252875e-05, + "loss": 0.1307, + "step": 11307 + }, + { + "epoch": 38.99310344827586, + "grad_norm": 0.9961676001548767, + "learning_rate": 2.8136551724137933e-05, + "loss": 0.1287, + "step": 11308 + }, + { + "epoch": 38.99655172413793, + "grad_norm": 1.2895593643188477, + "learning_rate": 2.813609195402299e-05, + "loss": 0.1135, + "step": 11309 + }, + { + "epoch": 39.0, + "grad_norm": 1.1720943450927734, + "learning_rate": 2.8135632183908044e-05, + "loss": 0.1684, + "step": 11310 + }, + { + "epoch": 39.00344827586207, + "grad_norm": 1.0253268480300903, + "learning_rate": 2.8135172413793106e-05, + "loss": 0.2464, + "step": 11311 + }, + { + "epoch": 39.00689655172414, + "grad_norm": 0.639373242855072, + "learning_rate": 2.813471264367816e-05, + "loss": 0.1996, + "step": 11312 + }, + { + "epoch": 39.01034482758621, + "grad_norm": 1.7809005975723267, + "learning_rate": 2.813425287356322e-05, + "loss": 0.1662, + "step": 11313 + }, + { + "epoch": 39.01379310344828, + "grad_norm": 0.6196741461753845, + "learning_rate": 2.8133793103448275e-05, + "loss": 0.1673, + "step": 11314 + }, + { + "epoch": 39.01724137931034, + "grad_norm": 0.8031917214393616, + "learning_rate": 2.8133333333333334e-05, + "loss": 0.1571, + "step": 11315 + }, + { + "epoch": 39.02068965517241, + "grad_norm": 0.6652750968933105, + "learning_rate": 2.8132873563218393e-05, + "loss": 0.1812, + "step": 11316 + }, + { + "epoch": 39.02413793103448, + "grad_norm": 0.6250159740447998, + "learning_rate": 2.8132413793103448e-05, + "loss": 0.1665, + "step": 11317 + }, + { + "epoch": 39.02758620689655, + "grad_norm": 0.5972087383270264, + "learning_rate": 2.8131954022988507e-05, + "loss": 0.1731, + "step": 11318 + }, + { + "epoch": 39.03103448275862, + "grad_norm": 1.7323660850524902, + "learning_rate": 2.8131494252873565e-05, + "loss": 0.1429, + "step": 11319 + }, + { + "epoch": 39.03448275862069, + "grad_norm": 0.7930275797843933, + "learning_rate": 2.813103448275862e-05, + "loss": 0.1575, + "step": 11320 + }, + { + "epoch": 39.03793103448276, + "grad_norm": 0.691792905330658, + "learning_rate": 2.813057471264368e-05, + "loss": 0.1434, + "step": 11321 + }, + { + "epoch": 39.04137931034483, + "grad_norm": 0.5114172101020813, + "learning_rate": 2.8130114942528735e-05, + "loss": 0.1478, + "step": 11322 + }, + { + "epoch": 39.0448275862069, + "grad_norm": 0.8113369941711426, + "learning_rate": 2.8129655172413797e-05, + "loss": 0.1446, + "step": 11323 + }, + { + "epoch": 39.04827586206896, + "grad_norm": 0.5466988682746887, + "learning_rate": 2.8129195402298852e-05, + "loss": 0.1445, + "step": 11324 + }, + { + "epoch": 39.05172413793103, + "grad_norm": 0.8558183312416077, + "learning_rate": 2.8128735632183907e-05, + "loss": 0.1358, + "step": 11325 + }, + { + "epoch": 39.0551724137931, + "grad_norm": 0.6201282739639282, + "learning_rate": 2.8128275862068966e-05, + "loss": 0.1212, + "step": 11326 + }, + { + "epoch": 39.05862068965517, + "grad_norm": 0.5900123119354248, + "learning_rate": 2.8127816091954025e-05, + "loss": 0.1322, + "step": 11327 + }, + { + "epoch": 39.06206896551724, + "grad_norm": 0.5589483976364136, + "learning_rate": 2.8127356321839083e-05, + "loss": 0.1284, + "step": 11328 + }, + { + "epoch": 39.06551724137931, + "grad_norm": 1.0415207147598267, + "learning_rate": 2.812689655172414e-05, + "loss": 0.1082, + "step": 11329 + }, + { + "epoch": 39.06896551724138, + "grad_norm": 1.2627681493759155, + "learning_rate": 2.8126436781609194e-05, + "loss": 0.1178, + "step": 11330 + }, + { + "epoch": 39.07241379310345, + "grad_norm": 0.8590753078460693, + "learning_rate": 2.8125977011494256e-05, + "loss": 0.1139, + "step": 11331 + }, + { + "epoch": 39.07586206896552, + "grad_norm": 0.6881011128425598, + "learning_rate": 2.812551724137931e-05, + "loss": 0.0885, + "step": 11332 + }, + { + "epoch": 39.07931034482758, + "grad_norm": 0.6686098575592041, + "learning_rate": 2.812505747126437e-05, + "loss": 0.0989, + "step": 11333 + }, + { + "epoch": 39.08275862068965, + "grad_norm": 1.1820886135101318, + "learning_rate": 2.8124597701149425e-05, + "loss": 0.1161, + "step": 11334 + }, + { + "epoch": 39.08620689655172, + "grad_norm": 1.5159265995025635, + "learning_rate": 2.8124137931034484e-05, + "loss": 0.1628, + "step": 11335 + }, + { + "epoch": 39.08965517241379, + "grad_norm": 0.8943869471549988, + "learning_rate": 2.8123678160919543e-05, + "loss": 0.2254, + "step": 11336 + }, + { + "epoch": 39.09310344827586, + "grad_norm": 0.6968327760696411, + "learning_rate": 2.8123218390804598e-05, + "loss": 0.2095, + "step": 11337 + }, + { + "epoch": 39.09655172413793, + "grad_norm": 1.3079099655151367, + "learning_rate": 2.8122758620689653e-05, + "loss": 0.1567, + "step": 11338 + }, + { + "epoch": 39.1, + "grad_norm": 1.5072187185287476, + "learning_rate": 2.8122298850574715e-05, + "loss": 0.1649, + "step": 11339 + }, + { + "epoch": 39.10344827586207, + "grad_norm": 0.6269375085830688, + "learning_rate": 2.812183908045977e-05, + "loss": 0.1536, + "step": 11340 + }, + { + "epoch": 39.10689655172414, + "grad_norm": 0.9921271204948425, + "learning_rate": 2.812137931034483e-05, + "loss": 0.175, + "step": 11341 + }, + { + "epoch": 39.110344827586204, + "grad_norm": 1.8879188299179077, + "learning_rate": 2.8120919540229884e-05, + "loss": 0.167, + "step": 11342 + }, + { + "epoch": 39.11379310344827, + "grad_norm": 0.7134392261505127, + "learning_rate": 2.8120459770114943e-05, + "loss": 0.1619, + "step": 11343 + }, + { + "epoch": 39.11724137931034, + "grad_norm": 0.651421844959259, + "learning_rate": 2.8120000000000002e-05, + "loss": 0.1367, + "step": 11344 + }, + { + "epoch": 39.12068965517241, + "grad_norm": 0.7347873449325562, + "learning_rate": 2.8119540229885057e-05, + "loss": 0.1416, + "step": 11345 + }, + { + "epoch": 39.12413793103448, + "grad_norm": 0.6486946940422058, + "learning_rate": 2.8119080459770116e-05, + "loss": 0.1481, + "step": 11346 + }, + { + "epoch": 39.12758620689655, + "grad_norm": 1.0482981204986572, + "learning_rate": 2.8118620689655174e-05, + "loss": 0.1384, + "step": 11347 + }, + { + "epoch": 39.13103448275862, + "grad_norm": 1.5896856784820557, + "learning_rate": 2.811816091954023e-05, + "loss": 0.148, + "step": 11348 + }, + { + "epoch": 39.13448275862069, + "grad_norm": 0.5301230549812317, + "learning_rate": 2.811770114942529e-05, + "loss": 0.1105, + "step": 11349 + }, + { + "epoch": 39.13793103448276, + "grad_norm": 0.8420941233634949, + "learning_rate": 2.8117241379310344e-05, + "loss": 0.1358, + "step": 11350 + }, + { + "epoch": 39.141379310344824, + "grad_norm": 1.6486687660217285, + "learning_rate": 2.8116781609195406e-05, + "loss": 0.1194, + "step": 11351 + }, + { + "epoch": 39.144827586206894, + "grad_norm": 0.5849422812461853, + "learning_rate": 2.811632183908046e-05, + "loss": 0.1264, + "step": 11352 + }, + { + "epoch": 39.148275862068964, + "grad_norm": 0.7122141718864441, + "learning_rate": 2.8115862068965516e-05, + "loss": 0.1453, + "step": 11353 + }, + { + "epoch": 39.15172413793103, + "grad_norm": 1.420987844467163, + "learning_rate": 2.8115402298850575e-05, + "loss": 0.1343, + "step": 11354 + }, + { + "epoch": 39.1551724137931, + "grad_norm": 2.1680684089660645, + "learning_rate": 2.8114942528735634e-05, + "loss": 0.1274, + "step": 11355 + }, + { + "epoch": 39.15862068965517, + "grad_norm": 0.5910596251487732, + "learning_rate": 2.8114482758620692e-05, + "loss": 0.1037, + "step": 11356 + }, + { + "epoch": 39.16206896551724, + "grad_norm": 1.0430805683135986, + "learning_rate": 2.8114022988505748e-05, + "loss": 0.1164, + "step": 11357 + }, + { + "epoch": 39.16551724137931, + "grad_norm": 1.087209939956665, + "learning_rate": 2.8113563218390803e-05, + "loss": 0.092, + "step": 11358 + }, + { + "epoch": 39.16896551724138, + "grad_norm": 0.7241266965866089, + "learning_rate": 2.8113103448275865e-05, + "loss": 0.1064, + "step": 11359 + }, + { + "epoch": 39.172413793103445, + "grad_norm": 2.1909291744232178, + "learning_rate": 2.811264367816092e-05, + "loss": 0.1136, + "step": 11360 + }, + { + "epoch": 39.175862068965515, + "grad_norm": 0.7654843926429749, + "learning_rate": 2.811218390804598e-05, + "loss": 0.2248, + "step": 11361 + }, + { + "epoch": 39.179310344827584, + "grad_norm": 1.2009152173995972, + "learning_rate": 2.8111724137931034e-05, + "loss": 0.1706, + "step": 11362 + }, + { + "epoch": 39.182758620689654, + "grad_norm": 0.9743526577949524, + "learning_rate": 2.8111264367816093e-05, + "loss": 0.1729, + "step": 11363 + }, + { + "epoch": 39.186206896551724, + "grad_norm": 0.7971171140670776, + "learning_rate": 2.8110804597701152e-05, + "loss": 0.154, + "step": 11364 + }, + { + "epoch": 39.189655172413794, + "grad_norm": 0.5320194363594055, + "learning_rate": 2.8110344827586207e-05, + "loss": 0.1562, + "step": 11365 + }, + { + "epoch": 39.19310344827586, + "grad_norm": 1.0084235668182373, + "learning_rate": 2.8109885057471262e-05, + "loss": 0.1559, + "step": 11366 + }, + { + "epoch": 39.19655172413793, + "grad_norm": 0.5684787034988403, + "learning_rate": 2.8109425287356324e-05, + "loss": 0.1559, + "step": 11367 + }, + { + "epoch": 39.2, + "grad_norm": 0.8705762624740601, + "learning_rate": 2.810896551724138e-05, + "loss": 0.1604, + "step": 11368 + }, + { + "epoch": 39.203448275862065, + "grad_norm": 1.0085281133651733, + "learning_rate": 2.810850574712644e-05, + "loss": 0.1464, + "step": 11369 + }, + { + "epoch": 39.206896551724135, + "grad_norm": 0.5042669773101807, + "learning_rate": 2.8108045977011494e-05, + "loss": 0.1336, + "step": 11370 + }, + { + "epoch": 39.210344827586205, + "grad_norm": 0.614955484867096, + "learning_rate": 2.8107586206896552e-05, + "loss": 0.1266, + "step": 11371 + }, + { + "epoch": 39.213793103448275, + "grad_norm": 1.8639941215515137, + "learning_rate": 2.810712643678161e-05, + "loss": 0.176, + "step": 11372 + }, + { + "epoch": 39.217241379310344, + "grad_norm": 0.504321277141571, + "learning_rate": 2.8106666666666666e-05, + "loss": 0.1206, + "step": 11373 + }, + { + "epoch": 39.220689655172414, + "grad_norm": 0.8796669840812683, + "learning_rate": 2.8106206896551725e-05, + "loss": 0.1282, + "step": 11374 + }, + { + "epoch": 39.224137931034484, + "grad_norm": 0.7915198802947998, + "learning_rate": 2.8105747126436784e-05, + "loss": 0.1222, + "step": 11375 + }, + { + "epoch": 39.227586206896554, + "grad_norm": 0.6976243257522583, + "learning_rate": 2.810528735632184e-05, + "loss": 0.1445, + "step": 11376 + }, + { + "epoch": 39.23103448275862, + "grad_norm": 0.7113633751869202, + "learning_rate": 2.8104827586206898e-05, + "loss": 0.1292, + "step": 11377 + }, + { + "epoch": 39.234482758620686, + "grad_norm": 1.2345213890075684, + "learning_rate": 2.8104367816091953e-05, + "loss": 0.1336, + "step": 11378 + }, + { + "epoch": 39.237931034482756, + "grad_norm": 0.7949514389038086, + "learning_rate": 2.8103908045977015e-05, + "loss": 0.1245, + "step": 11379 + }, + { + "epoch": 39.241379310344826, + "grad_norm": 0.6540402173995972, + "learning_rate": 2.810344827586207e-05, + "loss": 0.1139, + "step": 11380 + }, + { + "epoch": 39.244827586206895, + "grad_norm": 0.9986185431480408, + "learning_rate": 2.8102988505747126e-05, + "loss": 0.124, + "step": 11381 + }, + { + "epoch": 39.248275862068965, + "grad_norm": 0.6179977059364319, + "learning_rate": 2.8102528735632184e-05, + "loss": 0.1214, + "step": 11382 + }, + { + "epoch": 39.251724137931035, + "grad_norm": 2.956439733505249, + "learning_rate": 2.8102068965517243e-05, + "loss": 0.1219, + "step": 11383 + }, + { + "epoch": 39.255172413793105, + "grad_norm": 2.9646363258361816, + "learning_rate": 2.8101609195402302e-05, + "loss": 0.108, + "step": 11384 + }, + { + "epoch": 39.258620689655174, + "grad_norm": 1.1277996301651, + "learning_rate": 2.8101149425287357e-05, + "loss": 0.1663, + "step": 11385 + }, + { + "epoch": 39.262068965517244, + "grad_norm": 0.8954567313194275, + "learning_rate": 2.8100689655172412e-05, + "loss": 0.2158, + "step": 11386 + }, + { + "epoch": 39.265517241379314, + "grad_norm": 0.5649714469909668, + "learning_rate": 2.8100229885057474e-05, + "loss": 0.1986, + "step": 11387 + }, + { + "epoch": 39.26896551724138, + "grad_norm": 0.4658292233943939, + "learning_rate": 2.809977011494253e-05, + "loss": 0.176, + "step": 11388 + }, + { + "epoch": 39.272413793103446, + "grad_norm": 0.6606435179710388, + "learning_rate": 2.809931034482759e-05, + "loss": 0.157, + "step": 11389 + }, + { + "epoch": 39.275862068965516, + "grad_norm": 0.6109241247177124, + "learning_rate": 2.8098850574712644e-05, + "loss": 0.1645, + "step": 11390 + }, + { + "epoch": 39.279310344827586, + "grad_norm": 0.71871417760849, + "learning_rate": 2.8098390804597702e-05, + "loss": 0.1765, + "step": 11391 + }, + { + "epoch": 39.282758620689656, + "grad_norm": 0.9430921077728271, + "learning_rate": 2.809793103448276e-05, + "loss": 0.1626, + "step": 11392 + }, + { + "epoch": 39.286206896551725, + "grad_norm": 1.2927865982055664, + "learning_rate": 2.8097471264367816e-05, + "loss": 0.1439, + "step": 11393 + }, + { + "epoch": 39.289655172413795, + "grad_norm": 0.7112196683883667, + "learning_rate": 2.8097011494252875e-05, + "loss": 0.131, + "step": 11394 + }, + { + "epoch": 39.293103448275865, + "grad_norm": 0.6491581201553345, + "learning_rate": 2.8096551724137934e-05, + "loss": 0.1389, + "step": 11395 + }, + { + "epoch": 39.296551724137935, + "grad_norm": 0.8715078830718994, + "learning_rate": 2.809609195402299e-05, + "loss": 0.1482, + "step": 11396 + }, + { + "epoch": 39.3, + "grad_norm": 1.048891544342041, + "learning_rate": 2.8095632183908048e-05, + "loss": 0.1618, + "step": 11397 + }, + { + "epoch": 39.30344827586207, + "grad_norm": 1.1776371002197266, + "learning_rate": 2.8095172413793103e-05, + "loss": 0.1521, + "step": 11398 + }, + { + "epoch": 39.30689655172414, + "grad_norm": 1.1293679475784302, + "learning_rate": 2.809471264367816e-05, + "loss": 0.143, + "step": 11399 + }, + { + "epoch": 39.310344827586206, + "grad_norm": 0.6758058071136475, + "learning_rate": 2.809425287356322e-05, + "loss": 0.1583, + "step": 11400 + }, + { + "epoch": 39.313793103448276, + "grad_norm": 2.2403564453125, + "learning_rate": 2.8093793103448276e-05, + "loss": 0.1384, + "step": 11401 + }, + { + "epoch": 39.317241379310346, + "grad_norm": 0.687269926071167, + "learning_rate": 2.8093333333333334e-05, + "loss": 0.1387, + "step": 11402 + }, + { + "epoch": 39.320689655172416, + "grad_norm": 1.706756830215454, + "learning_rate": 2.8092873563218393e-05, + "loss": 0.1237, + "step": 11403 + }, + { + "epoch": 39.324137931034485, + "grad_norm": 1.2068983316421509, + "learning_rate": 2.8092413793103448e-05, + "loss": 0.1263, + "step": 11404 + }, + { + "epoch": 39.327586206896555, + "grad_norm": 0.5400568842887878, + "learning_rate": 2.8091954022988507e-05, + "loss": 0.1114, + "step": 11405 + }, + { + "epoch": 39.33103448275862, + "grad_norm": 1.2803703546524048, + "learning_rate": 2.8091494252873562e-05, + "loss": 0.1056, + "step": 11406 + }, + { + "epoch": 39.33448275862069, + "grad_norm": 0.6489802002906799, + "learning_rate": 2.8091034482758624e-05, + "loss": 0.1099, + "step": 11407 + }, + { + "epoch": 39.33793103448276, + "grad_norm": 0.6319431066513062, + "learning_rate": 2.809057471264368e-05, + "loss": 0.1008, + "step": 11408 + }, + { + "epoch": 39.34137931034483, + "grad_norm": 0.856367826461792, + "learning_rate": 2.8090114942528735e-05, + "loss": 0.1146, + "step": 11409 + }, + { + "epoch": 39.3448275862069, + "grad_norm": 1.344766616821289, + "learning_rate": 2.8089655172413794e-05, + "loss": 0.1619, + "step": 11410 + }, + { + "epoch": 39.34827586206897, + "grad_norm": 0.6225053668022156, + "learning_rate": 2.8089195402298852e-05, + "loss": 0.2241, + "step": 11411 + }, + { + "epoch": 39.351724137931036, + "grad_norm": 0.4927051067352295, + "learning_rate": 2.808873563218391e-05, + "loss": 0.1713, + "step": 11412 + }, + { + "epoch": 39.355172413793106, + "grad_norm": 0.4878547191619873, + "learning_rate": 2.8088275862068966e-05, + "loss": 0.1898, + "step": 11413 + }, + { + "epoch": 39.358620689655176, + "grad_norm": 0.453934907913208, + "learning_rate": 2.808781609195402e-05, + "loss": 0.1788, + "step": 11414 + }, + { + "epoch": 39.36206896551724, + "grad_norm": 0.5316328406333923, + "learning_rate": 2.8087356321839084e-05, + "loss": 0.1842, + "step": 11415 + }, + { + "epoch": 39.36551724137931, + "grad_norm": 1.2046620845794678, + "learning_rate": 2.808689655172414e-05, + "loss": 0.139, + "step": 11416 + }, + { + "epoch": 39.36896551724138, + "grad_norm": 0.4694715142250061, + "learning_rate": 2.8086436781609198e-05, + "loss": 0.1581, + "step": 11417 + }, + { + "epoch": 39.37241379310345, + "grad_norm": 0.6577185392379761, + "learning_rate": 2.8085977011494253e-05, + "loss": 0.1464, + "step": 11418 + }, + { + "epoch": 39.37586206896552, + "grad_norm": 1.0235077142715454, + "learning_rate": 2.808551724137931e-05, + "loss": 0.1544, + "step": 11419 + }, + { + "epoch": 39.37931034482759, + "grad_norm": 1.173883318901062, + "learning_rate": 2.808505747126437e-05, + "loss": 0.1544, + "step": 11420 + }, + { + "epoch": 39.38275862068966, + "grad_norm": 0.7688548564910889, + "learning_rate": 2.8084597701149426e-05, + "loss": 0.1423, + "step": 11421 + }, + { + "epoch": 39.38620689655173, + "grad_norm": 0.7255688309669495, + "learning_rate": 2.8084137931034484e-05, + "loss": 0.1305, + "step": 11422 + }, + { + "epoch": 39.389655172413796, + "grad_norm": 1.0433151721954346, + "learning_rate": 2.8083678160919543e-05, + "loss": 0.1377, + "step": 11423 + }, + { + "epoch": 39.39310344827586, + "grad_norm": 0.4874139726161957, + "learning_rate": 2.8083218390804598e-05, + "loss": 0.1235, + "step": 11424 + }, + { + "epoch": 39.39655172413793, + "grad_norm": 0.7005029320716858, + "learning_rate": 2.8082758620689657e-05, + "loss": 0.1524, + "step": 11425 + }, + { + "epoch": 39.4, + "grad_norm": 0.920814573764801, + "learning_rate": 2.8082298850574712e-05, + "loss": 0.1193, + "step": 11426 + }, + { + "epoch": 39.40344827586207, + "grad_norm": 0.5916476249694824, + "learning_rate": 2.808183908045977e-05, + "loss": 0.1169, + "step": 11427 + }, + { + "epoch": 39.40689655172414, + "grad_norm": 1.0757548809051514, + "learning_rate": 2.808137931034483e-05, + "loss": 0.1067, + "step": 11428 + }, + { + "epoch": 39.41034482758621, + "grad_norm": 0.9174897074699402, + "learning_rate": 2.8080919540229885e-05, + "loss": 0.104, + "step": 11429 + }, + { + "epoch": 39.41379310344828, + "grad_norm": 0.9467368721961975, + "learning_rate": 2.8080459770114944e-05, + "loss": 0.1335, + "step": 11430 + }, + { + "epoch": 39.41724137931035, + "grad_norm": 0.710720419883728, + "learning_rate": 2.8080000000000002e-05, + "loss": 0.1208, + "step": 11431 + }, + { + "epoch": 39.42068965517242, + "grad_norm": 0.6420710682868958, + "learning_rate": 2.8079540229885057e-05, + "loss": 0.1098, + "step": 11432 + }, + { + "epoch": 39.42413793103448, + "grad_norm": 1.1723577976226807, + "learning_rate": 2.8079080459770116e-05, + "loss": 0.1107, + "step": 11433 + }, + { + "epoch": 39.42758620689655, + "grad_norm": 1.0184645652770996, + "learning_rate": 2.807862068965517e-05, + "loss": 0.1288, + "step": 11434 + }, + { + "epoch": 39.43103448275862, + "grad_norm": 2.0558419227600098, + "learning_rate": 2.8078160919540234e-05, + "loss": 0.1802, + "step": 11435 + }, + { + "epoch": 39.43448275862069, + "grad_norm": 0.6640236973762512, + "learning_rate": 2.807770114942529e-05, + "loss": 0.2107, + "step": 11436 + }, + { + "epoch": 39.43793103448276, + "grad_norm": 0.520627498626709, + "learning_rate": 2.8077241379310344e-05, + "loss": 0.1906, + "step": 11437 + }, + { + "epoch": 39.44137931034483, + "grad_norm": 0.627945065498352, + "learning_rate": 2.8076781609195403e-05, + "loss": 0.1929, + "step": 11438 + }, + { + "epoch": 39.4448275862069, + "grad_norm": 0.6437147855758667, + "learning_rate": 2.807632183908046e-05, + "loss": 0.1545, + "step": 11439 + }, + { + "epoch": 39.44827586206897, + "grad_norm": 0.6573736667633057, + "learning_rate": 2.807586206896552e-05, + "loss": 0.1771, + "step": 11440 + }, + { + "epoch": 39.45172413793104, + "grad_norm": 0.9056336283683777, + "learning_rate": 2.8075402298850575e-05, + "loss": 0.1801, + "step": 11441 + }, + { + "epoch": 39.4551724137931, + "grad_norm": 0.6939224600791931, + "learning_rate": 2.807494252873563e-05, + "loss": 0.1693, + "step": 11442 + }, + { + "epoch": 39.45862068965517, + "grad_norm": 0.6820400357246399, + "learning_rate": 2.8074482758620693e-05, + "loss": 0.1579, + "step": 11443 + }, + { + "epoch": 39.46206896551724, + "grad_norm": 0.7561465501785278, + "learning_rate": 2.8074022988505748e-05, + "loss": 0.1601, + "step": 11444 + }, + { + "epoch": 39.46551724137931, + "grad_norm": 0.6759411692619324, + "learning_rate": 2.8073563218390807e-05, + "loss": 0.1404, + "step": 11445 + }, + { + "epoch": 39.46896551724138, + "grad_norm": 0.7630171775817871, + "learning_rate": 2.8073103448275862e-05, + "loss": 0.1649, + "step": 11446 + }, + { + "epoch": 39.47241379310345, + "grad_norm": 0.9129201769828796, + "learning_rate": 2.8072643678160917e-05, + "loss": 0.151, + "step": 11447 + }, + { + "epoch": 39.47586206896552, + "grad_norm": 1.050868034362793, + "learning_rate": 2.807218390804598e-05, + "loss": 0.1401, + "step": 11448 + }, + { + "epoch": 39.47931034482759, + "grad_norm": 0.9194563031196594, + "learning_rate": 2.8071724137931035e-05, + "loss": 0.1595, + "step": 11449 + }, + { + "epoch": 39.48275862068966, + "grad_norm": 0.6321210861206055, + "learning_rate": 2.8071264367816093e-05, + "loss": 0.134, + "step": 11450 + }, + { + "epoch": 39.48620689655172, + "grad_norm": 5.3997721672058105, + "learning_rate": 2.807080459770115e-05, + "loss": 0.1179, + "step": 11451 + }, + { + "epoch": 39.48965517241379, + "grad_norm": 0.8643282651901245, + "learning_rate": 2.8070344827586207e-05, + "loss": 0.1468, + "step": 11452 + }, + { + "epoch": 39.49310344827586, + "grad_norm": 0.9580731391906738, + "learning_rate": 2.8069885057471266e-05, + "loss": 0.1253, + "step": 11453 + }, + { + "epoch": 39.49655172413793, + "grad_norm": 0.7964051961898804, + "learning_rate": 2.806942528735632e-05, + "loss": 0.1304, + "step": 11454 + }, + { + "epoch": 39.5, + "grad_norm": 1.4830408096313477, + "learning_rate": 2.8068965517241377e-05, + "loss": 0.1212, + "step": 11455 + }, + { + "epoch": 39.50344827586207, + "grad_norm": 0.704849362373352, + "learning_rate": 2.806850574712644e-05, + "loss": 0.1382, + "step": 11456 + }, + { + "epoch": 39.50689655172414, + "grad_norm": 0.8388257622718811, + "learning_rate": 2.8068045977011494e-05, + "loss": 0.1024, + "step": 11457 + }, + { + "epoch": 39.51034482758621, + "grad_norm": 1.078876256942749, + "learning_rate": 2.8067586206896553e-05, + "loss": 0.1232, + "step": 11458 + }, + { + "epoch": 39.51379310344828, + "grad_norm": 0.810330331325531, + "learning_rate": 2.8067126436781608e-05, + "loss": 0.134, + "step": 11459 + }, + { + "epoch": 39.51724137931034, + "grad_norm": 1.557500958442688, + "learning_rate": 2.8066666666666667e-05, + "loss": 0.1544, + "step": 11460 + }, + { + "epoch": 39.52068965517241, + "grad_norm": 0.7689520716667175, + "learning_rate": 2.8066206896551725e-05, + "loss": 0.2147, + "step": 11461 + }, + { + "epoch": 39.52413793103448, + "grad_norm": 0.766836404800415, + "learning_rate": 2.806574712643678e-05, + "loss": 0.1814, + "step": 11462 + }, + { + "epoch": 39.52758620689655, + "grad_norm": 0.7409343123435974, + "learning_rate": 2.806528735632184e-05, + "loss": 0.1986, + "step": 11463 + }, + { + "epoch": 39.53103448275862, + "grad_norm": 0.6706486940383911, + "learning_rate": 2.8064827586206898e-05, + "loss": 0.1728, + "step": 11464 + }, + { + "epoch": 39.53448275862069, + "grad_norm": 0.7322885394096375, + "learning_rate": 2.8064367816091953e-05, + "loss": 0.1596, + "step": 11465 + }, + { + "epoch": 39.53793103448276, + "grad_norm": 0.8368232846260071, + "learning_rate": 2.8063908045977012e-05, + "loss": 0.1491, + "step": 11466 + }, + { + "epoch": 39.54137931034483, + "grad_norm": 0.537189245223999, + "learning_rate": 2.8063448275862067e-05, + "loss": 0.1282, + "step": 11467 + }, + { + "epoch": 39.5448275862069, + "grad_norm": 0.7506726980209351, + "learning_rate": 2.806298850574713e-05, + "loss": 0.1538, + "step": 11468 + }, + { + "epoch": 39.54827586206896, + "grad_norm": 1.7588565349578857, + "learning_rate": 2.8062528735632185e-05, + "loss": 0.1761, + "step": 11469 + }, + { + "epoch": 39.55172413793103, + "grad_norm": 0.7439810633659363, + "learning_rate": 2.806206896551724e-05, + "loss": 0.1501, + "step": 11470 + }, + { + "epoch": 39.5551724137931, + "grad_norm": 1.13484787940979, + "learning_rate": 2.80616091954023e-05, + "loss": 0.1675, + "step": 11471 + }, + { + "epoch": 39.55862068965517, + "grad_norm": 0.8381078243255615, + "learning_rate": 2.8061149425287357e-05, + "loss": 0.1375, + "step": 11472 + }, + { + "epoch": 39.56206896551724, + "grad_norm": 0.8601353764533997, + "learning_rate": 2.8060689655172416e-05, + "loss": 0.127, + "step": 11473 + }, + { + "epoch": 39.56551724137931, + "grad_norm": 0.8690395951271057, + "learning_rate": 2.806022988505747e-05, + "loss": 0.1342, + "step": 11474 + }, + { + "epoch": 39.56896551724138, + "grad_norm": 0.6036515235900879, + "learning_rate": 2.8059770114942527e-05, + "loss": 0.1372, + "step": 11475 + }, + { + "epoch": 39.57241379310345, + "grad_norm": 0.6451120972633362, + "learning_rate": 2.805931034482759e-05, + "loss": 0.1363, + "step": 11476 + }, + { + "epoch": 39.57586206896552, + "grad_norm": 0.5364387631416321, + "learning_rate": 2.8058850574712644e-05, + "loss": 0.1125, + "step": 11477 + }, + { + "epoch": 39.57931034482758, + "grad_norm": 0.6387764811515808, + "learning_rate": 2.8058390804597703e-05, + "loss": 0.1368, + "step": 11478 + }, + { + "epoch": 39.58275862068965, + "grad_norm": 0.7233579158782959, + "learning_rate": 2.8057931034482758e-05, + "loss": 0.1279, + "step": 11479 + }, + { + "epoch": 39.58620689655172, + "grad_norm": 0.6326417326927185, + "learning_rate": 2.8057471264367817e-05, + "loss": 0.1115, + "step": 11480 + }, + { + "epoch": 39.58965517241379, + "grad_norm": 0.8463090062141418, + "learning_rate": 2.8057011494252875e-05, + "loss": 0.1636, + "step": 11481 + }, + { + "epoch": 39.59310344827586, + "grad_norm": 0.6153095364570618, + "learning_rate": 2.805655172413793e-05, + "loss": 0.1024, + "step": 11482 + }, + { + "epoch": 39.59655172413793, + "grad_norm": 0.5719016790390015, + "learning_rate": 2.805609195402299e-05, + "loss": 0.1013, + "step": 11483 + }, + { + "epoch": 39.6, + "grad_norm": 0.774377703666687, + "learning_rate": 2.8055632183908048e-05, + "loss": 0.1067, + "step": 11484 + }, + { + "epoch": 39.60344827586207, + "grad_norm": 1.4463481903076172, + "learning_rate": 2.8055172413793103e-05, + "loss": 0.1607, + "step": 11485 + }, + { + "epoch": 39.60689655172414, + "grad_norm": 1.2848585844039917, + "learning_rate": 2.8054712643678162e-05, + "loss": 0.2361, + "step": 11486 + }, + { + "epoch": 39.610344827586204, + "grad_norm": 0.6632060408592224, + "learning_rate": 2.8054252873563217e-05, + "loss": 0.189, + "step": 11487 + }, + { + "epoch": 39.61379310344827, + "grad_norm": 0.523815929889679, + "learning_rate": 2.8053793103448276e-05, + "loss": 0.1843, + "step": 11488 + }, + { + "epoch": 39.61724137931034, + "grad_norm": 0.5185151100158691, + "learning_rate": 2.8053333333333335e-05, + "loss": 0.1866, + "step": 11489 + }, + { + "epoch": 39.62068965517241, + "grad_norm": 0.7114824056625366, + "learning_rate": 2.805287356321839e-05, + "loss": 0.164, + "step": 11490 + }, + { + "epoch": 39.62413793103448, + "grad_norm": 0.8949483633041382, + "learning_rate": 2.805241379310345e-05, + "loss": 0.1449, + "step": 11491 + }, + { + "epoch": 39.62758620689655, + "grad_norm": 0.8068279027938843, + "learning_rate": 2.8051954022988507e-05, + "loss": 0.1588, + "step": 11492 + }, + { + "epoch": 39.63103448275862, + "grad_norm": 0.6915461421012878, + "learning_rate": 2.8051494252873563e-05, + "loss": 0.1476, + "step": 11493 + }, + { + "epoch": 39.63448275862069, + "grad_norm": 1.9260233640670776, + "learning_rate": 2.805103448275862e-05, + "loss": 0.133, + "step": 11494 + }, + { + "epoch": 39.63793103448276, + "grad_norm": 0.8284426331520081, + "learning_rate": 2.8050574712643677e-05, + "loss": 0.1522, + "step": 11495 + }, + { + "epoch": 39.641379310344824, + "grad_norm": 0.6056115031242371, + "learning_rate": 2.805011494252874e-05, + "loss": 0.1417, + "step": 11496 + }, + { + "epoch": 39.644827586206894, + "grad_norm": 0.6201438903808594, + "learning_rate": 2.8049655172413794e-05, + "loss": 0.1842, + "step": 11497 + }, + { + "epoch": 39.648275862068964, + "grad_norm": 0.7673490643501282, + "learning_rate": 2.804919540229885e-05, + "loss": 0.1357, + "step": 11498 + }, + { + "epoch": 39.65172413793103, + "grad_norm": 0.7154965996742249, + "learning_rate": 2.8048735632183908e-05, + "loss": 0.1425, + "step": 11499 + }, + { + "epoch": 39.6551724137931, + "grad_norm": 1.130608320236206, + "learning_rate": 2.8048275862068967e-05, + "loss": 0.1604, + "step": 11500 + }, + { + "epoch": 39.65862068965517, + "grad_norm": 1.0151304006576538, + "learning_rate": 2.8047816091954025e-05, + "loss": 0.142, + "step": 11501 + }, + { + "epoch": 39.66206896551724, + "grad_norm": 0.6416845917701721, + "learning_rate": 2.804735632183908e-05, + "loss": 0.1101, + "step": 11502 + }, + { + "epoch": 39.66551724137931, + "grad_norm": 0.7699695825576782, + "learning_rate": 2.8046896551724136e-05, + "loss": 0.1133, + "step": 11503 + }, + { + "epoch": 39.66896551724138, + "grad_norm": 0.8104886412620544, + "learning_rate": 2.8046436781609198e-05, + "loss": 0.1232, + "step": 11504 + }, + { + "epoch": 39.672413793103445, + "grad_norm": 0.553966760635376, + "learning_rate": 2.8045977011494253e-05, + "loss": 0.1005, + "step": 11505 + }, + { + "epoch": 39.675862068965515, + "grad_norm": 1.0513066053390503, + "learning_rate": 2.8045517241379312e-05, + "loss": 0.1107, + "step": 11506 + }, + { + "epoch": 39.679310344827584, + "grad_norm": 0.8021120429039001, + "learning_rate": 2.8045057471264367e-05, + "loss": 0.1073, + "step": 11507 + }, + { + "epoch": 39.682758620689654, + "grad_norm": 0.7450590133666992, + "learning_rate": 2.8044597701149426e-05, + "loss": 0.109, + "step": 11508 + }, + { + "epoch": 39.686206896551724, + "grad_norm": 1.2773017883300781, + "learning_rate": 2.8044137931034485e-05, + "loss": 0.1083, + "step": 11509 + }, + { + "epoch": 39.689655172413794, + "grad_norm": 1.550302267074585, + "learning_rate": 2.804367816091954e-05, + "loss": 0.1349, + "step": 11510 + }, + { + "epoch": 39.69310344827586, + "grad_norm": 0.984873354434967, + "learning_rate": 2.80432183908046e-05, + "loss": 0.2145, + "step": 11511 + }, + { + "epoch": 39.69655172413793, + "grad_norm": 0.6027946472167969, + "learning_rate": 2.8042758620689657e-05, + "loss": 0.1714, + "step": 11512 + }, + { + "epoch": 39.7, + "grad_norm": 0.580236554145813, + "learning_rate": 2.8042298850574713e-05, + "loss": 0.1769, + "step": 11513 + }, + { + "epoch": 39.703448275862065, + "grad_norm": 0.5701853036880493, + "learning_rate": 2.804183908045977e-05, + "loss": 0.1754, + "step": 11514 + }, + { + "epoch": 39.706896551724135, + "grad_norm": 0.6304711699485779, + "learning_rate": 2.8041379310344827e-05, + "loss": 0.2023, + "step": 11515 + }, + { + "epoch": 39.710344827586205, + "grad_norm": 0.8770312666893005, + "learning_rate": 2.8040919540229885e-05, + "loss": 0.1734, + "step": 11516 + }, + { + "epoch": 39.713793103448275, + "grad_norm": 1.2147611379623413, + "learning_rate": 2.8040459770114944e-05, + "loss": 0.1856, + "step": 11517 + }, + { + "epoch": 39.717241379310344, + "grad_norm": 2.212428331375122, + "learning_rate": 2.804e-05, + "loss": 0.1781, + "step": 11518 + }, + { + "epoch": 39.720689655172414, + "grad_norm": 0.8960394859313965, + "learning_rate": 2.8039540229885058e-05, + "loss": 0.1572, + "step": 11519 + }, + { + "epoch": 39.724137931034484, + "grad_norm": 0.5783913731575012, + "learning_rate": 2.8039080459770117e-05, + "loss": 0.1532, + "step": 11520 + }, + { + "epoch": 39.727586206896554, + "grad_norm": 0.7316312193870544, + "learning_rate": 2.8038620689655172e-05, + "loss": 0.1415, + "step": 11521 + }, + { + "epoch": 39.73103448275862, + "grad_norm": 0.8523666858673096, + "learning_rate": 2.803816091954023e-05, + "loss": 0.1455, + "step": 11522 + }, + { + "epoch": 39.734482758620686, + "grad_norm": 0.8479415774345398, + "learning_rate": 2.8037701149425286e-05, + "loss": 0.1312, + "step": 11523 + }, + { + "epoch": 39.737931034482756, + "grad_norm": 2.0010015964508057, + "learning_rate": 2.8037241379310348e-05, + "loss": 0.1433, + "step": 11524 + }, + { + "epoch": 39.741379310344826, + "grad_norm": 0.8007112145423889, + "learning_rate": 2.8036781609195403e-05, + "loss": 0.1605, + "step": 11525 + }, + { + "epoch": 39.744827586206895, + "grad_norm": 0.9121559858322144, + "learning_rate": 2.803632183908046e-05, + "loss": 0.1324, + "step": 11526 + }, + { + "epoch": 39.748275862068965, + "grad_norm": 0.8926279544830322, + "learning_rate": 2.8035862068965517e-05, + "loss": 0.125, + "step": 11527 + }, + { + "epoch": 39.751724137931035, + "grad_norm": 0.7816924452781677, + "learning_rate": 2.8035402298850576e-05, + "loss": 0.1186, + "step": 11528 + }, + { + "epoch": 39.755172413793105, + "grad_norm": 1.2233219146728516, + "learning_rate": 2.8034942528735635e-05, + "loss": 0.1261, + "step": 11529 + }, + { + "epoch": 39.758620689655174, + "grad_norm": 0.7805418968200684, + "learning_rate": 2.803448275862069e-05, + "loss": 0.1014, + "step": 11530 + }, + { + "epoch": 39.762068965517244, + "grad_norm": 1.45162034034729, + "learning_rate": 2.8034022988505745e-05, + "loss": 0.1391, + "step": 11531 + }, + { + "epoch": 39.765517241379314, + "grad_norm": 1.2744754552841187, + "learning_rate": 2.8033563218390807e-05, + "loss": 0.1247, + "step": 11532 + }, + { + "epoch": 39.76896551724138, + "grad_norm": 1.2398245334625244, + "learning_rate": 2.8033103448275863e-05, + "loss": 0.132, + "step": 11533 + }, + { + "epoch": 39.772413793103446, + "grad_norm": 1.2867166996002197, + "learning_rate": 2.803264367816092e-05, + "loss": 0.1537, + "step": 11534 + }, + { + "epoch": 39.775862068965516, + "grad_norm": 0.988305926322937, + "learning_rate": 2.8032183908045976e-05, + "loss": 0.1809, + "step": 11535 + }, + { + "epoch": 39.779310344827586, + "grad_norm": 2.060601234436035, + "learning_rate": 2.8031724137931035e-05, + "loss": 0.2203, + "step": 11536 + }, + { + "epoch": 39.782758620689656, + "grad_norm": 0.9540982246398926, + "learning_rate": 2.8031264367816094e-05, + "loss": 0.1739, + "step": 11537 + }, + { + "epoch": 39.786206896551725, + "grad_norm": 0.5264396667480469, + "learning_rate": 2.803080459770115e-05, + "loss": 0.2009, + "step": 11538 + }, + { + "epoch": 39.789655172413795, + "grad_norm": 0.5899782776832581, + "learning_rate": 2.8030344827586208e-05, + "loss": 0.1785, + "step": 11539 + }, + { + "epoch": 39.793103448275865, + "grad_norm": 1.5404542684555054, + "learning_rate": 2.8029885057471267e-05, + "loss": 0.1523, + "step": 11540 + }, + { + "epoch": 39.796551724137935, + "grad_norm": 0.8499155044555664, + "learning_rate": 2.8029425287356322e-05, + "loss": 0.1857, + "step": 11541 + }, + { + "epoch": 39.8, + "grad_norm": 0.8888062238693237, + "learning_rate": 2.802896551724138e-05, + "loss": 0.1869, + "step": 11542 + }, + { + "epoch": 39.80344827586207, + "grad_norm": 0.5029839277267456, + "learning_rate": 2.8028505747126436e-05, + "loss": 0.1654, + "step": 11543 + }, + { + "epoch": 39.80689655172414, + "grad_norm": 0.7781130075454712, + "learning_rate": 2.8028045977011494e-05, + "loss": 0.1656, + "step": 11544 + }, + { + "epoch": 39.810344827586206, + "grad_norm": 0.8604413866996765, + "learning_rate": 2.8027586206896553e-05, + "loss": 0.1457, + "step": 11545 + }, + { + "epoch": 39.813793103448276, + "grad_norm": 1.0199977159500122, + "learning_rate": 2.802712643678161e-05, + "loss": 0.1512, + "step": 11546 + }, + { + "epoch": 39.817241379310346, + "grad_norm": 0.7425589561462402, + "learning_rate": 2.8026666666666667e-05, + "loss": 0.1566, + "step": 11547 + }, + { + "epoch": 39.820689655172416, + "grad_norm": 0.622673511505127, + "learning_rate": 2.8026206896551726e-05, + "loss": 0.1635, + "step": 11548 + }, + { + "epoch": 39.824137931034485, + "grad_norm": 0.5950734615325928, + "learning_rate": 2.802574712643678e-05, + "loss": 0.1699, + "step": 11549 + }, + { + "epoch": 39.827586206896555, + "grad_norm": 0.6048804521560669, + "learning_rate": 2.802528735632184e-05, + "loss": 0.1232, + "step": 11550 + }, + { + "epoch": 39.83103448275862, + "grad_norm": 0.9689356088638306, + "learning_rate": 2.8024827586206895e-05, + "loss": 0.1377, + "step": 11551 + }, + { + "epoch": 39.83448275862069, + "grad_norm": 0.9682617783546448, + "learning_rate": 2.8024367816091957e-05, + "loss": 0.1387, + "step": 11552 + }, + { + "epoch": 39.83793103448276, + "grad_norm": 1.2814329862594604, + "learning_rate": 2.8023908045977012e-05, + "loss": 0.1251, + "step": 11553 + }, + { + "epoch": 39.84137931034483, + "grad_norm": 0.9389997124671936, + "learning_rate": 2.8023448275862068e-05, + "loss": 0.1181, + "step": 11554 + }, + { + "epoch": 39.8448275862069, + "grad_norm": 0.8210214376449585, + "learning_rate": 2.8022988505747126e-05, + "loss": 0.1261, + "step": 11555 + }, + { + "epoch": 39.84827586206897, + "grad_norm": 0.7662495970726013, + "learning_rate": 2.8022528735632185e-05, + "loss": 0.1318, + "step": 11556 + }, + { + "epoch": 39.851724137931036, + "grad_norm": 0.8982017636299133, + "learning_rate": 2.8022068965517244e-05, + "loss": 0.0962, + "step": 11557 + }, + { + "epoch": 39.855172413793106, + "grad_norm": 1.0896496772766113, + "learning_rate": 2.80216091954023e-05, + "loss": 0.1334, + "step": 11558 + }, + { + "epoch": 39.858620689655176, + "grad_norm": 0.8507948517799377, + "learning_rate": 2.8021149425287354e-05, + "loss": 0.1229, + "step": 11559 + }, + { + "epoch": 39.86206896551724, + "grad_norm": 1.1775119304656982, + "learning_rate": 2.8020689655172416e-05, + "loss": 0.15, + "step": 11560 + }, + { + "epoch": 39.86551724137931, + "grad_norm": 1.5011897087097168, + "learning_rate": 2.8020229885057472e-05, + "loss": 0.2115, + "step": 11561 + }, + { + "epoch": 39.86896551724138, + "grad_norm": 0.7803739309310913, + "learning_rate": 2.801977011494253e-05, + "loss": 0.201, + "step": 11562 + }, + { + "epoch": 39.87241379310345, + "grad_norm": 0.5539829730987549, + "learning_rate": 2.8019310344827586e-05, + "loss": 0.1819, + "step": 11563 + }, + { + "epoch": 39.87586206896552, + "grad_norm": 0.5139386653900146, + "learning_rate": 2.8018850574712644e-05, + "loss": 0.1932, + "step": 11564 + }, + { + "epoch": 39.87931034482759, + "grad_norm": 1.0221246480941772, + "learning_rate": 2.8018390804597703e-05, + "loss": 0.162, + "step": 11565 + }, + { + "epoch": 39.88275862068966, + "grad_norm": 0.8142642974853516, + "learning_rate": 2.801793103448276e-05, + "loss": 0.1623, + "step": 11566 + }, + { + "epoch": 39.88620689655173, + "grad_norm": 1.0034339427947998, + "learning_rate": 2.8017471264367817e-05, + "loss": 0.1818, + "step": 11567 + }, + { + "epoch": 39.889655172413796, + "grad_norm": 0.7470972537994385, + "learning_rate": 2.8017011494252876e-05, + "loss": 0.1666, + "step": 11568 + }, + { + "epoch": 39.89310344827586, + "grad_norm": 0.6089482307434082, + "learning_rate": 2.801655172413793e-05, + "loss": 0.1771, + "step": 11569 + }, + { + "epoch": 39.89655172413793, + "grad_norm": 0.8579623699188232, + "learning_rate": 2.801609195402299e-05, + "loss": 0.1618, + "step": 11570 + }, + { + "epoch": 39.9, + "grad_norm": 0.6752385497093201, + "learning_rate": 2.8015632183908045e-05, + "loss": 0.1445, + "step": 11571 + }, + { + "epoch": 39.90344827586207, + "grad_norm": 0.662358283996582, + "learning_rate": 2.8015172413793104e-05, + "loss": 0.1567, + "step": 11572 + }, + { + "epoch": 39.90689655172414, + "grad_norm": 0.7682713866233826, + "learning_rate": 2.8014712643678162e-05, + "loss": 0.1512, + "step": 11573 + }, + { + "epoch": 39.91034482758621, + "grad_norm": 0.8414244651794434, + "learning_rate": 2.8014252873563218e-05, + "loss": 0.1561, + "step": 11574 + }, + { + "epoch": 39.91379310344828, + "grad_norm": 0.9113763570785522, + "learning_rate": 2.8013793103448276e-05, + "loss": 0.1399, + "step": 11575 + }, + { + "epoch": 39.91724137931035, + "grad_norm": 0.8288007378578186, + "learning_rate": 2.8013333333333335e-05, + "loss": 0.1484, + "step": 11576 + }, + { + "epoch": 39.92068965517242, + "grad_norm": 1.0269458293914795, + "learning_rate": 2.801287356321839e-05, + "loss": 0.1322, + "step": 11577 + }, + { + "epoch": 39.92413793103448, + "grad_norm": 0.6786368489265442, + "learning_rate": 2.801241379310345e-05, + "loss": 0.1202, + "step": 11578 + }, + { + "epoch": 39.92758620689655, + "grad_norm": 1.5795526504516602, + "learning_rate": 2.8011954022988504e-05, + "loss": 0.1332, + "step": 11579 + }, + { + "epoch": 39.93103448275862, + "grad_norm": 0.8821081519126892, + "learning_rate": 2.8011494252873566e-05, + "loss": 0.1117, + "step": 11580 + }, + { + "epoch": 39.93448275862069, + "grad_norm": 0.7510960102081299, + "learning_rate": 2.801103448275862e-05, + "loss": 0.121, + "step": 11581 + }, + { + "epoch": 39.93793103448276, + "grad_norm": 0.9274259209632874, + "learning_rate": 2.8010574712643677e-05, + "loss": 0.1234, + "step": 11582 + }, + { + "epoch": 39.94137931034483, + "grad_norm": 0.8250483274459839, + "learning_rate": 2.8010114942528736e-05, + "loss": 0.114, + "step": 11583 + }, + { + "epoch": 39.9448275862069, + "grad_norm": 0.865753173828125, + "learning_rate": 2.8009655172413794e-05, + "loss": 0.1243, + "step": 11584 + }, + { + "epoch": 39.94827586206897, + "grad_norm": 1.1547582149505615, + "learning_rate": 2.8009195402298853e-05, + "loss": 0.1898, + "step": 11585 + }, + { + "epoch": 39.95172413793104, + "grad_norm": 1.0319417715072632, + "learning_rate": 2.800873563218391e-05, + "loss": 0.2102, + "step": 11586 + }, + { + "epoch": 39.9551724137931, + "grad_norm": 0.5368159413337708, + "learning_rate": 2.8008275862068964e-05, + "loss": 0.1818, + "step": 11587 + }, + { + "epoch": 39.95862068965517, + "grad_norm": 0.5381379723548889, + "learning_rate": 2.8007816091954026e-05, + "loss": 0.1835, + "step": 11588 + }, + { + "epoch": 39.96206896551724, + "grad_norm": 1.0764422416687012, + "learning_rate": 2.800735632183908e-05, + "loss": 0.172, + "step": 11589 + }, + { + "epoch": 39.96551724137931, + "grad_norm": 0.5636780858039856, + "learning_rate": 2.800689655172414e-05, + "loss": 0.1434, + "step": 11590 + }, + { + "epoch": 39.96896551724138, + "grad_norm": 1.5343387126922607, + "learning_rate": 2.8006436781609195e-05, + "loss": 0.1636, + "step": 11591 + }, + { + "epoch": 39.97241379310345, + "grad_norm": 0.7999375462532043, + "learning_rate": 2.8005977011494254e-05, + "loss": 0.1614, + "step": 11592 + }, + { + "epoch": 39.97586206896552, + "grad_norm": 1.0915523767471313, + "learning_rate": 2.8005517241379312e-05, + "loss": 0.1361, + "step": 11593 + }, + { + "epoch": 39.97931034482759, + "grad_norm": 0.6097292304039001, + "learning_rate": 2.8005057471264368e-05, + "loss": 0.1432, + "step": 11594 + }, + { + "epoch": 39.98275862068966, + "grad_norm": 0.9179913997650146, + "learning_rate": 2.8004597701149426e-05, + "loss": 0.1488, + "step": 11595 + }, + { + "epoch": 39.98620689655172, + "grad_norm": 0.8879362940788269, + "learning_rate": 2.8004137931034485e-05, + "loss": 0.1519, + "step": 11596 + }, + { + "epoch": 39.98965517241379, + "grad_norm": 0.8876436948776245, + "learning_rate": 2.800367816091954e-05, + "loss": 0.1216, + "step": 11597 + }, + { + "epoch": 39.99310344827586, + "grad_norm": 1.6118890047073364, + "learning_rate": 2.80032183908046e-05, + "loss": 0.116, + "step": 11598 + }, + { + "epoch": 39.99655172413793, + "grad_norm": 1.871911644935608, + "learning_rate": 2.8002758620689654e-05, + "loss": 0.1281, + "step": 11599 + }, + { + "epoch": 40.0, + "grad_norm": 0.9541428089141846, + "learning_rate": 2.8002298850574713e-05, + "loss": 0.2223, + "step": 11600 + }, + { + "epoch": 40.00344827586207, + "grad_norm": 0.5181897282600403, + "learning_rate": 2.800183908045977e-05, + "loss": 0.2095, + "step": 11601 + }, + { + "epoch": 40.00689655172414, + "grad_norm": 0.5921854972839355, + "learning_rate": 2.8001379310344827e-05, + "loss": 0.1867, + "step": 11602 + }, + { + "epoch": 40.01034482758621, + "grad_norm": 0.48063069581985474, + "learning_rate": 2.8000919540229886e-05, + "loss": 0.1516, + "step": 11603 + }, + { + "epoch": 40.01379310344828, + "grad_norm": 0.5444591641426086, + "learning_rate": 2.8000459770114944e-05, + "loss": 0.1567, + "step": 11604 + }, + { + "epoch": 40.01724137931034, + "grad_norm": 1.1245945692062378, + "learning_rate": 2.8e-05, + "loss": 0.1698, + "step": 11605 + }, + { + "epoch": 40.02068965517241, + "grad_norm": 0.5311099886894226, + "learning_rate": 2.7999540229885058e-05, + "loss": 0.1673, + "step": 11606 + }, + { + "epoch": 40.02413793103448, + "grad_norm": 0.8394598960876465, + "learning_rate": 2.7999080459770114e-05, + "loss": 0.1573, + "step": 11607 + }, + { + "epoch": 40.02758620689655, + "grad_norm": 0.6501821279525757, + "learning_rate": 2.7998620689655176e-05, + "loss": 0.1354, + "step": 11608 + }, + { + "epoch": 40.03103448275862, + "grad_norm": 0.5301545858383179, + "learning_rate": 2.799816091954023e-05, + "loss": 0.1545, + "step": 11609 + }, + { + "epoch": 40.03448275862069, + "grad_norm": 0.547879695892334, + "learning_rate": 2.7997701149425286e-05, + "loss": 0.1451, + "step": 11610 + }, + { + "epoch": 40.03793103448276, + "grad_norm": 0.5859290361404419, + "learning_rate": 2.7997241379310345e-05, + "loss": 0.135, + "step": 11611 + }, + { + "epoch": 40.04137931034483, + "grad_norm": 0.8631305694580078, + "learning_rate": 2.7996781609195404e-05, + "loss": 0.1417, + "step": 11612 + }, + { + "epoch": 40.0448275862069, + "grad_norm": 0.6690996885299683, + "learning_rate": 2.7996321839080462e-05, + "loss": 0.1219, + "step": 11613 + }, + { + "epoch": 40.04827586206896, + "grad_norm": 0.5393000841140747, + "learning_rate": 2.7995862068965518e-05, + "loss": 0.1277, + "step": 11614 + }, + { + "epoch": 40.05172413793103, + "grad_norm": 0.7986444234848022, + "learning_rate": 2.7995402298850573e-05, + "loss": 0.121, + "step": 11615 + }, + { + "epoch": 40.0551724137931, + "grad_norm": 0.9650202989578247, + "learning_rate": 2.7994942528735635e-05, + "loss": 0.1264, + "step": 11616 + }, + { + "epoch": 40.05862068965517, + "grad_norm": 1.3113383054733276, + "learning_rate": 2.799448275862069e-05, + "loss": 0.1108, + "step": 11617 + }, + { + "epoch": 40.06206896551724, + "grad_norm": 1.6393826007843018, + "learning_rate": 2.799402298850575e-05, + "loss": 0.1163, + "step": 11618 + }, + { + "epoch": 40.06551724137931, + "grad_norm": 0.690750002861023, + "learning_rate": 2.7993563218390804e-05, + "loss": 0.1079, + "step": 11619 + }, + { + "epoch": 40.06896551724138, + "grad_norm": 0.5626112222671509, + "learning_rate": 2.7993103448275863e-05, + "loss": 0.1003, + "step": 11620 + }, + { + "epoch": 40.07241379310345, + "grad_norm": 0.7423434257507324, + "learning_rate": 2.799264367816092e-05, + "loss": 0.117, + "step": 11621 + }, + { + "epoch": 40.07586206896552, + "grad_norm": 0.6133210062980652, + "learning_rate": 2.7992183908045977e-05, + "loss": 0.0907, + "step": 11622 + }, + { + "epoch": 40.07931034482758, + "grad_norm": 0.8096251487731934, + "learning_rate": 2.7991724137931036e-05, + "loss": 0.0993, + "step": 11623 + }, + { + "epoch": 40.08275862068965, + "grad_norm": 1.315021276473999, + "learning_rate": 2.7991264367816094e-05, + "loss": 0.1343, + "step": 11624 + }, + { + "epoch": 40.08620689655172, + "grad_norm": 1.6839079856872559, + "learning_rate": 2.799080459770115e-05, + "loss": 0.1622, + "step": 11625 + }, + { + "epoch": 40.08965517241379, + "grad_norm": 0.5504493713378906, + "learning_rate": 2.7990344827586208e-05, + "loss": 0.1995, + "step": 11626 + }, + { + "epoch": 40.09310344827586, + "grad_norm": 0.6462870836257935, + "learning_rate": 2.7989885057471263e-05, + "loss": 0.1762, + "step": 11627 + }, + { + "epoch": 40.09655172413793, + "grad_norm": 0.6583945751190186, + "learning_rate": 2.7989425287356326e-05, + "loss": 0.1676, + "step": 11628 + }, + { + "epoch": 40.1, + "grad_norm": 0.4450218975543976, + "learning_rate": 2.798896551724138e-05, + "loss": 0.1519, + "step": 11629 + }, + { + "epoch": 40.10344827586207, + "grad_norm": 0.5877785682678223, + "learning_rate": 2.7988505747126436e-05, + "loss": 0.1409, + "step": 11630 + }, + { + "epoch": 40.10689655172414, + "grad_norm": 0.7537900805473328, + "learning_rate": 2.7988045977011495e-05, + "loss": 0.1532, + "step": 11631 + }, + { + "epoch": 40.110344827586204, + "grad_norm": 0.9696903228759766, + "learning_rate": 2.7987586206896554e-05, + "loss": 0.1627, + "step": 11632 + }, + { + "epoch": 40.11379310344827, + "grad_norm": 0.7010102868080139, + "learning_rate": 2.798712643678161e-05, + "loss": 0.1424, + "step": 11633 + }, + { + "epoch": 40.11724137931034, + "grad_norm": 0.8179371356964111, + "learning_rate": 2.7986666666666668e-05, + "loss": 0.1398, + "step": 11634 + }, + { + "epoch": 40.12068965517241, + "grad_norm": 0.5714012980461121, + "learning_rate": 2.7986206896551723e-05, + "loss": 0.1534, + "step": 11635 + }, + { + "epoch": 40.12413793103448, + "grad_norm": 0.6157788038253784, + "learning_rate": 2.7985747126436785e-05, + "loss": 0.1157, + "step": 11636 + }, + { + "epoch": 40.12758620689655, + "grad_norm": 0.6693355441093445, + "learning_rate": 2.798528735632184e-05, + "loss": 0.1279, + "step": 11637 + }, + { + "epoch": 40.13103448275862, + "grad_norm": 0.7183413505554199, + "learning_rate": 2.7984827586206895e-05, + "loss": 0.1077, + "step": 11638 + }, + { + "epoch": 40.13448275862069, + "grad_norm": 0.5054282546043396, + "learning_rate": 2.7984367816091954e-05, + "loss": 0.1306, + "step": 11639 + }, + { + "epoch": 40.13793103448276, + "grad_norm": 0.5936710834503174, + "learning_rate": 2.7983908045977013e-05, + "loss": 0.1341, + "step": 11640 + }, + { + "epoch": 40.141379310344824, + "grad_norm": 0.6644858121871948, + "learning_rate": 2.798344827586207e-05, + "loss": 0.1316, + "step": 11641 + }, + { + "epoch": 40.144827586206894, + "grad_norm": 0.8058156967163086, + "learning_rate": 2.7982988505747127e-05, + "loss": 0.1149, + "step": 11642 + }, + { + "epoch": 40.148275862068964, + "grad_norm": 1.2202357053756714, + "learning_rate": 2.7982528735632182e-05, + "loss": 0.1192, + "step": 11643 + }, + { + "epoch": 40.15172413793103, + "grad_norm": 0.7190808653831482, + "learning_rate": 2.7982068965517244e-05, + "loss": 0.1127, + "step": 11644 + }, + { + "epoch": 40.1551724137931, + "grad_norm": 1.0018106698989868, + "learning_rate": 2.79816091954023e-05, + "loss": 0.0967, + "step": 11645 + }, + { + "epoch": 40.15862068965517, + "grad_norm": 0.6785632967948914, + "learning_rate": 2.7981149425287358e-05, + "loss": 0.1221, + "step": 11646 + }, + { + "epoch": 40.16206896551724, + "grad_norm": 0.8175433278083801, + "learning_rate": 2.7980689655172413e-05, + "loss": 0.1115, + "step": 11647 + }, + { + "epoch": 40.16551724137931, + "grad_norm": 1.1953258514404297, + "learning_rate": 2.7980229885057472e-05, + "loss": 0.0901, + "step": 11648 + }, + { + "epoch": 40.16896551724138, + "grad_norm": 0.7493051290512085, + "learning_rate": 2.797977011494253e-05, + "loss": 0.1049, + "step": 11649 + }, + { + "epoch": 40.172413793103445, + "grad_norm": 0.9021804332733154, + "learning_rate": 2.7979310344827586e-05, + "loss": 0.1565, + "step": 11650 + }, + { + "epoch": 40.175862068965515, + "grad_norm": 0.6075196266174316, + "learning_rate": 2.7978850574712645e-05, + "loss": 0.221, + "step": 11651 + }, + { + "epoch": 40.179310344827584, + "grad_norm": 0.6591607332229614, + "learning_rate": 2.7978390804597703e-05, + "loss": 0.1739, + "step": 11652 + }, + { + "epoch": 40.182758620689654, + "grad_norm": 0.555267333984375, + "learning_rate": 2.797793103448276e-05, + "loss": 0.1842, + "step": 11653 + }, + { + "epoch": 40.186206896551724, + "grad_norm": 0.9013941884040833, + "learning_rate": 2.7977471264367817e-05, + "loss": 0.1607, + "step": 11654 + }, + { + "epoch": 40.189655172413794, + "grad_norm": 0.6267364025115967, + "learning_rate": 2.7977011494252873e-05, + "loss": 0.1621, + "step": 11655 + }, + { + "epoch": 40.19310344827586, + "grad_norm": 0.5677948594093323, + "learning_rate": 2.7976551724137935e-05, + "loss": 0.1452, + "step": 11656 + }, + { + "epoch": 40.19655172413793, + "grad_norm": 0.502986490726471, + "learning_rate": 2.797609195402299e-05, + "loss": 0.1669, + "step": 11657 + }, + { + "epoch": 40.2, + "grad_norm": 0.6021277904510498, + "learning_rate": 2.7975632183908045e-05, + "loss": 0.1313, + "step": 11658 + }, + { + "epoch": 40.203448275862065, + "grad_norm": 0.6180835962295532, + "learning_rate": 2.7975172413793104e-05, + "loss": 0.1562, + "step": 11659 + }, + { + "epoch": 40.206896551724135, + "grad_norm": 1.2324230670928955, + "learning_rate": 2.7974712643678163e-05, + "loss": 0.1376, + "step": 11660 + }, + { + "epoch": 40.210344827586205, + "grad_norm": 0.6263248920440674, + "learning_rate": 2.7974252873563218e-05, + "loss": 0.1569, + "step": 11661 + }, + { + "epoch": 40.213793103448275, + "grad_norm": 0.7797892093658447, + "learning_rate": 2.7973793103448277e-05, + "loss": 0.1315, + "step": 11662 + }, + { + "epoch": 40.217241379310344, + "grad_norm": 0.5494678616523743, + "learning_rate": 2.7973333333333332e-05, + "loss": 0.1314, + "step": 11663 + }, + { + "epoch": 40.220689655172414, + "grad_norm": 0.5808693766593933, + "learning_rate": 2.7972873563218394e-05, + "loss": 0.1028, + "step": 11664 + }, + { + "epoch": 40.224137931034484, + "grad_norm": 0.5643265247344971, + "learning_rate": 2.797241379310345e-05, + "loss": 0.1202, + "step": 11665 + }, + { + "epoch": 40.227586206896554, + "grad_norm": 0.7029027342796326, + "learning_rate": 2.7971954022988505e-05, + "loss": 0.1092, + "step": 11666 + }, + { + "epoch": 40.23103448275862, + "grad_norm": 0.7036227583885193, + "learning_rate": 2.7971494252873563e-05, + "loss": 0.1246, + "step": 11667 + }, + { + "epoch": 40.234482758620686, + "grad_norm": 0.5897785425186157, + "learning_rate": 2.7971034482758622e-05, + "loss": 0.1088, + "step": 11668 + }, + { + "epoch": 40.237931034482756, + "grad_norm": 0.7039844393730164, + "learning_rate": 2.797057471264368e-05, + "loss": 0.1092, + "step": 11669 + }, + { + "epoch": 40.241379310344826, + "grad_norm": 1.0365760326385498, + "learning_rate": 2.7970114942528736e-05, + "loss": 0.111, + "step": 11670 + }, + { + "epoch": 40.244827586206895, + "grad_norm": 1.865059733390808, + "learning_rate": 2.796965517241379e-05, + "loss": 0.1308, + "step": 11671 + }, + { + "epoch": 40.248275862068965, + "grad_norm": 0.6405381560325623, + "learning_rate": 2.7969195402298853e-05, + "loss": 0.0902, + "step": 11672 + }, + { + "epoch": 40.251724137931035, + "grad_norm": 2.853517532348633, + "learning_rate": 2.796873563218391e-05, + "loss": 0.092, + "step": 11673 + }, + { + "epoch": 40.255172413793105, + "grad_norm": 1.1927621364593506, + "learning_rate": 2.7968275862068967e-05, + "loss": 0.1035, + "step": 11674 + }, + { + "epoch": 40.258620689655174, + "grad_norm": 1.0837582349777222, + "learning_rate": 2.7967816091954023e-05, + "loss": 0.1199, + "step": 11675 + }, + { + "epoch": 40.262068965517244, + "grad_norm": 0.8633678555488586, + "learning_rate": 2.796735632183908e-05, + "loss": 0.227, + "step": 11676 + }, + { + "epoch": 40.265517241379314, + "grad_norm": 0.7727140784263611, + "learning_rate": 2.796689655172414e-05, + "loss": 0.165, + "step": 11677 + }, + { + "epoch": 40.26896551724138, + "grad_norm": 0.794918954372406, + "learning_rate": 2.7966436781609195e-05, + "loss": 0.1749, + "step": 11678 + }, + { + "epoch": 40.272413793103446, + "grad_norm": 0.49571534991264343, + "learning_rate": 2.7965977011494254e-05, + "loss": 0.1662, + "step": 11679 + }, + { + "epoch": 40.275862068965516, + "grad_norm": 0.8848534822463989, + "learning_rate": 2.7965517241379313e-05, + "loss": 0.148, + "step": 11680 + }, + { + "epoch": 40.279310344827586, + "grad_norm": 0.9959067702293396, + "learning_rate": 2.7965057471264368e-05, + "loss": 0.1543, + "step": 11681 + }, + { + "epoch": 40.282758620689656, + "grad_norm": 2.4409101009368896, + "learning_rate": 2.7964597701149427e-05, + "loss": 0.1506, + "step": 11682 + }, + { + "epoch": 40.286206896551725, + "grad_norm": 0.6493564248085022, + "learning_rate": 2.7964137931034482e-05, + "loss": 0.1648, + "step": 11683 + }, + { + "epoch": 40.289655172413795, + "grad_norm": 0.7148909568786621, + "learning_rate": 2.7963678160919544e-05, + "loss": 0.1441, + "step": 11684 + }, + { + "epoch": 40.293103448275865, + "grad_norm": 0.6141997575759888, + "learning_rate": 2.79632183908046e-05, + "loss": 0.1395, + "step": 11685 + }, + { + "epoch": 40.296551724137935, + "grad_norm": 0.6709399819374084, + "learning_rate": 2.7962758620689655e-05, + "loss": 0.15, + "step": 11686 + }, + { + "epoch": 40.3, + "grad_norm": 1.1454461812973022, + "learning_rate": 2.7962298850574713e-05, + "loss": 0.1721, + "step": 11687 + }, + { + "epoch": 40.30344827586207, + "grad_norm": 1.5469913482666016, + "learning_rate": 2.7961839080459772e-05, + "loss": 0.1346, + "step": 11688 + }, + { + "epoch": 40.30689655172414, + "grad_norm": 0.8761661648750305, + "learning_rate": 2.7961379310344827e-05, + "loss": 0.1239, + "step": 11689 + }, + { + "epoch": 40.310344827586206, + "grad_norm": 0.8666542172431946, + "learning_rate": 2.7960919540229886e-05, + "loss": 0.1427, + "step": 11690 + }, + { + "epoch": 40.313793103448276, + "grad_norm": 0.6497369408607483, + "learning_rate": 2.796045977011494e-05, + "loss": 0.1076, + "step": 11691 + }, + { + "epoch": 40.317241379310346, + "grad_norm": 0.7148862481117249, + "learning_rate": 2.7960000000000003e-05, + "loss": 0.1267, + "step": 11692 + }, + { + "epoch": 40.320689655172416, + "grad_norm": 0.9966643452644348, + "learning_rate": 2.795954022988506e-05, + "loss": 0.1176, + "step": 11693 + }, + { + "epoch": 40.324137931034485, + "grad_norm": 0.8974887132644653, + "learning_rate": 2.7959080459770114e-05, + "loss": 0.1207, + "step": 11694 + }, + { + "epoch": 40.327586206896555, + "grad_norm": 0.629341185092926, + "learning_rate": 2.7958620689655173e-05, + "loss": 0.1185, + "step": 11695 + }, + { + "epoch": 40.33103448275862, + "grad_norm": 0.8130760788917542, + "learning_rate": 2.795816091954023e-05, + "loss": 0.1262, + "step": 11696 + }, + { + "epoch": 40.33448275862069, + "grad_norm": 0.7954714894294739, + "learning_rate": 2.795770114942529e-05, + "loss": 0.0981, + "step": 11697 + }, + { + "epoch": 40.33793103448276, + "grad_norm": 0.5662274956703186, + "learning_rate": 2.7957241379310345e-05, + "loss": 0.1151, + "step": 11698 + }, + { + "epoch": 40.34137931034483, + "grad_norm": 0.8570734858512878, + "learning_rate": 2.79567816091954e-05, + "loss": 0.124, + "step": 11699 + }, + { + "epoch": 40.3448275862069, + "grad_norm": 1.1559773683547974, + "learning_rate": 2.7956321839080463e-05, + "loss": 0.1374, + "step": 11700 + }, + { + "epoch": 40.34827586206897, + "grad_norm": 0.6159823536872864, + "learning_rate": 2.7955862068965518e-05, + "loss": 0.2043, + "step": 11701 + }, + { + "epoch": 40.351724137931036, + "grad_norm": 0.5950391292572021, + "learning_rate": 2.7955402298850577e-05, + "loss": 0.1681, + "step": 11702 + }, + { + "epoch": 40.355172413793106, + "grad_norm": 0.4927203059196472, + "learning_rate": 2.7954942528735632e-05, + "loss": 0.167, + "step": 11703 + }, + { + "epoch": 40.358620689655176, + "grad_norm": 1.321279764175415, + "learning_rate": 2.795448275862069e-05, + "loss": 0.1613, + "step": 11704 + }, + { + "epoch": 40.36206896551724, + "grad_norm": 0.8601357936859131, + "learning_rate": 2.795402298850575e-05, + "loss": 0.1641, + "step": 11705 + }, + { + "epoch": 40.36551724137931, + "grad_norm": 1.159642219543457, + "learning_rate": 2.7953563218390805e-05, + "loss": 0.1678, + "step": 11706 + }, + { + "epoch": 40.36896551724138, + "grad_norm": 0.8981372714042664, + "learning_rate": 2.7953103448275863e-05, + "loss": 0.1828, + "step": 11707 + }, + { + "epoch": 40.37241379310345, + "grad_norm": 0.6486225724220276, + "learning_rate": 2.7952643678160922e-05, + "loss": 0.178, + "step": 11708 + }, + { + "epoch": 40.37586206896552, + "grad_norm": 0.7072604298591614, + "learning_rate": 2.7952183908045977e-05, + "loss": 0.1514, + "step": 11709 + }, + { + "epoch": 40.37931034482759, + "grad_norm": 0.5991432070732117, + "learning_rate": 2.7951724137931036e-05, + "loss": 0.1413, + "step": 11710 + }, + { + "epoch": 40.38275862068966, + "grad_norm": 0.9521961212158203, + "learning_rate": 2.795126436781609e-05, + "loss": 0.1206, + "step": 11711 + }, + { + "epoch": 40.38620689655173, + "grad_norm": 1.9931718111038208, + "learning_rate": 2.7950804597701153e-05, + "loss": 0.177, + "step": 11712 + }, + { + "epoch": 40.389655172413796, + "grad_norm": 1.1548880338668823, + "learning_rate": 2.795034482758621e-05, + "loss": 0.1368, + "step": 11713 + }, + { + "epoch": 40.39310344827586, + "grad_norm": 0.6284099817276001, + "learning_rate": 2.7949885057471264e-05, + "loss": 0.1328, + "step": 11714 + }, + { + "epoch": 40.39655172413793, + "grad_norm": 0.8119350671768188, + "learning_rate": 2.7949425287356323e-05, + "loss": 0.1034, + "step": 11715 + }, + { + "epoch": 40.4, + "grad_norm": 0.7650372982025146, + "learning_rate": 2.794896551724138e-05, + "loss": 0.1515, + "step": 11716 + }, + { + "epoch": 40.40344827586207, + "grad_norm": 0.6360500454902649, + "learning_rate": 2.794850574712644e-05, + "loss": 0.1262, + "step": 11717 + }, + { + "epoch": 40.40689655172414, + "grad_norm": 0.731512725353241, + "learning_rate": 2.7948045977011495e-05, + "loss": 0.1173, + "step": 11718 + }, + { + "epoch": 40.41034482758621, + "grad_norm": 0.5277930498123169, + "learning_rate": 2.794758620689655e-05, + "loss": 0.1159, + "step": 11719 + }, + { + "epoch": 40.41379310344828, + "grad_norm": 0.7175533771514893, + "learning_rate": 2.7947126436781613e-05, + "loss": 0.1156, + "step": 11720 + }, + { + "epoch": 40.41724137931035, + "grad_norm": 0.6196317076683044, + "learning_rate": 2.7946666666666668e-05, + "loss": 0.1016, + "step": 11721 + }, + { + "epoch": 40.42068965517242, + "grad_norm": 0.7266805768013, + "learning_rate": 2.7946206896551723e-05, + "loss": 0.1029, + "step": 11722 + }, + { + "epoch": 40.42413793103448, + "grad_norm": 2.7246503829956055, + "learning_rate": 2.7945747126436782e-05, + "loss": 0.1045, + "step": 11723 + }, + { + "epoch": 40.42758620689655, + "grad_norm": 0.7863669395446777, + "learning_rate": 2.794528735632184e-05, + "loss": 0.099, + "step": 11724 + }, + { + "epoch": 40.43103448275862, + "grad_norm": 1.2347368001937866, + "learning_rate": 2.79448275862069e-05, + "loss": 0.1559, + "step": 11725 + }, + { + "epoch": 40.43448275862069, + "grad_norm": 1.5440723896026611, + "learning_rate": 2.7944367816091955e-05, + "loss": 0.2245, + "step": 11726 + }, + { + "epoch": 40.43793103448276, + "grad_norm": 0.5833970308303833, + "learning_rate": 2.794390804597701e-05, + "loss": 0.1639, + "step": 11727 + }, + { + "epoch": 40.44137931034483, + "grad_norm": 1.124950885772705, + "learning_rate": 2.7943448275862072e-05, + "loss": 0.1765, + "step": 11728 + }, + { + "epoch": 40.4448275862069, + "grad_norm": 0.7326246500015259, + "learning_rate": 2.7942988505747127e-05, + "loss": 0.1669, + "step": 11729 + }, + { + "epoch": 40.44827586206897, + "grad_norm": 0.7290887832641602, + "learning_rate": 2.7942528735632186e-05, + "loss": 0.1556, + "step": 11730 + }, + { + "epoch": 40.45172413793104, + "grad_norm": 1.0433979034423828, + "learning_rate": 2.794206896551724e-05, + "loss": 0.1513, + "step": 11731 + }, + { + "epoch": 40.4551724137931, + "grad_norm": 0.5453125834465027, + "learning_rate": 2.79416091954023e-05, + "loss": 0.1647, + "step": 11732 + }, + { + "epoch": 40.45862068965517, + "grad_norm": 0.9280747771263123, + "learning_rate": 2.794114942528736e-05, + "loss": 0.1586, + "step": 11733 + }, + { + "epoch": 40.46206896551724, + "grad_norm": 1.8095715045928955, + "learning_rate": 2.7940689655172414e-05, + "loss": 0.134, + "step": 11734 + }, + { + "epoch": 40.46551724137931, + "grad_norm": 0.9350835084915161, + "learning_rate": 2.7940229885057473e-05, + "loss": 0.1379, + "step": 11735 + }, + { + "epoch": 40.46896551724138, + "grad_norm": 1.448866605758667, + "learning_rate": 2.793977011494253e-05, + "loss": 0.1383, + "step": 11736 + }, + { + "epoch": 40.47241379310345, + "grad_norm": 0.825294017791748, + "learning_rate": 2.7939310344827586e-05, + "loss": 0.1382, + "step": 11737 + }, + { + "epoch": 40.47586206896552, + "grad_norm": 1.2684051990509033, + "learning_rate": 2.7938850574712645e-05, + "loss": 0.1382, + "step": 11738 + }, + { + "epoch": 40.47931034482759, + "grad_norm": 0.5286739468574524, + "learning_rate": 2.79383908045977e-05, + "loss": 0.1308, + "step": 11739 + }, + { + "epoch": 40.48275862068966, + "grad_norm": 0.7574636340141296, + "learning_rate": 2.7937931034482763e-05, + "loss": 0.141, + "step": 11740 + }, + { + "epoch": 40.48620689655172, + "grad_norm": 0.7143364548683167, + "learning_rate": 2.7937471264367818e-05, + "loss": 0.1255, + "step": 11741 + }, + { + "epoch": 40.48965517241379, + "grad_norm": 0.687987208366394, + "learning_rate": 2.7937011494252873e-05, + "loss": 0.1132, + "step": 11742 + }, + { + "epoch": 40.49310344827586, + "grad_norm": 0.7487812638282776, + "learning_rate": 2.7936551724137932e-05, + "loss": 0.1155, + "step": 11743 + }, + { + "epoch": 40.49655172413793, + "grad_norm": 1.6839125156402588, + "learning_rate": 2.793609195402299e-05, + "loss": 0.11, + "step": 11744 + }, + { + "epoch": 40.5, + "grad_norm": 0.7727195024490356, + "learning_rate": 2.793563218390805e-05, + "loss": 0.1211, + "step": 11745 + }, + { + "epoch": 40.50344827586207, + "grad_norm": 0.6054051518440247, + "learning_rate": 2.7935172413793104e-05, + "loss": 0.0913, + "step": 11746 + }, + { + "epoch": 40.50689655172414, + "grad_norm": 0.6255416870117188, + "learning_rate": 2.793471264367816e-05, + "loss": 0.0832, + "step": 11747 + }, + { + "epoch": 40.51034482758621, + "grad_norm": 1.1705973148345947, + "learning_rate": 2.793425287356322e-05, + "loss": 0.112, + "step": 11748 + }, + { + "epoch": 40.51379310344828, + "grad_norm": 1.0925915241241455, + "learning_rate": 2.7933793103448277e-05, + "loss": 0.1019, + "step": 11749 + }, + { + "epoch": 40.51724137931034, + "grad_norm": 0.877522349357605, + "learning_rate": 2.7933333333333332e-05, + "loss": 0.1196, + "step": 11750 + }, + { + "epoch": 40.52068965517241, + "grad_norm": 0.7351245880126953, + "learning_rate": 2.793287356321839e-05, + "loss": 0.2012, + "step": 11751 + }, + { + "epoch": 40.52413793103448, + "grad_norm": 0.7945444583892822, + "learning_rate": 2.7932413793103446e-05, + "loss": 0.1697, + "step": 11752 + }, + { + "epoch": 40.52758620689655, + "grad_norm": 0.6955210566520691, + "learning_rate": 2.793195402298851e-05, + "loss": 0.1793, + "step": 11753 + }, + { + "epoch": 40.53103448275862, + "grad_norm": 0.8090256452560425, + "learning_rate": 2.7931494252873564e-05, + "loss": 0.1758, + "step": 11754 + }, + { + "epoch": 40.53448275862069, + "grad_norm": 0.7219394445419312, + "learning_rate": 2.793103448275862e-05, + "loss": 0.18, + "step": 11755 + }, + { + "epoch": 40.53793103448276, + "grad_norm": 1.8151274919509888, + "learning_rate": 2.7930574712643678e-05, + "loss": 0.1468, + "step": 11756 + }, + { + "epoch": 40.54137931034483, + "grad_norm": 1.0861194133758545, + "learning_rate": 2.7930114942528736e-05, + "loss": 0.1517, + "step": 11757 + }, + { + "epoch": 40.5448275862069, + "grad_norm": 0.5736615061759949, + "learning_rate": 2.7929655172413795e-05, + "loss": 0.1491, + "step": 11758 + }, + { + "epoch": 40.54827586206896, + "grad_norm": 0.6556744575500488, + "learning_rate": 2.792919540229885e-05, + "loss": 0.1436, + "step": 11759 + }, + { + "epoch": 40.55172413793103, + "grad_norm": 0.641986072063446, + "learning_rate": 2.7928735632183906e-05, + "loss": 0.137, + "step": 11760 + }, + { + "epoch": 40.5551724137931, + "grad_norm": 1.0249788761138916, + "learning_rate": 2.7928275862068968e-05, + "loss": 0.1429, + "step": 11761 + }, + { + "epoch": 40.55862068965517, + "grad_norm": 0.5966471433639526, + "learning_rate": 2.7927816091954023e-05, + "loss": 0.1416, + "step": 11762 + }, + { + "epoch": 40.56206896551724, + "grad_norm": 0.5168145895004272, + "learning_rate": 2.7927356321839082e-05, + "loss": 0.1242, + "step": 11763 + }, + { + "epoch": 40.56551724137931, + "grad_norm": 0.6422237157821655, + "learning_rate": 2.7926896551724137e-05, + "loss": 0.1205, + "step": 11764 + }, + { + "epoch": 40.56896551724138, + "grad_norm": 0.8198093175888062, + "learning_rate": 2.7926436781609196e-05, + "loss": 0.1216, + "step": 11765 + }, + { + "epoch": 40.57241379310345, + "grad_norm": 0.7240015268325806, + "learning_rate": 2.7925977011494254e-05, + "loss": 0.1243, + "step": 11766 + }, + { + "epoch": 40.57586206896552, + "grad_norm": 0.6955036520957947, + "learning_rate": 2.792551724137931e-05, + "loss": 0.1144, + "step": 11767 + }, + { + "epoch": 40.57931034482758, + "grad_norm": 0.9881756901741028, + "learning_rate": 2.792505747126437e-05, + "loss": 0.1179, + "step": 11768 + }, + { + "epoch": 40.58275862068965, + "grad_norm": 0.5950742959976196, + "learning_rate": 2.7924597701149427e-05, + "loss": 0.1188, + "step": 11769 + }, + { + "epoch": 40.58620689655172, + "grad_norm": 0.7534858584403992, + "learning_rate": 2.7924137931034482e-05, + "loss": 0.1104, + "step": 11770 + }, + { + "epoch": 40.58965517241379, + "grad_norm": 1.7280205488204956, + "learning_rate": 2.792367816091954e-05, + "loss": 0.1291, + "step": 11771 + }, + { + "epoch": 40.59310344827586, + "grad_norm": 2.9699957370758057, + "learning_rate": 2.7923218390804596e-05, + "loss": 0.1021, + "step": 11772 + }, + { + "epoch": 40.59655172413793, + "grad_norm": 2.6579055786132812, + "learning_rate": 2.792275862068966e-05, + "loss": 0.1267, + "step": 11773 + }, + { + "epoch": 40.6, + "grad_norm": 0.9990864992141724, + "learning_rate": 2.7922298850574714e-05, + "loss": 0.1184, + "step": 11774 + }, + { + "epoch": 40.60344827586207, + "grad_norm": 1.852492094039917, + "learning_rate": 2.792183908045977e-05, + "loss": 0.1501, + "step": 11775 + }, + { + "epoch": 40.60689655172414, + "grad_norm": 0.7869579792022705, + "learning_rate": 2.7921379310344828e-05, + "loss": 0.2018, + "step": 11776 + }, + { + "epoch": 40.610344827586204, + "grad_norm": 1.1157070398330688, + "learning_rate": 2.7920919540229886e-05, + "loss": 0.1974, + "step": 11777 + }, + { + "epoch": 40.61379310344827, + "grad_norm": 1.480584740638733, + "learning_rate": 2.792045977011494e-05, + "loss": 0.1659, + "step": 11778 + }, + { + "epoch": 40.61724137931034, + "grad_norm": 1.0910156965255737, + "learning_rate": 2.792e-05, + "loss": 0.1818, + "step": 11779 + }, + { + "epoch": 40.62068965517241, + "grad_norm": 0.8567787408828735, + "learning_rate": 2.7919540229885056e-05, + "loss": 0.1565, + "step": 11780 + }, + { + "epoch": 40.62413793103448, + "grad_norm": 2.324859619140625, + "learning_rate": 2.7919080459770118e-05, + "loss": 0.1597, + "step": 11781 + }, + { + "epoch": 40.62758620689655, + "grad_norm": 0.73204106092453, + "learning_rate": 2.7918620689655173e-05, + "loss": 0.1511, + "step": 11782 + }, + { + "epoch": 40.63103448275862, + "grad_norm": 1.0159928798675537, + "learning_rate": 2.7918160919540228e-05, + "loss": 0.1498, + "step": 11783 + }, + { + "epoch": 40.63448275862069, + "grad_norm": 0.4772260785102844, + "learning_rate": 2.7917701149425287e-05, + "loss": 0.1266, + "step": 11784 + }, + { + "epoch": 40.63793103448276, + "grad_norm": 0.6090050339698792, + "learning_rate": 2.7917241379310346e-05, + "loss": 0.1433, + "step": 11785 + }, + { + "epoch": 40.641379310344824, + "grad_norm": 0.9100318551063538, + "learning_rate": 2.7916781609195404e-05, + "loss": 0.1569, + "step": 11786 + }, + { + "epoch": 40.644827586206894, + "grad_norm": 1.195310354232788, + "learning_rate": 2.791632183908046e-05, + "loss": 0.1397, + "step": 11787 + }, + { + "epoch": 40.648275862068964, + "grad_norm": 0.9153019189834595, + "learning_rate": 2.7915862068965515e-05, + "loss": 0.124, + "step": 11788 + }, + { + "epoch": 40.65172413793103, + "grad_norm": 0.8968881368637085, + "learning_rate": 2.7915402298850577e-05, + "loss": 0.1596, + "step": 11789 + }, + { + "epoch": 40.6551724137931, + "grad_norm": 0.8315351605415344, + "learning_rate": 2.7914942528735632e-05, + "loss": 0.151, + "step": 11790 + }, + { + "epoch": 40.65862068965517, + "grad_norm": 0.9347983002662659, + "learning_rate": 2.791448275862069e-05, + "loss": 0.1404, + "step": 11791 + }, + { + "epoch": 40.66206896551724, + "grad_norm": 0.9890848398208618, + "learning_rate": 2.7914022988505746e-05, + "loss": 0.1286, + "step": 11792 + }, + { + "epoch": 40.66551724137931, + "grad_norm": 0.6763122081756592, + "learning_rate": 2.7913563218390805e-05, + "loss": 0.1172, + "step": 11793 + }, + { + "epoch": 40.66896551724138, + "grad_norm": 0.9448462724685669, + "learning_rate": 2.7913103448275864e-05, + "loss": 0.1068, + "step": 11794 + }, + { + "epoch": 40.672413793103445, + "grad_norm": 1.370289921760559, + "learning_rate": 2.791264367816092e-05, + "loss": 0.1139, + "step": 11795 + }, + { + "epoch": 40.675862068965515, + "grad_norm": 0.844764232635498, + "learning_rate": 2.7912183908045978e-05, + "loss": 0.115, + "step": 11796 + }, + { + "epoch": 40.679310344827584, + "grad_norm": 0.6647254824638367, + "learning_rate": 2.7911724137931036e-05, + "loss": 0.1034, + "step": 11797 + }, + { + "epoch": 40.682758620689654, + "grad_norm": 3.673616409301758, + "learning_rate": 2.791126436781609e-05, + "loss": 0.1151, + "step": 11798 + }, + { + "epoch": 40.686206896551724, + "grad_norm": 0.7570379376411438, + "learning_rate": 2.791080459770115e-05, + "loss": 0.1299, + "step": 11799 + }, + { + "epoch": 40.689655172413794, + "grad_norm": 1.274048089981079, + "learning_rate": 2.7910344827586206e-05, + "loss": 0.1861, + "step": 11800 + }, + { + "epoch": 40.69310344827586, + "grad_norm": 0.7875194549560547, + "learning_rate": 2.7909885057471268e-05, + "loss": 0.2268, + "step": 11801 + }, + { + "epoch": 40.69655172413793, + "grad_norm": 0.8849730491638184, + "learning_rate": 2.7909425287356323e-05, + "loss": 0.1776, + "step": 11802 + }, + { + "epoch": 40.7, + "grad_norm": 1.3349769115447998, + "learning_rate": 2.7908965517241378e-05, + "loss": 0.184, + "step": 11803 + }, + { + "epoch": 40.703448275862065, + "grad_norm": 0.6739205718040466, + "learning_rate": 2.7908505747126437e-05, + "loss": 0.1648, + "step": 11804 + }, + { + "epoch": 40.706896551724135, + "grad_norm": 0.7420412302017212, + "learning_rate": 2.7908045977011496e-05, + "loss": 0.1723, + "step": 11805 + }, + { + "epoch": 40.710344827586205, + "grad_norm": 0.8694465160369873, + "learning_rate": 2.7907586206896554e-05, + "loss": 0.1563, + "step": 11806 + }, + { + "epoch": 40.713793103448275, + "grad_norm": 0.6973437070846558, + "learning_rate": 2.790712643678161e-05, + "loss": 0.1582, + "step": 11807 + }, + { + "epoch": 40.717241379310344, + "grad_norm": 0.7746396064758301, + "learning_rate": 2.7906666666666665e-05, + "loss": 0.1501, + "step": 11808 + }, + { + "epoch": 40.720689655172414, + "grad_norm": 1.4606819152832031, + "learning_rate": 2.7906206896551727e-05, + "loss": 0.1417, + "step": 11809 + }, + { + "epoch": 40.724137931034484, + "grad_norm": 0.6521008610725403, + "learning_rate": 2.7905747126436782e-05, + "loss": 0.1438, + "step": 11810 + }, + { + "epoch": 40.727586206896554, + "grad_norm": 0.7490999698638916, + "learning_rate": 2.7905287356321838e-05, + "loss": 0.1611, + "step": 11811 + }, + { + "epoch": 40.73103448275862, + "grad_norm": 0.7601701617240906, + "learning_rate": 2.7904827586206896e-05, + "loss": 0.1282, + "step": 11812 + }, + { + "epoch": 40.734482758620686, + "grad_norm": 0.6846688389778137, + "learning_rate": 2.7904367816091955e-05, + "loss": 0.1358, + "step": 11813 + }, + { + "epoch": 40.737931034482756, + "grad_norm": 0.7778067588806152, + "learning_rate": 2.7903908045977014e-05, + "loss": 0.1286, + "step": 11814 + }, + { + "epoch": 40.741379310344826, + "grad_norm": 1.0610971450805664, + "learning_rate": 2.790344827586207e-05, + "loss": 0.1493, + "step": 11815 + }, + { + "epoch": 40.744827586206895, + "grad_norm": 1.0356992483139038, + "learning_rate": 2.7902988505747124e-05, + "loss": 0.1177, + "step": 11816 + }, + { + "epoch": 40.748275862068965, + "grad_norm": 1.1441752910614014, + "learning_rate": 2.7902528735632186e-05, + "loss": 0.116, + "step": 11817 + }, + { + "epoch": 40.751724137931035, + "grad_norm": 0.7899928092956543, + "learning_rate": 2.790206896551724e-05, + "loss": 0.1192, + "step": 11818 + }, + { + "epoch": 40.755172413793105, + "grad_norm": 0.6208645105361938, + "learning_rate": 2.79016091954023e-05, + "loss": 0.122, + "step": 11819 + }, + { + "epoch": 40.758620689655174, + "grad_norm": 1.048755168914795, + "learning_rate": 2.7901149425287356e-05, + "loss": 0.1028, + "step": 11820 + }, + { + "epoch": 40.762068965517244, + "grad_norm": 1.3539106845855713, + "learning_rate": 2.7900689655172414e-05, + "loss": 0.1193, + "step": 11821 + }, + { + "epoch": 40.765517241379314, + "grad_norm": 0.6610104441642761, + "learning_rate": 2.7900229885057473e-05, + "loss": 0.1115, + "step": 11822 + }, + { + "epoch": 40.76896551724138, + "grad_norm": 0.8307573199272156, + "learning_rate": 2.7899770114942528e-05, + "loss": 0.108, + "step": 11823 + }, + { + "epoch": 40.772413793103446, + "grad_norm": 0.88006192445755, + "learning_rate": 2.7899310344827587e-05, + "loss": 0.1279, + "step": 11824 + }, + { + "epoch": 40.775862068965516, + "grad_norm": 1.4205797910690308, + "learning_rate": 2.7898850574712646e-05, + "loss": 0.1211, + "step": 11825 + }, + { + "epoch": 40.779310344827586, + "grad_norm": 1.053330659866333, + "learning_rate": 2.78983908045977e-05, + "loss": 0.2244, + "step": 11826 + }, + { + "epoch": 40.782758620689656, + "grad_norm": 0.5796409249305725, + "learning_rate": 2.789793103448276e-05, + "loss": 0.1967, + "step": 11827 + }, + { + "epoch": 40.786206896551725, + "grad_norm": 0.5129582285881042, + "learning_rate": 2.7897471264367815e-05, + "loss": 0.172, + "step": 11828 + }, + { + "epoch": 40.789655172413795, + "grad_norm": 0.5083187818527222, + "learning_rate": 2.7897011494252877e-05, + "loss": 0.1812, + "step": 11829 + }, + { + "epoch": 40.793103448275865, + "grad_norm": 0.7066628336906433, + "learning_rate": 2.7896551724137932e-05, + "loss": 0.1873, + "step": 11830 + }, + { + "epoch": 40.796551724137935, + "grad_norm": 0.5763672590255737, + "learning_rate": 2.7896091954022987e-05, + "loss": 0.1519, + "step": 11831 + }, + { + "epoch": 40.8, + "grad_norm": 2.805004119873047, + "learning_rate": 2.7895632183908046e-05, + "loss": 0.1396, + "step": 11832 + }, + { + "epoch": 40.80344827586207, + "grad_norm": 0.6230461001396179, + "learning_rate": 2.7895172413793105e-05, + "loss": 0.1459, + "step": 11833 + }, + { + "epoch": 40.80689655172414, + "grad_norm": 0.4898602366447449, + "learning_rate": 2.7894712643678164e-05, + "loss": 0.1503, + "step": 11834 + }, + { + "epoch": 40.810344827586206, + "grad_norm": 0.6101446151733398, + "learning_rate": 2.789425287356322e-05, + "loss": 0.1436, + "step": 11835 + }, + { + "epoch": 40.813793103448276, + "grad_norm": 2.803314208984375, + "learning_rate": 2.7893793103448274e-05, + "loss": 0.1412, + "step": 11836 + }, + { + "epoch": 40.817241379310346, + "grad_norm": 1.1435198783874512, + "learning_rate": 2.7893333333333336e-05, + "loss": 0.1156, + "step": 11837 + }, + { + "epoch": 40.820689655172416, + "grad_norm": 0.7559360265731812, + "learning_rate": 2.789287356321839e-05, + "loss": 0.1354, + "step": 11838 + }, + { + "epoch": 40.824137931034485, + "grad_norm": 1.0477510690689087, + "learning_rate": 2.7892413793103447e-05, + "loss": 0.121, + "step": 11839 + }, + { + "epoch": 40.827586206896555, + "grad_norm": 0.6943573355674744, + "learning_rate": 2.7891954022988505e-05, + "loss": 0.1142, + "step": 11840 + }, + { + "epoch": 40.83103448275862, + "grad_norm": 0.6271378993988037, + "learning_rate": 2.7891494252873564e-05, + "loss": 0.12, + "step": 11841 + }, + { + "epoch": 40.83448275862069, + "grad_norm": 0.7589528560638428, + "learning_rate": 2.7891034482758623e-05, + "loss": 0.123, + "step": 11842 + }, + { + "epoch": 40.83793103448276, + "grad_norm": 1.1358497142791748, + "learning_rate": 2.7890574712643678e-05, + "loss": 0.1257, + "step": 11843 + }, + { + "epoch": 40.84137931034483, + "grad_norm": 1.1042550802230835, + "learning_rate": 2.7890114942528733e-05, + "loss": 0.1318, + "step": 11844 + }, + { + "epoch": 40.8448275862069, + "grad_norm": 0.6960496306419373, + "learning_rate": 2.7889655172413795e-05, + "loss": 0.1177, + "step": 11845 + }, + { + "epoch": 40.84827586206897, + "grad_norm": 0.6922822594642639, + "learning_rate": 2.788919540229885e-05, + "loss": 0.0955, + "step": 11846 + }, + { + "epoch": 40.851724137931036, + "grad_norm": 1.0474412441253662, + "learning_rate": 2.788873563218391e-05, + "loss": 0.102, + "step": 11847 + }, + { + "epoch": 40.855172413793106, + "grad_norm": 1.7826316356658936, + "learning_rate": 2.7888275862068965e-05, + "loss": 0.114, + "step": 11848 + }, + { + "epoch": 40.858620689655176, + "grad_norm": 0.9580329060554504, + "learning_rate": 2.7887816091954023e-05, + "loss": 0.1125, + "step": 11849 + }, + { + "epoch": 40.86206896551724, + "grad_norm": 0.9531774520874023, + "learning_rate": 2.7887356321839082e-05, + "loss": 0.1423, + "step": 11850 + }, + { + "epoch": 40.86551724137931, + "grad_norm": 0.659142792224884, + "learning_rate": 2.7886896551724137e-05, + "loss": 0.2409, + "step": 11851 + }, + { + "epoch": 40.86896551724138, + "grad_norm": 0.6826728582382202, + "learning_rate": 2.7886436781609196e-05, + "loss": 0.1995, + "step": 11852 + }, + { + "epoch": 40.87241379310345, + "grad_norm": 0.547243058681488, + "learning_rate": 2.7885977011494255e-05, + "loss": 0.1736, + "step": 11853 + }, + { + "epoch": 40.87586206896552, + "grad_norm": 1.1701080799102783, + "learning_rate": 2.788551724137931e-05, + "loss": 0.1799, + "step": 11854 + }, + { + "epoch": 40.87931034482759, + "grad_norm": 1.5189658403396606, + "learning_rate": 2.788505747126437e-05, + "loss": 0.1613, + "step": 11855 + }, + { + "epoch": 40.88275862068966, + "grad_norm": 1.4116151332855225, + "learning_rate": 2.7884597701149424e-05, + "loss": 0.1483, + "step": 11856 + }, + { + "epoch": 40.88620689655173, + "grad_norm": 0.8265933394432068, + "learning_rate": 2.7884137931034486e-05, + "loss": 0.1326, + "step": 11857 + }, + { + "epoch": 40.889655172413796, + "grad_norm": 2.1229608058929443, + "learning_rate": 2.788367816091954e-05, + "loss": 0.1705, + "step": 11858 + }, + { + "epoch": 40.89310344827586, + "grad_norm": 0.9507710933685303, + "learning_rate": 2.7883218390804597e-05, + "loss": 0.1535, + "step": 11859 + }, + { + "epoch": 40.89655172413793, + "grad_norm": 1.0392764806747437, + "learning_rate": 2.7882758620689655e-05, + "loss": 0.1404, + "step": 11860 + }, + { + "epoch": 40.9, + "grad_norm": 0.7244572639465332, + "learning_rate": 2.7882298850574714e-05, + "loss": 0.1268, + "step": 11861 + }, + { + "epoch": 40.90344827586207, + "grad_norm": 0.5686535239219666, + "learning_rate": 2.7881839080459773e-05, + "loss": 0.1426, + "step": 11862 + }, + { + "epoch": 40.90689655172414, + "grad_norm": 0.5321837067604065, + "learning_rate": 2.7881379310344828e-05, + "loss": 0.1473, + "step": 11863 + }, + { + "epoch": 40.91034482758621, + "grad_norm": 0.9425690770149231, + "learning_rate": 2.7880919540229883e-05, + "loss": 0.1515, + "step": 11864 + }, + { + "epoch": 40.91379310344828, + "grad_norm": 2.6231701374053955, + "learning_rate": 2.7880459770114945e-05, + "loss": 0.1443, + "step": 11865 + }, + { + "epoch": 40.91724137931035, + "grad_norm": 0.6337547302246094, + "learning_rate": 2.788e-05, + "loss": 0.1374, + "step": 11866 + }, + { + "epoch": 40.92068965517242, + "grad_norm": 0.9334180951118469, + "learning_rate": 2.7879540229885056e-05, + "loss": 0.1141, + "step": 11867 + }, + { + "epoch": 40.92413793103448, + "grad_norm": 0.5588310360908508, + "learning_rate": 2.7879080459770115e-05, + "loss": 0.1245, + "step": 11868 + }, + { + "epoch": 40.92758620689655, + "grad_norm": 0.7732442617416382, + "learning_rate": 2.7878620689655173e-05, + "loss": 0.1248, + "step": 11869 + }, + { + "epoch": 40.93103448275862, + "grad_norm": 0.9320520758628845, + "learning_rate": 2.7878160919540232e-05, + "loss": 0.1183, + "step": 11870 + }, + { + "epoch": 40.93448275862069, + "grad_norm": 4.59244966506958, + "learning_rate": 2.7877701149425287e-05, + "loss": 0.1158, + "step": 11871 + }, + { + "epoch": 40.93793103448276, + "grad_norm": 0.7224825024604797, + "learning_rate": 2.7877241379310343e-05, + "loss": 0.1088, + "step": 11872 + }, + { + "epoch": 40.94137931034483, + "grad_norm": 0.9052416086196899, + "learning_rate": 2.7876781609195405e-05, + "loss": 0.0993, + "step": 11873 + }, + { + "epoch": 40.9448275862069, + "grad_norm": 0.7622255086898804, + "learning_rate": 2.787632183908046e-05, + "loss": 0.1053, + "step": 11874 + }, + { + "epoch": 40.94827586206897, + "grad_norm": 1.82595694065094, + "learning_rate": 2.787586206896552e-05, + "loss": 0.1727, + "step": 11875 + }, + { + "epoch": 40.95172413793104, + "grad_norm": 0.8928139805793762, + "learning_rate": 2.7875402298850574e-05, + "loss": 0.1884, + "step": 11876 + }, + { + "epoch": 40.9551724137931, + "grad_norm": 0.6927770972251892, + "learning_rate": 2.7874942528735633e-05, + "loss": 0.1922, + "step": 11877 + }, + { + "epoch": 40.95862068965517, + "grad_norm": 0.9186509251594543, + "learning_rate": 2.787448275862069e-05, + "loss": 0.1554, + "step": 11878 + }, + { + "epoch": 40.96206896551724, + "grad_norm": 1.2442209720611572, + "learning_rate": 2.7874022988505747e-05, + "loss": 0.1656, + "step": 11879 + }, + { + "epoch": 40.96551724137931, + "grad_norm": 0.7952768206596375, + "learning_rate": 2.7873563218390805e-05, + "loss": 0.1712, + "step": 11880 + }, + { + "epoch": 40.96896551724138, + "grad_norm": 4.886902809143066, + "learning_rate": 2.7873103448275864e-05, + "loss": 0.1549, + "step": 11881 + }, + { + "epoch": 40.97241379310345, + "grad_norm": 0.9248790144920349, + "learning_rate": 2.787264367816092e-05, + "loss": 0.1605, + "step": 11882 + }, + { + "epoch": 40.97586206896552, + "grad_norm": 0.5161663293838501, + "learning_rate": 2.7872183908045978e-05, + "loss": 0.1385, + "step": 11883 + }, + { + "epoch": 40.97931034482759, + "grad_norm": 1.8625320196151733, + "learning_rate": 2.7871724137931033e-05, + "loss": 0.1542, + "step": 11884 + }, + { + "epoch": 40.98275862068966, + "grad_norm": 0.5619131326675415, + "learning_rate": 2.7871264367816095e-05, + "loss": 0.1164, + "step": 11885 + }, + { + "epoch": 40.98620689655172, + "grad_norm": 1.0469545125961304, + "learning_rate": 2.787080459770115e-05, + "loss": 0.1263, + "step": 11886 + }, + { + "epoch": 40.98965517241379, + "grad_norm": 0.8741365075111389, + "learning_rate": 2.7870344827586206e-05, + "loss": 0.1191, + "step": 11887 + }, + { + "epoch": 40.99310344827586, + "grad_norm": 0.9119535684585571, + "learning_rate": 2.7869885057471265e-05, + "loss": 0.1188, + "step": 11888 + }, + { + "epoch": 40.99655172413793, + "grad_norm": 0.720704972743988, + "learning_rate": 2.7869425287356323e-05, + "loss": 0.0982, + "step": 11889 + }, + { + "epoch": 41.0, + "grad_norm": 6.535160541534424, + "learning_rate": 2.7868965517241382e-05, + "loss": 0.1581, + "step": 11890 + }, + { + "epoch": 41.00344827586207, + "grad_norm": 0.6805986166000366, + "learning_rate": 2.7868505747126437e-05, + "loss": 0.2226, + "step": 11891 + }, + { + "epoch": 41.00689655172414, + "grad_norm": 0.572300910949707, + "learning_rate": 2.7868045977011493e-05, + "loss": 0.1827, + "step": 11892 + }, + { + "epoch": 41.01034482758621, + "grad_norm": 0.7962373495101929, + "learning_rate": 2.7867586206896555e-05, + "loss": 0.1738, + "step": 11893 + }, + { + "epoch": 41.01379310344828, + "grad_norm": 0.5130933523178101, + "learning_rate": 2.786712643678161e-05, + "loss": 0.1571, + "step": 11894 + }, + { + "epoch": 41.01724137931034, + "grad_norm": 1.1399163007736206, + "learning_rate": 2.7866666666666665e-05, + "loss": 0.1429, + "step": 11895 + }, + { + "epoch": 41.02068965517241, + "grad_norm": 0.642897367477417, + "learning_rate": 2.7866206896551724e-05, + "loss": 0.1567, + "step": 11896 + }, + { + "epoch": 41.02413793103448, + "grad_norm": 0.725182294845581, + "learning_rate": 2.7865747126436783e-05, + "loss": 0.1543, + "step": 11897 + }, + { + "epoch": 41.02758620689655, + "grad_norm": 0.6084275841712952, + "learning_rate": 2.786528735632184e-05, + "loss": 0.1556, + "step": 11898 + }, + { + "epoch": 41.03103448275862, + "grad_norm": 0.4611421823501587, + "learning_rate": 2.7864827586206897e-05, + "loss": 0.1344, + "step": 11899 + }, + { + "epoch": 41.03448275862069, + "grad_norm": 2.6376240253448486, + "learning_rate": 2.7864367816091952e-05, + "loss": 0.1328, + "step": 11900 + }, + { + "epoch": 41.03793103448276, + "grad_norm": 0.6717047095298767, + "learning_rate": 2.7863908045977014e-05, + "loss": 0.1345, + "step": 11901 + }, + { + "epoch": 41.04137931034483, + "grad_norm": 0.5581512451171875, + "learning_rate": 2.786344827586207e-05, + "loss": 0.1254, + "step": 11902 + }, + { + "epoch": 41.0448275862069, + "grad_norm": 0.896269679069519, + "learning_rate": 2.7862988505747128e-05, + "loss": 0.1384, + "step": 11903 + }, + { + "epoch": 41.04827586206896, + "grad_norm": 0.5928277969360352, + "learning_rate": 2.7862528735632183e-05, + "loss": 0.1307, + "step": 11904 + }, + { + "epoch": 41.05172413793103, + "grad_norm": 1.040685772895813, + "learning_rate": 2.7862068965517242e-05, + "loss": 0.123, + "step": 11905 + }, + { + "epoch": 41.0551724137931, + "grad_norm": 0.8467941284179688, + "learning_rate": 2.78616091954023e-05, + "loss": 0.1053, + "step": 11906 + }, + { + "epoch": 41.05862068965517, + "grad_norm": 0.6045325994491577, + "learning_rate": 2.7861149425287356e-05, + "loss": 0.1054, + "step": 11907 + }, + { + "epoch": 41.06206896551724, + "grad_norm": 1.0747803449630737, + "learning_rate": 2.7860689655172415e-05, + "loss": 0.1122, + "step": 11908 + }, + { + "epoch": 41.06551724137931, + "grad_norm": 1.2228373289108276, + "learning_rate": 2.7860229885057473e-05, + "loss": 0.121, + "step": 11909 + }, + { + "epoch": 41.06896551724138, + "grad_norm": 1.0814549922943115, + "learning_rate": 2.785977011494253e-05, + "loss": 0.0961, + "step": 11910 + }, + { + "epoch": 41.07241379310345, + "grad_norm": 0.8392794728279114, + "learning_rate": 2.7859310344827587e-05, + "loss": 0.1144, + "step": 11911 + }, + { + "epoch": 41.07586206896552, + "grad_norm": 0.5752587914466858, + "learning_rate": 2.7858850574712643e-05, + "loss": 0.0908, + "step": 11912 + }, + { + "epoch": 41.07931034482758, + "grad_norm": 1.0401684045791626, + "learning_rate": 2.7858390804597705e-05, + "loss": 0.1066, + "step": 11913 + }, + { + "epoch": 41.08275862068965, + "grad_norm": 1.4423011541366577, + "learning_rate": 2.785793103448276e-05, + "loss": 0.0978, + "step": 11914 + }, + { + "epoch": 41.08620689655172, + "grad_norm": 1.0593105554580688, + "learning_rate": 2.7857471264367815e-05, + "loss": 0.1423, + "step": 11915 + }, + { + "epoch": 41.08965517241379, + "grad_norm": 0.804160475730896, + "learning_rate": 2.7857011494252874e-05, + "loss": 0.2111, + "step": 11916 + }, + { + "epoch": 41.09310344827586, + "grad_norm": 0.6861268281936646, + "learning_rate": 2.7856551724137933e-05, + "loss": 0.1742, + "step": 11917 + }, + { + "epoch": 41.09655172413793, + "grad_norm": 0.6653047204017639, + "learning_rate": 2.785609195402299e-05, + "loss": 0.17, + "step": 11918 + }, + { + "epoch": 41.1, + "grad_norm": 0.8181291222572327, + "learning_rate": 2.7855632183908047e-05, + "loss": 0.1613, + "step": 11919 + }, + { + "epoch": 41.10344827586207, + "grad_norm": 0.7577361464500427, + "learning_rate": 2.7855172413793102e-05, + "loss": 0.1474, + "step": 11920 + }, + { + "epoch": 41.10689655172414, + "grad_norm": 0.5354645252227783, + "learning_rate": 2.7854712643678164e-05, + "loss": 0.1288, + "step": 11921 + }, + { + "epoch": 41.110344827586204, + "grad_norm": 0.6103320717811584, + "learning_rate": 2.785425287356322e-05, + "loss": 0.1479, + "step": 11922 + }, + { + "epoch": 41.11379310344827, + "grad_norm": 0.7566019892692566, + "learning_rate": 2.7853793103448278e-05, + "loss": 0.1577, + "step": 11923 + }, + { + "epoch": 41.11724137931034, + "grad_norm": 0.615723192691803, + "learning_rate": 2.7853333333333333e-05, + "loss": 0.1439, + "step": 11924 + }, + { + "epoch": 41.12068965517241, + "grad_norm": 0.5227155685424805, + "learning_rate": 2.7852873563218392e-05, + "loss": 0.1342, + "step": 11925 + }, + { + "epoch": 41.12413793103448, + "grad_norm": 0.954628050327301, + "learning_rate": 2.785241379310345e-05, + "loss": 0.1435, + "step": 11926 + }, + { + "epoch": 41.12758620689655, + "grad_norm": 0.9029730558395386, + "learning_rate": 2.7851954022988506e-05, + "loss": 0.1175, + "step": 11927 + }, + { + "epoch": 41.13103448275862, + "grad_norm": 0.6986355781555176, + "learning_rate": 2.785149425287356e-05, + "loss": 0.1281, + "step": 11928 + }, + { + "epoch": 41.13448275862069, + "grad_norm": 0.48720887303352356, + "learning_rate": 2.7851034482758623e-05, + "loss": 0.1301, + "step": 11929 + }, + { + "epoch": 41.13793103448276, + "grad_norm": 0.5560234785079956, + "learning_rate": 2.785057471264368e-05, + "loss": 0.12, + "step": 11930 + }, + { + "epoch": 41.141379310344824, + "grad_norm": 0.6567030549049377, + "learning_rate": 2.7850114942528737e-05, + "loss": 0.111, + "step": 11931 + }, + { + "epoch": 41.144827586206894, + "grad_norm": 1.221776008605957, + "learning_rate": 2.7849655172413792e-05, + "loss": 0.1256, + "step": 11932 + }, + { + "epoch": 41.148275862068964, + "grad_norm": 1.1641077995300293, + "learning_rate": 2.784919540229885e-05, + "loss": 0.1234, + "step": 11933 + }, + { + "epoch": 41.15172413793103, + "grad_norm": 0.5724689364433289, + "learning_rate": 2.784873563218391e-05, + "loss": 0.0992, + "step": 11934 + }, + { + "epoch": 41.1551724137931, + "grad_norm": 1.264756679534912, + "learning_rate": 2.7848275862068965e-05, + "loss": 0.1176, + "step": 11935 + }, + { + "epoch": 41.15862068965517, + "grad_norm": 1.532914161682129, + "learning_rate": 2.7847816091954024e-05, + "loss": 0.1023, + "step": 11936 + }, + { + "epoch": 41.16206896551724, + "grad_norm": 2.1076996326446533, + "learning_rate": 2.7847356321839083e-05, + "loss": 0.098, + "step": 11937 + }, + { + "epoch": 41.16551724137931, + "grad_norm": 0.7160444855690002, + "learning_rate": 2.7846896551724138e-05, + "loss": 0.0981, + "step": 11938 + }, + { + "epoch": 41.16896551724138, + "grad_norm": 0.6355757117271423, + "learning_rate": 2.7846436781609196e-05, + "loss": 0.0868, + "step": 11939 + }, + { + "epoch": 41.172413793103445, + "grad_norm": 1.03900146484375, + "learning_rate": 2.7845977011494252e-05, + "loss": 0.1194, + "step": 11940 + }, + { + "epoch": 41.175862068965515, + "grad_norm": 0.5686364769935608, + "learning_rate": 2.7845517241379314e-05, + "loss": 0.2129, + "step": 11941 + }, + { + "epoch": 41.179310344827584, + "grad_norm": 0.5388920903205872, + "learning_rate": 2.784505747126437e-05, + "loss": 0.1381, + "step": 11942 + }, + { + "epoch": 41.182758620689654, + "grad_norm": 0.6442224979400635, + "learning_rate": 2.7844597701149424e-05, + "loss": 0.1695, + "step": 11943 + }, + { + "epoch": 41.186206896551724, + "grad_norm": 1.1506837606430054, + "learning_rate": 2.7844137931034483e-05, + "loss": 0.1684, + "step": 11944 + }, + { + "epoch": 41.189655172413794, + "grad_norm": 2.0077507495880127, + "learning_rate": 2.7843678160919542e-05, + "loss": 0.1428, + "step": 11945 + }, + { + "epoch": 41.19310344827586, + "grad_norm": 0.631734311580658, + "learning_rate": 2.78432183908046e-05, + "loss": 0.1513, + "step": 11946 + }, + { + "epoch": 41.19655172413793, + "grad_norm": 0.8695550560951233, + "learning_rate": 2.7842758620689656e-05, + "loss": 0.1467, + "step": 11947 + }, + { + "epoch": 41.2, + "grad_norm": 0.5237994194030762, + "learning_rate": 2.784229885057471e-05, + "loss": 0.1535, + "step": 11948 + }, + { + "epoch": 41.203448275862065, + "grad_norm": 0.706161618232727, + "learning_rate": 2.7841839080459773e-05, + "loss": 0.1251, + "step": 11949 + }, + { + "epoch": 41.206896551724135, + "grad_norm": 0.7336849570274353, + "learning_rate": 2.784137931034483e-05, + "loss": 0.132, + "step": 11950 + }, + { + "epoch": 41.210344827586205, + "grad_norm": 0.5422658324241638, + "learning_rate": 2.7840919540229887e-05, + "loss": 0.121, + "step": 11951 + }, + { + "epoch": 41.213793103448275, + "grad_norm": 0.6456677913665771, + "learning_rate": 2.7840459770114942e-05, + "loss": 0.1297, + "step": 11952 + }, + { + "epoch": 41.217241379310344, + "grad_norm": 0.7811408638954163, + "learning_rate": 2.784e-05, + "loss": 0.1219, + "step": 11953 + }, + { + "epoch": 41.220689655172414, + "grad_norm": 0.8268111348152161, + "learning_rate": 2.783954022988506e-05, + "loss": 0.1165, + "step": 11954 + }, + { + "epoch": 41.224137931034484, + "grad_norm": 0.7123024463653564, + "learning_rate": 2.7839080459770115e-05, + "loss": 0.1258, + "step": 11955 + }, + { + "epoch": 41.227586206896554, + "grad_norm": 0.5659504532814026, + "learning_rate": 2.783862068965517e-05, + "loss": 0.1172, + "step": 11956 + }, + { + "epoch": 41.23103448275862, + "grad_norm": 0.993030309677124, + "learning_rate": 2.7838160919540232e-05, + "loss": 0.1119, + "step": 11957 + }, + { + "epoch": 41.234482758620686, + "grad_norm": 0.6471213698387146, + "learning_rate": 2.7837701149425288e-05, + "loss": 0.1057, + "step": 11958 + }, + { + "epoch": 41.237931034482756, + "grad_norm": 0.6332545876502991, + "learning_rate": 2.7837241379310346e-05, + "loss": 0.1038, + "step": 11959 + }, + { + "epoch": 41.241379310344826, + "grad_norm": 0.6505969762802124, + "learning_rate": 2.7836781609195402e-05, + "loss": 0.1046, + "step": 11960 + }, + { + "epoch": 41.244827586206895, + "grad_norm": 0.7660714387893677, + "learning_rate": 2.783632183908046e-05, + "loss": 0.1012, + "step": 11961 + }, + { + "epoch": 41.248275862068965, + "grad_norm": 0.619316041469574, + "learning_rate": 2.783586206896552e-05, + "loss": 0.079, + "step": 11962 + }, + { + "epoch": 41.251724137931035, + "grad_norm": 0.8008268475532532, + "learning_rate": 2.7835402298850574e-05, + "loss": 0.1011, + "step": 11963 + }, + { + "epoch": 41.255172413793105, + "grad_norm": 0.8644103407859802, + "learning_rate": 2.7834942528735633e-05, + "loss": 0.0878, + "step": 11964 + }, + { + "epoch": 41.258620689655174, + "grad_norm": 1.2630164623260498, + "learning_rate": 2.7834482758620692e-05, + "loss": 0.1432, + "step": 11965 + }, + { + "epoch": 41.262068965517244, + "grad_norm": 0.7756518125534058, + "learning_rate": 2.7834022988505747e-05, + "loss": 0.2246, + "step": 11966 + }, + { + "epoch": 41.265517241379314, + "grad_norm": 0.521003246307373, + "learning_rate": 2.7833563218390806e-05, + "loss": 0.1681, + "step": 11967 + }, + { + "epoch": 41.26896551724138, + "grad_norm": 0.6028465628623962, + "learning_rate": 2.783310344827586e-05, + "loss": 0.1795, + "step": 11968 + }, + { + "epoch": 41.272413793103446, + "grad_norm": 0.5923756957054138, + "learning_rate": 2.7832643678160923e-05, + "loss": 0.1676, + "step": 11969 + }, + { + "epoch": 41.275862068965516, + "grad_norm": 0.8886812925338745, + "learning_rate": 2.783218390804598e-05, + "loss": 0.1344, + "step": 11970 + }, + { + "epoch": 41.279310344827586, + "grad_norm": 1.0262154340744019, + "learning_rate": 2.7831724137931034e-05, + "loss": 0.1437, + "step": 11971 + }, + { + "epoch": 41.282758620689656, + "grad_norm": 1.1608253717422485, + "learning_rate": 2.7831264367816092e-05, + "loss": 0.1671, + "step": 11972 + }, + { + "epoch": 41.286206896551725, + "grad_norm": 0.9611548185348511, + "learning_rate": 2.783080459770115e-05, + "loss": 0.1403, + "step": 11973 + }, + { + "epoch": 41.289655172413795, + "grad_norm": 0.69005286693573, + "learning_rate": 2.783034482758621e-05, + "loss": 0.1386, + "step": 11974 + }, + { + "epoch": 41.293103448275865, + "grad_norm": 0.6114193797111511, + "learning_rate": 2.7829885057471265e-05, + "loss": 0.1308, + "step": 11975 + }, + { + "epoch": 41.296551724137935, + "grad_norm": 0.7856693863868713, + "learning_rate": 2.782942528735632e-05, + "loss": 0.1193, + "step": 11976 + }, + { + "epoch": 41.3, + "grad_norm": 0.793415367603302, + "learning_rate": 2.7828965517241382e-05, + "loss": 0.1382, + "step": 11977 + }, + { + "epoch": 41.30344827586207, + "grad_norm": 0.467740923166275, + "learning_rate": 2.7828505747126438e-05, + "loss": 0.111, + "step": 11978 + }, + { + "epoch": 41.30689655172414, + "grad_norm": 4.855302810668945, + "learning_rate": 2.7828045977011496e-05, + "loss": 0.1012, + "step": 11979 + }, + { + "epoch": 41.310344827586206, + "grad_norm": 0.6347041130065918, + "learning_rate": 2.782758620689655e-05, + "loss": 0.118, + "step": 11980 + }, + { + "epoch": 41.313793103448276, + "grad_norm": 0.6863338351249695, + "learning_rate": 2.782712643678161e-05, + "loss": 0.1285, + "step": 11981 + }, + { + "epoch": 41.317241379310346, + "grad_norm": 0.7415573000907898, + "learning_rate": 2.782666666666667e-05, + "loss": 0.1262, + "step": 11982 + }, + { + "epoch": 41.320689655172416, + "grad_norm": 2.5014076232910156, + "learning_rate": 2.7826206896551724e-05, + "loss": 0.1116, + "step": 11983 + }, + { + "epoch": 41.324137931034485, + "grad_norm": 0.9302916526794434, + "learning_rate": 2.782574712643678e-05, + "loss": 0.0943, + "step": 11984 + }, + { + "epoch": 41.327586206896555, + "grad_norm": 0.982127845287323, + "learning_rate": 2.7825287356321842e-05, + "loss": 0.1, + "step": 11985 + }, + { + "epoch": 41.33103448275862, + "grad_norm": 3.882004976272583, + "learning_rate": 2.7824827586206897e-05, + "loss": 0.1085, + "step": 11986 + }, + { + "epoch": 41.33448275862069, + "grad_norm": 0.8281375169754028, + "learning_rate": 2.7824367816091956e-05, + "loss": 0.0916, + "step": 11987 + }, + { + "epoch": 41.33793103448276, + "grad_norm": 0.8988734483718872, + "learning_rate": 2.782390804597701e-05, + "loss": 0.0929, + "step": 11988 + }, + { + "epoch": 41.34137931034483, + "grad_norm": 0.8919851779937744, + "learning_rate": 2.782344827586207e-05, + "loss": 0.0939, + "step": 11989 + }, + { + "epoch": 41.3448275862069, + "grad_norm": 1.2896007299423218, + "learning_rate": 2.782298850574713e-05, + "loss": 0.1532, + "step": 11990 + }, + { + "epoch": 41.34827586206897, + "grad_norm": 0.6805572509765625, + "learning_rate": 2.7822528735632184e-05, + "loss": 0.2117, + "step": 11991 + }, + { + "epoch": 41.351724137931036, + "grad_norm": 0.5688475370407104, + "learning_rate": 2.7822068965517242e-05, + "loss": 0.1796, + "step": 11992 + }, + { + "epoch": 41.355172413793106, + "grad_norm": 1.2013331651687622, + "learning_rate": 2.78216091954023e-05, + "loss": 0.1571, + "step": 11993 + }, + { + "epoch": 41.358620689655176, + "grad_norm": 1.2206281423568726, + "learning_rate": 2.7821149425287356e-05, + "loss": 0.1494, + "step": 11994 + }, + { + "epoch": 41.36206896551724, + "grad_norm": 0.5915943384170532, + "learning_rate": 2.7820689655172415e-05, + "loss": 0.1678, + "step": 11995 + }, + { + "epoch": 41.36551724137931, + "grad_norm": 0.6025797128677368, + "learning_rate": 2.782022988505747e-05, + "loss": 0.1413, + "step": 11996 + }, + { + "epoch": 41.36896551724138, + "grad_norm": 0.7588165402412415, + "learning_rate": 2.7819770114942532e-05, + "loss": 0.1628, + "step": 11997 + }, + { + "epoch": 41.37241379310345, + "grad_norm": 0.5640226602554321, + "learning_rate": 2.7819310344827588e-05, + "loss": 0.1445, + "step": 11998 + }, + { + "epoch": 41.37586206896552, + "grad_norm": 1.794654369354248, + "learning_rate": 2.7818850574712643e-05, + "loss": 0.1355, + "step": 11999 + }, + { + "epoch": 41.37931034482759, + "grad_norm": 0.6887781620025635, + "learning_rate": 2.78183908045977e-05, + "loss": 0.1455, + "step": 12000 + }, + { + "epoch": 41.37931034482759, + "eval_cer": 0.1347972885899121, + "eval_loss": 0.35223913192749023, + "eval_runtime": 18.6328, + "eval_samples_per_second": 49.751, + "eval_steps_per_second": 0.161, + "eval_wer": 0.3079710144927536, + "step": 12000 } ], "logging_steps": 1.0, @@ -77131,7 +84141,7 @@ "early_stopping_threshold": 0.0 }, "attributes": { - "early_stopping_patience_counter": 3 + "early_stopping_patience_counter": 4 } }, "TrainerControl": { @@ -77145,7 +84155,7 @@ "attributes": {} } }, - "total_flos": 4.1495567357267096e+20, + "total_flos": 4.527891266461646e+20, "train_batch_size": 160, "trial_name": null, "trial_params": null