diff --git "a/WORLD_MODEL_POSTTRAIN/trainer_state.json" "b/WORLD_MODEL_POSTTRAIN/trainer_state.json" new file mode 100644--- /dev/null +++ "b/WORLD_MODEL_POSTTRAIN/trainer_state.json" @@ -0,0 +1,35042 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 5.139802631578948, + "eval_steps": 500, + "global_step": 50000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0010279605263157894, + "grad_norm": 5.212570460732926, + "learning_rate": 1.6000000000000003e-05, + "loss": 8.4432, + "step": 10 + }, + { + "epoch": 0.002055921052631579, + "grad_norm": 2.8157520566125185, + "learning_rate": 3.2000000000000005e-05, + "loss": 8.2432, + "step": 20 + }, + { + "epoch": 0.0030838815789473685, + "grad_norm": 6.763994192164909, + "learning_rate": 4.8e-05, + "loss": 8.2314, + "step": 30 + }, + { + "epoch": 0.004111842105263158, + "grad_norm": 2.5055066497476974, + "learning_rate": 6.400000000000001e-05, + "loss": 8.1243, + "step": 40 + }, + { + "epoch": 0.005139802631578948, + "grad_norm": 8.057530490629365, + "learning_rate": 8e-05, + "loss": 7.7036, + "step": 50 + }, + { + "epoch": 0.006167763157894737, + "grad_norm": 4.663557424625884, + "learning_rate": 7.999997936961541e-05, + "loss": 7.408, + "step": 60 + }, + { + "epoch": 0.007195723684210526, + "grad_norm": 4.929282774838182, + "learning_rate": 7.999991747848433e-05, + "loss": 6.8576, + "step": 70 + }, + { + "epoch": 0.008223684210526315, + "grad_norm": 5.339411351766599, + "learning_rate": 7.999981432667488e-05, + "loss": 6.4473, + "step": 80 + }, + { + "epoch": 0.009251644736842105, + "grad_norm": 4.260890471111822, + "learning_rate": 7.999966991430051e-05, + "loss": 6.1773, + "step": 90 + }, + { + "epoch": 0.010279605263157895, + "grad_norm": 4.117743148804496, + "learning_rate": 7.999948424152016e-05, + "loss": 5.9633, + "step": 100 + }, + { + "epoch": 0.011307565789473685, + "grad_norm": 4.220875296552004, + "learning_rate": 7.999925730853811e-05, + "loss": 5.8239, + "step": 110 + }, + { + "epoch": 0.012335526315789474, + "grad_norm": 4.624710486346094, + "learning_rate": 7.999898911560404e-05, + "loss": 5.7883, + "step": 120 + }, + { + "epoch": 0.013363486842105263, + "grad_norm": 2.631039619643356, + "learning_rate": 7.999867966301306e-05, + "loss": 5.795, + "step": 130 + }, + { + "epoch": 0.014391447368421052, + "grad_norm": 5.1377922995918315, + "learning_rate": 7.999832895110564e-05, + "loss": 5.661, + "step": 140 + }, + { + "epoch": 0.015419407894736841, + "grad_norm": 3.30012706165185, + "learning_rate": 7.999793698026768e-05, + "loss": 5.6009, + "step": 150 + }, + { + "epoch": 0.01644736842105263, + "grad_norm": 6.934663209775065, + "learning_rate": 7.999750375093043e-05, + "loss": 5.4214, + "step": 160 + }, + { + "epoch": 0.01747532894736842, + "grad_norm": 6.52205964116326, + "learning_rate": 7.99970292635706e-05, + "loss": 5.3802, + "step": 170 + }, + { + "epoch": 0.01850328947368421, + "grad_norm": 4.4619325506771155, + "learning_rate": 7.999651351871023e-05, + "loss": 5.2281, + "step": 180 + }, + { + "epoch": 0.01953125, + "grad_norm": 4.81260066732172, + "learning_rate": 7.999595651691683e-05, + "loss": 5.2349, + "step": 190 + }, + { + "epoch": 0.02055921052631579, + "grad_norm": 5.0864775058943055, + "learning_rate": 7.999535825880323e-05, + "loss": 5.0524, + "step": 200 + }, + { + "epoch": 0.02158717105263158, + "grad_norm": 3.0240627696975473, + "learning_rate": 7.999471874502768e-05, + "loss": 5.0825, + "step": 210 + }, + { + "epoch": 0.02261513157894737, + "grad_norm": 4.427515536883603, + "learning_rate": 7.999403797629387e-05, + "loss": 4.978, + "step": 220 + }, + { + "epoch": 0.023643092105263157, + "grad_norm": 2.9649688734508204, + "learning_rate": 7.999331595335078e-05, + "loss": 4.9619, + "step": 230 + }, + { + "epoch": 0.024671052631578948, + "grad_norm": 5.118119249618976, + "learning_rate": 7.99925526769929e-05, + "loss": 4.9893, + "step": 240 + }, + { + "epoch": 0.025699013157894735, + "grad_norm": 5.749523640678988, + "learning_rate": 7.999174814806002e-05, + "loss": 4.8205, + "step": 250 + }, + { + "epoch": 0.026726973684210526, + "grad_norm": 4.030768550541227, + "learning_rate": 7.999090236743736e-05, + "loss": 4.7835, + "step": 260 + }, + { + "epoch": 0.027754934210526317, + "grad_norm": 5.549799102721323, + "learning_rate": 7.999001533605553e-05, + "loss": 4.6622, + "step": 270 + }, + { + "epoch": 0.028782894736842105, + "grad_norm": 4.237524459003088, + "learning_rate": 7.998908705489051e-05, + "loss": 4.6806, + "step": 280 + }, + { + "epoch": 0.029810855263157895, + "grad_norm": 4.491466484256475, + "learning_rate": 7.998811752496369e-05, + "loss": 4.7014, + "step": 290 + }, + { + "epoch": 0.030838815789473683, + "grad_norm": 6.71718476271679, + "learning_rate": 7.99871067473418e-05, + "loss": 4.6525, + "step": 300 + }, + { + "epoch": 0.03186677631578947, + "grad_norm": 3.3132133435551823, + "learning_rate": 7.998605472313704e-05, + "loss": 4.6024, + "step": 310 + }, + { + "epoch": 0.03289473684210526, + "grad_norm": 4.782848016307305, + "learning_rate": 7.998496145350687e-05, + "loss": 4.6301, + "step": 320 + }, + { + "epoch": 0.03392269736842105, + "grad_norm": 3.4227664430839226, + "learning_rate": 7.998382693965425e-05, + "loss": 4.6435, + "step": 330 + }, + { + "epoch": 0.03495065789473684, + "grad_norm": 2.9481904314975202, + "learning_rate": 7.998265118282745e-05, + "loss": 4.6249, + "step": 340 + }, + { + "epoch": 0.035978618421052634, + "grad_norm": 3.7505176186950635, + "learning_rate": 7.998143418432015e-05, + "loss": 4.5096, + "step": 350 + }, + { + "epoch": 0.03700657894736842, + "grad_norm": 4.859300815224532, + "learning_rate": 7.99801759454714e-05, + "loss": 4.5699, + "step": 360 + }, + { + "epoch": 0.03803453947368421, + "grad_norm": 3.6631046824533717, + "learning_rate": 7.99788764676656e-05, + "loss": 4.6175, + "step": 370 + }, + { + "epoch": 0.0390625, + "grad_norm": 2.869856836556329, + "learning_rate": 7.997753575233259e-05, + "loss": 4.5593, + "step": 380 + }, + { + "epoch": 0.04009046052631579, + "grad_norm": 3.3060194717544586, + "learning_rate": 7.99761538009475e-05, + "loss": 4.4659, + "step": 390 + }, + { + "epoch": 0.04111842105263158, + "grad_norm": 3.417354157914101, + "learning_rate": 7.997473061503091e-05, + "loss": 4.5522, + "step": 400 + }, + { + "epoch": 0.042146381578947366, + "grad_norm": 2.2520557125443332, + "learning_rate": 7.99732661961487e-05, + "loss": 4.501, + "step": 410 + }, + { + "epoch": 0.04317434210526316, + "grad_norm": 4.17492769829739, + "learning_rate": 7.997176054591217e-05, + "loss": 4.5468, + "step": 420 + }, + { + "epoch": 0.04420230263157895, + "grad_norm": 2.6140596288287896, + "learning_rate": 7.997021366597799e-05, + "loss": 4.4789, + "step": 430 + }, + { + "epoch": 0.04523026315789474, + "grad_norm": 3.6226642581406123, + "learning_rate": 7.996862555804811e-05, + "loss": 4.477, + "step": 440 + }, + { + "epoch": 0.04625822368421053, + "grad_norm": 3.219567056302648, + "learning_rate": 7.996699622386996e-05, + "loss": 4.4705, + "step": 450 + }, + { + "epoch": 0.047286184210526314, + "grad_norm": 3.499765252006602, + "learning_rate": 7.996532566523626e-05, + "loss": 4.3973, + "step": 460 + }, + { + "epoch": 0.048314144736842105, + "grad_norm": 2.8620111372842287, + "learning_rate": 7.996361388398509e-05, + "loss": 4.3823, + "step": 470 + }, + { + "epoch": 0.049342105263157895, + "grad_norm": 4.379921678861834, + "learning_rate": 7.996186088199991e-05, + "loss": 4.4736, + "step": 480 + }, + { + "epoch": 0.050370065789473686, + "grad_norm": 2.724766268024204, + "learning_rate": 7.996006666120955e-05, + "loss": 4.3828, + "step": 490 + }, + { + "epoch": 0.05139802631578947, + "grad_norm": 1.7205182776174517, + "learning_rate": 7.995823122358812e-05, + "loss": 4.4109, + "step": 500 + }, + { + "epoch": 0.05242598684210526, + "grad_norm": 4.28486981473459, + "learning_rate": 7.995635457115517e-05, + "loss": 4.4195, + "step": 510 + }, + { + "epoch": 0.05345394736842105, + "grad_norm": 2.7405772467077294, + "learning_rate": 7.995443670597554e-05, + "loss": 4.3025, + "step": 520 + }, + { + "epoch": 0.05448190789473684, + "grad_norm": 3.6434210653349104, + "learning_rate": 7.995247763015943e-05, + "loss": 4.424, + "step": 530 + }, + { + "epoch": 0.055509868421052634, + "grad_norm": 5.136596214080284, + "learning_rate": 7.995047734586239e-05, + "loss": 4.3398, + "step": 540 + }, + { + "epoch": 0.05653782894736842, + "grad_norm": 2.801081046814382, + "learning_rate": 7.994843585528531e-05, + "loss": 4.4218, + "step": 550 + }, + { + "epoch": 0.05756578947368421, + "grad_norm": 4.438723328364429, + "learning_rate": 7.994635316067441e-05, + "loss": 4.3546, + "step": 560 + }, + { + "epoch": 0.05859375, + "grad_norm": 2.6166605404662646, + "learning_rate": 7.994422926432125e-05, + "loss": 4.3592, + "step": 570 + }, + { + "epoch": 0.05962171052631579, + "grad_norm": 3.059024941169693, + "learning_rate": 7.994206416856274e-05, + "loss": 4.387, + "step": 580 + }, + { + "epoch": 0.06064967105263158, + "grad_norm": 2.494929703389159, + "learning_rate": 7.99398578757811e-05, + "loss": 4.3729, + "step": 590 + }, + { + "epoch": 0.061677631578947366, + "grad_norm": 3.3160473524883978, + "learning_rate": 7.993761038840387e-05, + "loss": 4.2937, + "step": 600 + }, + { + "epoch": 0.06270559210526316, + "grad_norm": 2.7230818865928526, + "learning_rate": 7.993532170890396e-05, + "loss": 4.2808, + "step": 610 + }, + { + "epoch": 0.06373355263157894, + "grad_norm": 2.2682209795786936, + "learning_rate": 7.993299183979955e-05, + "loss": 4.3104, + "step": 620 + }, + { + "epoch": 0.06476151315789473, + "grad_norm": 3.3202836338359187, + "learning_rate": 7.993062078365417e-05, + "loss": 4.334, + "step": 630 + }, + { + "epoch": 0.06578947368421052, + "grad_norm": 4.1281321800568, + "learning_rate": 7.992820854307667e-05, + "loss": 4.3107, + "step": 640 + }, + { + "epoch": 0.06681743421052631, + "grad_norm": 2.515956311888201, + "learning_rate": 7.992575512072122e-05, + "loss": 4.3195, + "step": 650 + }, + { + "epoch": 0.0678453947368421, + "grad_norm": 3.336698851708039, + "learning_rate": 7.992326051928727e-05, + "loss": 4.3479, + "step": 660 + }, + { + "epoch": 0.0688733552631579, + "grad_norm": 3.945684011360373, + "learning_rate": 7.992072474151957e-05, + "loss": 4.3052, + "step": 670 + }, + { + "epoch": 0.06990131578947369, + "grad_norm": 2.1107075191109947, + "learning_rate": 7.991814779020827e-05, + "loss": 4.3281, + "step": 680 + }, + { + "epoch": 0.07092927631578948, + "grad_norm": 3.6504951245743387, + "learning_rate": 7.991552966818873e-05, + "loss": 4.3568, + "step": 690 + }, + { + "epoch": 0.07195723684210527, + "grad_norm": 3.6876003445695593, + "learning_rate": 7.991287037834159e-05, + "loss": 4.3582, + "step": 700 + }, + { + "epoch": 0.07298519736842106, + "grad_norm": 3.2196941287322267, + "learning_rate": 7.991016992359291e-05, + "loss": 4.2184, + "step": 710 + }, + { + "epoch": 0.07401315789473684, + "grad_norm": 2.348521855324694, + "learning_rate": 7.990742830691391e-05, + "loss": 4.2251, + "step": 720 + }, + { + "epoch": 0.07504111842105263, + "grad_norm": 4.1600363557400675, + "learning_rate": 7.990464553132117e-05, + "loss": 4.3212, + "step": 730 + }, + { + "epoch": 0.07606907894736842, + "grad_norm": 2.046534153172162, + "learning_rate": 7.990182159987654e-05, + "loss": 4.2078, + "step": 740 + }, + { + "epoch": 0.07709703947368421, + "grad_norm": 4.043974437115902, + "learning_rate": 7.989895651568717e-05, + "loss": 4.2851, + "step": 750 + }, + { + "epoch": 0.078125, + "grad_norm": 4.398870036944569, + "learning_rate": 7.989605028190547e-05, + "loss": 4.2137, + "step": 760 + }, + { + "epoch": 0.07915296052631579, + "grad_norm": 3.8909648455755024, + "learning_rate": 7.98931029017291e-05, + "loss": 4.2421, + "step": 770 + }, + { + "epoch": 0.08018092105263158, + "grad_norm": 2.23650765177328, + "learning_rate": 7.989011437840109e-05, + "loss": 4.2205, + "step": 780 + }, + { + "epoch": 0.08120888157894737, + "grad_norm": 2.5727864797502615, + "learning_rate": 7.988708471520961e-05, + "loss": 4.2789, + "step": 790 + }, + { + "epoch": 0.08223684210526316, + "grad_norm": 3.625626871561936, + "learning_rate": 7.98840139154882e-05, + "loss": 4.2635, + "step": 800 + }, + { + "epoch": 0.08326480263157894, + "grad_norm": 3.408904834639838, + "learning_rate": 7.98809019826156e-05, + "loss": 4.2691, + "step": 810 + }, + { + "epoch": 0.08429276315789473, + "grad_norm": 2.6540992556584464, + "learning_rate": 7.987774892001583e-05, + "loss": 4.2281, + "step": 820 + }, + { + "epoch": 0.08532072368421052, + "grad_norm": 2.5824388883437357, + "learning_rate": 7.987455473115819e-05, + "loss": 4.1892, + "step": 830 + }, + { + "epoch": 0.08634868421052631, + "grad_norm": 3.126844548396032, + "learning_rate": 7.987131941955716e-05, + "loss": 4.2629, + "step": 840 + }, + { + "epoch": 0.0873766447368421, + "grad_norm": 3.549719791367905, + "learning_rate": 7.986804298877256e-05, + "loss": 4.2228, + "step": 850 + }, + { + "epoch": 0.0884046052631579, + "grad_norm": 3.036633459747795, + "learning_rate": 7.986472544240937e-05, + "loss": 4.2794, + "step": 860 + }, + { + "epoch": 0.08943256578947369, + "grad_norm": 1.6963440178472013, + "learning_rate": 7.986136678411788e-05, + "loss": 4.2032, + "step": 870 + }, + { + "epoch": 0.09046052631578948, + "grad_norm": 3.3417550669049727, + "learning_rate": 7.985796701759353e-05, + "loss": 4.2971, + "step": 880 + }, + { + "epoch": 0.09148848684210527, + "grad_norm": 2.543710907956731, + "learning_rate": 7.985452614657707e-05, + "loss": 4.2972, + "step": 890 + }, + { + "epoch": 0.09251644736842106, + "grad_norm": 2.0945150771009327, + "learning_rate": 7.985104417485445e-05, + "loss": 4.2079, + "step": 900 + }, + { + "epoch": 0.09354440789473684, + "grad_norm": 2.4403230995693477, + "learning_rate": 7.984752110625682e-05, + "loss": 4.146, + "step": 910 + }, + { + "epoch": 0.09457236842105263, + "grad_norm": 2.393705888545912, + "learning_rate": 7.984395694466057e-05, + "loss": 4.201, + "step": 920 + }, + { + "epoch": 0.09560032894736842, + "grad_norm": 2.691354870132756, + "learning_rate": 7.984035169398732e-05, + "loss": 4.2631, + "step": 930 + }, + { + "epoch": 0.09662828947368421, + "grad_norm": 2.6214542965278222, + "learning_rate": 7.983670535820388e-05, + "loss": 4.2621, + "step": 940 + }, + { + "epoch": 0.09765625, + "grad_norm": 3.697118725001978, + "learning_rate": 7.983301794132224e-05, + "loss": 4.1346, + "step": 950 + }, + { + "epoch": 0.09868421052631579, + "grad_norm": 1.7124398659499342, + "learning_rate": 7.982928944739964e-05, + "loss": 4.1756, + "step": 960 + }, + { + "epoch": 0.09971217105263158, + "grad_norm": 2.473388659825813, + "learning_rate": 7.982551988053848e-05, + "loss": 4.2391, + "step": 970 + }, + { + "epoch": 0.10074013157894737, + "grad_norm": 2.4116072035468834, + "learning_rate": 7.982170924488639e-05, + "loss": 4.1885, + "step": 980 + }, + { + "epoch": 0.10176809210526316, + "grad_norm": 2.4452353760863415, + "learning_rate": 7.981785754463613e-05, + "loss": 4.1365, + "step": 990 + }, + { + "epoch": 0.10279605263157894, + "grad_norm": 1.782794817904515, + "learning_rate": 7.98139647840257e-05, + "loss": 4.2003, + "step": 1000 + }, + { + "epoch": 0.10382401315789473, + "grad_norm": 2.9429470449304955, + "learning_rate": 7.981003096733826e-05, + "loss": 4.2333, + "step": 1010 + }, + { + "epoch": 0.10485197368421052, + "grad_norm": 2.664717428746834, + "learning_rate": 7.980605609890211e-05, + "loss": 4.1844, + "step": 1020 + }, + { + "epoch": 0.10587993421052631, + "grad_norm": 1.8671437022926185, + "learning_rate": 7.980204018309076e-05, + "loss": 4.1742, + "step": 1030 + }, + { + "epoch": 0.1069078947368421, + "grad_norm": 2.2227392734246885, + "learning_rate": 7.979798322432288e-05, + "loss": 4.1029, + "step": 1040 + }, + { + "epoch": 0.1079358552631579, + "grad_norm": 2.6117456140942363, + "learning_rate": 7.979388522706228e-05, + "loss": 4.1497, + "step": 1050 + }, + { + "epoch": 0.10896381578947369, + "grad_norm": 1.658910881122967, + "learning_rate": 7.978974619581795e-05, + "loss": 4.1432, + "step": 1060 + }, + { + "epoch": 0.10999177631578948, + "grad_norm": 2.811442112649147, + "learning_rate": 7.9785566135144e-05, + "loss": 4.1712, + "step": 1070 + }, + { + "epoch": 0.11101973684210527, + "grad_norm": 3.8983073547289204, + "learning_rate": 7.978134504963969e-05, + "loss": 4.1521, + "step": 1080 + }, + { + "epoch": 0.11204769736842106, + "grad_norm": 2.078410078873545, + "learning_rate": 7.977708294394943e-05, + "loss": 4.193, + "step": 1090 + }, + { + "epoch": 0.11307565789473684, + "grad_norm": 2.2998884505712014, + "learning_rate": 7.977277982276277e-05, + "loss": 4.2058, + "step": 1100 + }, + { + "epoch": 0.11410361842105263, + "grad_norm": 2.0618093085484155, + "learning_rate": 7.976843569081437e-05, + "loss": 4.1106, + "step": 1110 + }, + { + "epoch": 0.11513157894736842, + "grad_norm": 1.7738918445048943, + "learning_rate": 7.976405055288401e-05, + "loss": 4.1594, + "step": 1120 + }, + { + "epoch": 0.11615953947368421, + "grad_norm": 2.3362999064022216, + "learning_rate": 7.975962441379664e-05, + "loss": 4.2306, + "step": 1130 + }, + { + "epoch": 0.1171875, + "grad_norm": 2.437271205238958, + "learning_rate": 7.975515727842227e-05, + "loss": 4.1525, + "step": 1140 + }, + { + "epoch": 0.11821546052631579, + "grad_norm": 2.8258556724242077, + "learning_rate": 7.9750649151676e-05, + "loss": 4.0458, + "step": 1150 + }, + { + "epoch": 0.11924342105263158, + "grad_norm": 2.2032147111956997, + "learning_rate": 7.97461000385181e-05, + "loss": 4.2122, + "step": 1160 + }, + { + "epoch": 0.12027138157894737, + "grad_norm": 1.5595038106614325, + "learning_rate": 7.974150994395387e-05, + "loss": 4.1009, + "step": 1170 + }, + { + "epoch": 0.12129934210526316, + "grad_norm": 3.271181480847501, + "learning_rate": 7.973687887303377e-05, + "loss": 4.1299, + "step": 1180 + }, + { + "epoch": 0.12232730263157894, + "grad_norm": 2.160849378972245, + "learning_rate": 7.973220683085328e-05, + "loss": 4.0878, + "step": 1190 + }, + { + "epoch": 0.12335526315789473, + "grad_norm": 3.8405080630050543, + "learning_rate": 7.972749382255299e-05, + "loss": 4.1741, + "step": 1200 + }, + { + "epoch": 0.12438322368421052, + "grad_norm": 2.1286547031004672, + "learning_rate": 7.972273985331858e-05, + "loss": 4.1259, + "step": 1210 + }, + { + "epoch": 0.12541118421052633, + "grad_norm": 1.7679328351791506, + "learning_rate": 7.971794492838076e-05, + "loss": 4.1514, + "step": 1220 + }, + { + "epoch": 0.12643914473684212, + "grad_norm": 1.607910823503316, + "learning_rate": 7.971310905301534e-05, + "loss": 4.0681, + "step": 1230 + }, + { + "epoch": 0.12746710526315788, + "grad_norm": 2.1109422301063656, + "learning_rate": 7.970823223254315e-05, + "loss": 4.1906, + "step": 1240 + }, + { + "epoch": 0.12849506578947367, + "grad_norm": 3.523365502139953, + "learning_rate": 7.970331447233013e-05, + "loss": 4.1672, + "step": 1250 + }, + { + "epoch": 0.12952302631578946, + "grad_norm": 2.21857290501423, + "learning_rate": 7.969835577778719e-05, + "loss": 4.0777, + "step": 1260 + }, + { + "epoch": 0.13055098684210525, + "grad_norm": 3.2389751652574903, + "learning_rate": 7.969335615437036e-05, + "loss": 4.0993, + "step": 1270 + }, + { + "epoch": 0.13157894736842105, + "grad_norm": 3.1397978798998407, + "learning_rate": 7.968831560758061e-05, + "loss": 4.1559, + "step": 1280 + }, + { + "epoch": 0.13260690789473684, + "grad_norm": 2.994294303837003, + "learning_rate": 7.968323414296403e-05, + "loss": 4.1145, + "step": 1290 + }, + { + "epoch": 0.13363486842105263, + "grad_norm": 2.025030150012457, + "learning_rate": 7.967811176611167e-05, + "loss": 4.1895, + "step": 1300 + }, + { + "epoch": 0.13466282894736842, + "grad_norm": 3.1234468796757615, + "learning_rate": 7.967294848265963e-05, + "loss": 4.1364, + "step": 1310 + }, + { + "epoch": 0.1356907894736842, + "grad_norm": 1.9747463522168796, + "learning_rate": 7.9667744298289e-05, + "loss": 4.1061, + "step": 1320 + }, + { + "epoch": 0.13671875, + "grad_norm": 1.9002149761451745, + "learning_rate": 7.966249921872586e-05, + "loss": 4.0371, + "step": 1330 + }, + { + "epoch": 0.1377467105263158, + "grad_norm": 2.0860244789991422, + "learning_rate": 7.965721324974132e-05, + "loss": 4.0786, + "step": 1340 + }, + { + "epoch": 0.13877467105263158, + "grad_norm": 3.4460654614519175, + "learning_rate": 7.965188639715147e-05, + "loss": 4.1581, + "step": 1350 + }, + { + "epoch": 0.13980263157894737, + "grad_norm": 1.8600317171940632, + "learning_rate": 7.964651866681737e-05, + "loss": 4.0561, + "step": 1360 + }, + { + "epoch": 0.14083059210526316, + "grad_norm": 1.5645501229342134, + "learning_rate": 7.964111006464505e-05, + "loss": 4.0973, + "step": 1370 + }, + { + "epoch": 0.14185855263157895, + "grad_norm": 3.4348134032628868, + "learning_rate": 7.963566059658555e-05, + "loss": 4.111, + "step": 1380 + }, + { + "epoch": 0.14288651315789475, + "grad_norm": 2.00749518710993, + "learning_rate": 7.963017026863484e-05, + "loss": 4.0812, + "step": 1390 + }, + { + "epoch": 0.14391447368421054, + "grad_norm": 3.142707855221201, + "learning_rate": 7.962463908683386e-05, + "loss": 4.1724, + "step": 1400 + }, + { + "epoch": 0.14494243421052633, + "grad_norm": 2.291097836267391, + "learning_rate": 7.96190670572685e-05, + "loss": 4.1666, + "step": 1410 + }, + { + "epoch": 0.14597039473684212, + "grad_norm": 1.7141654165384237, + "learning_rate": 7.961345418606957e-05, + "loss": 4.1613, + "step": 1420 + }, + { + "epoch": 0.14699835526315788, + "grad_norm": 1.8636737146250548, + "learning_rate": 7.960780047941287e-05, + "loss": 4.1191, + "step": 1430 + }, + { + "epoch": 0.14802631578947367, + "grad_norm": 2.56280041306065, + "learning_rate": 7.960210594351908e-05, + "loss": 4.0492, + "step": 1440 + }, + { + "epoch": 0.14905427631578946, + "grad_norm": 2.3327684626986716, + "learning_rate": 7.959637058465384e-05, + "loss": 4.1065, + "step": 1450 + }, + { + "epoch": 0.15008223684210525, + "grad_norm": 2.3619432429084632, + "learning_rate": 7.959059440912769e-05, + "loss": 4.0454, + "step": 1460 + }, + { + "epoch": 0.15111019736842105, + "grad_norm": 2.8350891021235967, + "learning_rate": 7.958477742329607e-05, + "loss": 4.0697, + "step": 1470 + }, + { + "epoch": 0.15213815789473684, + "grad_norm": 2.8874700701594977, + "learning_rate": 7.957891963355934e-05, + "loss": 4.1131, + "step": 1480 + }, + { + "epoch": 0.15316611842105263, + "grad_norm": 2.5112864358099625, + "learning_rate": 7.957302104636276e-05, + "loss": 4.1502, + "step": 1490 + }, + { + "epoch": 0.15419407894736842, + "grad_norm": 2.66814014336691, + "learning_rate": 7.956708166819648e-05, + "loss": 4.0868, + "step": 1500 + }, + { + "epoch": 0.1552220394736842, + "grad_norm": 2.7983152634395276, + "learning_rate": 7.956110150559548e-05, + "loss": 4.1202, + "step": 1510 + }, + { + "epoch": 0.15625, + "grad_norm": 1.9732071815367769, + "learning_rate": 7.955508056513969e-05, + "loss": 4.0416, + "step": 1520 + }, + { + "epoch": 0.1572779605263158, + "grad_norm": 2.099599836460891, + "learning_rate": 7.954901885345387e-05, + "loss": 4.0964, + "step": 1530 + }, + { + "epoch": 0.15830592105263158, + "grad_norm": 1.4524093013947812, + "learning_rate": 7.954291637720763e-05, + "loss": 4.1102, + "step": 1540 + }, + { + "epoch": 0.15933388157894737, + "grad_norm": 1.7656287809348346, + "learning_rate": 7.953677314311547e-05, + "loss": 4.0598, + "step": 1550 + }, + { + "epoch": 0.16036184210526316, + "grad_norm": 2.7385971170129286, + "learning_rate": 7.953058915793666e-05, + "loss": 4.0831, + "step": 1560 + }, + { + "epoch": 0.16138980263157895, + "grad_norm": 1.6061884619654545, + "learning_rate": 7.952436442847543e-05, + "loss": 4.0706, + "step": 1570 + }, + { + "epoch": 0.16241776315789475, + "grad_norm": 2.4171903525140226, + "learning_rate": 7.951809896158073e-05, + "loss": 4.0826, + "step": 1580 + }, + { + "epoch": 0.16344572368421054, + "grad_norm": 1.4754208788649594, + "learning_rate": 7.951179276414638e-05, + "loss": 4.1513, + "step": 1590 + }, + { + "epoch": 0.16447368421052633, + "grad_norm": 3.7580227545211238, + "learning_rate": 7.9505445843111e-05, + "loss": 4.0989, + "step": 1600 + }, + { + "epoch": 0.16550164473684212, + "grad_norm": 1.9704415456817617, + "learning_rate": 7.949905820545804e-05, + "loss": 4.0133, + "step": 1610 + }, + { + "epoch": 0.16652960526315788, + "grad_norm": 2.169762331696678, + "learning_rate": 7.949262985821573e-05, + "loss": 4.1388, + "step": 1620 + }, + { + "epoch": 0.16755756578947367, + "grad_norm": 2.052881760029155, + "learning_rate": 7.948616080845709e-05, + "loss": 4.1128, + "step": 1630 + }, + { + "epoch": 0.16858552631578946, + "grad_norm": 2.1003075011230337, + "learning_rate": 7.947965106329993e-05, + "loss": 4.0676, + "step": 1640 + }, + { + "epoch": 0.16961348684210525, + "grad_norm": 1.9037684563326362, + "learning_rate": 7.947310062990688e-05, + "loss": 4.0569, + "step": 1650 + }, + { + "epoch": 0.17064144736842105, + "grad_norm": 1.827255172507141, + "learning_rate": 7.946650951548524e-05, + "loss": 4.0733, + "step": 1660 + }, + { + "epoch": 0.17166940789473684, + "grad_norm": 1.6678161551292698, + "learning_rate": 7.945987772728716e-05, + "loss": 4.0767, + "step": 1670 + }, + { + "epoch": 0.17269736842105263, + "grad_norm": 2.981216596820124, + "learning_rate": 7.945320527260951e-05, + "loss": 4.0207, + "step": 1680 + }, + { + "epoch": 0.17372532894736842, + "grad_norm": 44.441374490462856, + "learning_rate": 7.94464921587939e-05, + "loss": 4.185, + "step": 1690 + }, + { + "epoch": 0.1747532894736842, + "grad_norm": 3.182054692928238, + "learning_rate": 7.943973839322669e-05, + "loss": 4.0872, + "step": 1700 + }, + { + "epoch": 0.17578125, + "grad_norm": 1.853773511144589, + "learning_rate": 7.943294398333896e-05, + "loss": 4.0437, + "step": 1710 + }, + { + "epoch": 0.1768092105263158, + "grad_norm": 1.385742703653445, + "learning_rate": 7.94261089366065e-05, + "loss": 4.1256, + "step": 1720 + }, + { + "epoch": 0.17783717105263158, + "grad_norm": 1.932761458244894, + "learning_rate": 7.941923326054985e-05, + "loss": 4.0026, + "step": 1730 + }, + { + "epoch": 0.17886513157894737, + "grad_norm": 1.7991671993342582, + "learning_rate": 7.94123169627342e-05, + "loss": 3.9854, + "step": 1740 + }, + { + "epoch": 0.17989309210526316, + "grad_norm": 1.3723489607371429, + "learning_rate": 7.940536005076949e-05, + "loss": 4.0669, + "step": 1750 + }, + { + "epoch": 0.18092105263157895, + "grad_norm": 3.6373458305276576, + "learning_rate": 7.939836253231028e-05, + "loss": 4.0966, + "step": 1760 + }, + { + "epoch": 0.18194901315789475, + "grad_norm": 2.291645558577919, + "learning_rate": 7.93913244150559e-05, + "loss": 4.0566, + "step": 1770 + }, + { + "epoch": 0.18297697368421054, + "grad_norm": 1.6177540399497705, + "learning_rate": 7.938424570675026e-05, + "loss": 4.0036, + "step": 1780 + }, + { + "epoch": 0.18400493421052633, + "grad_norm": 2.323655517036255, + "learning_rate": 7.937712641518202e-05, + "loss": 4.0128, + "step": 1790 + }, + { + "epoch": 0.18503289473684212, + "grad_norm": 3.0097537427267884, + "learning_rate": 7.936996654818438e-05, + "loss": 4.0639, + "step": 1800 + }, + { + "epoch": 0.18606085526315788, + "grad_norm": 2.6949554649169514, + "learning_rate": 7.93627661136353e-05, + "loss": 4.0437, + "step": 1810 + }, + { + "epoch": 0.18708881578947367, + "grad_norm": 2.539750226122657, + "learning_rate": 7.935552511945732e-05, + "loss": 4.1227, + "step": 1820 + }, + { + "epoch": 0.18811677631578946, + "grad_norm": 2.0382148471600323, + "learning_rate": 7.934824357361759e-05, + "loss": 3.9894, + "step": 1830 + }, + { + "epoch": 0.18914473684210525, + "grad_norm": 2.002585885138502, + "learning_rate": 7.934092148412792e-05, + "loss": 4.0874, + "step": 1840 + }, + { + "epoch": 0.19017269736842105, + "grad_norm": 3.162063381527383, + "learning_rate": 7.933355885904469e-05, + "loss": 3.9805, + "step": 1850 + }, + { + "epoch": 0.19120065789473684, + "grad_norm": 2.9149320803233048, + "learning_rate": 7.932615570646894e-05, + "loss": 4.0167, + "step": 1860 + }, + { + "epoch": 0.19222861842105263, + "grad_norm": 1.3073017175700943, + "learning_rate": 7.931871203454622e-05, + "loss": 4.034, + "step": 1870 + }, + { + "epoch": 0.19325657894736842, + "grad_norm": 1.8055101641630595, + "learning_rate": 7.931122785146675e-05, + "loss": 4.023, + "step": 1880 + }, + { + "epoch": 0.1942845394736842, + "grad_norm": 2.0404544916815204, + "learning_rate": 7.930370316546524e-05, + "loss": 4.067, + "step": 1890 + }, + { + "epoch": 0.1953125, + "grad_norm": 1.791129904193009, + "learning_rate": 7.929613798482105e-05, + "loss": 4.0446, + "step": 1900 + }, + { + "epoch": 0.1963404605263158, + "grad_norm": 2.4253430546626373, + "learning_rate": 7.9288532317858e-05, + "loss": 4.0247, + "step": 1910 + }, + { + "epoch": 0.19736842105263158, + "grad_norm": 1.6975542707574176, + "learning_rate": 7.928088617294454e-05, + "loss": 4.0656, + "step": 1920 + }, + { + "epoch": 0.19839638157894737, + "grad_norm": 2.444805153569936, + "learning_rate": 7.927319955849362e-05, + "loss": 4.0615, + "step": 1930 + }, + { + "epoch": 0.19942434210526316, + "grad_norm": 2.6722073710771976, + "learning_rate": 7.926547248296273e-05, + "loss": 4.0799, + "step": 1940 + }, + { + "epoch": 0.20045230263157895, + "grad_norm": 1.6778650377261004, + "learning_rate": 7.925770495485385e-05, + "loss": 4.0389, + "step": 1950 + }, + { + "epoch": 0.20148026315789475, + "grad_norm": 1.9493852746808302, + "learning_rate": 7.924989698271353e-05, + "loss": 3.9939, + "step": 1960 + }, + { + "epoch": 0.20250822368421054, + "grad_norm": 2.433349989045853, + "learning_rate": 7.924204857513274e-05, + "loss": 4.0344, + "step": 1970 + }, + { + "epoch": 0.20353618421052633, + "grad_norm": 1.6615879482625653, + "learning_rate": 7.9234159740747e-05, + "loss": 4.0329, + "step": 1980 + }, + { + "epoch": 0.20456414473684212, + "grad_norm": 1.1894121335237056, + "learning_rate": 7.92262304882363e-05, + "loss": 4.0368, + "step": 1990 + }, + { + "epoch": 0.20559210526315788, + "grad_norm": 2.8231926157929643, + "learning_rate": 7.921826082632508e-05, + "loss": 4.0538, + "step": 2000 + }, + { + "epoch": 0.20662006578947367, + "grad_norm": 1.2349122725151007, + "learning_rate": 7.921025076378227e-05, + "loss": 3.9588, + "step": 2010 + }, + { + "epoch": 0.20764802631578946, + "grad_norm": 2.0809350769239674, + "learning_rate": 7.920220030942124e-05, + "loss": 4.0595, + "step": 2020 + }, + { + "epoch": 0.20867598684210525, + "grad_norm": 1.7353290992188846, + "learning_rate": 7.919410947209979e-05, + "loss": 4.1386, + "step": 2030 + }, + { + "epoch": 0.20970394736842105, + "grad_norm": 2.0772756117039406, + "learning_rate": 7.918597826072017e-05, + "loss": 4.1055, + "step": 2040 + }, + { + "epoch": 0.21073190789473684, + "grad_norm": 1.6416369269270426, + "learning_rate": 7.917780668422904e-05, + "loss": 4.1103, + "step": 2050 + }, + { + "epoch": 0.21175986842105263, + "grad_norm": 1.2764935945289635, + "learning_rate": 7.91695947516175e-05, + "loss": 3.9997, + "step": 2060 + }, + { + "epoch": 0.21278782894736842, + "grad_norm": 2.356079528470191, + "learning_rate": 7.916134247192103e-05, + "loss": 4.0107, + "step": 2070 + }, + { + "epoch": 0.2138157894736842, + "grad_norm": 2.795702470046731, + "learning_rate": 7.915304985421949e-05, + "loss": 4.0458, + "step": 2080 + }, + { + "epoch": 0.21484375, + "grad_norm": 1.6409858670041608, + "learning_rate": 7.914471690763715e-05, + "loss": 4.0362, + "step": 2090 + }, + { + "epoch": 0.2158717105263158, + "grad_norm": 2.888393221008857, + "learning_rate": 7.913634364134265e-05, + "loss": 3.9545, + "step": 2100 + }, + { + "epoch": 0.21689967105263158, + "grad_norm": 2.896615249856405, + "learning_rate": 7.912793006454898e-05, + "loss": 4.043, + "step": 2110 + }, + { + "epoch": 0.21792763157894737, + "grad_norm": 1.8060237676845585, + "learning_rate": 7.911947618651348e-05, + "loss": 4.0829, + "step": 2120 + }, + { + "epoch": 0.21895559210526316, + "grad_norm": 2.3076162576370853, + "learning_rate": 7.911098201653788e-05, + "loss": 4.0369, + "step": 2130 + }, + { + "epoch": 0.21998355263157895, + "grad_norm": 2.3283287829621107, + "learning_rate": 7.910244756396818e-05, + "loss": 4.0209, + "step": 2140 + }, + { + "epoch": 0.22101151315789475, + "grad_norm": 2.0234569416069506, + "learning_rate": 7.909387283819472e-05, + "loss": 4.0507, + "step": 2150 + }, + { + "epoch": 0.22203947368421054, + "grad_norm": 2.526840136200463, + "learning_rate": 7.908525784865219e-05, + "loss": 4.1285, + "step": 2160 + }, + { + "epoch": 0.22306743421052633, + "grad_norm": 1.5578030753352774, + "learning_rate": 7.907660260481952e-05, + "loss": 4.0414, + "step": 2170 + }, + { + "epoch": 0.22409539473684212, + "grad_norm": 2.4208502785838655, + "learning_rate": 7.906790711621998e-05, + "loss": 4.0483, + "step": 2180 + }, + { + "epoch": 0.22512335526315788, + "grad_norm": 1.119273126291252, + "learning_rate": 7.905917139242112e-05, + "loss": 4.0126, + "step": 2190 + }, + { + "epoch": 0.22615131578947367, + "grad_norm": 2.609523928840828, + "learning_rate": 7.90503954430347e-05, + "loss": 4.021, + "step": 2200 + }, + { + "epoch": 0.22717927631578946, + "grad_norm": 1.6260360935986466, + "learning_rate": 7.904157927771685e-05, + "loss": 3.9679, + "step": 2210 + }, + { + "epoch": 0.22820723684210525, + "grad_norm": 1.556408949858169, + "learning_rate": 7.903272290616784e-05, + "loss": 4.0554, + "step": 2220 + }, + { + "epoch": 0.22923519736842105, + "grad_norm": 3.0323849156014457, + "learning_rate": 7.902382633813222e-05, + "loss": 4.0161, + "step": 2230 + }, + { + "epoch": 0.23026315789473684, + "grad_norm": 2.333585748189432, + "learning_rate": 7.901488958339878e-05, + "loss": 3.959, + "step": 2240 + }, + { + "epoch": 0.23129111842105263, + "grad_norm": 1.6753437031218663, + "learning_rate": 7.900591265180051e-05, + "loss": 3.9428, + "step": 2250 + }, + { + "epoch": 0.23231907894736842, + "grad_norm": 2.1373262256679437, + "learning_rate": 7.899689555321462e-05, + "loss": 3.9798, + "step": 2260 + }, + { + "epoch": 0.2333470394736842, + "grad_norm": 2.128514693062743, + "learning_rate": 7.898783829756253e-05, + "loss": 4.0437, + "step": 2270 + }, + { + "epoch": 0.234375, + "grad_norm": 1.5334946664991687, + "learning_rate": 7.897874089480978e-05, + "loss": 3.9241, + "step": 2280 + }, + { + "epoch": 0.2354029605263158, + "grad_norm": 2.4278645929352027, + "learning_rate": 7.896960335496614e-05, + "loss": 3.9959, + "step": 2290 + }, + { + "epoch": 0.23643092105263158, + "grad_norm": 2.5213838611901145, + "learning_rate": 7.896042568808553e-05, + "loss": 3.9511, + "step": 2300 + }, + { + "epoch": 0.23745888157894737, + "grad_norm": 1.6314954784914026, + "learning_rate": 7.895120790426604e-05, + "loss": 4.0033, + "step": 2310 + }, + { + "epoch": 0.23848684210526316, + "grad_norm": 2.28406674748844, + "learning_rate": 7.894195001364985e-05, + "loss": 3.9973, + "step": 2320 + }, + { + "epoch": 0.23951480263157895, + "grad_norm": 1.581305669475561, + "learning_rate": 7.89326520264233e-05, + "loss": 4.0321, + "step": 2330 + }, + { + "epoch": 0.24054276315789475, + "grad_norm": 3.1329274144584227, + "learning_rate": 7.892331395281687e-05, + "loss": 4.0016, + "step": 2340 + }, + { + "epoch": 0.24157072368421054, + "grad_norm": 2.2327489738423556, + "learning_rate": 7.89139358031051e-05, + "loss": 3.9762, + "step": 2350 + }, + { + "epoch": 0.24259868421052633, + "grad_norm": 1.8383018041808028, + "learning_rate": 7.890451758760666e-05, + "loss": 3.9498, + "step": 2360 + }, + { + "epoch": 0.24362664473684212, + "grad_norm": 1.6023693006422923, + "learning_rate": 7.889505931668429e-05, + "loss": 3.9647, + "step": 2370 + }, + { + "epoch": 0.24465460526315788, + "grad_norm": 1.2805641013131757, + "learning_rate": 7.88855610007448e-05, + "loss": 4.012, + "step": 2380 + }, + { + "epoch": 0.24568256578947367, + "grad_norm": 1.6883421315412368, + "learning_rate": 7.887602265023907e-05, + "loss": 4.0713, + "step": 2390 + }, + { + "epoch": 0.24671052631578946, + "grad_norm": 2.460072392584782, + "learning_rate": 7.886644427566201e-05, + "loss": 4.0662, + "step": 2400 + }, + { + "epoch": 0.24773848684210525, + "grad_norm": 2.319173205639796, + "learning_rate": 7.88568258875526e-05, + "loss": 4.1067, + "step": 2410 + }, + { + "epoch": 0.24876644736842105, + "grad_norm": 1.3240799732689401, + "learning_rate": 7.884716749649385e-05, + "loss": 4.0233, + "step": 2420 + }, + { + "epoch": 0.24979440789473684, + "grad_norm": 3.6122154519763328, + "learning_rate": 7.88374691131127e-05, + "loss": 3.9995, + "step": 2430 + }, + { + "epoch": 0.25082236842105265, + "grad_norm": 3.5470488249411107, + "learning_rate": 7.88277307480802e-05, + "loss": 3.9525, + "step": 2440 + }, + { + "epoch": 0.2518503289473684, + "grad_norm": 2.5298110262592273, + "learning_rate": 7.881795241211135e-05, + "loss": 4.0459, + "step": 2450 + }, + { + "epoch": 0.25287828947368424, + "grad_norm": 1.2925459063731903, + "learning_rate": 7.880813411596513e-05, + "loss": 4.0241, + "step": 2460 + }, + { + "epoch": 0.25390625, + "grad_norm": 2.101055665569229, + "learning_rate": 7.879827587044448e-05, + "loss": 3.9515, + "step": 2470 + }, + { + "epoch": 0.25493421052631576, + "grad_norm": 2.019976037515958, + "learning_rate": 7.878837768639628e-05, + "loss": 3.999, + "step": 2480 + }, + { + "epoch": 0.2559621710526316, + "grad_norm": 1.3640393005006277, + "learning_rate": 7.877843957471138e-05, + "loss": 4.0219, + "step": 2490 + }, + { + "epoch": 0.25699013157894735, + "grad_norm": 1.5487241385381225, + "learning_rate": 7.876846154632461e-05, + "loss": 4.0064, + "step": 2500 + }, + { + "epoch": 0.25801809210526316, + "grad_norm": 2.0311658401660777, + "learning_rate": 7.87584436122146e-05, + "loss": 4.0707, + "step": 2510 + }, + { + "epoch": 0.2590460526315789, + "grad_norm": 1.3246785943662682, + "learning_rate": 7.874838578340397e-05, + "loss": 4.0967, + "step": 2520 + }, + { + "epoch": 0.26007401315789475, + "grad_norm": 1.764459478482134, + "learning_rate": 7.873828807095925e-05, + "loss": 3.8857, + "step": 2530 + }, + { + "epoch": 0.2611019736842105, + "grad_norm": 1.7634353563592116, + "learning_rate": 7.872815048599078e-05, + "loss": 3.9503, + "step": 2540 + }, + { + "epoch": 0.2621299342105263, + "grad_norm": 2.3922815504964507, + "learning_rate": 7.871797303965287e-05, + "loss": 3.9677, + "step": 2550 + }, + { + "epoch": 0.2631578947368421, + "grad_norm": 1.2509011834786339, + "learning_rate": 7.870775574314358e-05, + "loss": 3.9808, + "step": 2560 + }, + { + "epoch": 0.2641858552631579, + "grad_norm": 1.4888416746981914, + "learning_rate": 7.869749860770488e-05, + "loss": 3.9114, + "step": 2570 + }, + { + "epoch": 0.2652138157894737, + "grad_norm": 1.251940556074778, + "learning_rate": 7.86872016446226e-05, + "loss": 4.0063, + "step": 2580 + }, + { + "epoch": 0.2662417763157895, + "grad_norm": 1.9706203114374863, + "learning_rate": 7.867686486522634e-05, + "loss": 4.0287, + "step": 2590 + }, + { + "epoch": 0.26726973684210525, + "grad_norm": 1.8774958846939842, + "learning_rate": 7.86664882808895e-05, + "loss": 4.0366, + "step": 2600 + }, + { + "epoch": 0.2682976973684211, + "grad_norm": 3.049126209102878, + "learning_rate": 7.865607190302933e-05, + "loss": 3.9674, + "step": 2610 + }, + { + "epoch": 0.26932565789473684, + "grad_norm": 1.43739981924183, + "learning_rate": 7.864561574310683e-05, + "loss": 3.9745, + "step": 2620 + }, + { + "epoch": 0.27035361842105265, + "grad_norm": 1.8410864518736498, + "learning_rate": 7.863511981262677e-05, + "loss": 3.8911, + "step": 2630 + }, + { + "epoch": 0.2713815789473684, + "grad_norm": 1.7458396231720847, + "learning_rate": 7.862458412313772e-05, + "loss": 4.0547, + "step": 2640 + }, + { + "epoch": 0.27240953947368424, + "grad_norm": 2.027108062980122, + "learning_rate": 7.861400868623191e-05, + "loss": 3.9476, + "step": 2650 + }, + { + "epoch": 0.2734375, + "grad_norm": 1.8154918707200969, + "learning_rate": 7.860339351354539e-05, + "loss": 4.0323, + "step": 2660 + }, + { + "epoch": 0.27446546052631576, + "grad_norm": 1.9700260568241326, + "learning_rate": 7.859273861675792e-05, + "loss": 3.9773, + "step": 2670 + }, + { + "epoch": 0.2754934210526316, + "grad_norm": 1.707060091311882, + "learning_rate": 7.858204400759289e-05, + "loss": 4.0183, + "step": 2680 + }, + { + "epoch": 0.27652138157894735, + "grad_norm": 1.3865816720183435, + "learning_rate": 7.85713096978175e-05, + "loss": 4.0399, + "step": 2690 + }, + { + "epoch": 0.27754934210526316, + "grad_norm": 1.4660709377783834, + "learning_rate": 7.856053569924254e-05, + "loss": 4.0037, + "step": 2700 + }, + { + "epoch": 0.2785773026315789, + "grad_norm": 3.052227776313317, + "learning_rate": 7.854972202372249e-05, + "loss": 3.8386, + "step": 2710 + }, + { + "epoch": 0.27960526315789475, + "grad_norm": 3.7515075196571237, + "learning_rate": 7.853886868315553e-05, + "loss": 3.9735, + "step": 2720 + }, + { + "epoch": 0.2806332236842105, + "grad_norm": 2.178253686680033, + "learning_rate": 7.852797568948345e-05, + "loss": 3.9804, + "step": 2730 + }, + { + "epoch": 0.2816611842105263, + "grad_norm": 1.4462466958155757, + "learning_rate": 7.851704305469164e-05, + "loss": 3.9839, + "step": 2740 + }, + { + "epoch": 0.2826891447368421, + "grad_norm": 2.4638719290295734, + "learning_rate": 7.850607079080918e-05, + "loss": 3.9319, + "step": 2750 + }, + { + "epoch": 0.2837171052631579, + "grad_norm": 0.9100694737795614, + "learning_rate": 7.849505890990868e-05, + "loss": 4.0005, + "step": 2760 + }, + { + "epoch": 0.2847450657894737, + "grad_norm": 2.0729611134835926, + "learning_rate": 7.848400742410638e-05, + "loss": 3.9619, + "step": 2770 + }, + { + "epoch": 0.2857730263157895, + "grad_norm": 2.2759740777240682, + "learning_rate": 7.847291634556208e-05, + "loss": 3.9759, + "step": 2780 + }, + { + "epoch": 0.28680098684210525, + "grad_norm": 1.4162668150049198, + "learning_rate": 7.846178568647917e-05, + "loss": 3.9801, + "step": 2790 + }, + { + "epoch": 0.2878289473684211, + "grad_norm": 2.000224100909051, + "learning_rate": 7.845061545910456e-05, + "loss": 3.9506, + "step": 2800 + }, + { + "epoch": 0.28885690789473684, + "grad_norm": 1.7212440428564943, + "learning_rate": 7.843940567572871e-05, + "loss": 3.9178, + "step": 2810 + }, + { + "epoch": 0.28988486842105265, + "grad_norm": 1.2592048283938813, + "learning_rate": 7.84281563486856e-05, + "loss": 3.9626, + "step": 2820 + }, + { + "epoch": 0.2909128289473684, + "grad_norm": 1.4476871262791842, + "learning_rate": 7.841686749035272e-05, + "loss": 3.9466, + "step": 2830 + }, + { + "epoch": 0.29194078947368424, + "grad_norm": 1.7873225517493196, + "learning_rate": 7.840553911315107e-05, + "loss": 3.9735, + "step": 2840 + }, + { + "epoch": 0.29296875, + "grad_norm": 2.030671609547327, + "learning_rate": 7.839417122954509e-05, + "loss": 3.9415, + "step": 2850 + }, + { + "epoch": 0.29399671052631576, + "grad_norm": 2.330764400848087, + "learning_rate": 7.838276385204273e-05, + "loss": 4.0134, + "step": 2860 + }, + { + "epoch": 0.2950246710526316, + "grad_norm": 1.3221332252739244, + "learning_rate": 7.83713169931954e-05, + "loss": 4.0295, + "step": 2870 + }, + { + "epoch": 0.29605263157894735, + "grad_norm": 2.1326273743488957, + "learning_rate": 7.83598306655979e-05, + "loss": 3.9193, + "step": 2880 + }, + { + "epoch": 0.29708059210526316, + "grad_norm": 1.578486788230708, + "learning_rate": 7.834830488188851e-05, + "loss": 3.9984, + "step": 2890 + }, + { + "epoch": 0.2981085526315789, + "grad_norm": 1.563193205604117, + "learning_rate": 7.83367396547489e-05, + "loss": 3.9944, + "step": 2900 + }, + { + "epoch": 0.29913651315789475, + "grad_norm": 1.0629089914660634, + "learning_rate": 7.832513499690412e-05, + "loss": 3.9956, + "step": 2910 + }, + { + "epoch": 0.3001644736842105, + "grad_norm": 1.4480323562326414, + "learning_rate": 7.831349092112266e-05, + "loss": 3.8685, + "step": 2920 + }, + { + "epoch": 0.3011924342105263, + "grad_norm": 1.958272209295754, + "learning_rate": 7.830180744021634e-05, + "loss": 3.95, + "step": 2930 + }, + { + "epoch": 0.3022203947368421, + "grad_norm": 3.032665276233653, + "learning_rate": 7.829008456704033e-05, + "loss": 3.9966, + "step": 2940 + }, + { + "epoch": 0.3032483552631579, + "grad_norm": 1.1524179208452454, + "learning_rate": 7.827832231449314e-05, + "loss": 4.0042, + "step": 2950 + }, + { + "epoch": 0.3042763157894737, + "grad_norm": 1.4435410649768208, + "learning_rate": 7.826652069551668e-05, + "loss": 3.938, + "step": 2960 + }, + { + "epoch": 0.3053042763157895, + "grad_norm": 1.655972871447615, + "learning_rate": 7.825467972309606e-05, + "loss": 3.9411, + "step": 2970 + }, + { + "epoch": 0.30633223684210525, + "grad_norm": 1.8236643175335323, + "learning_rate": 7.824279941025979e-05, + "loss": 3.9432, + "step": 2980 + }, + { + "epoch": 0.3073601973684211, + "grad_norm": 1.1507976958115533, + "learning_rate": 7.823087977007961e-05, + "loss": 3.9706, + "step": 2990 + }, + { + "epoch": 0.30838815789473684, + "grad_norm": 1.5961864771260694, + "learning_rate": 7.821892081567056e-05, + "loss": 3.9785, + "step": 3000 + }, + { + "epoch": 0.30941611842105265, + "grad_norm": 0.9595402118691103, + "learning_rate": 7.820692256019091e-05, + "loss": 4.0154, + "step": 3010 + }, + { + "epoch": 0.3104440789473684, + "grad_norm": 1.2535206329540476, + "learning_rate": 7.819488501684219e-05, + "loss": 3.9238, + "step": 3020 + }, + { + "epoch": 0.31147203947368424, + "grad_norm": 1.7334267713353313, + "learning_rate": 7.818280819886916e-05, + "loss": 3.9638, + "step": 3030 + }, + { + "epoch": 0.3125, + "grad_norm": 1.348300238172893, + "learning_rate": 7.817069211955978e-05, + "loss": 3.9244, + "step": 3040 + }, + { + "epoch": 0.31352796052631576, + "grad_norm": 1.256473515681905, + "learning_rate": 7.815853679224522e-05, + "loss": 3.9182, + "step": 3050 + }, + { + "epoch": 0.3145559210526316, + "grad_norm": 1.4453923383776532, + "learning_rate": 7.814634223029984e-05, + "loss": 3.9991, + "step": 3060 + }, + { + "epoch": 0.31558388157894735, + "grad_norm": 1.2005119681979177, + "learning_rate": 7.813410844714115e-05, + "loss": 3.9874, + "step": 3070 + }, + { + "epoch": 0.31661184210526316, + "grad_norm": 2.340715198281357, + "learning_rate": 7.812183545622983e-05, + "loss": 3.8767, + "step": 3080 + }, + { + "epoch": 0.3176398026315789, + "grad_norm": 1.591835667424166, + "learning_rate": 7.810952327106968e-05, + "loss": 3.9822, + "step": 3090 + }, + { + "epoch": 0.31866776315789475, + "grad_norm": 2.0061808684134963, + "learning_rate": 7.809717190520767e-05, + "loss": 3.9603, + "step": 3100 + }, + { + "epoch": 0.3196957236842105, + "grad_norm": 1.6504674769458667, + "learning_rate": 7.808478137223381e-05, + "loss": 4.0139, + "step": 3110 + }, + { + "epoch": 0.3207236842105263, + "grad_norm": 1.77620222721379, + "learning_rate": 7.807235168578128e-05, + "loss": 3.9882, + "step": 3120 + }, + { + "epoch": 0.3217516447368421, + "grad_norm": 1.2899918876067584, + "learning_rate": 7.805988285952628e-05, + "loss": 3.9289, + "step": 3130 + }, + { + "epoch": 0.3227796052631579, + "grad_norm": 0.9627564125847953, + "learning_rate": 7.804737490718812e-05, + "loss": 3.9625, + "step": 3140 + }, + { + "epoch": 0.3238075657894737, + "grad_norm": 1.3118145833272201, + "learning_rate": 7.803482784252911e-05, + "loss": 3.9625, + "step": 3150 + }, + { + "epoch": 0.3248355263157895, + "grad_norm": 2.2171543454896487, + "learning_rate": 7.802224167935466e-05, + "loss": 4.0034, + "step": 3160 + }, + { + "epoch": 0.32586348684210525, + "grad_norm": 2.1459312663523145, + "learning_rate": 7.800961643151314e-05, + "loss": 3.9204, + "step": 3170 + }, + { + "epoch": 0.3268914473684211, + "grad_norm": 1.3252995674205705, + "learning_rate": 7.799695211289596e-05, + "loss": 3.8984, + "step": 3180 + }, + { + "epoch": 0.32791940789473684, + "grad_norm": 1.2021428347471743, + "learning_rate": 7.798424873743751e-05, + "loss": 3.8965, + "step": 3190 + }, + { + "epoch": 0.32894736842105265, + "grad_norm": 1.095676741392886, + "learning_rate": 7.797150631911512e-05, + "loss": 3.9376, + "step": 3200 + }, + { + "epoch": 0.3299753289473684, + "grad_norm": 1.3778285064425067, + "learning_rate": 7.795872487194916e-05, + "loss": 3.9784, + "step": 3210 + }, + { + "epoch": 0.33100328947368424, + "grad_norm": 1.694903627654786, + "learning_rate": 7.794590441000285e-05, + "loss": 3.9209, + "step": 3220 + }, + { + "epoch": 0.33203125, + "grad_norm": 2.0896273206782476, + "learning_rate": 7.793304494738239e-05, + "loss": 3.9203, + "step": 3230 + }, + { + "epoch": 0.33305921052631576, + "grad_norm": 5.190072765979501, + "learning_rate": 7.792014649823691e-05, + "loss": 3.9664, + "step": 3240 + }, + { + "epoch": 0.3340871710526316, + "grad_norm": 2.1345579948562046, + "learning_rate": 7.790720907675837e-05, + "loss": 3.9416, + "step": 3250 + }, + { + "epoch": 0.33511513157894735, + "grad_norm": 1.5360603291167567, + "learning_rate": 7.789423269718167e-05, + "loss": 3.9882, + "step": 3260 + }, + { + "epoch": 0.33614309210526316, + "grad_norm": 1.6999966649415457, + "learning_rate": 7.788121737378455e-05, + "loss": 3.9877, + "step": 3270 + }, + { + "epoch": 0.3371710526315789, + "grad_norm": 2.712222627442398, + "learning_rate": 7.78681631208876e-05, + "loss": 3.9123, + "step": 3280 + }, + { + "epoch": 0.33819901315789475, + "grad_norm": 1.6628007000190999, + "learning_rate": 7.785506995285425e-05, + "loss": 3.9412, + "step": 3290 + }, + { + "epoch": 0.3392269736842105, + "grad_norm": 1.3155531113798045, + "learning_rate": 7.784193788409075e-05, + "loss": 3.9055, + "step": 3300 + }, + { + "epoch": 0.3402549342105263, + "grad_norm": 1.5885093393840732, + "learning_rate": 7.782876692904614e-05, + "loss": 3.9232, + "step": 3310 + }, + { + "epoch": 0.3412828947368421, + "grad_norm": 2.3202753031924854, + "learning_rate": 7.781555710221223e-05, + "loss": 3.8688, + "step": 3320 + }, + { + "epoch": 0.3423108552631579, + "grad_norm": 1.1966313086645957, + "learning_rate": 7.780230841812368e-05, + "loss": 3.8839, + "step": 3330 + }, + { + "epoch": 0.3433388157894737, + "grad_norm": 1.222116028020715, + "learning_rate": 7.778902089135781e-05, + "loss": 3.9121, + "step": 3340 + }, + { + "epoch": 0.3443667763157895, + "grad_norm": 1.7482307891422981, + "learning_rate": 7.777569453653471e-05, + "loss": 3.8747, + "step": 3350 + }, + { + "epoch": 0.34539473684210525, + "grad_norm": 1.4351668289450872, + "learning_rate": 7.776232936831721e-05, + "loss": 4.0201, + "step": 3360 + }, + { + "epoch": 0.3464226973684211, + "grad_norm": 1.8311048821245084, + "learning_rate": 7.774892540141082e-05, + "loss": 3.9048, + "step": 3370 + }, + { + "epoch": 0.34745065789473684, + "grad_norm": 2.295715454546183, + "learning_rate": 7.773548265056379e-05, + "loss": 3.9447, + "step": 3380 + }, + { + "epoch": 0.34847861842105265, + "grad_norm": 1.696511402832709, + "learning_rate": 7.772200113056697e-05, + "loss": 3.8431, + "step": 3390 + }, + { + "epoch": 0.3495065789473684, + "grad_norm": 1.437717006723336, + "learning_rate": 7.770848085625389e-05, + "loss": 3.8636, + "step": 3400 + }, + { + "epoch": 0.35053453947368424, + "grad_norm": 1.0502687360095273, + "learning_rate": 7.769492184250079e-05, + "loss": 3.8665, + "step": 3410 + }, + { + "epoch": 0.3515625, + "grad_norm": 1.287737633709637, + "learning_rate": 7.768132410422643e-05, + "loss": 3.9193, + "step": 3420 + }, + { + "epoch": 0.35259046052631576, + "grad_norm": 1.8704597816701596, + "learning_rate": 7.766768765639224e-05, + "loss": 3.9948, + "step": 3430 + }, + { + "epoch": 0.3536184210526316, + "grad_norm": 1.5212191349305602, + "learning_rate": 7.765401251400224e-05, + "loss": 3.9705, + "step": 3440 + }, + { + "epoch": 0.35464638157894735, + "grad_norm": 1.6642205248379038, + "learning_rate": 7.7640298692103e-05, + "loss": 3.8801, + "step": 3450 + }, + { + "epoch": 0.35567434210526316, + "grad_norm": 2.2506835746823066, + "learning_rate": 7.762654620578368e-05, + "loss": 3.8996, + "step": 3460 + }, + { + "epoch": 0.3567023026315789, + "grad_norm": 1.517112866814159, + "learning_rate": 7.761275507017595e-05, + "loss": 3.8938, + "step": 3470 + }, + { + "epoch": 0.35773026315789475, + "grad_norm": 1.34121287156734, + "learning_rate": 7.759892530045403e-05, + "loss": 3.902, + "step": 3480 + }, + { + "epoch": 0.3587582236842105, + "grad_norm": 1.568184468740804, + "learning_rate": 7.758505691183462e-05, + "loss": 3.9739, + "step": 3490 + }, + { + "epoch": 0.3597861842105263, + "grad_norm": 2.029905255554332, + "learning_rate": 7.757114991957694e-05, + "loss": 3.9053, + "step": 3500 + }, + { + "epoch": 0.3608141447368421, + "grad_norm": 2.0796481408938137, + "learning_rate": 7.755720433898268e-05, + "loss": 3.9407, + "step": 3510 + }, + { + "epoch": 0.3618421052631579, + "grad_norm": 1.7465771852873435, + "learning_rate": 7.754322018539598e-05, + "loss": 3.9329, + "step": 3520 + }, + { + "epoch": 0.3628700657894737, + "grad_norm": 1.6458743448419109, + "learning_rate": 7.752919747420342e-05, + "loss": 3.9236, + "step": 3530 + }, + { + "epoch": 0.3638980263157895, + "grad_norm": 2.010601318523476, + "learning_rate": 7.751513622083403e-05, + "loss": 3.9657, + "step": 3540 + }, + { + "epoch": 0.36492598684210525, + "grad_norm": 1.8101810221485521, + "learning_rate": 7.75010364407592e-05, + "loss": 3.9909, + "step": 3550 + }, + { + "epoch": 0.3659539473684211, + "grad_norm": 1.4271867069570445, + "learning_rate": 7.748689814949275e-05, + "loss": 3.8426, + "step": 3560 + }, + { + "epoch": 0.36698190789473684, + "grad_norm": 1.4515083918839722, + "learning_rate": 7.747272136259087e-05, + "loss": 3.909, + "step": 3570 + }, + { + "epoch": 0.36800986842105265, + "grad_norm": 1.5483163611087847, + "learning_rate": 7.745850609565208e-05, + "loss": 3.9367, + "step": 3580 + }, + { + "epoch": 0.3690378289473684, + "grad_norm": 1.8214757256054022, + "learning_rate": 7.744425236431725e-05, + "loss": 3.9057, + "step": 3590 + }, + { + "epoch": 0.37006578947368424, + "grad_norm": 1.635703488931225, + "learning_rate": 7.74299601842696e-05, + "loss": 3.9154, + "step": 3600 + }, + { + "epoch": 0.37109375, + "grad_norm": 1.402006315900042, + "learning_rate": 7.741562957123463e-05, + "loss": 3.8343, + "step": 3610 + }, + { + "epoch": 0.37212171052631576, + "grad_norm": 1.4700218537710843, + "learning_rate": 7.740126054098011e-05, + "loss": 3.9318, + "step": 3620 + }, + { + "epoch": 0.3731496710526316, + "grad_norm": 1.1904096093178338, + "learning_rate": 7.738685310931611e-05, + "loss": 3.8281, + "step": 3630 + }, + { + "epoch": 0.37417763157894735, + "grad_norm": 1.1723542314376345, + "learning_rate": 7.737240729209494e-05, + "loss": 3.9324, + "step": 3640 + }, + { + "epoch": 0.37520559210526316, + "grad_norm": 1.9096388789014245, + "learning_rate": 7.735792310521116e-05, + "loss": 3.8923, + "step": 3650 + }, + { + "epoch": 0.3762335526315789, + "grad_norm": 1.8858105228790336, + "learning_rate": 7.734340056460153e-05, + "loss": 3.9528, + "step": 3660 + }, + { + "epoch": 0.37726151315789475, + "grad_norm": 1.1246116815710976, + "learning_rate": 7.7328839686245e-05, + "loss": 3.9318, + "step": 3670 + }, + { + "epoch": 0.3782894736842105, + "grad_norm": 1.8510299109788855, + "learning_rate": 7.731424048616275e-05, + "loss": 4.0274, + "step": 3680 + }, + { + "epoch": 0.3793174342105263, + "grad_norm": 1.386520758450407, + "learning_rate": 7.729960298041806e-05, + "loss": 3.8669, + "step": 3690 + }, + { + "epoch": 0.3803453947368421, + "grad_norm": 1.6596774499122389, + "learning_rate": 7.72849271851164e-05, + "loss": 3.8456, + "step": 3700 + }, + { + "epoch": 0.3813733552631579, + "grad_norm": 1.178840130771315, + "learning_rate": 7.727021311640537e-05, + "loss": 3.9287, + "step": 3710 + }, + { + "epoch": 0.3824013157894737, + "grad_norm": 0.8537952654611007, + "learning_rate": 7.725546079047466e-05, + "loss": 3.8741, + "step": 3720 + }, + { + "epoch": 0.3834292763157895, + "grad_norm": 1.6515674977150243, + "learning_rate": 7.724067022355606e-05, + "loss": 3.9773, + "step": 3730 + }, + { + "epoch": 0.38445723684210525, + "grad_norm": 1.9299425582046115, + "learning_rate": 7.722584143192345e-05, + "loss": 3.9984, + "step": 3740 + }, + { + "epoch": 0.3854851973684211, + "grad_norm": 1.6503025008832128, + "learning_rate": 7.721097443189276e-05, + "loss": 3.9102, + "step": 3750 + }, + { + "epoch": 0.38651315789473684, + "grad_norm": 2.0729295995363155, + "learning_rate": 7.719606923982196e-05, + "loss": 3.9413, + "step": 3760 + }, + { + "epoch": 0.38754111842105265, + "grad_norm": 1.149525963482167, + "learning_rate": 7.718112587211104e-05, + "loss": 3.8989, + "step": 3770 + }, + { + "epoch": 0.3885690789473684, + "grad_norm": 1.7216385583838762, + "learning_rate": 7.7166144345202e-05, + "loss": 3.8958, + "step": 3780 + }, + { + "epoch": 0.38959703947368424, + "grad_norm": 1.9133291279794198, + "learning_rate": 7.715112467557881e-05, + "loss": 3.973, + "step": 3790 + }, + { + "epoch": 0.390625, + "grad_norm": 1.3852252063057304, + "learning_rate": 7.713606687976743e-05, + "loss": 3.8288, + "step": 3800 + }, + { + "epoch": 0.39165296052631576, + "grad_norm": 1.202702931274638, + "learning_rate": 7.712097097433576e-05, + "loss": 3.9276, + "step": 3810 + }, + { + "epoch": 0.3926809210526316, + "grad_norm": 1.406393679590312, + "learning_rate": 7.710583697589362e-05, + "loss": 3.8313, + "step": 3820 + }, + { + "epoch": 0.39370888157894735, + "grad_norm": 1.692930244180417, + "learning_rate": 7.709066490109277e-05, + "loss": 3.9279, + "step": 3830 + }, + { + "epoch": 0.39473684210526316, + "grad_norm": 2.031215726123103, + "learning_rate": 7.707545476662684e-05, + "loss": 3.9219, + "step": 3840 + }, + { + "epoch": 0.3957648026315789, + "grad_norm": 1.305146644800811, + "learning_rate": 7.706020658923136e-05, + "loss": 3.8366, + "step": 3850 + }, + { + "epoch": 0.39679276315789475, + "grad_norm": 1.3223687135976272, + "learning_rate": 7.704492038568366e-05, + "loss": 3.9074, + "step": 3860 + }, + { + "epoch": 0.3978207236842105, + "grad_norm": 1.9308098992166065, + "learning_rate": 7.702959617280302e-05, + "loss": 3.9192, + "step": 3870 + }, + { + "epoch": 0.3988486842105263, + "grad_norm": 1.970344200756547, + "learning_rate": 7.701423396745043e-05, + "loss": 3.8726, + "step": 3880 + }, + { + "epoch": 0.3998766447368421, + "grad_norm": 1.1404097367005162, + "learning_rate": 7.699883378652872e-05, + "loss": 3.889, + "step": 3890 + }, + { + "epoch": 0.4009046052631579, + "grad_norm": 1.49748384521126, + "learning_rate": 7.698339564698253e-05, + "loss": 3.906, + "step": 3900 + }, + { + "epoch": 0.4019325657894737, + "grad_norm": 1.9548923278387977, + "learning_rate": 7.696791956579827e-05, + "loss": 3.9223, + "step": 3910 + }, + { + "epoch": 0.4029605263157895, + "grad_norm": 1.9286189653704087, + "learning_rate": 7.695240556000404e-05, + "loss": 3.8102, + "step": 3920 + }, + { + "epoch": 0.40398848684210525, + "grad_norm": 1.5796601788340023, + "learning_rate": 7.693685364666972e-05, + "loss": 3.9546, + "step": 3930 + }, + { + "epoch": 0.4050164473684211, + "grad_norm": 1.1024326648174676, + "learning_rate": 7.692126384290686e-05, + "loss": 3.909, + "step": 3940 + }, + { + "epoch": 0.40604440789473684, + "grad_norm": 1.8556988061531767, + "learning_rate": 7.690563616586875e-05, + "loss": 3.8886, + "step": 3950 + }, + { + "epoch": 0.40707236842105265, + "grad_norm": 1.1530109090628096, + "learning_rate": 7.68899706327503e-05, + "loss": 3.9093, + "step": 3960 + }, + { + "epoch": 0.4081003289473684, + "grad_norm": 1.4089015969162257, + "learning_rate": 7.68742672607881e-05, + "loss": 3.9195, + "step": 3970 + }, + { + "epoch": 0.40912828947368424, + "grad_norm": 1.5415181354763763, + "learning_rate": 7.685852606726037e-05, + "loss": 3.9427, + "step": 3980 + }, + { + "epoch": 0.41015625, + "grad_norm": 1.997275165226946, + "learning_rate": 7.684274706948694e-05, + "loss": 3.8503, + "step": 3990 + }, + { + "epoch": 0.41118421052631576, + "grad_norm": 2.165543693128108, + "learning_rate": 7.682693028482925e-05, + "loss": 3.9033, + "step": 4000 + }, + { + "epoch": 0.4122121710526316, + "grad_norm": 1.2728526170581935, + "learning_rate": 7.681107573069031e-05, + "loss": 3.9375, + "step": 4010 + }, + { + "epoch": 0.41324013157894735, + "grad_norm": 1.713291517123427, + "learning_rate": 7.679518342451464e-05, + "loss": 3.8587, + "step": 4020 + }, + { + "epoch": 0.41426809210526316, + "grad_norm": 2.0140842196859685, + "learning_rate": 7.677925338378839e-05, + "loss": 3.9452, + "step": 4030 + }, + { + "epoch": 0.4152960526315789, + "grad_norm": 1.4283428435007164, + "learning_rate": 7.676328562603917e-05, + "loss": 3.9561, + "step": 4040 + }, + { + "epoch": 0.41632401315789475, + "grad_norm": 1.3293236204332626, + "learning_rate": 7.674728016883608e-05, + "loss": 3.8013, + "step": 4050 + }, + { + "epoch": 0.4173519736842105, + "grad_norm": 2.553960387553779, + "learning_rate": 7.673123702978974e-05, + "loss": 3.8636, + "step": 4060 + }, + { + "epoch": 0.4183799342105263, + "grad_norm": 1.6511436053875455, + "learning_rate": 7.671515622655219e-05, + "loss": 3.8224, + "step": 4070 + }, + { + "epoch": 0.4194078947368421, + "grad_norm": 1.4691900040620645, + "learning_rate": 7.669903777681693e-05, + "loss": 3.932, + "step": 4080 + }, + { + "epoch": 0.4204358552631579, + "grad_norm": 1.2453518754771993, + "learning_rate": 7.668288169831892e-05, + "loss": 3.8672, + "step": 4090 + }, + { + "epoch": 0.4214638157894737, + "grad_norm": 1.5103685130193356, + "learning_rate": 7.666668800883445e-05, + "loss": 3.9338, + "step": 4100 + }, + { + "epoch": 0.4224917763157895, + "grad_norm": 2.089823785133899, + "learning_rate": 7.665045672618124e-05, + "loss": 3.9063, + "step": 4110 + }, + { + "epoch": 0.42351973684210525, + "grad_norm": 1.2480809739876675, + "learning_rate": 7.663418786821836e-05, + "loss": 3.864, + "step": 4120 + }, + { + "epoch": 0.4245476973684211, + "grad_norm": 2.4592300546823647, + "learning_rate": 7.661788145284625e-05, + "loss": 3.8906, + "step": 4130 + }, + { + "epoch": 0.42557565789473684, + "grad_norm": 2.2164182633728884, + "learning_rate": 7.66015374980066e-05, + "loss": 3.9329, + "step": 4140 + }, + { + "epoch": 0.42660361842105265, + "grad_norm": 2.0554751202880785, + "learning_rate": 7.658515602168251e-05, + "loss": 3.8165, + "step": 4150 + }, + { + "epoch": 0.4276315789473684, + "grad_norm": 1.3219770513871358, + "learning_rate": 7.656873704189826e-05, + "loss": 3.8716, + "step": 4160 + }, + { + "epoch": 0.42865953947368424, + "grad_norm": 1.8974393881665224, + "learning_rate": 7.655228057671946e-05, + "loss": 3.9191, + "step": 4170 + }, + { + "epoch": 0.4296875, + "grad_norm": 1.1784056210410365, + "learning_rate": 7.653578664425296e-05, + "loss": 3.8324, + "step": 4180 + }, + { + "epoch": 0.43071546052631576, + "grad_norm": 1.2283893271631923, + "learning_rate": 7.651925526264681e-05, + "loss": 3.9136, + "step": 4190 + }, + { + "epoch": 0.4317434210526316, + "grad_norm": 1.9352331128523825, + "learning_rate": 7.65026864500903e-05, + "loss": 3.9396, + "step": 4200 + }, + { + "epoch": 0.43277138157894735, + "grad_norm": 1.6748992804236316, + "learning_rate": 7.648608022481384e-05, + "loss": 3.8848, + "step": 4210 + }, + { + "epoch": 0.43379934210526316, + "grad_norm": 1.040441186381982, + "learning_rate": 7.646943660508907e-05, + "loss": 3.945, + "step": 4220 + }, + { + "epoch": 0.4348273026315789, + "grad_norm": 1.6459432959540536, + "learning_rate": 7.645275560922876e-05, + "loss": 3.8944, + "step": 4230 + }, + { + "epoch": 0.43585526315789475, + "grad_norm": 1.5720170671638225, + "learning_rate": 7.643603725558678e-05, + "loss": 3.8776, + "step": 4240 + }, + { + "epoch": 0.4368832236842105, + "grad_norm": 1.631368701459958, + "learning_rate": 7.641928156255813e-05, + "loss": 3.9168, + "step": 4250 + }, + { + "epoch": 0.4379111842105263, + "grad_norm": 1.7579448734879761, + "learning_rate": 7.640248854857888e-05, + "loss": 3.8845, + "step": 4260 + }, + { + "epoch": 0.4389391447368421, + "grad_norm": 1.5783968398635326, + "learning_rate": 7.638565823212616e-05, + "loss": 3.9288, + "step": 4270 + }, + { + "epoch": 0.4399671052631579, + "grad_norm": 1.7229036253036183, + "learning_rate": 7.636879063171819e-05, + "loss": 3.8625, + "step": 4280 + }, + { + "epoch": 0.4409950657894737, + "grad_norm": 2.225668958761694, + "learning_rate": 7.635188576591411e-05, + "loss": 3.8925, + "step": 4290 + }, + { + "epoch": 0.4420230263157895, + "grad_norm": 1.585234989111433, + "learning_rate": 7.633494365331419e-05, + "loss": 3.8597, + "step": 4300 + }, + { + "epoch": 0.44305098684210525, + "grad_norm": 1.8235605755663862, + "learning_rate": 7.631796431255957e-05, + "loss": 3.9119, + "step": 4310 + }, + { + "epoch": 0.4440789473684211, + "grad_norm": 1.4242259282251908, + "learning_rate": 7.630094776233244e-05, + "loss": 3.9405, + "step": 4320 + }, + { + "epoch": 0.44510690789473684, + "grad_norm": 1.9681417948689781, + "learning_rate": 7.628389402135585e-05, + "loss": 3.806, + "step": 4330 + }, + { + "epoch": 0.44613486842105265, + "grad_norm": 1.419916585836306, + "learning_rate": 7.626680310839387e-05, + "loss": 3.9332, + "step": 4340 + }, + { + "epoch": 0.4471628289473684, + "grad_norm": 1.4753580176133887, + "learning_rate": 7.624967504225134e-05, + "loss": 3.8911, + "step": 4350 + }, + { + "epoch": 0.44819078947368424, + "grad_norm": 1.3339028731508153, + "learning_rate": 7.623250984177412e-05, + "loss": 3.8193, + "step": 4360 + }, + { + "epoch": 0.44921875, + "grad_norm": 1.479587665158699, + "learning_rate": 7.621530752584882e-05, + "loss": 3.9519, + "step": 4370 + }, + { + "epoch": 0.45024671052631576, + "grad_norm": 1.6809771887226006, + "learning_rate": 7.619806811340294e-05, + "loss": 3.9655, + "step": 4380 + }, + { + "epoch": 0.4512746710526316, + "grad_norm": 1.179996980404627, + "learning_rate": 7.618079162340479e-05, + "loss": 3.8881, + "step": 4390 + }, + { + "epoch": 0.45230263157894735, + "grad_norm": 1.8839662428016362, + "learning_rate": 7.616347807486344e-05, + "loss": 3.9, + "step": 4400 + }, + { + "epoch": 0.45333059210526316, + "grad_norm": 1.0510274038616751, + "learning_rate": 7.614612748682882e-05, + "loss": 3.9448, + "step": 4410 + }, + { + "epoch": 0.4543585526315789, + "grad_norm": 1.212281840646722, + "learning_rate": 7.61287398783915e-05, + "loss": 3.8235, + "step": 4420 + }, + { + "epoch": 0.45538651315789475, + "grad_norm": 1.2699184355488258, + "learning_rate": 7.611131526868288e-05, + "loss": 3.9155, + "step": 4430 + }, + { + "epoch": 0.4564144736842105, + "grad_norm": 1.7706797664102225, + "learning_rate": 7.609385367687504e-05, + "loss": 3.82, + "step": 4440 + }, + { + "epoch": 0.4574424342105263, + "grad_norm": 2.39012150667082, + "learning_rate": 7.607635512218072e-05, + "loss": 3.8954, + "step": 4450 + }, + { + "epoch": 0.4584703947368421, + "grad_norm": 1.1876991506526127, + "learning_rate": 7.605881962385336e-05, + "loss": 3.822, + "step": 4460 + }, + { + "epoch": 0.4594983552631579, + "grad_norm": 1.2550419687951455, + "learning_rate": 7.604124720118708e-05, + "loss": 3.9717, + "step": 4470 + }, + { + "epoch": 0.4605263157894737, + "grad_norm": 1.2910334922027378, + "learning_rate": 7.602363787351653e-05, + "loss": 3.9182, + "step": 4480 + }, + { + "epoch": 0.4615542763157895, + "grad_norm": 1.715194561990793, + "learning_rate": 7.600599166021707e-05, + "loss": 3.8981, + "step": 4490 + }, + { + "epoch": 0.46258223684210525, + "grad_norm": 1.6553847013905192, + "learning_rate": 7.598830858070459e-05, + "loss": 3.8306, + "step": 4500 + }, + { + "epoch": 0.4636101973684211, + "grad_norm": 1.233759714646824, + "learning_rate": 7.597058865443555e-05, + "loss": 3.8037, + "step": 4510 + }, + { + "epoch": 0.46463815789473684, + "grad_norm": 1.6454147176698006, + "learning_rate": 7.595283190090697e-05, + "loss": 3.9137, + "step": 4520 + }, + { + "epoch": 0.46566611842105265, + "grad_norm": 2.0479977855223366, + "learning_rate": 7.593503833965637e-05, + "loss": 3.837, + "step": 4530 + }, + { + "epoch": 0.4666940789473684, + "grad_norm": 1.5350436492081798, + "learning_rate": 7.591720799026178e-05, + "loss": 3.8665, + "step": 4540 + }, + { + "epoch": 0.46772203947368424, + "grad_norm": 1.4758037252416094, + "learning_rate": 7.589934087234172e-05, + "loss": 3.8876, + "step": 4550 + }, + { + "epoch": 0.46875, + "grad_norm": 1.8719247394251783, + "learning_rate": 7.588143700555513e-05, + "loss": 3.9128, + "step": 4560 + }, + { + "epoch": 0.46977796052631576, + "grad_norm": 1.3310451941036179, + "learning_rate": 7.586349640960142e-05, + "loss": 3.8168, + "step": 4570 + }, + { + "epoch": 0.4708059210526316, + "grad_norm": 1.6136216365664795, + "learning_rate": 7.584551910422036e-05, + "loss": 3.8692, + "step": 4580 + }, + { + "epoch": 0.47183388157894735, + "grad_norm": 1.5352380939438792, + "learning_rate": 7.582750510919219e-05, + "loss": 3.9019, + "step": 4590 + }, + { + "epoch": 0.47286184210526316, + "grad_norm": 1.1168236840343657, + "learning_rate": 7.580945444433747e-05, + "loss": 3.9519, + "step": 4600 + }, + { + "epoch": 0.4738898026315789, + "grad_norm": 2.150156228355384, + "learning_rate": 7.579136712951711e-05, + "loss": 3.8376, + "step": 4610 + }, + { + "epoch": 0.47491776315789475, + "grad_norm": 1.141164573656, + "learning_rate": 7.577324318463236e-05, + "loss": 3.8446, + "step": 4620 + }, + { + "epoch": 0.4759457236842105, + "grad_norm": 1.0730295708270974, + "learning_rate": 7.575508262962474e-05, + "loss": 3.8784, + "step": 4630 + }, + { + "epoch": 0.4769736842105263, + "grad_norm": 1.3822976139786776, + "learning_rate": 7.573688548447609e-05, + "loss": 3.7622, + "step": 4640 + }, + { + "epoch": 0.4780016447368421, + "grad_norm": 1.91419284921669, + "learning_rate": 7.57186517692085e-05, + "loss": 3.8809, + "step": 4650 + }, + { + "epoch": 0.4790296052631579, + "grad_norm": 1.7348814216350348, + "learning_rate": 7.57003815038843e-05, + "loss": 3.9328, + "step": 4660 + }, + { + "epoch": 0.4800575657894737, + "grad_norm": 1.8324912984102788, + "learning_rate": 7.5682074708606e-05, + "loss": 3.9162, + "step": 4670 + }, + { + "epoch": 0.4810855263157895, + "grad_norm": 1.2756645994941331, + "learning_rate": 7.566373140351636e-05, + "loss": 3.8127, + "step": 4680 + }, + { + "epoch": 0.48211348684210525, + "grad_norm": 1.1942371692507243, + "learning_rate": 7.564535160879828e-05, + "loss": 3.9132, + "step": 4690 + }, + { + "epoch": 0.4831414473684211, + "grad_norm": 1.2018522987066647, + "learning_rate": 7.56269353446748e-05, + "loss": 3.8213, + "step": 4700 + }, + { + "epoch": 0.48416940789473684, + "grad_norm": 1.284292052251979, + "learning_rate": 7.560848263140909e-05, + "loss": 3.8025, + "step": 4710 + }, + { + "epoch": 0.48519736842105265, + "grad_norm": 1.549226995726094, + "learning_rate": 7.558999348930448e-05, + "loss": 3.8556, + "step": 4720 + }, + { + "epoch": 0.4862253289473684, + "grad_norm": 1.482558574580778, + "learning_rate": 7.557146793870429e-05, + "loss": 3.9071, + "step": 4730 + }, + { + "epoch": 0.48725328947368424, + "grad_norm": 1.1119972721862268, + "learning_rate": 7.555290599999195e-05, + "loss": 3.8059, + "step": 4740 + }, + { + "epoch": 0.48828125, + "grad_norm": 1.156794084947614, + "learning_rate": 7.553430769359096e-05, + "loss": 3.8761, + "step": 4750 + }, + { + "epoch": 0.48930921052631576, + "grad_norm": 1.1081182318930962, + "learning_rate": 7.551567303996474e-05, + "loss": 3.8928, + "step": 4760 + }, + { + "epoch": 0.4903371710526316, + "grad_norm": 1.4880942880536634, + "learning_rate": 7.549700205961681e-05, + "loss": 3.8373, + "step": 4770 + }, + { + "epoch": 0.49136513157894735, + "grad_norm": 1.1700327825436634, + "learning_rate": 7.547829477309059e-05, + "loss": 3.8697, + "step": 4780 + }, + { + "epoch": 0.49239309210526316, + "grad_norm": 1.2040439910614535, + "learning_rate": 7.545955120096947e-05, + "loss": 3.8735, + "step": 4790 + }, + { + "epoch": 0.4934210526315789, + "grad_norm": 1.3005237496307311, + "learning_rate": 7.544077136387677e-05, + "loss": 3.9122, + "step": 4800 + }, + { + "epoch": 0.49444901315789475, + "grad_norm": 1.1137697453145345, + "learning_rate": 7.542195528247569e-05, + "loss": 3.945, + "step": 4810 + }, + { + "epoch": 0.4954769736842105, + "grad_norm": 1.5153902796259981, + "learning_rate": 7.540310297746932e-05, + "loss": 3.8504, + "step": 4820 + }, + { + "epoch": 0.4965049342105263, + "grad_norm": 1.6913512046966426, + "learning_rate": 7.538421446960064e-05, + "loss": 3.8516, + "step": 4830 + }, + { + "epoch": 0.4975328947368421, + "grad_norm": 1.817268085086942, + "learning_rate": 7.536528977965239e-05, + "loss": 3.8658, + "step": 4840 + }, + { + "epoch": 0.4985608552631579, + "grad_norm": 1.4540767219220152, + "learning_rate": 7.53463289284472e-05, + "loss": 3.9009, + "step": 4850 + }, + { + "epoch": 0.4995888157894737, + "grad_norm": 1.06637506124979, + "learning_rate": 7.532733193684744e-05, + "loss": 3.7926, + "step": 4860 + }, + { + "epoch": 0.5006167763157895, + "grad_norm": 1.2827455584475747, + "learning_rate": 7.530829882575526e-05, + "loss": 3.8048, + "step": 4870 + }, + { + "epoch": 0.5016447368421053, + "grad_norm": 0.9856914920669203, + "learning_rate": 7.528922961611252e-05, + "loss": 3.9616, + "step": 4880 + }, + { + "epoch": 0.502672697368421, + "grad_norm": 1.32692695886093, + "learning_rate": 7.527012432890086e-05, + "loss": 3.8424, + "step": 4890 + }, + { + "epoch": 0.5037006578947368, + "grad_norm": 2.1420983583251965, + "learning_rate": 7.525098298514157e-05, + "loss": 3.8605, + "step": 4900 + }, + { + "epoch": 0.5047286184210527, + "grad_norm": 1.9897220689692228, + "learning_rate": 7.523180560589562e-05, + "loss": 3.864, + "step": 4910 + }, + { + "epoch": 0.5057565789473685, + "grad_norm": 3.5867905609779216, + "learning_rate": 7.521259221226364e-05, + "loss": 3.8629, + "step": 4920 + }, + { + "epoch": 0.5067845394736842, + "grad_norm": 1.4500143741026743, + "learning_rate": 7.519334282538589e-05, + "loss": 3.8513, + "step": 4930 + }, + { + "epoch": 0.5078125, + "grad_norm": 1.0091369560796957, + "learning_rate": 7.51740574664422e-05, + "loss": 3.8632, + "step": 4940 + }, + { + "epoch": 0.5088404605263158, + "grad_norm": 0.964184802441722, + "learning_rate": 7.515473615665204e-05, + "loss": 3.8131, + "step": 4950 + }, + { + "epoch": 0.5098684210526315, + "grad_norm": 1.2048675006815515, + "learning_rate": 7.513537891727437e-05, + "loss": 3.8037, + "step": 4960 + }, + { + "epoch": 0.5108963815789473, + "grad_norm": 2.578600383048006, + "learning_rate": 7.511598576960772e-05, + "loss": 3.8826, + "step": 4970 + }, + { + "epoch": 0.5119243421052632, + "grad_norm": 1.4728955142792333, + "learning_rate": 7.509655673499011e-05, + "loss": 3.8929, + "step": 4980 + }, + { + "epoch": 0.512952302631579, + "grad_norm": 1.326710879381479, + "learning_rate": 7.507709183479908e-05, + "loss": 3.8563, + "step": 4990 + }, + { + "epoch": 0.5139802631578947, + "grad_norm": 2.0782715021565457, + "learning_rate": 7.505759109045159e-05, + "loss": 3.9103, + "step": 5000 + }, + { + "epoch": 0.5150082236842105, + "grad_norm": 1.0879138005799773, + "learning_rate": 7.503805452340408e-05, + "loss": 3.8341, + "step": 5010 + }, + { + "epoch": 0.5160361842105263, + "grad_norm": 1.4341789527173234, + "learning_rate": 7.501848215515238e-05, + "loss": 3.843, + "step": 5020 + }, + { + "epoch": 0.5170641447368421, + "grad_norm": 1.5274342854001348, + "learning_rate": 7.499887400723169e-05, + "loss": 3.8024, + "step": 5030 + }, + { + "epoch": 0.5180921052631579, + "grad_norm": 1.0046410747142438, + "learning_rate": 7.497923010121663e-05, + "loss": 3.8891, + "step": 5040 + }, + { + "epoch": 0.5191200657894737, + "grad_norm": 1.1411510978703812, + "learning_rate": 7.495955045872112e-05, + "loss": 3.7942, + "step": 5050 + }, + { + "epoch": 0.5201480263157895, + "grad_norm": 1.6978355653580322, + "learning_rate": 7.493983510139844e-05, + "loss": 3.8809, + "step": 5060 + }, + { + "epoch": 0.5211759868421053, + "grad_norm": 1.597062725820269, + "learning_rate": 7.492008405094113e-05, + "loss": 3.8053, + "step": 5070 + }, + { + "epoch": 0.522203947368421, + "grad_norm": 1.6002692979442532, + "learning_rate": 7.490029732908102e-05, + "loss": 3.8424, + "step": 5080 + }, + { + "epoch": 0.5232319078947368, + "grad_norm": 1.1566123394662284, + "learning_rate": 7.48804749575892e-05, + "loss": 3.8757, + "step": 5090 + }, + { + "epoch": 0.5242598684210527, + "grad_norm": 1.4443144780564368, + "learning_rate": 7.486061695827594e-05, + "loss": 3.7893, + "step": 5100 + }, + { + "epoch": 0.5252878289473685, + "grad_norm": 1.2079804277823936, + "learning_rate": 7.484072335299077e-05, + "loss": 3.9215, + "step": 5110 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 1.0917740002250667, + "learning_rate": 7.482079416362233e-05, + "loss": 3.81, + "step": 5120 + }, + { + "epoch": 0.52734375, + "grad_norm": 1.1192786113361664, + "learning_rate": 7.480082941209852e-05, + "loss": 3.8524, + "step": 5130 + }, + { + "epoch": 0.5283717105263158, + "grad_norm": 1.6249884825078242, + "learning_rate": 7.478082912038625e-05, + "loss": 3.7593, + "step": 5140 + }, + { + "epoch": 0.5293996710526315, + "grad_norm": 1.4038217135935829, + "learning_rate": 7.47607933104916e-05, + "loss": 3.7846, + "step": 5150 + }, + { + "epoch": 0.5304276315789473, + "grad_norm": 3.4252013321335455, + "learning_rate": 7.47407220044597e-05, + "loss": 3.8369, + "step": 5160 + }, + { + "epoch": 0.5314555921052632, + "grad_norm": 1.7122504365719675, + "learning_rate": 7.472061522437476e-05, + "loss": 3.88, + "step": 5170 + }, + { + "epoch": 0.532483552631579, + "grad_norm": 1.8370408009881651, + "learning_rate": 7.470047299236001e-05, + "loss": 3.8276, + "step": 5180 + }, + { + "epoch": 0.5335115131578947, + "grad_norm": 1.2155745292326092, + "learning_rate": 7.468029533057771e-05, + "loss": 3.8396, + "step": 5190 + }, + { + "epoch": 0.5345394736842105, + "grad_norm": 1.1968435816214282, + "learning_rate": 7.466008226122904e-05, + "loss": 3.8477, + "step": 5200 + }, + { + "epoch": 0.5355674342105263, + "grad_norm": 1.6433209022350985, + "learning_rate": 7.463983380655423e-05, + "loss": 3.8858, + "step": 5210 + }, + { + "epoch": 0.5365953947368421, + "grad_norm": 1.0842442106585386, + "learning_rate": 7.461954998883236e-05, + "loss": 3.8384, + "step": 5220 + }, + { + "epoch": 0.5376233552631579, + "grad_norm": 1.2043892794338134, + "learning_rate": 7.459923083038147e-05, + "loss": 3.9242, + "step": 5230 + }, + { + "epoch": 0.5386513157894737, + "grad_norm": 1.5190588994112015, + "learning_rate": 7.457887635355846e-05, + "loss": 3.7986, + "step": 5240 + }, + { + "epoch": 0.5396792763157895, + "grad_norm": 1.848845407682037, + "learning_rate": 7.455848658075911e-05, + "loss": 3.8144, + "step": 5250 + }, + { + "epoch": 0.5407072368421053, + "grad_norm": 1.1644873131944986, + "learning_rate": 7.453806153441802e-05, + "loss": 3.8155, + "step": 5260 + }, + { + "epoch": 0.541735197368421, + "grad_norm": 1.0595657559485838, + "learning_rate": 7.45176012370086e-05, + "loss": 3.7482, + "step": 5270 + }, + { + "epoch": 0.5427631578947368, + "grad_norm": 1.387947934677526, + "learning_rate": 7.449710571104308e-05, + "loss": 3.8343, + "step": 5280 + }, + { + "epoch": 0.5437911184210527, + "grad_norm": 1.178865816043549, + "learning_rate": 7.44765749790724e-05, + "loss": 3.8937, + "step": 5290 + }, + { + "epoch": 0.5448190789473685, + "grad_norm": 0.956117840279658, + "learning_rate": 7.445600906368624e-05, + "loss": 3.8172, + "step": 5300 + }, + { + "epoch": 0.5458470394736842, + "grad_norm": 1.5802830208428982, + "learning_rate": 7.443540798751306e-05, + "loss": 3.7986, + "step": 5310 + }, + { + "epoch": 0.546875, + "grad_norm": 1.1889472474474339, + "learning_rate": 7.441477177321994e-05, + "loss": 3.7997, + "step": 5320 + }, + { + "epoch": 0.5479029605263158, + "grad_norm": 1.3098317681098657, + "learning_rate": 7.439410044351264e-05, + "loss": 3.7958, + "step": 5330 + }, + { + "epoch": 0.5489309210526315, + "grad_norm": 1.4098645698492327, + "learning_rate": 7.437339402113556e-05, + "loss": 3.7863, + "step": 5340 + }, + { + "epoch": 0.5499588815789473, + "grad_norm": 1.2663742484980145, + "learning_rate": 7.43526525288717e-05, + "loss": 3.8546, + "step": 5350 + }, + { + "epoch": 0.5509868421052632, + "grad_norm": 1.3674519310405997, + "learning_rate": 7.433187598954268e-05, + "loss": 3.8523, + "step": 5360 + }, + { + "epoch": 0.552014802631579, + "grad_norm": 2.0035947442307007, + "learning_rate": 7.431106442600864e-05, + "loss": 3.8452, + "step": 5370 + }, + { + "epoch": 0.5530427631578947, + "grad_norm": 1.4049429859526208, + "learning_rate": 7.429021786116828e-05, + "loss": 3.816, + "step": 5380 + }, + { + "epoch": 0.5540707236842105, + "grad_norm": 1.2794590505059908, + "learning_rate": 7.426933631795884e-05, + "loss": 3.9298, + "step": 5390 + }, + { + "epoch": 0.5550986842105263, + "grad_norm": 1.219240481153527, + "learning_rate": 7.424841981935597e-05, + "loss": 3.8022, + "step": 5400 + }, + { + "epoch": 0.5561266447368421, + "grad_norm": 1.2235987141722464, + "learning_rate": 7.422746838837384e-05, + "loss": 3.8993, + "step": 5410 + }, + { + "epoch": 0.5571546052631579, + "grad_norm": 1.4474021033880708, + "learning_rate": 7.420648204806505e-05, + "loss": 3.8594, + "step": 5420 + }, + { + "epoch": 0.5581825657894737, + "grad_norm": 1.4246844527907938, + "learning_rate": 7.418546082152061e-05, + "loss": 3.8983, + "step": 5430 + }, + { + "epoch": 0.5592105263157895, + "grad_norm": 1.0407233480999039, + "learning_rate": 7.416440473186986e-05, + "loss": 3.8726, + "step": 5440 + }, + { + "epoch": 0.5602384868421053, + "grad_norm": 1.2489641188910932, + "learning_rate": 7.41433138022806e-05, + "loss": 3.8339, + "step": 5450 + }, + { + "epoch": 0.561266447368421, + "grad_norm": 1.08655766404584, + "learning_rate": 7.412218805595887e-05, + "loss": 3.88, + "step": 5460 + }, + { + "epoch": 0.5622944078947368, + "grad_norm": 1.56392025295959, + "learning_rate": 7.410102751614908e-05, + "loss": 3.8162, + "step": 5470 + }, + { + "epoch": 0.5633223684210527, + "grad_norm": 1.862031785626057, + "learning_rate": 7.40798322061339e-05, + "loss": 3.8741, + "step": 5480 + }, + { + "epoch": 0.5643503289473685, + "grad_norm": 1.3963924351017691, + "learning_rate": 7.405860214923424e-05, + "loss": 3.7374, + "step": 5490 + }, + { + "epoch": 0.5653782894736842, + "grad_norm": 1.1486163875843542, + "learning_rate": 7.403733736880929e-05, + "loss": 3.8462, + "step": 5500 + }, + { + "epoch": 0.56640625, + "grad_norm": 1.123706179603558, + "learning_rate": 7.401603788825638e-05, + "loss": 3.7971, + "step": 5510 + }, + { + "epoch": 0.5674342105263158, + "grad_norm": 1.65807084093962, + "learning_rate": 7.399470373101108e-05, + "loss": 3.8934, + "step": 5520 + }, + { + "epoch": 0.5684621710526315, + "grad_norm": 2.5612769580947106, + "learning_rate": 7.397333492054707e-05, + "loss": 3.8454, + "step": 5530 + }, + { + "epoch": 0.5694901315789473, + "grad_norm": 1.1727041744272058, + "learning_rate": 7.39519314803762e-05, + "loss": 3.8976, + "step": 5540 + }, + { + "epoch": 0.5705180921052632, + "grad_norm": 0.9923316069953039, + "learning_rate": 7.39304934340484e-05, + "loss": 3.7919, + "step": 5550 + }, + { + "epoch": 0.571546052631579, + "grad_norm": 1.3657578322974882, + "learning_rate": 7.390902080515166e-05, + "loss": 3.773, + "step": 5560 + }, + { + "epoch": 0.5725740131578947, + "grad_norm": 1.2231480398527521, + "learning_rate": 7.388751361731204e-05, + "loss": 3.8328, + "step": 5570 + }, + { + "epoch": 0.5736019736842105, + "grad_norm": 1.4177765518955152, + "learning_rate": 7.386597189419364e-05, + "loss": 3.928, + "step": 5580 + }, + { + "epoch": 0.5746299342105263, + "grad_norm": 1.2904725194825861, + "learning_rate": 7.384439565949851e-05, + "loss": 3.9005, + "step": 5590 + }, + { + "epoch": 0.5756578947368421, + "grad_norm": 1.3226000589691154, + "learning_rate": 7.382278493696674e-05, + "loss": 3.8614, + "step": 5600 + }, + { + "epoch": 0.5766858552631579, + "grad_norm": 1.9282251910079249, + "learning_rate": 7.380113975037632e-05, + "loss": 3.8889, + "step": 5610 + }, + { + "epoch": 0.5777138157894737, + "grad_norm": 1.5371564496792138, + "learning_rate": 7.377946012354318e-05, + "loss": 3.8425, + "step": 5620 + }, + { + "epoch": 0.5787417763157895, + "grad_norm": 1.141324756175267, + "learning_rate": 7.375774608032109e-05, + "loss": 3.8469, + "step": 5630 + }, + { + "epoch": 0.5797697368421053, + "grad_norm": 1.7914548645210715, + "learning_rate": 7.37359976446018e-05, + "loss": 3.8556, + "step": 5640 + }, + { + "epoch": 0.580797697368421, + "grad_norm": 1.1480604837447559, + "learning_rate": 7.371421484031479e-05, + "loss": 3.8173, + "step": 5650 + }, + { + "epoch": 0.5818256578947368, + "grad_norm": 1.0185359027772545, + "learning_rate": 7.369239769142741e-05, + "loss": 3.8786, + "step": 5660 + }, + { + "epoch": 0.5828536184210527, + "grad_norm": 1.1987949579298964, + "learning_rate": 7.367054622194477e-05, + "loss": 3.8156, + "step": 5670 + }, + { + "epoch": 0.5838815789473685, + "grad_norm": 1.348092934444201, + "learning_rate": 7.364866045590982e-05, + "loss": 3.8353, + "step": 5680 + }, + { + "epoch": 0.5849095394736842, + "grad_norm": 1.3199177891243823, + "learning_rate": 7.362674041740312e-05, + "loss": 3.8635, + "step": 5690 + }, + { + "epoch": 0.5859375, + "grad_norm": 1.2607384525778575, + "learning_rate": 7.360478613054306e-05, + "loss": 3.8593, + "step": 5700 + }, + { + "epoch": 0.5869654605263158, + "grad_norm": 0.8804834413698992, + "learning_rate": 7.358279761948562e-05, + "loss": 3.8458, + "step": 5710 + }, + { + "epoch": 0.5879934210526315, + "grad_norm": 1.2579222447635028, + "learning_rate": 7.35607749084245e-05, + "loss": 3.8872, + "step": 5720 + }, + { + "epoch": 0.5890213815789473, + "grad_norm": 1.0882833378122079, + "learning_rate": 7.353871802159101e-05, + "loss": 3.9097, + "step": 5730 + }, + { + "epoch": 0.5900493421052632, + "grad_norm": 1.4823924427352442, + "learning_rate": 7.351662698325406e-05, + "loss": 3.8499, + "step": 5740 + }, + { + "epoch": 0.591077302631579, + "grad_norm": 1.2935379941895035, + "learning_rate": 7.349450181772012e-05, + "loss": 3.7969, + "step": 5750 + }, + { + "epoch": 0.5921052631578947, + "grad_norm": 1.2031957432149096, + "learning_rate": 7.347234254933321e-05, + "loss": 3.8466, + "step": 5760 + }, + { + "epoch": 0.5931332236842105, + "grad_norm": 1.2206752926514577, + "learning_rate": 7.345014920247494e-05, + "loss": 3.8628, + "step": 5770 + }, + { + "epoch": 0.5941611842105263, + "grad_norm": 1.5450956340931303, + "learning_rate": 7.342792180156432e-05, + "loss": 3.8161, + "step": 5780 + }, + { + "epoch": 0.5951891447368421, + "grad_norm": 1.1619721147273943, + "learning_rate": 7.34056603710579e-05, + "loss": 3.7993, + "step": 5790 + }, + { + "epoch": 0.5962171052631579, + "grad_norm": 1.3214170812600667, + "learning_rate": 7.338336493544963e-05, + "loss": 3.7644, + "step": 5800 + }, + { + "epoch": 0.5972450657894737, + "grad_norm": 1.1121024332208378, + "learning_rate": 7.336103551927092e-05, + "loss": 3.8558, + "step": 5810 + }, + { + "epoch": 0.5982730263157895, + "grad_norm": 1.1604435625960101, + "learning_rate": 7.33386721470905e-05, + "loss": 3.8479, + "step": 5820 + }, + { + "epoch": 0.5993009868421053, + "grad_norm": 2.8246127918187396, + "learning_rate": 7.331627484351453e-05, + "loss": 3.7939, + "step": 5830 + }, + { + "epoch": 0.600328947368421, + "grad_norm": 1.0274272928246628, + "learning_rate": 7.329384363318648e-05, + "loss": 3.9029, + "step": 5840 + }, + { + "epoch": 0.6013569078947368, + "grad_norm": 1.5034449278497934, + "learning_rate": 7.327137854078708e-05, + "loss": 3.807, + "step": 5850 + }, + { + "epoch": 0.6023848684210527, + "grad_norm": 1.4904427438953165, + "learning_rate": 7.324887959103445e-05, + "loss": 3.8673, + "step": 5860 + }, + { + "epoch": 0.6034128289473685, + "grad_norm": 1.4667965499125806, + "learning_rate": 7.322634680868384e-05, + "loss": 3.8931, + "step": 5870 + }, + { + "epoch": 0.6044407894736842, + "grad_norm": 1.439869379238759, + "learning_rate": 7.320378021852782e-05, + "loss": 3.8239, + "step": 5880 + }, + { + "epoch": 0.60546875, + "grad_norm": 1.1487548285913114, + "learning_rate": 7.318117984539607e-05, + "loss": 3.8776, + "step": 5890 + }, + { + "epoch": 0.6064967105263158, + "grad_norm": 1.6818981438684883, + "learning_rate": 7.315854571415554e-05, + "loss": 3.7871, + "step": 5900 + }, + { + "epoch": 0.6075246710526315, + "grad_norm": 1.36634255593947, + "learning_rate": 7.313587784971025e-05, + "loss": 3.8479, + "step": 5910 + }, + { + "epoch": 0.6085526315789473, + "grad_norm": 1.3575491646562652, + "learning_rate": 7.311317627700138e-05, + "loss": 3.8635, + "step": 5920 + }, + { + "epoch": 0.6095805921052632, + "grad_norm": 0.9860361705803903, + "learning_rate": 7.309044102100716e-05, + "loss": 3.804, + "step": 5930 + }, + { + "epoch": 0.610608552631579, + "grad_norm": 1.3066960937404246, + "learning_rate": 7.306767210674291e-05, + "loss": 3.8197, + "step": 5940 + }, + { + "epoch": 0.6116365131578947, + "grad_norm": 1.722986652408913, + "learning_rate": 7.304486955926097e-05, + "loss": 3.7668, + "step": 5950 + }, + { + "epoch": 0.6126644736842105, + "grad_norm": 2.039550956596383, + "learning_rate": 7.30220334036507e-05, + "loss": 3.8024, + "step": 5960 + }, + { + "epoch": 0.6136924342105263, + "grad_norm": 1.1519636141408476, + "learning_rate": 7.299916366503841e-05, + "loss": 3.8828, + "step": 5970 + }, + { + "epoch": 0.6147203947368421, + "grad_norm": 1.117008701768686, + "learning_rate": 7.29762603685874e-05, + "loss": 3.8045, + "step": 5980 + }, + { + "epoch": 0.6157483552631579, + "grad_norm": 1.7868574121176048, + "learning_rate": 7.295332353949785e-05, + "loss": 3.7772, + "step": 5990 + }, + { + "epoch": 0.6167763157894737, + "grad_norm": 1.3752058481973077, + "learning_rate": 7.293035320300689e-05, + "loss": 3.7932, + "step": 6000 + }, + { + "epoch": 0.6178042763157895, + "grad_norm": 1.1279075455246879, + "learning_rate": 7.290734938438846e-05, + "loss": 3.7686, + "step": 6010 + }, + { + "epoch": 0.6188322368421053, + "grad_norm": 1.2294268359691929, + "learning_rate": 7.288431210895339e-05, + "loss": 3.7372, + "step": 6020 + }, + { + "epoch": 0.619860197368421, + "grad_norm": 1.1603412416464973, + "learning_rate": 7.286124140204928e-05, + "loss": 3.8749, + "step": 6030 + }, + { + "epoch": 0.6208881578947368, + "grad_norm": 1.2816880570517106, + "learning_rate": 7.283813728906054e-05, + "loss": 3.829, + "step": 6040 + }, + { + "epoch": 0.6219161184210527, + "grad_norm": 1.0627328954023276, + "learning_rate": 7.281499979540833e-05, + "loss": 3.9196, + "step": 6050 + }, + { + "epoch": 0.6229440789473685, + "grad_norm": 1.3616960813612888, + "learning_rate": 7.279182894655052e-05, + "loss": 3.912, + "step": 6060 + }, + { + "epoch": 0.6239720394736842, + "grad_norm": 1.6771880249454763, + "learning_rate": 7.276862476798174e-05, + "loss": 3.8457, + "step": 6070 + }, + { + "epoch": 0.625, + "grad_norm": 1.0826677454072373, + "learning_rate": 7.27453872852332e-05, + "loss": 3.7974, + "step": 6080 + }, + { + "epoch": 0.6260279605263158, + "grad_norm": 1.6380932846987073, + "learning_rate": 7.272211652387285e-05, + "loss": 3.8451, + "step": 6090 + }, + { + "epoch": 0.6270559210526315, + "grad_norm": 1.1901030370037746, + "learning_rate": 7.269881250950518e-05, + "loss": 3.8437, + "step": 6100 + }, + { + "epoch": 0.6280838815789473, + "grad_norm": 1.414363765764649, + "learning_rate": 7.267547526777131e-05, + "loss": 3.8357, + "step": 6110 + }, + { + "epoch": 0.6291118421052632, + "grad_norm": 1.1950726735988906, + "learning_rate": 7.26521048243489e-05, + "loss": 3.8772, + "step": 6120 + }, + { + "epoch": 0.630139802631579, + "grad_norm": 1.2634295978624541, + "learning_rate": 7.262870120495216e-05, + "loss": 3.7949, + "step": 6130 + }, + { + "epoch": 0.6311677631578947, + "grad_norm": 1.168565272455925, + "learning_rate": 7.260526443533179e-05, + "loss": 3.8391, + "step": 6140 + }, + { + "epoch": 0.6321957236842105, + "grad_norm": 1.2094457482435232, + "learning_rate": 7.258179454127495e-05, + "loss": 3.8545, + "step": 6150 + }, + { + "epoch": 0.6332236842105263, + "grad_norm": 1.1660192032700016, + "learning_rate": 7.255829154860528e-05, + "loss": 3.7324, + "step": 6160 + }, + { + "epoch": 0.6342516447368421, + "grad_norm": 1.5248148633798886, + "learning_rate": 7.253475548318283e-05, + "loss": 3.8706, + "step": 6170 + }, + { + "epoch": 0.6352796052631579, + "grad_norm": 1.134274611221418, + "learning_rate": 7.251118637090402e-05, + "loss": 3.7945, + "step": 6180 + }, + { + "epoch": 0.6363075657894737, + "grad_norm": 1.2371256637825851, + "learning_rate": 7.248758423770162e-05, + "loss": 3.8546, + "step": 6190 + }, + { + "epoch": 0.6373355263157895, + "grad_norm": 1.2010866212440996, + "learning_rate": 7.24639491095448e-05, + "loss": 3.8398, + "step": 6200 + }, + { + "epoch": 0.6383634868421053, + "grad_norm": 0.8415869077024948, + "learning_rate": 7.244028101243895e-05, + "loss": 3.7575, + "step": 6210 + }, + { + "epoch": 0.639391447368421, + "grad_norm": 1.3772347236709808, + "learning_rate": 7.24165799724258e-05, + "loss": 3.8557, + "step": 6220 + }, + { + "epoch": 0.6404194078947368, + "grad_norm": 1.1683632174944911, + "learning_rate": 7.239284601558328e-05, + "loss": 3.8163, + "step": 6230 + }, + { + "epoch": 0.6414473684210527, + "grad_norm": 1.4104152874773157, + "learning_rate": 7.236907916802558e-05, + "loss": 3.8129, + "step": 6240 + }, + { + "epoch": 0.6424753289473685, + "grad_norm": 1.548442547248136, + "learning_rate": 7.234527945590302e-05, + "loss": 3.7391, + "step": 6250 + }, + { + "epoch": 0.6435032894736842, + "grad_norm": 0.8873275145966385, + "learning_rate": 7.232144690540216e-05, + "loss": 3.7699, + "step": 6260 + }, + { + "epoch": 0.64453125, + "grad_norm": 1.2443627079922268, + "learning_rate": 7.229758154274564e-05, + "loss": 3.7642, + "step": 6270 + }, + { + "epoch": 0.6455592105263158, + "grad_norm": 1.1125866245448437, + "learning_rate": 7.227368339419219e-05, + "loss": 3.8311, + "step": 6280 + }, + { + "epoch": 0.6465871710526315, + "grad_norm": 1.4050698732669888, + "learning_rate": 7.224975248603666e-05, + "loss": 3.7512, + "step": 6290 + }, + { + "epoch": 0.6476151315789473, + "grad_norm": 1.2696856126106988, + "learning_rate": 7.222578884460992e-05, + "loss": 3.8346, + "step": 6300 + }, + { + "epoch": 0.6486430921052632, + "grad_norm": 1.0943242336765941, + "learning_rate": 7.220179249627886e-05, + "loss": 3.8665, + "step": 6310 + }, + { + "epoch": 0.649671052631579, + "grad_norm": 1.439460247130871, + "learning_rate": 7.217776346744631e-05, + "loss": 3.8161, + "step": 6320 + }, + { + "epoch": 0.6506990131578947, + "grad_norm": 1.1845514935966506, + "learning_rate": 7.215370178455116e-05, + "loss": 3.8763, + "step": 6330 + }, + { + "epoch": 0.6517269736842105, + "grad_norm": 1.0261786058057774, + "learning_rate": 7.212960747406815e-05, + "loss": 3.8237, + "step": 6340 + }, + { + "epoch": 0.6527549342105263, + "grad_norm": 1.531550008228219, + "learning_rate": 7.210548056250793e-05, + "loss": 3.8182, + "step": 6350 + }, + { + "epoch": 0.6537828947368421, + "grad_norm": 1.5231252497006393, + "learning_rate": 7.208132107641704e-05, + "loss": 3.8497, + "step": 6360 + }, + { + "epoch": 0.6548108552631579, + "grad_norm": 1.0100744133154598, + "learning_rate": 7.205712904237786e-05, + "loss": 3.8543, + "step": 6370 + }, + { + "epoch": 0.6558388157894737, + "grad_norm": 1.2570801018325766, + "learning_rate": 7.203290448700857e-05, + "loss": 3.7715, + "step": 6380 + }, + { + "epoch": 0.6568667763157895, + "grad_norm": 1.2467588843868025, + "learning_rate": 7.200864743696314e-05, + "loss": 3.8038, + "step": 6390 + }, + { + "epoch": 0.6578947368421053, + "grad_norm": 1.9510210225210833, + "learning_rate": 7.198435791893128e-05, + "loss": 3.8183, + "step": 6400 + }, + { + "epoch": 0.658922697368421, + "grad_norm": 1.4543365022279564, + "learning_rate": 7.196003595963845e-05, + "loss": 3.7845, + "step": 6410 + }, + { + "epoch": 0.6599506578947368, + "grad_norm": 1.2374526660228151, + "learning_rate": 7.193568158584577e-05, + "loss": 3.8761, + "step": 6420 + }, + { + "epoch": 0.6609786184210527, + "grad_norm": 1.6890568792230207, + "learning_rate": 7.191129482435007e-05, + "loss": 3.7812, + "step": 6430 + }, + { + "epoch": 0.6620065789473685, + "grad_norm": 1.244479817335219, + "learning_rate": 7.188687570198377e-05, + "loss": 3.8312, + "step": 6440 + }, + { + "epoch": 0.6630345394736842, + "grad_norm": 1.5836166161879746, + "learning_rate": 7.186242424561493e-05, + "loss": 3.8398, + "step": 6450 + }, + { + "epoch": 0.6640625, + "grad_norm": 0.7748969594074909, + "learning_rate": 7.183794048214717e-05, + "loss": 3.7945, + "step": 6460 + }, + { + "epoch": 0.6650904605263158, + "grad_norm": 1.649001833963849, + "learning_rate": 7.181342443851966e-05, + "loss": 3.7865, + "step": 6470 + }, + { + "epoch": 0.6661184210526315, + "grad_norm": 1.7603613113036878, + "learning_rate": 7.178887614170709e-05, + "loss": 3.8069, + "step": 6480 + }, + { + "epoch": 0.6671463815789473, + "grad_norm": 1.228007491995979, + "learning_rate": 7.176429561871963e-05, + "loss": 3.8928, + "step": 6490 + }, + { + "epoch": 0.6681743421052632, + "grad_norm": 1.9989331499441287, + "learning_rate": 7.173968289660293e-05, + "loss": 3.8261, + "step": 6500 + }, + { + "epoch": 0.669202302631579, + "grad_norm": 1.1044863052278984, + "learning_rate": 7.171503800243804e-05, + "loss": 3.7358, + "step": 6510 + }, + { + "epoch": 0.6702302631578947, + "grad_norm": 1.1786576379894809, + "learning_rate": 7.169036096334142e-05, + "loss": 3.71, + "step": 6520 + }, + { + "epoch": 0.6712582236842105, + "grad_norm": 1.7617422301087384, + "learning_rate": 7.166565180646491e-05, + "loss": 3.7589, + "step": 6530 + }, + { + "epoch": 0.6722861842105263, + "grad_norm": 4.547046566377635, + "learning_rate": 7.164091055899566e-05, + "loss": 3.7976, + "step": 6540 + }, + { + "epoch": 0.6733141447368421, + "grad_norm": 1.1119592145191424, + "learning_rate": 7.161613724815617e-05, + "loss": 3.8014, + "step": 6550 + }, + { + "epoch": 0.6743421052631579, + "grad_norm": 1.068465874365739, + "learning_rate": 7.159133190120418e-05, + "loss": 3.7893, + "step": 6560 + }, + { + "epoch": 0.6753700657894737, + "grad_norm": 1.027887410826216, + "learning_rate": 7.156649454543271e-05, + "loss": 3.7969, + "step": 6570 + }, + { + "epoch": 0.6763980263157895, + "grad_norm": 1.141806630560445, + "learning_rate": 7.154162520816997e-05, + "loss": 3.837, + "step": 6580 + }, + { + "epoch": 0.6774259868421053, + "grad_norm": 1.5841446688410838, + "learning_rate": 7.151672391677937e-05, + "loss": 3.8547, + "step": 6590 + }, + { + "epoch": 0.678453947368421, + "grad_norm": 1.5263683589996704, + "learning_rate": 7.14917906986595e-05, + "loss": 3.8278, + "step": 6600 + }, + { + "epoch": 0.6794819078947368, + "grad_norm": 1.3323860031662786, + "learning_rate": 7.146682558124403e-05, + "loss": 3.8104, + "step": 6610 + }, + { + "epoch": 0.6805098684210527, + "grad_norm": 1.5894039961149622, + "learning_rate": 7.14418285920018e-05, + "loss": 3.8144, + "step": 6620 + }, + { + "epoch": 0.6815378289473685, + "grad_norm": 1.153524072333905, + "learning_rate": 7.141679975843665e-05, + "loss": 3.8308, + "step": 6630 + }, + { + "epoch": 0.6825657894736842, + "grad_norm": 1.3583756204275292, + "learning_rate": 7.13917391080875e-05, + "loss": 3.7962, + "step": 6640 + }, + { + "epoch": 0.68359375, + "grad_norm": 1.789116937733634, + "learning_rate": 7.136664666852822e-05, + "loss": 3.7591, + "step": 6650 + }, + { + "epoch": 0.6846217105263158, + "grad_norm": 2.6860065302652267, + "learning_rate": 7.134152246736774e-05, + "loss": 3.8603, + "step": 6660 + }, + { + "epoch": 0.6856496710526315, + "grad_norm": 1.542574288852971, + "learning_rate": 7.131636653224991e-05, + "loss": 3.8544, + "step": 6670 + }, + { + "epoch": 0.6866776315789473, + "grad_norm": 1.2382551124293153, + "learning_rate": 7.129117889085346e-05, + "loss": 3.7789, + "step": 6680 + }, + { + "epoch": 0.6877055921052632, + "grad_norm": 1.376264684115453, + "learning_rate": 7.126595957089201e-05, + "loss": 3.694, + "step": 6690 + }, + { + "epoch": 0.688733552631579, + "grad_norm": 1.05712890625, + "learning_rate": 7.124070860011408e-05, + "loss": 3.8305, + "step": 6700 + }, + { + "epoch": 0.6897615131578947, + "grad_norm": 1.2815042569093107, + "learning_rate": 7.121542600630299e-05, + "loss": 3.8357, + "step": 6710 + }, + { + "epoch": 0.6907894736842105, + "grad_norm": 1.5794124827977207, + "learning_rate": 7.119011181727685e-05, + "loss": 3.8173, + "step": 6720 + }, + { + "epoch": 0.6918174342105263, + "grad_norm": 0.9799019136291218, + "learning_rate": 7.116476606088851e-05, + "loss": 3.8195, + "step": 6730 + }, + { + "epoch": 0.6928453947368421, + "grad_norm": 1.6712445648036502, + "learning_rate": 7.113938876502562e-05, + "loss": 3.7663, + "step": 6740 + }, + { + "epoch": 0.6938733552631579, + "grad_norm": 1.5327330433572235, + "learning_rate": 7.11139799576105e-05, + "loss": 3.7813, + "step": 6750 + }, + { + "epoch": 0.6949013157894737, + "grad_norm": 1.2073275501797998, + "learning_rate": 7.108853966660008e-05, + "loss": 3.7825, + "step": 6760 + }, + { + "epoch": 0.6959292763157895, + "grad_norm": 1.1460475316834047, + "learning_rate": 7.106306791998604e-05, + "loss": 3.8445, + "step": 6770 + }, + { + "epoch": 0.6969572368421053, + "grad_norm": 1.3359805986914466, + "learning_rate": 7.10375647457946e-05, + "loss": 3.7644, + "step": 6780 + }, + { + "epoch": 0.697985197368421, + "grad_norm": 1.1004831965098223, + "learning_rate": 7.101203017208655e-05, + "loss": 3.8257, + "step": 6790 + }, + { + "epoch": 0.6990131578947368, + "grad_norm": 1.3409473111770067, + "learning_rate": 7.09864642269573e-05, + "loss": 3.747, + "step": 6800 + }, + { + "epoch": 0.7000411184210527, + "grad_norm": 1.4847169181142965, + "learning_rate": 7.09608669385367e-05, + "loss": 3.7652, + "step": 6810 + }, + { + "epoch": 0.7010690789473685, + "grad_norm": 2.135661810473133, + "learning_rate": 7.093523833498911e-05, + "loss": 3.8653, + "step": 6820 + }, + { + "epoch": 0.7020970394736842, + "grad_norm": 1.4897867432896452, + "learning_rate": 7.090957844451338e-05, + "loss": 3.7813, + "step": 6830 + }, + { + "epoch": 0.703125, + "grad_norm": 0.9781867595833351, + "learning_rate": 7.088388729534274e-05, + "loss": 3.8427, + "step": 6840 + }, + { + "epoch": 0.7041529605263158, + "grad_norm": 1.4953340439156284, + "learning_rate": 7.085816491574486e-05, + "loss": 3.7794, + "step": 6850 + }, + { + "epoch": 0.7051809210526315, + "grad_norm": 1.3593543917091344, + "learning_rate": 7.083241133402171e-05, + "loss": 3.7899, + "step": 6860 + }, + { + "epoch": 0.7062088815789473, + "grad_norm": 1.2296294725776309, + "learning_rate": 7.080662657850964e-05, + "loss": 3.8344, + "step": 6870 + }, + { + "epoch": 0.7072368421052632, + "grad_norm": 1.0051373010287563, + "learning_rate": 7.078081067757929e-05, + "loss": 3.8321, + "step": 6880 + }, + { + "epoch": 0.708264802631579, + "grad_norm": 1.3011515322320117, + "learning_rate": 7.075496365963555e-05, + "loss": 3.7833, + "step": 6890 + }, + { + "epoch": 0.7092927631578947, + "grad_norm": 1.0079417774173307, + "learning_rate": 7.07290855531176e-05, + "loss": 3.7795, + "step": 6900 + }, + { + "epoch": 0.7103207236842105, + "grad_norm": 1.122327172582984, + "learning_rate": 7.070317638649874e-05, + "loss": 3.7699, + "step": 6910 + }, + { + "epoch": 0.7113486842105263, + "grad_norm": 1.10145725261495, + "learning_rate": 7.067723618828651e-05, + "loss": 3.869, + "step": 6920 + }, + { + "epoch": 0.7123766447368421, + "grad_norm": 1.2800753721755282, + "learning_rate": 7.06512649870226e-05, + "loss": 3.7134, + "step": 6930 + }, + { + "epoch": 0.7134046052631579, + "grad_norm": 0.9723692321135506, + "learning_rate": 7.062526281128277e-05, + "loss": 3.8026, + "step": 6940 + }, + { + "epoch": 0.7144325657894737, + "grad_norm": 1.3255326945655406, + "learning_rate": 7.059922968967688e-05, + "loss": 3.8239, + "step": 6950 + }, + { + "epoch": 0.7154605263157895, + "grad_norm": 1.0062940528656743, + "learning_rate": 7.057316565084884e-05, + "loss": 3.8011, + "step": 6960 + }, + { + "epoch": 0.7164884868421053, + "grad_norm": 1.2690873998750947, + "learning_rate": 7.054707072347658e-05, + "loss": 3.7435, + "step": 6970 + }, + { + "epoch": 0.717516447368421, + "grad_norm": 1.570239924181852, + "learning_rate": 7.052094493627202e-05, + "loss": 3.727, + "step": 6980 + }, + { + "epoch": 0.7185444078947368, + "grad_norm": 1.6053324175121275, + "learning_rate": 7.049478831798102e-05, + "loss": 3.795, + "step": 6990 + }, + { + "epoch": 0.7195723684210527, + "grad_norm": 1.1266921351072892, + "learning_rate": 7.04686008973834e-05, + "loss": 3.7768, + "step": 7000 + }, + { + "epoch": 0.7206003289473685, + "grad_norm": 5.674189155897457, + "learning_rate": 7.044238270329279e-05, + "loss": 3.8397, + "step": 7010 + }, + { + "epoch": 0.7216282894736842, + "grad_norm": 1.0563893468909706, + "learning_rate": 7.041613376455678e-05, + "loss": 3.773, + "step": 7020 + }, + { + "epoch": 0.72265625, + "grad_norm": 0.979044910057384, + "learning_rate": 7.03898541100567e-05, + "loss": 3.8268, + "step": 7030 + }, + { + "epoch": 0.7236842105263158, + "grad_norm": 1.721538154899395, + "learning_rate": 7.036354376870775e-05, + "loss": 3.6828, + "step": 7040 + }, + { + "epoch": 0.7247121710526315, + "grad_norm": 1.0145188984132982, + "learning_rate": 7.033720276945886e-05, + "loss": 3.8169, + "step": 7050 + }, + { + "epoch": 0.7257401315789473, + "grad_norm": 1.603493842512488, + "learning_rate": 7.031083114129267e-05, + "loss": 3.8559, + "step": 7060 + }, + { + "epoch": 0.7267680921052632, + "grad_norm": 1.1992607482116777, + "learning_rate": 7.028442891322557e-05, + "loss": 3.7874, + "step": 7070 + }, + { + "epoch": 0.727796052631579, + "grad_norm": 1.4741973503431585, + "learning_rate": 7.025799611430758e-05, + "loss": 3.8348, + "step": 7080 + }, + { + "epoch": 0.7288240131578947, + "grad_norm": 1.0863379419370853, + "learning_rate": 7.023153277362235e-05, + "loss": 3.8409, + "step": 7090 + }, + { + "epoch": 0.7298519736842105, + "grad_norm": 0.9516101117429466, + "learning_rate": 7.020503892028721e-05, + "loss": 3.8053, + "step": 7100 + }, + { + "epoch": 0.7308799342105263, + "grad_norm": 1.0938353368983087, + "learning_rate": 7.017851458345296e-05, + "loss": 3.7439, + "step": 7110 + }, + { + "epoch": 0.7319078947368421, + "grad_norm": 1.598181976150085, + "learning_rate": 7.015195979230398e-05, + "loss": 3.7856, + "step": 7120 + }, + { + "epoch": 0.7329358552631579, + "grad_norm": 0.8876840320034084, + "learning_rate": 7.012537457605821e-05, + "loss": 3.7839, + "step": 7130 + }, + { + "epoch": 0.7339638157894737, + "grad_norm": 1.5185027184133908, + "learning_rate": 7.0098758963967e-05, + "loss": 3.8958, + "step": 7140 + }, + { + "epoch": 0.7349917763157895, + "grad_norm": 1.1765185607567934, + "learning_rate": 7.007211298531516e-05, + "loss": 3.8338, + "step": 7150 + }, + { + "epoch": 0.7360197368421053, + "grad_norm": 1.4289884265193404, + "learning_rate": 7.004543666942093e-05, + "loss": 3.7679, + "step": 7160 + }, + { + "epoch": 0.737047697368421, + "grad_norm": 1.2920093030653683, + "learning_rate": 7.00187300456359e-05, + "loss": 3.7931, + "step": 7170 + }, + { + "epoch": 0.7380756578947368, + "grad_norm": 1.127750636830168, + "learning_rate": 6.999199314334504e-05, + "loss": 3.8299, + "step": 7180 + }, + { + "epoch": 0.7391036184210527, + "grad_norm": 0.8093428528046228, + "learning_rate": 6.996522599196663e-05, + "loss": 3.7567, + "step": 7190 + }, + { + "epoch": 0.7401315789473685, + "grad_norm": 1.0595433666828615, + "learning_rate": 6.993842862095221e-05, + "loss": 3.7516, + "step": 7200 + }, + { + "epoch": 0.7411595394736842, + "grad_norm": 1.352349465642935, + "learning_rate": 6.99116010597866e-05, + "loss": 3.8508, + "step": 7210 + }, + { + "epoch": 0.7421875, + "grad_norm": 1.4780963822350663, + "learning_rate": 6.988474333798779e-05, + "loss": 3.812, + "step": 7220 + }, + { + "epoch": 0.7432154605263158, + "grad_norm": 1.679402029929434, + "learning_rate": 6.985785548510701e-05, + "loss": 3.7517, + "step": 7230 + }, + { + "epoch": 0.7442434210526315, + "grad_norm": 1.2203332447432989, + "learning_rate": 6.98309375307286e-05, + "loss": 3.806, + "step": 7240 + }, + { + "epoch": 0.7452713815789473, + "grad_norm": 1.5522497800463464, + "learning_rate": 6.980398950447008e-05, + "loss": 3.7709, + "step": 7250 + }, + { + "epoch": 0.7462993421052632, + "grad_norm": 1.1953394269715223, + "learning_rate": 6.9777011435982e-05, + "loss": 3.7716, + "step": 7260 + }, + { + "epoch": 0.747327302631579, + "grad_norm": 1.1925595064331054, + "learning_rate": 6.975000335494795e-05, + "loss": 3.7233, + "step": 7270 + }, + { + "epoch": 0.7483552631578947, + "grad_norm": 1.3151617761463885, + "learning_rate": 6.972296529108461e-05, + "loss": 3.7943, + "step": 7280 + }, + { + "epoch": 0.7493832236842105, + "grad_norm": 0.8753867316510539, + "learning_rate": 6.96958972741416e-05, + "loss": 3.795, + "step": 7290 + }, + { + "epoch": 0.7504111842105263, + "grad_norm": 1.2106639922346356, + "learning_rate": 6.966879933390154e-05, + "loss": 3.7914, + "step": 7300 + }, + { + "epoch": 0.7514391447368421, + "grad_norm": 1.2125582277926528, + "learning_rate": 6.964167150017989e-05, + "loss": 3.7557, + "step": 7310 + }, + { + "epoch": 0.7524671052631579, + "grad_norm": 1.3857110458864252, + "learning_rate": 6.961451380282512e-05, + "loss": 3.8477, + "step": 7320 + }, + { + "epoch": 0.7534950657894737, + "grad_norm": 1.6219219987413762, + "learning_rate": 6.958732627171846e-05, + "loss": 3.8396, + "step": 7330 + }, + { + "epoch": 0.7545230263157895, + "grad_norm": 1.1269445567739163, + "learning_rate": 6.956010893677401e-05, + "loss": 3.7952, + "step": 7340 + }, + { + "epoch": 0.7555509868421053, + "grad_norm": 1.243290489462559, + "learning_rate": 6.953286182793866e-05, + "loss": 3.7793, + "step": 7350 + }, + { + "epoch": 0.756578947368421, + "grad_norm": 1.1373117479763237, + "learning_rate": 6.950558497519203e-05, + "loss": 3.7757, + "step": 7360 + }, + { + "epoch": 0.7576069078947368, + "grad_norm": 1.2034979898509666, + "learning_rate": 6.947827840854653e-05, + "loss": 3.774, + "step": 7370 + }, + { + "epoch": 0.7586348684210527, + "grad_norm": 0.9152596288266749, + "learning_rate": 6.94509421580472e-05, + "loss": 3.788, + "step": 7380 + }, + { + "epoch": 0.7596628289473685, + "grad_norm": 1.288139937905102, + "learning_rate": 6.942357625377177e-05, + "loss": 3.75, + "step": 7390 + }, + { + "epoch": 0.7606907894736842, + "grad_norm": 0.8257548043772176, + "learning_rate": 6.939618072583059e-05, + "loss": 3.8702, + "step": 7400 + }, + { + "epoch": 0.76171875, + "grad_norm": 1.4045895840439646, + "learning_rate": 6.936875560436662e-05, + "loss": 3.7775, + "step": 7410 + }, + { + "epoch": 0.7627467105263158, + "grad_norm": 1.9670296070098985, + "learning_rate": 6.934130091955536e-05, + "loss": 3.8328, + "step": 7420 + }, + { + "epoch": 0.7637746710526315, + "grad_norm": 1.625678947635954, + "learning_rate": 6.931381670160486e-05, + "loss": 3.7624, + "step": 7430 + }, + { + "epoch": 0.7648026315789473, + "grad_norm": 1.0563908138876728, + "learning_rate": 6.928630298075564e-05, + "loss": 3.8089, + "step": 7440 + }, + { + "epoch": 0.7658305921052632, + "grad_norm": 1.1131881942674038, + "learning_rate": 6.925875978728069e-05, + "loss": 3.7826, + "step": 7450 + }, + { + "epoch": 0.766858552631579, + "grad_norm": 1.4093936538307652, + "learning_rate": 6.923118715148545e-05, + "loss": 3.809, + "step": 7460 + }, + { + "epoch": 0.7678865131578947, + "grad_norm": 1.5088537702295453, + "learning_rate": 6.920358510370771e-05, + "loss": 3.7617, + "step": 7470 + }, + { + "epoch": 0.7689144736842105, + "grad_norm": 1.4758918490549406, + "learning_rate": 6.917595367431767e-05, + "loss": 3.8389, + "step": 7480 + }, + { + "epoch": 0.7699424342105263, + "grad_norm": 1.4782733189565929, + "learning_rate": 6.914829289371785e-05, + "loss": 3.7098, + "step": 7490 + }, + { + "epoch": 0.7709703947368421, + "grad_norm": 1.2160821226804908, + "learning_rate": 6.912060279234303e-05, + "loss": 3.7547, + "step": 7500 + }, + { + "epoch": 0.7719983552631579, + "grad_norm": 1.535981167767234, + "learning_rate": 6.909288340066028e-05, + "loss": 3.8343, + "step": 7510 + }, + { + "epoch": 0.7730263157894737, + "grad_norm": 1.8559021132487514, + "learning_rate": 6.906513474916889e-05, + "loss": 3.8304, + "step": 7520 + }, + { + "epoch": 0.7740542763157895, + "grad_norm": 1.0809404404118543, + "learning_rate": 6.903735686840035e-05, + "loss": 3.8467, + "step": 7530 + }, + { + "epoch": 0.7750822368421053, + "grad_norm": 1.6147049724740195, + "learning_rate": 6.900954978891829e-05, + "loss": 3.7565, + "step": 7540 + }, + { + "epoch": 0.776110197368421, + "grad_norm": 0.908737484820843, + "learning_rate": 6.898171354131852e-05, + "loss": 3.8139, + "step": 7550 + }, + { + "epoch": 0.7771381578947368, + "grad_norm": 1.3494157445214947, + "learning_rate": 6.895384815622886e-05, + "loss": 3.811, + "step": 7560 + }, + { + "epoch": 0.7781661184210527, + "grad_norm": 1.0907908600440002, + "learning_rate": 6.892595366430926e-05, + "loss": 3.8175, + "step": 7570 + }, + { + "epoch": 0.7791940789473685, + "grad_norm": 1.0924397386040954, + "learning_rate": 6.889803009625168e-05, + "loss": 3.7515, + "step": 7580 + }, + { + "epoch": 0.7802220394736842, + "grad_norm": 1.8731323477263957, + "learning_rate": 6.887007748278005e-05, + "loss": 3.7769, + "step": 7590 + }, + { + "epoch": 0.78125, + "grad_norm": 1.704977305644404, + "learning_rate": 6.884209585465027e-05, + "loss": 3.6676, + "step": 7600 + }, + { + "epoch": 0.7822779605263158, + "grad_norm": 1.011793391593078, + "learning_rate": 6.881408524265019e-05, + "loss": 3.6691, + "step": 7610 + }, + { + "epoch": 0.7833059210526315, + "grad_norm": 1.3473132595402175, + "learning_rate": 6.87860456775995e-05, + "loss": 3.7426, + "step": 7620 + }, + { + "epoch": 0.7843338815789473, + "grad_norm": 1.0103491272261875, + "learning_rate": 6.875797719034979e-05, + "loss": 3.8891, + "step": 7630 + }, + { + "epoch": 0.7853618421052632, + "grad_norm": 1.6184714560031928, + "learning_rate": 6.872987981178442e-05, + "loss": 3.775, + "step": 7640 + }, + { + "epoch": 0.786389802631579, + "grad_norm": 0.8325132666209494, + "learning_rate": 6.870175357281864e-05, + "loss": 3.8236, + "step": 7650 + }, + { + "epoch": 0.7874177631578947, + "grad_norm": 1.907354316397782, + "learning_rate": 6.867359850439933e-05, + "loss": 3.7676, + "step": 7660 + }, + { + "epoch": 0.7884457236842105, + "grad_norm": 1.5406457338620894, + "learning_rate": 6.86454146375052e-05, + "loss": 3.8023, + "step": 7670 + }, + { + "epoch": 0.7894736842105263, + "grad_norm": 0.9131082623858358, + "learning_rate": 6.861720200314657e-05, + "loss": 3.7879, + "step": 7680 + }, + { + "epoch": 0.7905016447368421, + "grad_norm": 1.3367824893739406, + "learning_rate": 6.858896063236543e-05, + "loss": 3.778, + "step": 7690 + }, + { + "epoch": 0.7915296052631579, + "grad_norm": 1.190288031444998, + "learning_rate": 6.856069055623543e-05, + "loss": 3.7027, + "step": 7700 + }, + { + "epoch": 0.7925575657894737, + "grad_norm": 0.9628764184364152, + "learning_rate": 6.853239180586175e-05, + "loss": 3.8097, + "step": 7710 + }, + { + "epoch": 0.7935855263157895, + "grad_norm": 1.1803166309756061, + "learning_rate": 6.850406441238114e-05, + "loss": 3.7971, + "step": 7720 + }, + { + "epoch": 0.7946134868421053, + "grad_norm": 1.777734265026742, + "learning_rate": 6.847570840696189e-05, + "loss": 3.7912, + "step": 7730 + }, + { + "epoch": 0.795641447368421, + "grad_norm": 1.008795739121329, + "learning_rate": 6.84473238208037e-05, + "loss": 3.7609, + "step": 7740 + }, + { + "epoch": 0.7966694078947368, + "grad_norm": 1.2165642725737833, + "learning_rate": 6.841891068513784e-05, + "loss": 3.7245, + "step": 7750 + }, + { + "epoch": 0.7976973684210527, + "grad_norm": 1.0628731577207984, + "learning_rate": 6.839046903122685e-05, + "loss": 3.7998, + "step": 7760 + }, + { + "epoch": 0.7987253289473685, + "grad_norm": 2.6789180440523497, + "learning_rate": 6.836199889036476e-05, + "loss": 3.8236, + "step": 7770 + }, + { + "epoch": 0.7997532894736842, + "grad_norm": 1.4899543709136818, + "learning_rate": 6.833350029387688e-05, + "loss": 3.7862, + "step": 7780 + }, + { + "epoch": 0.80078125, + "grad_norm": 1.6448460365990492, + "learning_rate": 6.830497327311986e-05, + "loss": 3.8216, + "step": 7790 + }, + { + "epoch": 0.8018092105263158, + "grad_norm": 1.1188347150385978, + "learning_rate": 6.82764178594816e-05, + "loss": 3.7748, + "step": 7800 + }, + { + "epoch": 0.8028371710526315, + "grad_norm": 1.1532245437951623, + "learning_rate": 6.824783408438129e-05, + "loss": 3.7512, + "step": 7810 + }, + { + "epoch": 0.8038651315789473, + "grad_norm": 1.4860725734542704, + "learning_rate": 6.821922197926925e-05, + "loss": 3.779, + "step": 7820 + }, + { + "epoch": 0.8048930921052632, + "grad_norm": 1.4176814521223633, + "learning_rate": 6.819058157562705e-05, + "loss": 3.809, + "step": 7830 + }, + { + "epoch": 0.805921052631579, + "grad_norm": 1.6699104531773132, + "learning_rate": 6.816191290496733e-05, + "loss": 3.7302, + "step": 7840 + }, + { + "epoch": 0.8069490131578947, + "grad_norm": 1.2565468527562718, + "learning_rate": 6.813321599883387e-05, + "loss": 3.7617, + "step": 7850 + }, + { + "epoch": 0.8079769736842105, + "grad_norm": 1.3371031734000665, + "learning_rate": 6.810449088880151e-05, + "loss": 3.7636, + "step": 7860 + }, + { + "epoch": 0.8090049342105263, + "grad_norm": 1.4303596459141128, + "learning_rate": 6.807573760647613e-05, + "loss": 3.8252, + "step": 7870 + }, + { + "epoch": 0.8100328947368421, + "grad_norm": 1.5899324509529138, + "learning_rate": 6.804695618349457e-05, + "loss": 3.7538, + "step": 7880 + }, + { + "epoch": 0.8110608552631579, + "grad_norm": 1.8239775340930322, + "learning_rate": 6.801814665152469e-05, + "loss": 3.8314, + "step": 7890 + }, + { + "epoch": 0.8120888157894737, + "grad_norm": 1.6867809176417912, + "learning_rate": 6.798930904226524e-05, + "loss": 3.8115, + "step": 7900 + }, + { + "epoch": 0.8131167763157895, + "grad_norm": 1.379669151313746, + "learning_rate": 6.796044338744587e-05, + "loss": 3.6993, + "step": 7910 + }, + { + "epoch": 0.8141447368421053, + "grad_norm": 1.0407560500831148, + "learning_rate": 6.793154971882706e-05, + "loss": 3.7362, + "step": 7920 + }, + { + "epoch": 0.815172697368421, + "grad_norm": 0.997421367242062, + "learning_rate": 6.790262806820019e-05, + "loss": 3.7257, + "step": 7930 + }, + { + "epoch": 0.8162006578947368, + "grad_norm": 1.0943567500398246, + "learning_rate": 6.787367846738737e-05, + "loss": 3.8071, + "step": 7940 + }, + { + "epoch": 0.8172286184210527, + "grad_norm": 1.2379307295904498, + "learning_rate": 6.784470094824145e-05, + "loss": 3.7142, + "step": 7950 + }, + { + "epoch": 0.8182565789473685, + "grad_norm": 1.1124807591863943, + "learning_rate": 6.781569554264605e-05, + "loss": 3.7555, + "step": 7960 + }, + { + "epoch": 0.8192845394736842, + "grad_norm": 0.967068843927496, + "learning_rate": 6.778666228251544e-05, + "loss": 3.7167, + "step": 7970 + }, + { + "epoch": 0.8203125, + "grad_norm": 1.5577174426493507, + "learning_rate": 6.775760119979453e-05, + "loss": 3.7912, + "step": 7980 + }, + { + "epoch": 0.8213404605263158, + "grad_norm": 1.2395997834383001, + "learning_rate": 6.772851232645886e-05, + "loss": 3.8035, + "step": 7990 + }, + { + "epoch": 0.8223684210526315, + "grad_norm": 1.2645309334738875, + "learning_rate": 6.769939569451457e-05, + "loss": 3.7515, + "step": 8000 + }, + { + "epoch": 0.8233963815789473, + "grad_norm": 1.1113732300305454, + "learning_rate": 6.767025133599829e-05, + "loss": 3.7461, + "step": 8010 + }, + { + "epoch": 0.8244243421052632, + "grad_norm": 1.438035119011188, + "learning_rate": 6.76410792829772e-05, + "loss": 3.8389, + "step": 8020 + }, + { + "epoch": 0.825452302631579, + "grad_norm": 1.0966041926200683, + "learning_rate": 6.761187956754891e-05, + "loss": 3.7575, + "step": 8030 + }, + { + "epoch": 0.8264802631578947, + "grad_norm": 1.3791908640433905, + "learning_rate": 6.758265222184151e-05, + "loss": 3.7709, + "step": 8040 + }, + { + "epoch": 0.8275082236842105, + "grad_norm": 1.2427753041403404, + "learning_rate": 6.755339727801347e-05, + "loss": 3.7476, + "step": 8050 + }, + { + "epoch": 0.8285361842105263, + "grad_norm": 1.5832583342741973, + "learning_rate": 6.752411476825366e-05, + "loss": 3.7779, + "step": 8060 + }, + { + "epoch": 0.8295641447368421, + "grad_norm": 1.384058590244155, + "learning_rate": 6.749480472478121e-05, + "loss": 3.7941, + "step": 8070 + }, + { + "epoch": 0.8305921052631579, + "grad_norm": 0.9591559977121735, + "learning_rate": 6.74654671798456e-05, + "loss": 3.8027, + "step": 8080 + }, + { + "epoch": 0.8316200657894737, + "grad_norm": 0.922417691081469, + "learning_rate": 6.743610216572654e-05, + "loss": 3.8306, + "step": 8090 + }, + { + "epoch": 0.8326480263157895, + "grad_norm": 1.8211411321458202, + "learning_rate": 6.7406709714734e-05, + "loss": 3.8175, + "step": 8100 + }, + { + "epoch": 0.8336759868421053, + "grad_norm": 1.187213361176779, + "learning_rate": 6.737728985920812e-05, + "loss": 3.7192, + "step": 8110 + }, + { + "epoch": 0.834703947368421, + "grad_norm": 1.1671585794361414, + "learning_rate": 6.734784263151916e-05, + "loss": 3.7711, + "step": 8120 + }, + { + "epoch": 0.8357319078947368, + "grad_norm": 1.611823950111095, + "learning_rate": 6.731836806406754e-05, + "loss": 3.815, + "step": 8130 + }, + { + "epoch": 0.8367598684210527, + "grad_norm": 1.0391252541772904, + "learning_rate": 6.728886618928378e-05, + "loss": 3.7768, + "step": 8140 + }, + { + "epoch": 0.8377878289473685, + "grad_norm": 0.9095677122439749, + "learning_rate": 6.725933703962837e-05, + "loss": 3.7774, + "step": 8150 + }, + { + "epoch": 0.8388157894736842, + "grad_norm": 1.6587877085581502, + "learning_rate": 6.722978064759187e-05, + "loss": 3.7788, + "step": 8160 + }, + { + "epoch": 0.83984375, + "grad_norm": 1.554354301287211, + "learning_rate": 6.720019704569482e-05, + "loss": 3.7995, + "step": 8170 + }, + { + "epoch": 0.8408717105263158, + "grad_norm": 1.242096759672816, + "learning_rate": 6.717058626648764e-05, + "loss": 3.7062, + "step": 8180 + }, + { + "epoch": 0.8418996710526315, + "grad_norm": 1.1912346387995538, + "learning_rate": 6.714094834255073e-05, + "loss": 3.7749, + "step": 8190 + }, + { + "epoch": 0.8429276315789473, + "grad_norm": 1.224687004975036, + "learning_rate": 6.711128330649429e-05, + "loss": 3.7362, + "step": 8200 + }, + { + "epoch": 0.8439555921052632, + "grad_norm": 1.3522529822817566, + "learning_rate": 6.708159119095839e-05, + "loss": 3.7542, + "step": 8210 + }, + { + "epoch": 0.844983552631579, + "grad_norm": 0.987695663720631, + "learning_rate": 6.705187202861287e-05, + "loss": 3.7193, + "step": 8220 + }, + { + "epoch": 0.8460115131578947, + "grad_norm": 1.0356483297534653, + "learning_rate": 6.702212585215737e-05, + "loss": 3.772, + "step": 8230 + }, + { + "epoch": 0.8470394736842105, + "grad_norm": 0.9789897813039639, + "learning_rate": 6.69923526943212e-05, + "loss": 3.6806, + "step": 8240 + }, + { + "epoch": 0.8480674342105263, + "grad_norm": 1.3373462766919189, + "learning_rate": 6.69625525878634e-05, + "loss": 3.72, + "step": 8250 + }, + { + "epoch": 0.8490953947368421, + "grad_norm": 1.0794260219051848, + "learning_rate": 6.693272556557265e-05, + "loss": 3.6987, + "step": 8260 + }, + { + "epoch": 0.8501233552631579, + "grad_norm": 0.8779200467438544, + "learning_rate": 6.690287166026722e-05, + "loss": 3.77, + "step": 8270 + }, + { + "epoch": 0.8511513157894737, + "grad_norm": 1.137305458955603, + "learning_rate": 6.687299090479498e-05, + "loss": 3.7959, + "step": 8280 + }, + { + "epoch": 0.8521792763157895, + "grad_norm": 1.400166428414789, + "learning_rate": 6.684308333203337e-05, + "loss": 3.7762, + "step": 8290 + }, + { + "epoch": 0.8532072368421053, + "grad_norm": 1.6091284701955124, + "learning_rate": 6.681314897488927e-05, + "loss": 3.7588, + "step": 8300 + }, + { + "epoch": 0.854235197368421, + "grad_norm": 1.4609931486414292, + "learning_rate": 6.678318786629911e-05, + "loss": 3.7579, + "step": 8310 + }, + { + "epoch": 0.8552631578947368, + "grad_norm": 1.3271627193802455, + "learning_rate": 6.675320003922867e-05, + "loss": 3.7587, + "step": 8320 + }, + { + "epoch": 0.8562911184210527, + "grad_norm": 1.1321289631101459, + "learning_rate": 6.672318552667321e-05, + "loss": 3.7487, + "step": 8330 + }, + { + "epoch": 0.8573190789473685, + "grad_norm": 1.4640394113322086, + "learning_rate": 6.66931443616573e-05, + "loss": 3.6992, + "step": 8340 + }, + { + "epoch": 0.8583470394736842, + "grad_norm": 1.2951910737370367, + "learning_rate": 6.666307657723486e-05, + "loss": 3.7849, + "step": 8350 + }, + { + "epoch": 0.859375, + "grad_norm": 1.373988299540713, + "learning_rate": 6.663298220648906e-05, + "loss": 3.723, + "step": 8360 + }, + { + "epoch": 0.8604029605263158, + "grad_norm": 1.078440633817335, + "learning_rate": 6.660286128253236e-05, + "loss": 3.7994, + "step": 8370 + }, + { + "epoch": 0.8614309210526315, + "grad_norm": 1.2156691123980337, + "learning_rate": 6.657271383850644e-05, + "loss": 3.8587, + "step": 8380 + }, + { + "epoch": 0.8624588815789473, + "grad_norm": 1.069962660146877, + "learning_rate": 6.654253990758213e-05, + "loss": 3.7433, + "step": 8390 + }, + { + "epoch": 0.8634868421052632, + "grad_norm": 1.3475246641630159, + "learning_rate": 6.651233952295943e-05, + "loss": 3.8529, + "step": 8400 + }, + { + "epoch": 0.864514802631579, + "grad_norm": 0.8576312320039672, + "learning_rate": 6.648211271786743e-05, + "loss": 3.7709, + "step": 8410 + }, + { + "epoch": 0.8655427631578947, + "grad_norm": 1.2166711733137998, + "learning_rate": 6.645185952556431e-05, + "loss": 3.7586, + "step": 8420 + }, + { + "epoch": 0.8665707236842105, + "grad_norm": 1.0691238820134703, + "learning_rate": 6.642157997933727e-05, + "loss": 3.7749, + "step": 8430 + }, + { + "epoch": 0.8675986842105263, + "grad_norm": 1.4169198352263996, + "learning_rate": 6.639127411250247e-05, + "loss": 3.8052, + "step": 8440 + }, + { + "epoch": 0.8686266447368421, + "grad_norm": 1.331893466300528, + "learning_rate": 6.636094195840511e-05, + "loss": 3.7824, + "step": 8450 + }, + { + "epoch": 0.8696546052631579, + "grad_norm": 1.197551749112738, + "learning_rate": 6.633058355041923e-05, + "loss": 3.6988, + "step": 8460 + }, + { + "epoch": 0.8706825657894737, + "grad_norm": 1.2025389792296848, + "learning_rate": 6.630019892194783e-05, + "loss": 3.7941, + "step": 8470 + }, + { + "epoch": 0.8717105263157895, + "grad_norm": 1.2601435604508964, + "learning_rate": 6.626978810642272e-05, + "loss": 3.7749, + "step": 8480 + }, + { + "epoch": 0.8727384868421053, + "grad_norm": 1.0318919409060763, + "learning_rate": 6.623935113730452e-05, + "loss": 3.7575, + "step": 8490 + }, + { + "epoch": 0.873766447368421, + "grad_norm": 1.8737199228429053, + "learning_rate": 6.620888804808265e-05, + "loss": 3.718, + "step": 8500 + }, + { + "epoch": 0.8747944078947368, + "grad_norm": 1.2174644047509695, + "learning_rate": 6.617839887227525e-05, + "loss": 3.7647, + "step": 8510 + }, + { + "epoch": 0.8758223684210527, + "grad_norm": 0.9929270653787783, + "learning_rate": 6.614788364342916e-05, + "loss": 3.8273, + "step": 8520 + }, + { + "epoch": 0.8768503289473685, + "grad_norm": 1.8955762989342257, + "learning_rate": 6.611734239511991e-05, + "loss": 3.7685, + "step": 8530 + }, + { + "epoch": 0.8778782894736842, + "grad_norm": 1.1521592849244084, + "learning_rate": 6.608677516095164e-05, + "loss": 3.7059, + "step": 8540 + }, + { + "epoch": 0.87890625, + "grad_norm": 0.9245689276876228, + "learning_rate": 6.605618197455709e-05, + "loss": 3.7769, + "step": 8550 + }, + { + "epoch": 0.8799342105263158, + "grad_norm": 0.7491771634745206, + "learning_rate": 6.602556286959752e-05, + "loss": 3.7579, + "step": 8560 + }, + { + "epoch": 0.8809621710526315, + "grad_norm": 0.9143554022586386, + "learning_rate": 6.599491787976279e-05, + "loss": 3.8307, + "step": 8570 + }, + { + "epoch": 0.8819901315789473, + "grad_norm": 4.683398474153165, + "learning_rate": 6.596424703877115e-05, + "loss": 3.7859, + "step": 8580 + }, + { + "epoch": 0.8830180921052632, + "grad_norm": 1.2897047350327822, + "learning_rate": 6.593355038036935e-05, + "loss": 3.7224, + "step": 8590 + }, + { + "epoch": 0.884046052631579, + "grad_norm": 1.0723226563342396, + "learning_rate": 6.590282793833252e-05, + "loss": 3.7273, + "step": 8600 + }, + { + "epoch": 0.8850740131578947, + "grad_norm": 1.0325202055055493, + "learning_rate": 6.587207974646419e-05, + "loss": 3.7602, + "step": 8610 + }, + { + "epoch": 0.8861019736842105, + "grad_norm": 1.1799084791155394, + "learning_rate": 6.584130583859617e-05, + "loss": 3.7616, + "step": 8620 + }, + { + "epoch": 0.8871299342105263, + "grad_norm": 0.8440348532879186, + "learning_rate": 6.581050624858862e-05, + "loss": 3.7665, + "step": 8630 + }, + { + "epoch": 0.8881578947368421, + "grad_norm": 1.1927963901758099, + "learning_rate": 6.577968101032992e-05, + "loss": 3.7586, + "step": 8640 + }, + { + "epoch": 0.8891858552631579, + "grad_norm": 0.8341958628742809, + "learning_rate": 6.57488301577367e-05, + "loss": 3.7365, + "step": 8650 + }, + { + "epoch": 0.8902138157894737, + "grad_norm": 1.3217345485516454, + "learning_rate": 6.571795372475373e-05, + "loss": 3.8133, + "step": 8660 + }, + { + "epoch": 0.8912417763157895, + "grad_norm": 0.9946301946692533, + "learning_rate": 6.568705174535398e-05, + "loss": 3.7737, + "step": 8670 + }, + { + "epoch": 0.8922697368421053, + "grad_norm": 0.7701593536976897, + "learning_rate": 6.56561242535385e-05, + "loss": 3.6587, + "step": 8680 + }, + { + "epoch": 0.893297697368421, + "grad_norm": 1.6269981130766054, + "learning_rate": 6.562517128333641e-05, + "loss": 3.7577, + "step": 8690 + }, + { + "epoch": 0.8943256578947368, + "grad_norm": 1.3263426096344861, + "learning_rate": 6.559419286880484e-05, + "loss": 3.7493, + "step": 8700 + }, + { + "epoch": 0.8953536184210527, + "grad_norm": 1.1296238186141905, + "learning_rate": 6.5563189044029e-05, + "loss": 3.672, + "step": 8710 + }, + { + "epoch": 0.8963815789473685, + "grad_norm": 1.0139537265083265, + "learning_rate": 6.553215984312194e-05, + "loss": 3.7107, + "step": 8720 + }, + { + "epoch": 0.8974095394736842, + "grad_norm": 1.5033962425354475, + "learning_rate": 6.550110530022473e-05, + "loss": 3.7892, + "step": 8730 + }, + { + "epoch": 0.8984375, + "grad_norm": 1.1367965619174154, + "learning_rate": 6.547002544950625e-05, + "loss": 3.739, + "step": 8740 + }, + { + "epoch": 0.8994654605263158, + "grad_norm": 1.414407593153081, + "learning_rate": 6.543892032516329e-05, + "loss": 3.7511, + "step": 8750 + }, + { + "epoch": 0.9004934210526315, + "grad_norm": 0.9668290566376825, + "learning_rate": 6.540778996142042e-05, + "loss": 3.7483, + "step": 8760 + }, + { + "epoch": 0.9015213815789473, + "grad_norm": 1.2569275580314547, + "learning_rate": 6.537663439252997e-05, + "loss": 3.6971, + "step": 8770 + }, + { + "epoch": 0.9025493421052632, + "grad_norm": 0.9677704812013374, + "learning_rate": 6.534545365277201e-05, + "loss": 3.8548, + "step": 8780 + }, + { + "epoch": 0.903577302631579, + "grad_norm": 1.706542549624412, + "learning_rate": 6.531424777645435e-05, + "loss": 3.7095, + "step": 8790 + }, + { + "epoch": 0.9046052631578947, + "grad_norm": 0.9946280972417089, + "learning_rate": 6.528301679791235e-05, + "loss": 3.6421, + "step": 8800 + }, + { + "epoch": 0.9056332236842105, + "grad_norm": 0.9054477363366437, + "learning_rate": 6.525176075150907e-05, + "loss": 3.8388, + "step": 8810 + }, + { + "epoch": 0.9066611842105263, + "grad_norm": 1.430005064335105, + "learning_rate": 6.522047967163518e-05, + "loss": 3.7506, + "step": 8820 + }, + { + "epoch": 0.9076891447368421, + "grad_norm": 0.9897317908843363, + "learning_rate": 6.51891735927088e-05, + "loss": 3.6779, + "step": 8830 + }, + { + "epoch": 0.9087171052631579, + "grad_norm": 1.3343171672214984, + "learning_rate": 6.515784254917565e-05, + "loss": 3.7143, + "step": 8840 + }, + { + "epoch": 0.9097450657894737, + "grad_norm": 1.4624712232057198, + "learning_rate": 6.512648657550886e-05, + "loss": 3.7686, + "step": 8850 + }, + { + "epoch": 0.9107730263157895, + "grad_norm": 1.1165422830531682, + "learning_rate": 6.509510570620899e-05, + "loss": 3.7814, + "step": 8860 + }, + { + "epoch": 0.9118009868421053, + "grad_norm": 1.1031095314562318, + "learning_rate": 6.506369997580403e-05, + "loss": 3.7458, + "step": 8870 + }, + { + "epoch": 0.912828947368421, + "grad_norm": 1.12855545454292, + "learning_rate": 6.503226941884929e-05, + "loss": 3.7437, + "step": 8880 + }, + { + "epoch": 0.9138569078947368, + "grad_norm": 1.1275940234614188, + "learning_rate": 6.500081406992739e-05, + "loss": 3.733, + "step": 8890 + }, + { + "epoch": 0.9148848684210527, + "grad_norm": 1.3314388445663092, + "learning_rate": 6.496933396364829e-05, + "loss": 3.8149, + "step": 8900 + }, + { + "epoch": 0.9159128289473685, + "grad_norm": 1.2804306015280913, + "learning_rate": 6.493782913464913e-05, + "loss": 3.8036, + "step": 8910 + }, + { + "epoch": 0.9169407894736842, + "grad_norm": 1.3199652491676874, + "learning_rate": 6.490629961759425e-05, + "loss": 3.777, + "step": 8920 + }, + { + "epoch": 0.91796875, + "grad_norm": 1.8804032355934128, + "learning_rate": 6.487474544717521e-05, + "loss": 3.7118, + "step": 8930 + }, + { + "epoch": 0.9189967105263158, + "grad_norm": 1.2496356910064315, + "learning_rate": 6.484316665811061e-05, + "loss": 3.7863, + "step": 8940 + }, + { + "epoch": 0.9200246710526315, + "grad_norm": 1.3182577362441232, + "learning_rate": 6.481156328514623e-05, + "loss": 3.7341, + "step": 8950 + }, + { + "epoch": 0.9210526315789473, + "grad_norm": 1.23694675871575, + "learning_rate": 6.477993536305484e-05, + "loss": 3.7287, + "step": 8960 + }, + { + "epoch": 0.9220805921052632, + "grad_norm": 1.078032890476264, + "learning_rate": 6.474828292663623e-05, + "loss": 3.758, + "step": 8970 + }, + { + "epoch": 0.923108552631579, + "grad_norm": 1.016608656859402, + "learning_rate": 6.471660601071719e-05, + "loss": 3.7324, + "step": 8980 + }, + { + "epoch": 0.9241365131578947, + "grad_norm": 1.2756249298738631, + "learning_rate": 6.46849046501514e-05, + "loss": 3.7562, + "step": 8990 + }, + { + "epoch": 0.9251644736842105, + "grad_norm": 1.7388498126518783, + "learning_rate": 6.465317887981946e-05, + "loss": 3.7185, + "step": 9000 + }, + { + "epoch": 0.9261924342105263, + "grad_norm": 1.5623544243708194, + "learning_rate": 6.462142873462887e-05, + "loss": 3.7672, + "step": 9010 + }, + { + "epoch": 0.9272203947368421, + "grad_norm": 1.3277498949697657, + "learning_rate": 6.458965424951387e-05, + "loss": 3.7553, + "step": 9020 + }, + { + "epoch": 0.9282483552631579, + "grad_norm": 1.0008560331397647, + "learning_rate": 6.455785545943553e-05, + "loss": 3.7508, + "step": 9030 + }, + { + "epoch": 0.9292763157894737, + "grad_norm": 1.1073046963709507, + "learning_rate": 6.452603239938165e-05, + "loss": 3.7659, + "step": 9040 + }, + { + "epoch": 0.9303042763157895, + "grad_norm": 0.8026634240642057, + "learning_rate": 6.449418510436671e-05, + "loss": 3.7205, + "step": 9050 + }, + { + "epoch": 0.9313322368421053, + "grad_norm": 0.9928194271394714, + "learning_rate": 6.446231360943193e-05, + "loss": 3.7965, + "step": 9060 + }, + { + "epoch": 0.932360197368421, + "grad_norm": 0.9142210081206557, + "learning_rate": 6.443041794964506e-05, + "loss": 3.6934, + "step": 9070 + }, + { + "epoch": 0.9333881578947368, + "grad_norm": 1.0575851759640664, + "learning_rate": 6.439849816010052e-05, + "loss": 3.7995, + "step": 9080 + }, + { + "epoch": 0.9344161184210527, + "grad_norm": 1.2311723423202967, + "learning_rate": 6.436655427591923e-05, + "loss": 3.699, + "step": 9090 + }, + { + "epoch": 0.9354440789473685, + "grad_norm": 1.4426986565477686, + "learning_rate": 6.433458633224865e-05, + "loss": 3.7286, + "step": 9100 + }, + { + "epoch": 0.9364720394736842, + "grad_norm": 1.68462324418652, + "learning_rate": 6.430259436426266e-05, + "loss": 3.7385, + "step": 9110 + }, + { + "epoch": 0.9375, + "grad_norm": 2.290494295270936, + "learning_rate": 6.427057840716165e-05, + "loss": 3.7267, + "step": 9120 + }, + { + "epoch": 0.9385279605263158, + "grad_norm": 1.6200010730598569, + "learning_rate": 6.423853849617235e-05, + "loss": 3.7735, + "step": 9130 + }, + { + "epoch": 0.9395559210526315, + "grad_norm": 1.1449062006647608, + "learning_rate": 6.420647466654788e-05, + "loss": 3.6799, + "step": 9140 + }, + { + "epoch": 0.9405838815789473, + "grad_norm": 0.9655544119407321, + "learning_rate": 6.417438695356763e-05, + "loss": 3.8411, + "step": 9150 + }, + { + "epoch": 0.9416118421052632, + "grad_norm": 0.9430502627965028, + "learning_rate": 6.414227539253734e-05, + "loss": 3.7022, + "step": 9160 + }, + { + "epoch": 0.942639802631579, + "grad_norm": 0.8936249251979936, + "learning_rate": 6.411014001878892e-05, + "loss": 3.8014, + "step": 9170 + }, + { + "epoch": 0.9436677631578947, + "grad_norm": 0.934981236856044, + "learning_rate": 6.407798086768051e-05, + "loss": 3.7172, + "step": 9180 + }, + { + "epoch": 0.9446957236842105, + "grad_norm": 1.075621199216007, + "learning_rate": 6.404579797459642e-05, + "loss": 3.767, + "step": 9190 + }, + { + "epoch": 0.9457236842105263, + "grad_norm": 1.034236167971976, + "learning_rate": 6.401359137494708e-05, + "loss": 3.8205, + "step": 9200 + }, + { + "epoch": 0.9467516447368421, + "grad_norm": 1.0925172125291878, + "learning_rate": 6.3981361104169e-05, + "loss": 3.649, + "step": 9210 + }, + { + "epoch": 0.9477796052631579, + "grad_norm": 1.3208564676049839, + "learning_rate": 6.394910719772472e-05, + "loss": 3.7451, + "step": 9220 + }, + { + "epoch": 0.9488075657894737, + "grad_norm": 0.9692303174361898, + "learning_rate": 6.391682969110278e-05, + "loss": 3.751, + "step": 9230 + }, + { + "epoch": 0.9498355263157895, + "grad_norm": 1.1356195857622808, + "learning_rate": 6.388452861981776e-05, + "loss": 3.7441, + "step": 9240 + }, + { + "epoch": 0.9508634868421053, + "grad_norm": 0.7904778053261178, + "learning_rate": 6.385220401941009e-05, + "loss": 3.7981, + "step": 9250 + }, + { + "epoch": 0.951891447368421, + "grad_norm": 1.250248121431431, + "learning_rate": 6.381985592544609e-05, + "loss": 3.6793, + "step": 9260 + }, + { + "epoch": 0.9529194078947368, + "grad_norm": 1.0380516010681273, + "learning_rate": 6.378748437351798e-05, + "loss": 3.7304, + "step": 9270 + }, + { + "epoch": 0.9539473684210527, + "grad_norm": 0.9094190108408486, + "learning_rate": 6.375508939924375e-05, + "loss": 3.7289, + "step": 9280 + }, + { + "epoch": 0.9549753289473685, + "grad_norm": 1.324664420691514, + "learning_rate": 6.37226710382672e-05, + "loss": 3.7232, + "step": 9290 + }, + { + "epoch": 0.9560032894736842, + "grad_norm": 1.4719421112246183, + "learning_rate": 6.369022932625781e-05, + "loss": 3.7815, + "step": 9300 + }, + { + "epoch": 0.95703125, + "grad_norm": 2.1611491399631757, + "learning_rate": 6.365776429891079e-05, + "loss": 3.7122, + "step": 9310 + }, + { + "epoch": 0.9580592105263158, + "grad_norm": 1.2931778845244974, + "learning_rate": 6.362527599194698e-05, + "loss": 3.7246, + "step": 9320 + }, + { + "epoch": 0.9590871710526315, + "grad_norm": 0.9371940431433218, + "learning_rate": 6.359276444111288e-05, + "loss": 3.7385, + "step": 9330 + }, + { + "epoch": 0.9601151315789473, + "grad_norm": 1.0543041521565382, + "learning_rate": 6.356022968218048e-05, + "loss": 3.7487, + "step": 9340 + }, + { + "epoch": 0.9611430921052632, + "grad_norm": 1.4000083582492373, + "learning_rate": 6.352767175094742e-05, + "loss": 3.7166, + "step": 9350 + }, + { + "epoch": 0.962171052631579, + "grad_norm": 1.314635582967165, + "learning_rate": 6.349509068323672e-05, + "loss": 3.7178, + "step": 9360 + }, + { + "epoch": 0.9631990131578947, + "grad_norm": 1.0802829541311518, + "learning_rate": 6.346248651489694e-05, + "loss": 3.7751, + "step": 9370 + }, + { + "epoch": 0.9642269736842105, + "grad_norm": 1.3443662871393671, + "learning_rate": 6.342985928180204e-05, + "loss": 3.7172, + "step": 9380 + }, + { + "epoch": 0.9652549342105263, + "grad_norm": 1.400449190285143, + "learning_rate": 6.33972090198513e-05, + "loss": 3.7445, + "step": 9390 + }, + { + "epoch": 0.9662828947368421, + "grad_norm": 1.5154256787931095, + "learning_rate": 6.336453576496946e-05, + "loss": 3.7109, + "step": 9400 + }, + { + "epoch": 0.9673108552631579, + "grad_norm": 1.1501264544082797, + "learning_rate": 6.33318395531064e-05, + "loss": 3.7338, + "step": 9410 + }, + { + "epoch": 0.9683388157894737, + "grad_norm": 1.035964420020613, + "learning_rate": 6.329912042023739e-05, + "loss": 3.7441, + "step": 9420 + }, + { + "epoch": 0.9693667763157895, + "grad_norm": 1.0385493682198248, + "learning_rate": 6.326637840236286e-05, + "loss": 3.7832, + "step": 9430 + }, + { + "epoch": 0.9703947368421053, + "grad_norm": 1.6838824739288187, + "learning_rate": 6.323361353550843e-05, + "loss": 3.7509, + "step": 9440 + }, + { + "epoch": 0.971422697368421, + "grad_norm": 1.3630274233028916, + "learning_rate": 6.320082585572488e-05, + "loss": 3.7432, + "step": 9450 + }, + { + "epoch": 0.9724506578947368, + "grad_norm": 1.2115413513777924, + "learning_rate": 6.316801539908807e-05, + "loss": 3.8428, + "step": 9460 + }, + { + "epoch": 0.9734786184210527, + "grad_norm": 1.7977883712711593, + "learning_rate": 6.313518220169889e-05, + "loss": 3.7809, + "step": 9470 + }, + { + "epoch": 0.9745065789473685, + "grad_norm": 1.1878558679706073, + "learning_rate": 6.310232629968334e-05, + "loss": 3.6424, + "step": 9480 + }, + { + "epoch": 0.9755345394736842, + "grad_norm": 0.8128851197921089, + "learning_rate": 6.30694477291923e-05, + "loss": 3.808, + "step": 9490 + }, + { + "epoch": 0.9765625, + "grad_norm": 1.060468020941238, + "learning_rate": 6.303654652640166e-05, + "loss": 3.8193, + "step": 9500 + }, + { + "epoch": 0.9775904605263158, + "grad_norm": 1.2508422398276582, + "learning_rate": 6.30036227275122e-05, + "loss": 3.7448, + "step": 9510 + }, + { + "epoch": 0.9786184210526315, + "grad_norm": 1.7284181729017056, + "learning_rate": 6.297067636874956e-05, + "loss": 3.717, + "step": 9520 + }, + { + "epoch": 0.9796463815789473, + "grad_norm": 0.9848726921785586, + "learning_rate": 6.293770748636415e-05, + "loss": 3.7701, + "step": 9530 + }, + { + "epoch": 0.9806743421052632, + "grad_norm": 0.9903573706570965, + "learning_rate": 6.290471611663126e-05, + "loss": 3.7162, + "step": 9540 + }, + { + "epoch": 0.981702302631579, + "grad_norm": 1.045854270137833, + "learning_rate": 6.287170229585083e-05, + "loss": 3.756, + "step": 9550 + }, + { + "epoch": 0.9827302631578947, + "grad_norm": 1.034598261315422, + "learning_rate": 6.283866606034759e-05, + "loss": 3.6886, + "step": 9560 + }, + { + "epoch": 0.9837582236842105, + "grad_norm": 1.3781624319654489, + "learning_rate": 6.280560744647086e-05, + "loss": 3.8254, + "step": 9570 + }, + { + "epoch": 0.9847861842105263, + "grad_norm": 0.8606067153550754, + "learning_rate": 6.277252649059459e-05, + "loss": 3.6919, + "step": 9580 + }, + { + "epoch": 0.9858141447368421, + "grad_norm": 1.5151408100472477, + "learning_rate": 6.273942322911737e-05, + "loss": 3.7564, + "step": 9590 + }, + { + "epoch": 0.9868421052631579, + "grad_norm": 1.3601322476516722, + "learning_rate": 6.270629769846226e-05, + "loss": 3.7724, + "step": 9600 + }, + { + "epoch": 0.9878700657894737, + "grad_norm": 0.9750651973542547, + "learning_rate": 6.267314993507687e-05, + "loss": 3.7324, + "step": 9610 + }, + { + "epoch": 0.9888980263157895, + "grad_norm": 1.1431254323183706, + "learning_rate": 6.263997997543329e-05, + "loss": 3.7265, + "step": 9620 + }, + { + "epoch": 0.9899259868421053, + "grad_norm": 3.635071411061024, + "learning_rate": 6.260678785602794e-05, + "loss": 3.7543, + "step": 9630 + }, + { + "epoch": 0.990953947368421, + "grad_norm": 1.2216437294087417, + "learning_rate": 6.257357361338174e-05, + "loss": 3.7238, + "step": 9640 + }, + { + "epoch": 0.9919819078947368, + "grad_norm": 1.0103674742269613, + "learning_rate": 6.254033728403986e-05, + "loss": 3.7362, + "step": 9650 + }, + { + "epoch": 0.9930098684210527, + "grad_norm": 1.4468097251849992, + "learning_rate": 6.250707890457182e-05, + "loss": 3.5736, + "step": 9660 + }, + { + "epoch": 0.9940378289473685, + "grad_norm": 1.4831401306229324, + "learning_rate": 6.247379851157138e-05, + "loss": 3.6945, + "step": 9670 + }, + { + "epoch": 0.9950657894736842, + "grad_norm": 1.0835367464172532, + "learning_rate": 6.244049614165656e-05, + "loss": 3.7071, + "step": 9680 + }, + { + "epoch": 0.99609375, + "grad_norm": 1.1055489770468852, + "learning_rate": 6.240717183146948e-05, + "loss": 3.6626, + "step": 9690 + }, + { + "epoch": 0.9971217105263158, + "grad_norm": 0.9281826823384824, + "learning_rate": 6.237382561767649e-05, + "loss": 3.8145, + "step": 9700 + }, + { + "epoch": 0.9981496710526315, + "grad_norm": 0.9983675268067133, + "learning_rate": 6.234045753696798e-05, + "loss": 3.7599, + "step": 9710 + }, + { + "epoch": 0.9991776315789473, + "grad_norm": 0.7922649924995434, + "learning_rate": 6.230706762605844e-05, + "loss": 3.716, + "step": 9720 + }, + { + "epoch": 1.000205592105263, + "grad_norm": 1.4441190173694567, + "learning_rate": 6.227365592168637e-05, + "loss": 3.7544, + "step": 9730 + }, + { + "epoch": 1.001233552631579, + "grad_norm": 1.233235086359758, + "learning_rate": 6.22402224606142e-05, + "loss": 3.759, + "step": 9740 + }, + { + "epoch": 1.0022615131578947, + "grad_norm": 1.1431793456927295, + "learning_rate": 6.220676727962835e-05, + "loss": 3.7766, + "step": 9750 + }, + { + "epoch": 1.0032894736842106, + "grad_norm": 0.834435584336894, + "learning_rate": 6.217329041553915e-05, + "loss": 3.6837, + "step": 9760 + }, + { + "epoch": 1.0043174342105263, + "grad_norm": 1.3699512557943672, + "learning_rate": 6.213979190518076e-05, + "loss": 3.7116, + "step": 9770 + }, + { + "epoch": 1.005345394736842, + "grad_norm": 1.148161770683973, + "learning_rate": 6.210627178541116e-05, + "loss": 3.7155, + "step": 9780 + }, + { + "epoch": 1.006373355263158, + "grad_norm": 1.0159590538552292, + "learning_rate": 6.207273009311209e-05, + "loss": 3.6567, + "step": 9790 + }, + { + "epoch": 1.0074013157894737, + "grad_norm": 1.2813093125175425, + "learning_rate": 6.203916686518905e-05, + "loss": 3.7046, + "step": 9800 + }, + { + "epoch": 1.0084292763157894, + "grad_norm": 0.8425985532785777, + "learning_rate": 6.200558213857123e-05, + "loss": 3.6851, + "step": 9810 + }, + { + "epoch": 1.0094572368421053, + "grad_norm": 1.7218278549580313, + "learning_rate": 6.19719759502115e-05, + "loss": 3.7323, + "step": 9820 + }, + { + "epoch": 1.010485197368421, + "grad_norm": 1.8632013395755878, + "learning_rate": 6.193834833708628e-05, + "loss": 3.7157, + "step": 9830 + }, + { + "epoch": 1.011513157894737, + "grad_norm": 1.5028218905511885, + "learning_rate": 6.190469933619562e-05, + "loss": 3.7117, + "step": 9840 + }, + { + "epoch": 1.0125411184210527, + "grad_norm": 1.509609199503779, + "learning_rate": 6.187102898456309e-05, + "loss": 3.7382, + "step": 9850 + }, + { + "epoch": 1.0135690789473684, + "grad_norm": 0.795367366794311, + "learning_rate": 6.183733731923573e-05, + "loss": 3.7853, + "step": 9860 + }, + { + "epoch": 1.0145970394736843, + "grad_norm": 1.6879293107603113, + "learning_rate": 6.180362437728405e-05, + "loss": 3.7083, + "step": 9870 + }, + { + "epoch": 1.015625, + "grad_norm": 1.0928123814902106, + "learning_rate": 6.176989019580198e-05, + "loss": 3.6798, + "step": 9880 + }, + { + "epoch": 1.0166529605263157, + "grad_norm": 0.9079918072053607, + "learning_rate": 6.173613481190682e-05, + "loss": 3.7838, + "step": 9890 + }, + { + "epoch": 1.0176809210526316, + "grad_norm": 1.1020388283544027, + "learning_rate": 6.170235826273915e-05, + "loss": 3.7001, + "step": 9900 + }, + { + "epoch": 1.0187088815789473, + "grad_norm": 0.9793705356011703, + "learning_rate": 6.166856058546288e-05, + "loss": 3.6561, + "step": 9910 + }, + { + "epoch": 1.019736842105263, + "grad_norm": 1.4783963718470934, + "learning_rate": 6.163474181726518e-05, + "loss": 3.7489, + "step": 9920 + }, + { + "epoch": 1.020764802631579, + "grad_norm": 1.0781794548103796, + "learning_rate": 6.160090199535641e-05, + "loss": 3.7437, + "step": 9930 + }, + { + "epoch": 1.0217927631578947, + "grad_norm": 0.9893149597592387, + "learning_rate": 6.156704115697009e-05, + "loss": 3.7249, + "step": 9940 + }, + { + "epoch": 1.0228207236842106, + "grad_norm": 1.1802879472743675, + "learning_rate": 6.153315933936283e-05, + "loss": 3.8074, + "step": 9950 + }, + { + "epoch": 1.0238486842105263, + "grad_norm": 1.3675754541645049, + "learning_rate": 6.14992565798144e-05, + "loss": 3.7813, + "step": 9960 + }, + { + "epoch": 1.024876644736842, + "grad_norm": 1.578708654060114, + "learning_rate": 6.146533291562757e-05, + "loss": 3.753, + "step": 9970 + }, + { + "epoch": 1.025904605263158, + "grad_norm": 1.2930041535938457, + "learning_rate": 6.143138838412808e-05, + "loss": 3.7421, + "step": 9980 + }, + { + "epoch": 1.0269325657894737, + "grad_norm": 1.0219466438026699, + "learning_rate": 6.13974230226647e-05, + "loss": 3.6399, + "step": 9990 + }, + { + "epoch": 1.0279605263157894, + "grad_norm": 0.9107161670152483, + "learning_rate": 6.136343686860905e-05, + "loss": 3.6324, + "step": 10000 + }, + { + "epoch": 1.0289884868421053, + "grad_norm": 1.3475923827144287, + "learning_rate": 6.132942995935568e-05, + "loss": 3.7726, + "step": 10010 + }, + { + "epoch": 1.030016447368421, + "grad_norm": 1.8803862454961153, + "learning_rate": 6.129540233232197e-05, + "loss": 3.7481, + "step": 10020 + }, + { + "epoch": 1.031044407894737, + "grad_norm": 3.9249080768682, + "learning_rate": 6.126135402494805e-05, + "loss": 3.8014, + "step": 10030 + }, + { + "epoch": 1.0320723684210527, + "grad_norm": 1.31837329998433, + "learning_rate": 6.122728507469686e-05, + "loss": 3.6886, + "step": 10040 + }, + { + "epoch": 1.0331003289473684, + "grad_norm": 1.626258142965515, + "learning_rate": 6.1193195519054e-05, + "loss": 3.7375, + "step": 10050 + }, + { + "epoch": 1.0341282894736843, + "grad_norm": 1.6540413290946727, + "learning_rate": 6.11590853955278e-05, + "loss": 3.6788, + "step": 10060 + }, + { + "epoch": 1.03515625, + "grad_norm": 1.2658100699287136, + "learning_rate": 6.112495474164919e-05, + "loss": 3.7406, + "step": 10070 + }, + { + "epoch": 1.0361842105263157, + "grad_norm": 1.2887027585300013, + "learning_rate": 6.109080359497167e-05, + "loss": 3.6524, + "step": 10080 + }, + { + "epoch": 1.0372121710526316, + "grad_norm": 1.1585152576941664, + "learning_rate": 6.105663199307133e-05, + "loss": 3.869, + "step": 10090 + }, + { + "epoch": 1.0382401315789473, + "grad_norm": 1.112563695048585, + "learning_rate": 6.102243997354672e-05, + "loss": 3.7089, + "step": 10100 + }, + { + "epoch": 1.039268092105263, + "grad_norm": 1.1541097783047864, + "learning_rate": 6.09882275740189e-05, + "loss": 3.7372, + "step": 10110 + }, + { + "epoch": 1.040296052631579, + "grad_norm": 1.8018281421529552, + "learning_rate": 6.0953994832131344e-05, + "loss": 3.6654, + "step": 10120 + }, + { + "epoch": 1.0413240131578947, + "grad_norm": 1.2271636170233624, + "learning_rate": 6.0919741785549886e-05, + "loss": 3.8089, + "step": 10130 + }, + { + "epoch": 1.0423519736842106, + "grad_norm": 1.6219106798896292, + "learning_rate": 6.088546847196271e-05, + "loss": 3.7187, + "step": 10140 + }, + { + "epoch": 1.0433799342105263, + "grad_norm": 1.213379151864286, + "learning_rate": 6.0851174929080306e-05, + "loss": 3.6609, + "step": 10150 + }, + { + "epoch": 1.044407894736842, + "grad_norm": 1.3087695103368808, + "learning_rate": 6.081686119463542e-05, + "loss": 3.6783, + "step": 10160 + }, + { + "epoch": 1.045435855263158, + "grad_norm": 1.3939716915474405, + "learning_rate": 6.0782527306383035e-05, + "loss": 3.7153, + "step": 10170 + }, + { + "epoch": 1.0464638157894737, + "grad_norm": 0.9503233961479826, + "learning_rate": 6.0748173302100245e-05, + "loss": 3.7495, + "step": 10180 + }, + { + "epoch": 1.0474917763157894, + "grad_norm": 1.0406566237034898, + "learning_rate": 6.0713799219586344e-05, + "loss": 3.7289, + "step": 10190 + }, + { + "epoch": 1.0485197368421053, + "grad_norm": 1.2630999773011164, + "learning_rate": 6.0679405096662716e-05, + "loss": 3.7967, + "step": 10200 + }, + { + "epoch": 1.049547697368421, + "grad_norm": 1.572307022213165, + "learning_rate": 6.064499097117273e-05, + "loss": 3.6843, + "step": 10210 + }, + { + "epoch": 1.050575657894737, + "grad_norm": 1.0306335687599204, + "learning_rate": 6.0610556880981836e-05, + "loss": 3.7364, + "step": 10220 + }, + { + "epoch": 1.0516036184210527, + "grad_norm": 5.231684571221662, + "learning_rate": 6.057610286397743e-05, + "loss": 3.7525, + "step": 10230 + }, + { + "epoch": 1.0526315789473684, + "grad_norm": 0.9235223502019207, + "learning_rate": 6.054162895806878e-05, + "loss": 3.6727, + "step": 10240 + }, + { + "epoch": 1.0536595394736843, + "grad_norm": 1.163431956902762, + "learning_rate": 6.050713520118714e-05, + "loss": 3.6913, + "step": 10250 + }, + { + "epoch": 1.0546875, + "grad_norm": 1.172820968292425, + "learning_rate": 6.04726216312855e-05, + "loss": 3.7423, + "step": 10260 + }, + { + "epoch": 1.0557154605263157, + "grad_norm": 1.3721670530703167, + "learning_rate": 6.0438088286338736e-05, + "loss": 3.6906, + "step": 10270 + }, + { + "epoch": 1.0567434210526316, + "grad_norm": 1.2787518221039529, + "learning_rate": 6.040353520434343e-05, + "loss": 3.7755, + "step": 10280 + }, + { + "epoch": 1.0577713815789473, + "grad_norm": 1.135677371877546, + "learning_rate": 6.0368962423317875e-05, + "loss": 3.627, + "step": 10290 + }, + { + "epoch": 1.058799342105263, + "grad_norm": 1.1222733728053074, + "learning_rate": 6.033436998130208e-05, + "loss": 3.8293, + "step": 10300 + }, + { + "epoch": 1.059827302631579, + "grad_norm": 1.007796590516837, + "learning_rate": 6.0299757916357685e-05, + "loss": 3.7671, + "step": 10310 + }, + { + "epoch": 1.0608552631578947, + "grad_norm": 1.2756454423521495, + "learning_rate": 6.026512626656786e-05, + "loss": 3.6564, + "step": 10320 + }, + { + "epoch": 1.0618832236842106, + "grad_norm": 1.1901907802368916, + "learning_rate": 6.023047507003737e-05, + "loss": 3.6581, + "step": 10330 + }, + { + "epoch": 1.0629111842105263, + "grad_norm": 1.39687588412195, + "learning_rate": 6.019580436489252e-05, + "loss": 3.8399, + "step": 10340 + }, + { + "epoch": 1.063939144736842, + "grad_norm": 0.8041026388054112, + "learning_rate": 6.016111418928101e-05, + "loss": 3.7563, + "step": 10350 + }, + { + "epoch": 1.064967105263158, + "grad_norm": 1.018630115499465, + "learning_rate": 6.012640458137202e-05, + "loss": 3.7432, + "step": 10360 + }, + { + "epoch": 1.0659950657894737, + "grad_norm": 1.5101142668278296, + "learning_rate": 6.0091675579356046e-05, + "loss": 3.6638, + "step": 10370 + }, + { + "epoch": 1.0670230263157894, + "grad_norm": 1.3241106287384228, + "learning_rate": 6.005692722144499e-05, + "loss": 3.6558, + "step": 10380 + }, + { + "epoch": 1.0680509868421053, + "grad_norm": 1.4286496924351568, + "learning_rate": 6.0022159545872033e-05, + "loss": 3.7029, + "step": 10390 + }, + { + "epoch": 1.069078947368421, + "grad_norm": 1.312340136283571, + "learning_rate": 5.998737259089159e-05, + "loss": 3.6597, + "step": 10400 + }, + { + "epoch": 1.0701069078947367, + "grad_norm": 0.9837532198997452, + "learning_rate": 5.99525663947793e-05, + "loss": 3.7272, + "step": 10410 + }, + { + "epoch": 1.0711348684210527, + "grad_norm": 2.060647537159109, + "learning_rate": 5.9917740995831984e-05, + "loss": 3.6892, + "step": 10420 + }, + { + "epoch": 1.0721628289473684, + "grad_norm": 1.2550692764858387, + "learning_rate": 5.9882896432367566e-05, + "loss": 3.7269, + "step": 10430 + }, + { + "epoch": 1.0731907894736843, + "grad_norm": 0.8264066394178223, + "learning_rate": 5.9848032742725096e-05, + "loss": 3.6419, + "step": 10440 + }, + { + "epoch": 1.07421875, + "grad_norm": 1.0252688962343348, + "learning_rate": 5.9813149965264624e-05, + "loss": 3.698, + "step": 10450 + }, + { + "epoch": 1.0752467105263157, + "grad_norm": 0.9567312587859174, + "learning_rate": 5.977824813836723e-05, + "loss": 3.7156, + "step": 10460 + }, + { + "epoch": 1.0762746710526316, + "grad_norm": 1.1861978217528633, + "learning_rate": 5.974332730043495e-05, + "loss": 3.7405, + "step": 10470 + }, + { + "epoch": 1.0773026315789473, + "grad_norm": 0.8922205235771371, + "learning_rate": 5.9708387489890724e-05, + "loss": 3.7531, + "step": 10480 + }, + { + "epoch": 1.0783305921052633, + "grad_norm": 1.8356039840019918, + "learning_rate": 5.9673428745178394e-05, + "loss": 3.7587, + "step": 10490 + }, + { + "epoch": 1.079358552631579, + "grad_norm": 1.0951130684084074, + "learning_rate": 5.96384511047626e-05, + "loss": 3.6998, + "step": 10500 + }, + { + "epoch": 1.0803865131578947, + "grad_norm": 1.3788481962178418, + "learning_rate": 5.960345460712881e-05, + "loss": 3.7061, + "step": 10510 + }, + { + "epoch": 1.0814144736842106, + "grad_norm": 1.2289243179031026, + "learning_rate": 5.9568439290783196e-05, + "loss": 3.7334, + "step": 10520 + }, + { + "epoch": 1.0824424342105263, + "grad_norm": 1.621320079032989, + "learning_rate": 5.95334051942527e-05, + "loss": 3.7629, + "step": 10530 + }, + { + "epoch": 1.083470394736842, + "grad_norm": 0.8576542706639597, + "learning_rate": 5.9498352356084846e-05, + "loss": 3.704, + "step": 10540 + }, + { + "epoch": 1.084498355263158, + "grad_norm": 1.459855520467782, + "learning_rate": 5.946328081484785e-05, + "loss": 3.8247, + "step": 10550 + }, + { + "epoch": 1.0855263157894737, + "grad_norm": 0.8912086247688564, + "learning_rate": 5.942819060913048e-05, + "loss": 3.7812, + "step": 10560 + }, + { + "epoch": 1.0865542763157894, + "grad_norm": 1.001627372747458, + "learning_rate": 5.939308177754202e-05, + "loss": 3.7199, + "step": 10570 + }, + { + "epoch": 1.0875822368421053, + "grad_norm": 1.0778560579412377, + "learning_rate": 5.9357954358712284e-05, + "loss": 3.6953, + "step": 10580 + }, + { + "epoch": 1.088610197368421, + "grad_norm": 0.9840140816390394, + "learning_rate": 5.9322808391291514e-05, + "loss": 3.7084, + "step": 10590 + }, + { + "epoch": 1.0896381578947367, + "grad_norm": 1.2777723240851149, + "learning_rate": 5.928764391395037e-05, + "loss": 3.7659, + "step": 10600 + }, + { + "epoch": 1.0906661184210527, + "grad_norm": 1.0350104463206735, + "learning_rate": 5.9252460965379875e-05, + "loss": 3.6834, + "step": 10610 + }, + { + "epoch": 1.0916940789473684, + "grad_norm": 1.2769354771479537, + "learning_rate": 5.921725958429137e-05, + "loss": 3.7339, + "step": 10620 + }, + { + "epoch": 1.0927220394736843, + "grad_norm": 1.7786905069917238, + "learning_rate": 5.9182039809416496e-05, + "loss": 3.6624, + "step": 10630 + }, + { + "epoch": 1.09375, + "grad_norm": 1.3153983312776318, + "learning_rate": 5.9146801679507113e-05, + "loss": 3.7292, + "step": 10640 + }, + { + "epoch": 1.0947779605263157, + "grad_norm": 1.3800389983706713, + "learning_rate": 5.911154523333528e-05, + "loss": 3.7024, + "step": 10650 + }, + { + "epoch": 1.0958059210526316, + "grad_norm": 0.8961709332671101, + "learning_rate": 5.907627050969321e-05, + "loss": 3.7058, + "step": 10660 + }, + { + "epoch": 1.0968338815789473, + "grad_norm": 0.81863620782654, + "learning_rate": 5.904097754739325e-05, + "loss": 3.6643, + "step": 10670 + }, + { + "epoch": 1.0978618421052633, + "grad_norm": 0.7116981880241033, + "learning_rate": 5.900566638526777e-05, + "loss": 3.6901, + "step": 10680 + }, + { + "epoch": 1.098889802631579, + "grad_norm": 0.9034188871468686, + "learning_rate": 5.89703370621692e-05, + "loss": 3.7085, + "step": 10690 + }, + { + "epoch": 1.0999177631578947, + "grad_norm": 1.0571530945017087, + "learning_rate": 5.893498961696992e-05, + "loss": 3.7067, + "step": 10700 + }, + { + "epoch": 1.1009457236842106, + "grad_norm": 0.9291604415331493, + "learning_rate": 5.8899624088562316e-05, + "loss": 3.7053, + "step": 10710 + }, + { + "epoch": 1.1019736842105263, + "grad_norm": 1.2778297922816648, + "learning_rate": 5.88642405158586e-05, + "loss": 3.7632, + "step": 10720 + }, + { + "epoch": 1.103001644736842, + "grad_norm": 1.122146803039068, + "learning_rate": 5.8828838937790846e-05, + "loss": 3.8095, + "step": 10730 + }, + { + "epoch": 1.104029605263158, + "grad_norm": 1.0483890232300392, + "learning_rate": 5.879341939331097e-05, + "loss": 3.6838, + "step": 10740 + }, + { + "epoch": 1.1050575657894737, + "grad_norm": 1.3449740933472316, + "learning_rate": 5.8757981921390646e-05, + "loss": 3.7291, + "step": 10750 + }, + { + "epoch": 1.1060855263157894, + "grad_norm": 0.8410586419202363, + "learning_rate": 5.872252656102129e-05, + "loss": 3.6604, + "step": 10760 + }, + { + "epoch": 1.1071134868421053, + "grad_norm": 1.0302520315830164, + "learning_rate": 5.8687053351213945e-05, + "loss": 3.7378, + "step": 10770 + }, + { + "epoch": 1.108141447368421, + "grad_norm": 1.535778201157721, + "learning_rate": 5.865156233099934e-05, + "loss": 3.7694, + "step": 10780 + }, + { + "epoch": 1.1091694078947367, + "grad_norm": 1.40415174983824, + "learning_rate": 5.861605353942781e-05, + "loss": 3.7332, + "step": 10790 + }, + { + "epoch": 1.1101973684210527, + "grad_norm": 0.9360221340525281, + "learning_rate": 5.8580527015569204e-05, + "loss": 3.6091, + "step": 10800 + }, + { + "epoch": 1.1112253289473684, + "grad_norm": 0.9268509041245037, + "learning_rate": 5.854498279851289e-05, + "loss": 3.6892, + "step": 10810 + }, + { + "epoch": 1.1122532894736843, + "grad_norm": 0.9300096418537216, + "learning_rate": 5.8509420927367735e-05, + "loss": 3.7492, + "step": 10820 + }, + { + "epoch": 1.11328125, + "grad_norm": 1.2196978282369, + "learning_rate": 5.847384144126199e-05, + "loss": 3.7248, + "step": 10830 + }, + { + "epoch": 1.1143092105263157, + "grad_norm": 1.077256889797513, + "learning_rate": 5.843824437934333e-05, + "loss": 3.7262, + "step": 10840 + }, + { + "epoch": 1.1153371710526316, + "grad_norm": 0.753212842131768, + "learning_rate": 5.840262978077874e-05, + "loss": 3.7488, + "step": 10850 + }, + { + "epoch": 1.1163651315789473, + "grad_norm": 0.8577663972066164, + "learning_rate": 5.8366997684754496e-05, + "loss": 3.7045, + "step": 10860 + }, + { + "epoch": 1.1173930921052633, + "grad_norm": 1.2422814963423066, + "learning_rate": 5.833134813047615e-05, + "loss": 3.7036, + "step": 10870 + }, + { + "epoch": 1.118421052631579, + "grad_norm": 0.7905616493627655, + "learning_rate": 5.829568115716843e-05, + "loss": 3.7546, + "step": 10880 + }, + { + "epoch": 1.1194490131578947, + "grad_norm": 1.1647968408156615, + "learning_rate": 5.825999680407527e-05, + "loss": 3.7277, + "step": 10890 + }, + { + "epoch": 1.1204769736842106, + "grad_norm": 0.872490383357525, + "learning_rate": 5.8224295110459694e-05, + "loss": 3.711, + "step": 10900 + }, + { + "epoch": 1.1215049342105263, + "grad_norm": 1.0350741947688835, + "learning_rate": 5.818857611560381e-05, + "loss": 3.6684, + "step": 10910 + }, + { + "epoch": 1.122532894736842, + "grad_norm": 1.0453465249492682, + "learning_rate": 5.8152839858808796e-05, + "loss": 3.7235, + "step": 10920 + }, + { + "epoch": 1.123560855263158, + "grad_norm": 0.8812826962188436, + "learning_rate": 5.811708637939477e-05, + "loss": 3.6737, + "step": 10930 + }, + { + "epoch": 1.1245888157894737, + "grad_norm": 0.888192754009336, + "learning_rate": 5.8081315716700845e-05, + "loss": 3.5983, + "step": 10940 + }, + { + "epoch": 1.1256167763157894, + "grad_norm": 0.859474245756292, + "learning_rate": 5.804552791008503e-05, + "loss": 3.6946, + "step": 10950 + }, + { + "epoch": 1.1266447368421053, + "grad_norm": 1.0942467787487995, + "learning_rate": 5.800972299892415e-05, + "loss": 3.6457, + "step": 10960 + }, + { + "epoch": 1.127672697368421, + "grad_norm": 1.873035991744483, + "learning_rate": 5.797390102261394e-05, + "loss": 3.7246, + "step": 10970 + }, + { + "epoch": 1.1287006578947367, + "grad_norm": 1.135728489931776, + "learning_rate": 5.793806202056882e-05, + "loss": 3.7027, + "step": 10980 + }, + { + "epoch": 1.1297286184210527, + "grad_norm": 0.8300074289173277, + "learning_rate": 5.7902206032222007e-05, + "loss": 3.6908, + "step": 10990 + }, + { + "epoch": 1.1307565789473684, + "grad_norm": 0.8212591975493966, + "learning_rate": 5.786633309702539e-05, + "loss": 3.7006, + "step": 11000 + }, + { + "epoch": 1.1317845394736843, + "grad_norm": 0.983978191506226, + "learning_rate": 5.7830443254449475e-05, + "loss": 3.7219, + "step": 11010 + }, + { + "epoch": 1.1328125, + "grad_norm": 1.02208223953738, + "learning_rate": 5.779453654398341e-05, + "loss": 3.7909, + "step": 11020 + }, + { + "epoch": 1.1338404605263157, + "grad_norm": 1.41141177261338, + "learning_rate": 5.775861300513489e-05, + "loss": 3.7213, + "step": 11030 + }, + { + "epoch": 1.1348684210526316, + "grad_norm": 0.9515430579994576, + "learning_rate": 5.7722672677430136e-05, + "loss": 3.7527, + "step": 11040 + }, + { + "epoch": 1.1358963815789473, + "grad_norm": 0.8746394368193657, + "learning_rate": 5.7686715600413815e-05, + "loss": 3.6959, + "step": 11050 + }, + { + "epoch": 1.1369243421052633, + "grad_norm": 1.48970664343148, + "learning_rate": 5.7650741813649026e-05, + "loss": 3.7437, + "step": 11060 + }, + { + "epoch": 1.137952302631579, + "grad_norm": 1.169417538462987, + "learning_rate": 5.7614751356717275e-05, + "loss": 3.6818, + "step": 11070 + }, + { + "epoch": 1.1389802631578947, + "grad_norm": 0.9114956802709423, + "learning_rate": 5.757874426921842e-05, + "loss": 3.739, + "step": 11080 + }, + { + "epoch": 1.1400082236842106, + "grad_norm": 1.089723094974734, + "learning_rate": 5.754272059077057e-05, + "loss": 3.7305, + "step": 11090 + }, + { + "epoch": 1.1410361842105263, + "grad_norm": 1.6134697386058563, + "learning_rate": 5.750668036101013e-05, + "loss": 3.7733, + "step": 11100 + }, + { + "epoch": 1.142064144736842, + "grad_norm": 1.5185659132171492, + "learning_rate": 5.74706236195917e-05, + "loss": 3.7508, + "step": 11110 + }, + { + "epoch": 1.143092105263158, + "grad_norm": 1.0805362881661797, + "learning_rate": 5.743455040618807e-05, + "loss": 3.7459, + "step": 11120 + }, + { + "epoch": 1.1441200657894737, + "grad_norm": 0.8625540799311175, + "learning_rate": 5.7398460760490093e-05, + "loss": 3.7121, + "step": 11130 + }, + { + "epoch": 1.1451480263157894, + "grad_norm": 1.4122082687508624, + "learning_rate": 5.736235472220676e-05, + "loss": 3.6827, + "step": 11140 + }, + { + "epoch": 1.1461759868421053, + "grad_norm": 0.8607399244735326, + "learning_rate": 5.73262323310651e-05, + "loss": 3.706, + "step": 11150 + }, + { + "epoch": 1.147203947368421, + "grad_norm": 1.308863461156962, + "learning_rate": 5.72900936268101e-05, + "loss": 3.6314, + "step": 11160 + }, + { + "epoch": 1.1482319078947367, + "grad_norm": 0.9186085520740946, + "learning_rate": 5.7253938649204724e-05, + "loss": 3.7631, + "step": 11170 + }, + { + "epoch": 1.1492598684210527, + "grad_norm": 0.6995899770700305, + "learning_rate": 5.7217767438029823e-05, + "loss": 3.7833, + "step": 11180 + }, + { + "epoch": 1.1502878289473684, + "grad_norm": 1.0715657157771823, + "learning_rate": 5.7181580033084104e-05, + "loss": 3.6307, + "step": 11190 + }, + { + "epoch": 1.1513157894736843, + "grad_norm": 1.0633276352585217, + "learning_rate": 5.714537647418413e-05, + "loss": 3.7027, + "step": 11200 + }, + { + "epoch": 1.15234375, + "grad_norm": 0.9521376076106852, + "learning_rate": 5.7109156801164194e-05, + "loss": 3.7228, + "step": 11210 + }, + { + "epoch": 1.1533717105263157, + "grad_norm": 0.9964616582263238, + "learning_rate": 5.7072921053876355e-05, + "loss": 3.6484, + "step": 11220 + }, + { + "epoch": 1.1543996710526316, + "grad_norm": 1.4350758720570076, + "learning_rate": 5.703666927219032e-05, + "loss": 3.6937, + "step": 11230 + }, + { + "epoch": 1.1554276315789473, + "grad_norm": 0.8236124782933307, + "learning_rate": 5.700040149599347e-05, + "loss": 3.7167, + "step": 11240 + }, + { + "epoch": 1.1564555921052633, + "grad_norm": 1.2698734233135593, + "learning_rate": 5.696411776519078e-05, + "loss": 3.669, + "step": 11250 + }, + { + "epoch": 1.157483552631579, + "grad_norm": 1.034494613231501, + "learning_rate": 5.692781811970477e-05, + "loss": 3.7008, + "step": 11260 + }, + { + "epoch": 1.1585115131578947, + "grad_norm": 0.9829755857002882, + "learning_rate": 5.6891502599475474e-05, + "loss": 3.7347, + "step": 11270 + }, + { + "epoch": 1.1595394736842106, + "grad_norm": 1.2223482079066834, + "learning_rate": 5.685517124446039e-05, + "loss": 3.6893, + "step": 11280 + }, + { + "epoch": 1.1605674342105263, + "grad_norm": 1.0710376037528508, + "learning_rate": 5.681882409463444e-05, + "loss": 3.7057, + "step": 11290 + }, + { + "epoch": 1.161595394736842, + "grad_norm": 1.8073847047748515, + "learning_rate": 5.6782461189989924e-05, + "loss": 3.5241, + "step": 11300 + }, + { + "epoch": 1.162623355263158, + "grad_norm": 1.0136479428209622, + "learning_rate": 5.6746082570536504e-05, + "loss": 3.7123, + "step": 11310 + }, + { + "epoch": 1.1636513157894737, + "grad_norm": 0.8449962031262208, + "learning_rate": 5.6709688276301074e-05, + "loss": 3.7585, + "step": 11320 + }, + { + "epoch": 1.1646792763157894, + "grad_norm": 1.0305029157017045, + "learning_rate": 5.6673278347327844e-05, + "loss": 3.7254, + "step": 11330 + }, + { + "epoch": 1.1657072368421053, + "grad_norm": 1.5933888344029832, + "learning_rate": 5.663685282367816e-05, + "loss": 3.7304, + "step": 11340 + }, + { + "epoch": 1.166735197368421, + "grad_norm": 1.1612028801826135, + "learning_rate": 5.6600411745430584e-05, + "loss": 3.7315, + "step": 11350 + }, + { + "epoch": 1.1677631578947367, + "grad_norm": 0.7568938989902173, + "learning_rate": 5.6563955152680744e-05, + "loss": 3.7651, + "step": 11360 + }, + { + "epoch": 1.1687911184210527, + "grad_norm": 2.9441663702727, + "learning_rate": 5.652748308554138e-05, + "loss": 3.7165, + "step": 11370 + }, + { + "epoch": 1.1698190789473684, + "grad_norm": 1.1099327189872392, + "learning_rate": 5.649099558414224e-05, + "loss": 3.676, + "step": 11380 + }, + { + "epoch": 1.1708470394736843, + "grad_norm": 1.0870173731646462, + "learning_rate": 5.6454492688630045e-05, + "loss": 3.7084, + "step": 11390 + }, + { + "epoch": 1.171875, + "grad_norm": 0.8030400123603563, + "learning_rate": 5.6417974439168467e-05, + "loss": 3.8254, + "step": 11400 + }, + { + "epoch": 1.1729029605263157, + "grad_norm": 0.7976010698591574, + "learning_rate": 5.638144087593805e-05, + "loss": 3.6794, + "step": 11410 + }, + { + "epoch": 1.1739309210526316, + "grad_norm": 0.9691111291458312, + "learning_rate": 5.634489203913623e-05, + "loss": 3.7426, + "step": 11420 + }, + { + "epoch": 1.1749588815789473, + "grad_norm": 1.1672578006597294, + "learning_rate": 5.630832796897723e-05, + "loss": 3.6784, + "step": 11430 + }, + { + "epoch": 1.1759868421052633, + "grad_norm": 1.4488356256072277, + "learning_rate": 5.6271748705692005e-05, + "loss": 3.7004, + "step": 11440 + }, + { + "epoch": 1.177014802631579, + "grad_norm": 1.3418112230713717, + "learning_rate": 5.623515428952825e-05, + "loss": 3.6776, + "step": 11450 + }, + { + "epoch": 1.1780427631578947, + "grad_norm": 1.0591491143317688, + "learning_rate": 5.6198544760750344e-05, + "loss": 3.7407, + "step": 11460 + }, + { + "epoch": 1.1790707236842106, + "grad_norm": 1.2462686637161948, + "learning_rate": 5.6161920159639266e-05, + "loss": 3.7215, + "step": 11470 + }, + { + "epoch": 1.1800986842105263, + "grad_norm": 0.9934880601197145, + "learning_rate": 5.612528052649262e-05, + "loss": 3.675, + "step": 11480 + }, + { + "epoch": 1.181126644736842, + "grad_norm": 0.9993735377698821, + "learning_rate": 5.6088625901624504e-05, + "loss": 3.6985, + "step": 11490 + }, + { + "epoch": 1.182154605263158, + "grad_norm": 1.0200190053365985, + "learning_rate": 5.6051956325365546e-05, + "loss": 3.7084, + "step": 11500 + }, + { + "epoch": 1.1831825657894737, + "grad_norm": 0.925397640538222, + "learning_rate": 5.60152718380628e-05, + "loss": 3.7029, + "step": 11510 + }, + { + "epoch": 1.1842105263157894, + "grad_norm": 0.9307551624586295, + "learning_rate": 5.597857248007973e-05, + "loss": 3.6636, + "step": 11520 + }, + { + "epoch": 1.1852384868421053, + "grad_norm": 0.901941217911386, + "learning_rate": 5.5941858291796195e-05, + "loss": 3.6736, + "step": 11530 + }, + { + "epoch": 1.186266447368421, + "grad_norm": 0.8641143852231974, + "learning_rate": 5.590512931360831e-05, + "loss": 3.7217, + "step": 11540 + }, + { + "epoch": 1.1872944078947367, + "grad_norm": 1.0373770445617245, + "learning_rate": 5.5868385585928523e-05, + "loss": 3.6227, + "step": 11550 + }, + { + "epoch": 1.1883223684210527, + "grad_norm": 0.8023353940237049, + "learning_rate": 5.5831627149185504e-05, + "loss": 3.6501, + "step": 11560 + }, + { + "epoch": 1.1893503289473684, + "grad_norm": 0.9008379836466761, + "learning_rate": 5.5794854043824046e-05, + "loss": 3.6864, + "step": 11570 + }, + { + "epoch": 1.1903782894736843, + "grad_norm": 1.2797951230356368, + "learning_rate": 5.5758066310305144e-05, + "loss": 3.7087, + "step": 11580 + }, + { + "epoch": 1.19140625, + "grad_norm": 1.4903445221877603, + "learning_rate": 5.5721263989105895e-05, + "loss": 3.658, + "step": 11590 + }, + { + "epoch": 1.1924342105263157, + "grad_norm": 1.4993771213454472, + "learning_rate": 5.568444712071937e-05, + "loss": 3.789, + "step": 11600 + }, + { + "epoch": 1.1934621710526316, + "grad_norm": 0.8943579893273892, + "learning_rate": 5.5647615745654745e-05, + "loss": 3.691, + "step": 11610 + }, + { + "epoch": 1.1944901315789473, + "grad_norm": 0.8304369700569304, + "learning_rate": 5.561076990443709e-05, + "loss": 3.7463, + "step": 11620 + }, + { + "epoch": 1.1955180921052633, + "grad_norm": 0.9688634805970222, + "learning_rate": 5.5573909637607396e-05, + "loss": 3.7851, + "step": 11630 + }, + { + "epoch": 1.196546052631579, + "grad_norm": 0.9639402475852766, + "learning_rate": 5.5537034985722585e-05, + "loss": 3.713, + "step": 11640 + }, + { + "epoch": 1.1975740131578947, + "grad_norm": 1.3441902592703796, + "learning_rate": 5.550014598935531e-05, + "loss": 3.6517, + "step": 11650 + }, + { + "epoch": 1.1986019736842106, + "grad_norm": 0.9543348044853672, + "learning_rate": 5.54632426890941e-05, + "loss": 3.8107, + "step": 11660 + }, + { + "epoch": 1.1996299342105263, + "grad_norm": 0.854912738306765, + "learning_rate": 5.542632512554317e-05, + "loss": 3.6809, + "step": 11670 + }, + { + "epoch": 1.200657894736842, + "grad_norm": 0.8782280233637636, + "learning_rate": 5.538939333932244e-05, + "loss": 3.6454, + "step": 11680 + }, + { + "epoch": 1.201685855263158, + "grad_norm": 1.2122977215858854, + "learning_rate": 5.535244737106749e-05, + "loss": 3.7095, + "step": 11690 + }, + { + "epoch": 1.2027138157894737, + "grad_norm": 1.5389415867745833, + "learning_rate": 5.531548726142947e-05, + "loss": 3.6891, + "step": 11700 + }, + { + "epoch": 1.2037417763157894, + "grad_norm": 216.38909904960553, + "learning_rate": 5.527851305107515e-05, + "loss": 3.8048, + "step": 11710 + }, + { + "epoch": 1.2047697368421053, + "grad_norm": 5.83761127374038, + "learning_rate": 5.524152478068677e-05, + "loss": 3.835, + "step": 11720 + }, + { + "epoch": 1.205797697368421, + "grad_norm": 2.717477347131275, + "learning_rate": 5.520452249096202e-05, + "loss": 3.7647, + "step": 11730 + }, + { + "epoch": 1.2068256578947367, + "grad_norm": 1.2215726883317721, + "learning_rate": 5.516750622261408e-05, + "loss": 3.695, + "step": 11740 + }, + { + "epoch": 1.2078536184210527, + "grad_norm": 0.9041636556291032, + "learning_rate": 5.5130476016371464e-05, + "loss": 3.7886, + "step": 11750 + }, + { + "epoch": 1.2088815789473684, + "grad_norm": 1.3535127185608675, + "learning_rate": 5.509343191297802e-05, + "loss": 3.7148, + "step": 11760 + }, + { + "epoch": 1.2099095394736843, + "grad_norm": 0.8455717764469719, + "learning_rate": 5.505637395319293e-05, + "loss": 3.7951, + "step": 11770 + }, + { + "epoch": 1.2109375, + "grad_norm": 1.1938808069911668, + "learning_rate": 5.501930217779056e-05, + "loss": 3.7398, + "step": 11780 + }, + { + "epoch": 1.2119654605263157, + "grad_norm": 1.1935557002576858, + "learning_rate": 5.498221662756053e-05, + "loss": 3.6647, + "step": 11790 + }, + { + "epoch": 1.2129934210526316, + "grad_norm": 1.0625186806326845, + "learning_rate": 5.494511734330759e-05, + "loss": 3.773, + "step": 11800 + }, + { + "epoch": 1.2140213815789473, + "grad_norm": 1.0459604966160347, + "learning_rate": 5.4908004365851597e-05, + "loss": 3.6895, + "step": 11810 + }, + { + "epoch": 1.2150493421052633, + "grad_norm": 0.7805013884630384, + "learning_rate": 5.4870877736027505e-05, + "loss": 3.7393, + "step": 11820 + }, + { + "epoch": 1.216077302631579, + "grad_norm": 0.9759439264062082, + "learning_rate": 5.483373749468524e-05, + "loss": 3.7448, + "step": 11830 + }, + { + "epoch": 1.2171052631578947, + "grad_norm": 1.0872049961946506, + "learning_rate": 5.4796583682689755e-05, + "loss": 3.6721, + "step": 11840 + }, + { + "epoch": 1.2181332236842106, + "grad_norm": 0.8661516893976052, + "learning_rate": 5.475941634092092e-05, + "loss": 3.7272, + "step": 11850 + }, + { + "epoch": 1.2191611842105263, + "grad_norm": 1.2215332626548308, + "learning_rate": 5.4722235510273456e-05, + "loss": 3.7164, + "step": 11860 + }, + { + "epoch": 1.220189144736842, + "grad_norm": 1.1063655275700695, + "learning_rate": 5.468504123165698e-05, + "loss": 3.6824, + "step": 11870 + }, + { + "epoch": 1.221217105263158, + "grad_norm": 1.2674635260787115, + "learning_rate": 5.4647833545995865e-05, + "loss": 3.6975, + "step": 11880 + }, + { + "epoch": 1.2222450657894737, + "grad_norm": 1.192675505384855, + "learning_rate": 5.461061249422925e-05, + "loss": 3.7264, + "step": 11890 + }, + { + "epoch": 1.2232730263157894, + "grad_norm": 1.1963535338175784, + "learning_rate": 5.4573378117311015e-05, + "loss": 3.7411, + "step": 11900 + }, + { + "epoch": 1.2243009868421053, + "grad_norm": 1.4804004663507386, + "learning_rate": 5.453613045620962e-05, + "loss": 3.7429, + "step": 11910 + }, + { + "epoch": 1.225328947368421, + "grad_norm": 0.8477655507300761, + "learning_rate": 5.449886955190819e-05, + "loss": 3.6401, + "step": 11920 + }, + { + "epoch": 1.2263569078947367, + "grad_norm": 0.9443598904969378, + "learning_rate": 5.4461595445404456e-05, + "loss": 3.6749, + "step": 11930 + }, + { + "epoch": 1.2273848684210527, + "grad_norm": 0.8117872926999857, + "learning_rate": 5.442430817771064e-05, + "loss": 3.7884, + "step": 11940 + }, + { + "epoch": 1.2284128289473684, + "grad_norm": 1.132875480216534, + "learning_rate": 5.43870077898534e-05, + "loss": 3.7102, + "step": 11950 + }, + { + "epoch": 1.2294407894736843, + "grad_norm": 1.575928698834184, + "learning_rate": 5.43496943228739e-05, + "loss": 3.6859, + "step": 11960 + }, + { + "epoch": 1.23046875, + "grad_norm": 1.6070095567068947, + "learning_rate": 5.4312367817827687e-05, + "loss": 3.6878, + "step": 11970 + }, + { + "epoch": 1.2314967105263157, + "grad_norm": 1.2611943150033773, + "learning_rate": 5.4275028315784604e-05, + "loss": 3.7238, + "step": 11980 + }, + { + "epoch": 1.2325246710526316, + "grad_norm": 1.2012107284588887, + "learning_rate": 5.423767585782885e-05, + "loss": 3.6585, + "step": 11990 + }, + { + "epoch": 1.2335526315789473, + "grad_norm": 1.2658415715039715, + "learning_rate": 5.420031048505885e-05, + "loss": 3.6753, + "step": 12000 + }, + { + "epoch": 1.2345805921052633, + "grad_norm": 1.3052235033392283, + "learning_rate": 5.4162932238587243e-05, + "loss": 3.6078, + "step": 12010 + }, + { + "epoch": 1.235608552631579, + "grad_norm": 1.542288792152445, + "learning_rate": 5.4125541159540825e-05, + "loss": 3.7517, + "step": 12020 + }, + { + "epoch": 1.2366365131578947, + "grad_norm": 0.9230752018001598, + "learning_rate": 5.408813728906053e-05, + "loss": 3.7194, + "step": 12030 + }, + { + "epoch": 1.2376644736842106, + "grad_norm": 1.8019502035663506, + "learning_rate": 5.405072066830137e-05, + "loss": 3.6751, + "step": 12040 + }, + { + "epoch": 1.2386924342105263, + "grad_norm": 2.136648563996675, + "learning_rate": 5.401329133843234e-05, + "loss": 3.6458, + "step": 12050 + }, + { + "epoch": 1.239720394736842, + "grad_norm": 0.9518551105668057, + "learning_rate": 5.3975849340636475e-05, + "loss": 3.7825, + "step": 12060 + }, + { + "epoch": 1.240748355263158, + "grad_norm": 1.0030737961865062, + "learning_rate": 5.393839471611072e-05, + "loss": 3.6754, + "step": 12070 + }, + { + "epoch": 1.2417763157894737, + "grad_norm": 1.0163361627104166, + "learning_rate": 5.390092750606593e-05, + "loss": 3.7094, + "step": 12080 + }, + { + "epoch": 1.2428042763157894, + "grad_norm": 1.2765296418199648, + "learning_rate": 5.386344775172678e-05, + "loss": 3.7411, + "step": 12090 + }, + { + "epoch": 1.2438322368421053, + "grad_norm": 0.9907248521896098, + "learning_rate": 5.382595549433176e-05, + "loss": 3.6793, + "step": 12100 + }, + { + "epoch": 1.244860197368421, + "grad_norm": 0.9233169593173997, + "learning_rate": 5.3788450775133137e-05, + "loss": 3.739, + "step": 12110 + }, + { + "epoch": 1.2458881578947367, + "grad_norm": 0.9747636630873286, + "learning_rate": 5.375093363539686e-05, + "loss": 3.7304, + "step": 12120 + }, + { + "epoch": 1.2469161184210527, + "grad_norm": 2.049327865761235, + "learning_rate": 5.3713404116402565e-05, + "loss": 3.7144, + "step": 12130 + }, + { + "epoch": 1.2479440789473684, + "grad_norm": 1.1488748353491707, + "learning_rate": 5.367586225944348e-05, + "loss": 3.6449, + "step": 12140 + }, + { + "epoch": 1.2489720394736843, + "grad_norm": 1.1198466738416128, + "learning_rate": 5.3638308105826466e-05, + "loss": 3.7137, + "step": 12150 + }, + { + "epoch": 1.25, + "grad_norm": 0.7755930369735939, + "learning_rate": 5.360074169687185e-05, + "loss": 3.6771, + "step": 12160 + }, + { + "epoch": 1.2510279605263157, + "grad_norm": 1.0589556198690742, + "learning_rate": 5.356316307391347e-05, + "loss": 3.7401, + "step": 12170 + }, + { + "epoch": 1.2520559210526316, + "grad_norm": 1.170547750992464, + "learning_rate": 5.352557227829861e-05, + "loss": 3.7627, + "step": 12180 + }, + { + "epoch": 1.2530838815789473, + "grad_norm": 1.2649513618255936, + "learning_rate": 5.3487969351387934e-05, + "loss": 3.6796, + "step": 12190 + }, + { + "epoch": 1.2541118421052633, + "grad_norm": 1.0444482603465595, + "learning_rate": 5.3450354334555476e-05, + "loss": 3.7424, + "step": 12200 + }, + { + "epoch": 1.255139802631579, + "grad_norm": 1.0831045618346666, + "learning_rate": 5.3412727269188546e-05, + "loss": 3.7009, + "step": 12210 + }, + { + "epoch": 1.2561677631578947, + "grad_norm": 0.7972370148178741, + "learning_rate": 5.337508819668772e-05, + "loss": 3.6904, + "step": 12220 + }, + { + "epoch": 1.2571957236842106, + "grad_norm": 0.7797211756342667, + "learning_rate": 5.333743715846678e-05, + "loss": 3.6492, + "step": 12230 + }, + { + "epoch": 1.2582236842105263, + "grad_norm": 1.560358410890787, + "learning_rate": 5.329977419595269e-05, + "loss": 3.6768, + "step": 12240 + }, + { + "epoch": 1.259251644736842, + "grad_norm": 0.9307174427138692, + "learning_rate": 5.326209935058554e-05, + "loss": 3.6352, + "step": 12250 + }, + { + "epoch": 1.260279605263158, + "grad_norm": 1.0328547389732294, + "learning_rate": 5.322441266381845e-05, + "loss": 3.6759, + "step": 12260 + }, + { + "epoch": 1.2613075657894737, + "grad_norm": 1.2781171411398584, + "learning_rate": 5.318671417711762e-05, + "loss": 3.7206, + "step": 12270 + }, + { + "epoch": 1.2623355263157894, + "grad_norm": 0.9910129292969809, + "learning_rate": 5.314900393196221e-05, + "loss": 3.6902, + "step": 12280 + }, + { + "epoch": 1.2633634868421053, + "grad_norm": 1.1683422499319598, + "learning_rate": 5.3111281969844304e-05, + "loss": 3.7571, + "step": 12290 + }, + { + "epoch": 1.264391447368421, + "grad_norm": 1.065496035292368, + "learning_rate": 5.307354833226889e-05, + "loss": 3.6114, + "step": 12300 + }, + { + "epoch": 1.2654194078947367, + "grad_norm": 1.0414686523422803, + "learning_rate": 5.303580306075384e-05, + "loss": 3.715, + "step": 12310 + }, + { + "epoch": 1.2664473684210527, + "grad_norm": 1.2028604191753915, + "learning_rate": 5.299804619682974e-05, + "loss": 3.6816, + "step": 12320 + }, + { + "epoch": 1.2674753289473684, + "grad_norm": 0.7591462344097518, + "learning_rate": 5.2960277782040034e-05, + "loss": 3.675, + "step": 12330 + }, + { + "epoch": 1.2685032894736843, + "grad_norm": 1.0675445182862162, + "learning_rate": 5.29224978579408e-05, + "loss": 3.6919, + "step": 12340 + }, + { + "epoch": 1.26953125, + "grad_norm": 1.5124008650645335, + "learning_rate": 5.28847064661008e-05, + "loss": 3.6838, + "step": 12350 + }, + { + "epoch": 1.2705592105263157, + "grad_norm": 1.2449997072334404, + "learning_rate": 5.284690364810144e-05, + "loss": 3.7537, + "step": 12360 + }, + { + "epoch": 1.2715871710526316, + "grad_norm": 0.8389900304090588, + "learning_rate": 5.2809089445536646e-05, + "loss": 3.7373, + "step": 12370 + }, + { + "epoch": 1.2726151315789473, + "grad_norm": 1.2850660390991142, + "learning_rate": 5.277126390001292e-05, + "loss": 3.69, + "step": 12380 + }, + { + "epoch": 1.2736430921052633, + "grad_norm": 1.0943842002878426, + "learning_rate": 5.273342705314922e-05, + "loss": 3.6194, + "step": 12390 + }, + { + "epoch": 1.274671052631579, + "grad_norm": 1.2654768303687245, + "learning_rate": 5.269557894657694e-05, + "loss": 3.7023, + "step": 12400 + }, + { + "epoch": 1.2756990131578947, + "grad_norm": 1.240700605572574, + "learning_rate": 5.265771962193989e-05, + "loss": 3.6113, + "step": 12410 + }, + { + "epoch": 1.2767269736842106, + "grad_norm": 0.8601174008832484, + "learning_rate": 5.261984912089416e-05, + "loss": 3.7361, + "step": 12420 + }, + { + "epoch": 1.2777549342105263, + "grad_norm": 1.1035995046418845, + "learning_rate": 5.25819674851082e-05, + "loss": 3.7582, + "step": 12430 + }, + { + "epoch": 1.278782894736842, + "grad_norm": 1.3309655722931066, + "learning_rate": 5.25440747562627e-05, + "loss": 3.6851, + "step": 12440 + }, + { + "epoch": 1.279810855263158, + "grad_norm": 0.8924842300945195, + "learning_rate": 5.2506170976050505e-05, + "loss": 3.7704, + "step": 12450 + }, + { + "epoch": 1.2808388157894737, + "grad_norm": 0.8421970841069453, + "learning_rate": 5.24682561861767e-05, + "loss": 3.6796, + "step": 12460 + }, + { + "epoch": 1.2818667763157894, + "grad_norm": 0.9348520075621265, + "learning_rate": 5.2430330428358406e-05, + "loss": 3.6318, + "step": 12470 + }, + { + "epoch": 1.2828947368421053, + "grad_norm": 0.9334799086620951, + "learning_rate": 5.2392393744324874e-05, + "loss": 3.657, + "step": 12480 + }, + { + "epoch": 1.283922697368421, + "grad_norm": 1.0898821970949966, + "learning_rate": 5.2354446175817355e-05, + "loss": 3.6421, + "step": 12490 + }, + { + "epoch": 1.2849506578947367, + "grad_norm": 1.0457920337339304, + "learning_rate": 5.2316487764589046e-05, + "loss": 3.6801, + "step": 12500 + }, + { + "epoch": 1.2859786184210527, + "grad_norm": 0.964426699554839, + "learning_rate": 5.2278518552405114e-05, + "loss": 3.6739, + "step": 12510 + }, + { + "epoch": 1.2870065789473684, + "grad_norm": 1.3207416626961153, + "learning_rate": 5.2240538581042606e-05, + "loss": 3.6952, + "step": 12520 + }, + { + "epoch": 1.2880345394736843, + "grad_norm": 1.043432468380468, + "learning_rate": 5.220254789229038e-05, + "loss": 3.7638, + "step": 12530 + }, + { + "epoch": 1.2890625, + "grad_norm": 1.052433855692126, + "learning_rate": 5.216454652794912e-05, + "loss": 3.6971, + "step": 12540 + }, + { + "epoch": 1.2900904605263157, + "grad_norm": 1.4756409529421857, + "learning_rate": 5.2126534529831214e-05, + "loss": 3.6964, + "step": 12550 + }, + { + "epoch": 1.2911184210526316, + "grad_norm": 1.9071063010119218, + "learning_rate": 5.208851193976081e-05, + "loss": 3.6779, + "step": 12560 + }, + { + "epoch": 1.2921463815789473, + "grad_norm": 0.9274009024949873, + "learning_rate": 5.2050478799573656e-05, + "loss": 3.686, + "step": 12570 + }, + { + "epoch": 1.2931743421052633, + "grad_norm": 0.928550890941705, + "learning_rate": 5.201243515111713e-05, + "loss": 3.7477, + "step": 12580 + }, + { + "epoch": 1.294202302631579, + "grad_norm": 0.949250397331229, + "learning_rate": 5.1974381036250174e-05, + "loss": 3.5533, + "step": 12590 + }, + { + "epoch": 1.2952302631578947, + "grad_norm": 0.8938893176246955, + "learning_rate": 5.193631649684324e-05, + "loss": 3.7237, + "step": 12600 + }, + { + "epoch": 1.2962582236842106, + "grad_norm": 1.4715205905833402, + "learning_rate": 5.1898241574778265e-05, + "loss": 3.6957, + "step": 12610 + }, + { + "epoch": 1.2972861842105263, + "grad_norm": 1.214591242315716, + "learning_rate": 5.186015631194858e-05, + "loss": 3.6896, + "step": 12620 + }, + { + "epoch": 1.298314144736842, + "grad_norm": 1.1312766878130847, + "learning_rate": 5.1822060750258915e-05, + "loss": 3.6355, + "step": 12630 + }, + { + "epoch": 1.299342105263158, + "grad_norm": 0.7875831635780085, + "learning_rate": 5.178395493162533e-05, + "loss": 3.6848, + "step": 12640 + }, + { + "epoch": 1.3003700657894737, + "grad_norm": 0.9302497894401691, + "learning_rate": 5.174583889797518e-05, + "loss": 3.6645, + "step": 12650 + }, + { + "epoch": 1.3013980263157894, + "grad_norm": 0.7691282194811989, + "learning_rate": 5.1707712691247026e-05, + "loss": 3.6283, + "step": 12660 + }, + { + "epoch": 1.3024259868421053, + "grad_norm": 1.004226989569841, + "learning_rate": 5.166957635339067e-05, + "loss": 3.6919, + "step": 12670 + }, + { + "epoch": 1.303453947368421, + "grad_norm": 1.4846245003873095, + "learning_rate": 5.163142992636701e-05, + "loss": 3.6798, + "step": 12680 + }, + { + "epoch": 1.3044819078947367, + "grad_norm": 1.252574653304233, + "learning_rate": 5.15932734521481e-05, + "loss": 3.7, + "step": 12690 + }, + { + "epoch": 1.3055098684210527, + "grad_norm": 1.1430300609443624, + "learning_rate": 5.155510697271699e-05, + "loss": 3.7004, + "step": 12700 + }, + { + "epoch": 1.3065378289473684, + "grad_norm": 2.1600691655352704, + "learning_rate": 5.151693053006778e-05, + "loss": 3.7215, + "step": 12710 + }, + { + "epoch": 1.3075657894736843, + "grad_norm": 1.86675726750166, + "learning_rate": 5.1478744166205516e-05, + "loss": 3.653, + "step": 12720 + }, + { + "epoch": 1.30859375, + "grad_norm": 1.125718576035474, + "learning_rate": 5.144054792314619e-05, + "loss": 3.6185, + "step": 12730 + }, + { + "epoch": 1.3096217105263157, + "grad_norm": 1.0322135123273324, + "learning_rate": 5.140234184291659e-05, + "loss": 3.7064, + "step": 12740 + }, + { + "epoch": 1.3106496710526316, + "grad_norm": 1.0611542987245537, + "learning_rate": 5.136412596755442e-05, + "loss": 3.6906, + "step": 12750 + }, + { + "epoch": 1.3116776315789473, + "grad_norm": 1.4062131664962394, + "learning_rate": 5.13259003391081e-05, + "loss": 3.7362, + "step": 12760 + }, + { + "epoch": 1.3127055921052633, + "grad_norm": 1.0280955069813407, + "learning_rate": 5.12876649996368e-05, + "loss": 3.6353, + "step": 12770 + }, + { + "epoch": 1.313733552631579, + "grad_norm": 0.8748788068492054, + "learning_rate": 5.1249419991210365e-05, + "loss": 3.675, + "step": 12780 + }, + { + "epoch": 1.3147615131578947, + "grad_norm": 1.1543687517278958, + "learning_rate": 5.121116535590931e-05, + "loss": 3.6974, + "step": 12790 + }, + { + "epoch": 1.3157894736842106, + "grad_norm": 0.9110327499084718, + "learning_rate": 5.1172901135824696e-05, + "loss": 3.6549, + "step": 12800 + }, + { + "epoch": 1.3168174342105263, + "grad_norm": 1.4804039289229092, + "learning_rate": 5.113462737305817e-05, + "loss": 3.6533, + "step": 12810 + }, + { + "epoch": 1.317845394736842, + "grad_norm": 0.9941647269993993, + "learning_rate": 5.109634410972185e-05, + "loss": 3.6446, + "step": 12820 + }, + { + "epoch": 1.318873355263158, + "grad_norm": 1.4774452275485153, + "learning_rate": 5.1058051387938336e-05, + "loss": 3.7053, + "step": 12830 + }, + { + "epoch": 1.3199013157894737, + "grad_norm": 1.0525433255518255, + "learning_rate": 5.1019749249840604e-05, + "loss": 3.805, + "step": 12840 + }, + { + "epoch": 1.3209292763157894, + "grad_norm": 1.1161509153126086, + "learning_rate": 5.0981437737572005e-05, + "loss": 3.6016, + "step": 12850 + }, + { + "epoch": 1.3219572368421053, + "grad_norm": 1.0424985361432013, + "learning_rate": 5.094311689328621e-05, + "loss": 3.7375, + "step": 12860 + }, + { + "epoch": 1.322985197368421, + "grad_norm": 0.837585891760209, + "learning_rate": 5.090478675914717e-05, + "loss": 3.6875, + "step": 12870 + }, + { + "epoch": 1.3240131578947367, + "grad_norm": 1.1992361458874166, + "learning_rate": 5.086644737732901e-05, + "loss": 3.6855, + "step": 12880 + }, + { + "epoch": 1.3250411184210527, + "grad_norm": 1.0055769736875846, + "learning_rate": 5.082809879001607e-05, + "loss": 3.7563, + "step": 12890 + }, + { + "epoch": 1.3260690789473684, + "grad_norm": 0.693900464039598, + "learning_rate": 5.078974103940281e-05, + "loss": 3.659, + "step": 12900 + }, + { + "epoch": 1.3270970394736843, + "grad_norm": 1.069289170727484, + "learning_rate": 5.0751374167693786e-05, + "loss": 3.715, + "step": 12910 + }, + { + "epoch": 1.328125, + "grad_norm": 0.9769384652265323, + "learning_rate": 5.071299821710355e-05, + "loss": 3.6401, + "step": 12920 + }, + { + "epoch": 1.3291529605263157, + "grad_norm": 1.0525850037886753, + "learning_rate": 5.0674613229856695e-05, + "loss": 3.6253, + "step": 12930 + }, + { + "epoch": 1.3301809210526316, + "grad_norm": 1.2539761722831864, + "learning_rate": 5.063621924818771e-05, + "loss": 3.6424, + "step": 12940 + }, + { + "epoch": 1.3312088815789473, + "grad_norm": 1.0057749888752803, + "learning_rate": 5.0597816314341004e-05, + "loss": 3.7134, + "step": 12950 + }, + { + "epoch": 1.3322368421052633, + "grad_norm": 0.8320526335211573, + "learning_rate": 5.055940447057083e-05, + "loss": 3.6754, + "step": 12960 + }, + { + "epoch": 1.333264802631579, + "grad_norm": 1.288399358162162, + "learning_rate": 5.052098375914126e-05, + "loss": 3.6662, + "step": 12970 + }, + { + "epoch": 1.3342927631578947, + "grad_norm": 1.1513654273669223, + "learning_rate": 5.04825542223261e-05, + "loss": 3.6915, + "step": 12980 + }, + { + "epoch": 1.3353207236842106, + "grad_norm": 1.0562448377313813, + "learning_rate": 5.044411590240888e-05, + "loss": 3.7796, + "step": 12990 + }, + { + "epoch": 1.3363486842105263, + "grad_norm": 0.9962675055254577, + "learning_rate": 5.040566884168279e-05, + "loss": 3.639, + "step": 13000 + }, + { + "epoch": 1.337376644736842, + "grad_norm": 0.8826075628287259, + "learning_rate": 5.036721308245064e-05, + "loss": 3.7046, + "step": 13010 + }, + { + "epoch": 1.338404605263158, + "grad_norm": 1.0416122358724136, + "learning_rate": 5.0328748667024815e-05, + "loss": 3.7694, + "step": 13020 + }, + { + "epoch": 1.3394325657894737, + "grad_norm": 1.0794604226897078, + "learning_rate": 5.0290275637727196e-05, + "loss": 3.6514, + "step": 13030 + }, + { + "epoch": 1.3404605263157894, + "grad_norm": 0.898650169080577, + "learning_rate": 5.025179403688917e-05, + "loss": 3.6821, + "step": 13040 + }, + { + "epoch": 1.3414884868421053, + "grad_norm": 0.9099956583611176, + "learning_rate": 5.0213303906851584e-05, + "loss": 3.6171, + "step": 13050 + }, + { + "epoch": 1.342516447368421, + "grad_norm": 0.926716199609916, + "learning_rate": 5.0174805289964585e-05, + "loss": 3.7152, + "step": 13060 + }, + { + "epoch": 1.3435444078947367, + "grad_norm": 1.285681573701429, + "learning_rate": 5.013629822858773e-05, + "loss": 3.6592, + "step": 13070 + }, + { + "epoch": 1.3445723684210527, + "grad_norm": 0.9935625415309348, + "learning_rate": 5.009778276508986e-05, + "loss": 3.6534, + "step": 13080 + }, + { + "epoch": 1.3456003289473684, + "grad_norm": 0.8225984601972597, + "learning_rate": 5.005925894184901e-05, + "loss": 3.6004, + "step": 13090 + }, + { + "epoch": 1.3466282894736843, + "grad_norm": 0.9146360896252713, + "learning_rate": 5.002072680125249e-05, + "loss": 3.7085, + "step": 13100 + }, + { + "epoch": 1.34765625, + "grad_norm": 1.4618305597812127, + "learning_rate": 4.998218638569669e-05, + "loss": 3.7092, + "step": 13110 + }, + { + "epoch": 1.3486842105263157, + "grad_norm": 1.1100234164955518, + "learning_rate": 4.9943637737587144e-05, + "loss": 3.7094, + "step": 13120 + }, + { + "epoch": 1.3497121710526316, + "grad_norm": 0.9570203196621513, + "learning_rate": 4.990508089933844e-05, + "loss": 3.7648, + "step": 13130 + }, + { + "epoch": 1.3507401315789473, + "grad_norm": 1.0472823531889695, + "learning_rate": 4.986651591337416e-05, + "loss": 3.784, + "step": 13140 + }, + { + "epoch": 1.3517680921052633, + "grad_norm": 1.274365841475762, + "learning_rate": 4.9827942822126885e-05, + "loss": 3.7065, + "step": 13150 + }, + { + "epoch": 1.352796052631579, + "grad_norm": 0.9372941426921857, + "learning_rate": 4.978936166803807e-05, + "loss": 3.6673, + "step": 13160 + }, + { + "epoch": 1.3538240131578947, + "grad_norm": 0.892085500789066, + "learning_rate": 4.975077249355806e-05, + "loss": 3.7162, + "step": 13170 + }, + { + "epoch": 1.3548519736842106, + "grad_norm": 0.9162009891448878, + "learning_rate": 4.971217534114608e-05, + "loss": 3.6577, + "step": 13180 + }, + { + "epoch": 1.3558799342105263, + "grad_norm": 1.4421104633755593, + "learning_rate": 4.967357025327001e-05, + "loss": 3.6718, + "step": 13190 + }, + { + "epoch": 1.356907894736842, + "grad_norm": 1.444789317522658, + "learning_rate": 4.963495727240659e-05, + "loss": 3.6154, + "step": 13200 + }, + { + "epoch": 1.357935855263158, + "grad_norm": 0.8551537617515302, + "learning_rate": 4.959633644104116e-05, + "loss": 3.6746, + "step": 13210 + }, + { + "epoch": 1.3589638157894737, + "grad_norm": 1.6511893782941232, + "learning_rate": 4.955770780166773e-05, + "loss": 3.7095, + "step": 13220 + }, + { + "epoch": 1.3599917763157894, + "grad_norm": 1.0494936176122338, + "learning_rate": 4.95190713967889e-05, + "loss": 3.7113, + "step": 13230 + }, + { + "epoch": 1.3610197368421053, + "grad_norm": 0.8853376951715823, + "learning_rate": 4.9480427268915805e-05, + "loss": 3.6545, + "step": 13240 + }, + { + "epoch": 1.362047697368421, + "grad_norm": 1.1609046140713315, + "learning_rate": 4.944177546056808e-05, + "loss": 3.696, + "step": 13250 + }, + { + "epoch": 1.3630756578947367, + "grad_norm": 1.3760846368326134, + "learning_rate": 4.940311601427383e-05, + "loss": 3.7245, + "step": 13260 + }, + { + "epoch": 1.3641036184210527, + "grad_norm": 1.228813098983188, + "learning_rate": 4.9364448972569535e-05, + "loss": 3.6462, + "step": 13270 + }, + { + "epoch": 1.3651315789473684, + "grad_norm": 1.014256009534221, + "learning_rate": 4.932577437800006e-05, + "loss": 3.665, + "step": 13280 + }, + { + "epoch": 1.3661595394736843, + "grad_norm": 1.4345257715175246, + "learning_rate": 4.9287092273118546e-05, + "loss": 3.6509, + "step": 13290 + }, + { + "epoch": 1.3671875, + "grad_norm": 1.0472314142475279, + "learning_rate": 4.924840270048643e-05, + "loss": 3.6556, + "step": 13300 + }, + { + "epoch": 1.3682154605263157, + "grad_norm": 0.7235061184532796, + "learning_rate": 4.920970570267336e-05, + "loss": 3.5963, + "step": 13310 + }, + { + "epoch": 1.3692434210526316, + "grad_norm": 1.3565686515171254, + "learning_rate": 4.917100132225711e-05, + "loss": 3.6172, + "step": 13320 + }, + { + "epoch": 1.3702713815789473, + "grad_norm": 1.043429954937952, + "learning_rate": 4.913228960182365e-05, + "loss": 3.7345, + "step": 13330 + }, + { + "epoch": 1.3712993421052633, + "grad_norm": 1.0449200496479383, + "learning_rate": 4.909357058396698e-05, + "loss": 3.8272, + "step": 13340 + }, + { + "epoch": 1.372327302631579, + "grad_norm": 0.8448407222805726, + "learning_rate": 4.905484431128914e-05, + "loss": 3.6948, + "step": 13350 + }, + { + "epoch": 1.3733552631578947, + "grad_norm": 1.4118809178850182, + "learning_rate": 4.9016110826400154e-05, + "loss": 3.6292, + "step": 13360 + }, + { + "epoch": 1.3743832236842106, + "grad_norm": 1.0938804003774338, + "learning_rate": 4.897737017191796e-05, + "loss": 3.6808, + "step": 13370 + }, + { + "epoch": 1.3754111842105263, + "grad_norm": 0.845546752004818, + "learning_rate": 4.8938622390468426e-05, + "loss": 3.6458, + "step": 13380 + }, + { + "epoch": 1.376439144736842, + "grad_norm": 0.8256621535645912, + "learning_rate": 4.889986752468524e-05, + "loss": 3.6462, + "step": 13390 + }, + { + "epoch": 1.377467105263158, + "grad_norm": 1.2982003666796273, + "learning_rate": 4.8861105617209855e-05, + "loss": 3.6706, + "step": 13400 + }, + { + "epoch": 1.3784950657894737, + "grad_norm": 1.091433824697376, + "learning_rate": 4.882233671069152e-05, + "loss": 3.6177, + "step": 13410 + }, + { + "epoch": 1.3795230263157894, + "grad_norm": 0.9440919231437399, + "learning_rate": 4.878356084778718e-05, + "loss": 3.6701, + "step": 13420 + }, + { + "epoch": 1.3805509868421053, + "grad_norm": 1.5995145031342148, + "learning_rate": 4.8744778071161394e-05, + "loss": 3.7591, + "step": 13430 + }, + { + "epoch": 1.381578947368421, + "grad_norm": 0.9418900382417786, + "learning_rate": 4.8705988423486365e-05, + "loss": 3.7293, + "step": 13440 + }, + { + "epoch": 1.3826069078947367, + "grad_norm": 0.7871331526049513, + "learning_rate": 4.8667191947441855e-05, + "loss": 3.6945, + "step": 13450 + }, + { + "epoch": 1.3836348684210527, + "grad_norm": 0.9380959523969139, + "learning_rate": 4.862838868571511e-05, + "loss": 3.6742, + "step": 13460 + }, + { + "epoch": 1.3846628289473684, + "grad_norm": 0.9810385701007839, + "learning_rate": 4.8589578681000876e-05, + "loss": 3.7141, + "step": 13470 + }, + { + "epoch": 1.3856907894736843, + "grad_norm": 1.5117984721136062, + "learning_rate": 4.85507619760013e-05, + "loss": 3.7533, + "step": 13480 + }, + { + "epoch": 1.38671875, + "grad_norm": 1.0931926397091882, + "learning_rate": 4.85119386134259e-05, + "loss": 3.6543, + "step": 13490 + }, + { + "epoch": 1.3877467105263157, + "grad_norm": 1.582836709020964, + "learning_rate": 4.847310863599153e-05, + "loss": 3.6915, + "step": 13500 + }, + { + "epoch": 1.3887746710526316, + "grad_norm": 1.570587589594071, + "learning_rate": 4.8434272086422306e-05, + "loss": 3.689, + "step": 13510 + }, + { + "epoch": 1.3898026315789473, + "grad_norm": 1.2663382416054585, + "learning_rate": 4.839542900744961e-05, + "loss": 3.583, + "step": 13520 + }, + { + "epoch": 1.3908305921052633, + "grad_norm": 1.0414904572539128, + "learning_rate": 4.8356579441811945e-05, + "loss": 3.7453, + "step": 13530 + }, + { + "epoch": 1.391858552631579, + "grad_norm": 0.9985007133233011, + "learning_rate": 4.831772343225502e-05, + "loss": 3.6793, + "step": 13540 + }, + { + "epoch": 1.3928865131578947, + "grad_norm": 1.2913590290461863, + "learning_rate": 4.8278861021531594e-05, + "loss": 3.6148, + "step": 13550 + }, + { + "epoch": 1.3939144736842106, + "grad_norm": 1.0021819509620833, + "learning_rate": 4.823999225240149e-05, + "loss": 3.6849, + "step": 13560 + }, + { + "epoch": 1.3949424342105263, + "grad_norm": 0.9846032801343144, + "learning_rate": 4.8201117167631495e-05, + "loss": 3.5868, + "step": 13570 + }, + { + "epoch": 1.395970394736842, + "grad_norm": 1.3881711419211593, + "learning_rate": 4.8162235809995395e-05, + "loss": 3.6701, + "step": 13580 + }, + { + "epoch": 1.396998355263158, + "grad_norm": 1.254002409958539, + "learning_rate": 4.812334822227382e-05, + "loss": 3.62, + "step": 13590 + }, + { + "epoch": 1.3980263157894737, + "grad_norm": 1.1810352130165598, + "learning_rate": 4.80844544472543e-05, + "loss": 3.725, + "step": 13600 + }, + { + "epoch": 1.3990542763157894, + "grad_norm": 0.7473504233216873, + "learning_rate": 4.804555452773115e-05, + "loss": 3.6416, + "step": 13610 + }, + { + "epoch": 1.4000822368421053, + "grad_norm": 0.9268271738833328, + "learning_rate": 4.800664850650547e-05, + "loss": 3.6767, + "step": 13620 + }, + { + "epoch": 1.401110197368421, + "grad_norm": 1.1445281774069518, + "learning_rate": 4.7967736426385025e-05, + "loss": 3.6671, + "step": 13630 + }, + { + "epoch": 1.4021381578947367, + "grad_norm": 1.363537477390196, + "learning_rate": 4.792881833018431e-05, + "loss": 3.6887, + "step": 13640 + }, + { + "epoch": 1.4031661184210527, + "grad_norm": 1.396627949623374, + "learning_rate": 4.788989426072439e-05, + "loss": 3.7274, + "step": 13650 + }, + { + "epoch": 1.4041940789473684, + "grad_norm": 1.2148831965184395, + "learning_rate": 4.785096426083292e-05, + "loss": 3.684, + "step": 13660 + }, + { + "epoch": 1.4052220394736843, + "grad_norm": 0.9647609041916939, + "learning_rate": 4.781202837334409e-05, + "loss": 3.7093, + "step": 13670 + }, + { + "epoch": 1.40625, + "grad_norm": 0.9171975325339525, + "learning_rate": 4.7773086641098536e-05, + "loss": 3.6335, + "step": 13680 + }, + { + "epoch": 1.4072779605263157, + "grad_norm": 1.2083605017841463, + "learning_rate": 4.773413910694338e-05, + "loss": 3.7077, + "step": 13690 + }, + { + "epoch": 1.4083059210526316, + "grad_norm": 1.1068561057358026, + "learning_rate": 4.769518581373206e-05, + "loss": 3.6878, + "step": 13700 + }, + { + "epoch": 1.4093338815789473, + "grad_norm": 1.2975900815736374, + "learning_rate": 4.76562268043244e-05, + "loss": 3.7077, + "step": 13710 + }, + { + "epoch": 1.4103618421052633, + "grad_norm": 1.1348277089769572, + "learning_rate": 4.76172621215865e-05, + "loss": 3.7551, + "step": 13720 + }, + { + "epoch": 1.411389802631579, + "grad_norm": 1.1854498382021201, + "learning_rate": 4.7578291808390695e-05, + "loss": 3.6284, + "step": 13730 + }, + { + "epoch": 1.4124177631578947, + "grad_norm": 0.9327839807083619, + "learning_rate": 4.753931590761555e-05, + "loss": 3.7005, + "step": 13740 + }, + { + "epoch": 1.4134457236842106, + "grad_norm": 1.2536290893366273, + "learning_rate": 4.750033446214572e-05, + "loss": 3.6759, + "step": 13750 + }, + { + "epoch": 1.4144736842105263, + "grad_norm": 1.1710213158456668, + "learning_rate": 4.7461347514871966e-05, + "loss": 3.64, + "step": 13760 + }, + { + "epoch": 1.415501644736842, + "grad_norm": 1.166512745967056, + "learning_rate": 4.742235510869118e-05, + "loss": 3.6353, + "step": 13770 + }, + { + "epoch": 1.416529605263158, + "grad_norm": 1.550688157061977, + "learning_rate": 4.738335728650618e-05, + "loss": 3.6562, + "step": 13780 + }, + { + "epoch": 1.4175575657894737, + "grad_norm": 1.5867913864842413, + "learning_rate": 4.734435409122577e-05, + "loss": 3.6831, + "step": 13790 + }, + { + "epoch": 1.4185855263157894, + "grad_norm": 1.6029493328613675, + "learning_rate": 4.730534556576465e-05, + "loss": 3.6604, + "step": 13800 + }, + { + "epoch": 1.4196134868421053, + "grad_norm": 1.5713404237497934, + "learning_rate": 4.726633175304342e-05, + "loss": 3.7079, + "step": 13810 + }, + { + "epoch": 1.420641447368421, + "grad_norm": 0.9369681757221702, + "learning_rate": 4.722731269598849e-05, + "loss": 3.6876, + "step": 13820 + }, + { + "epoch": 1.4216694078947367, + "grad_norm": 0.9023082115143828, + "learning_rate": 4.7188288437532007e-05, + "loss": 3.6717, + "step": 13830 + }, + { + "epoch": 1.4226973684210527, + "grad_norm": 0.8017223875371787, + "learning_rate": 4.714925902061186e-05, + "loss": 3.6166, + "step": 13840 + }, + { + "epoch": 1.4237253289473684, + "grad_norm": 0.9549952434001922, + "learning_rate": 4.711022448817164e-05, + "loss": 3.7081, + "step": 13850 + }, + { + "epoch": 1.4247532894736843, + "grad_norm": 0.9756809680923627, + "learning_rate": 4.707118488316051e-05, + "loss": 3.6494, + "step": 13860 + }, + { + "epoch": 1.42578125, + "grad_norm": 1.2572902756340636, + "learning_rate": 4.7032140248533266e-05, + "loss": 3.7006, + "step": 13870 + }, + { + "epoch": 1.4268092105263157, + "grad_norm": 0.7331121828303553, + "learning_rate": 4.6993090627250234e-05, + "loss": 3.5794, + "step": 13880 + }, + { + "epoch": 1.4278371710526316, + "grad_norm": 1.1837323803179483, + "learning_rate": 4.695403606227717e-05, + "loss": 3.6859, + "step": 13890 + }, + { + "epoch": 1.4288651315789473, + "grad_norm": 1.1669171041810962, + "learning_rate": 4.691497659658535e-05, + "loss": 3.5993, + "step": 13900 + }, + { + "epoch": 1.4298930921052633, + "grad_norm": 0.8090325238512033, + "learning_rate": 4.6875912273151384e-05, + "loss": 3.7013, + "step": 13910 + }, + { + "epoch": 1.430921052631579, + "grad_norm": 1.2100552698390965, + "learning_rate": 4.6836843134957254e-05, + "loss": 3.6324, + "step": 13920 + }, + { + "epoch": 1.4319490131578947, + "grad_norm": 1.365467842888348, + "learning_rate": 4.679776922499023e-05, + "loss": 3.6462, + "step": 13930 + }, + { + "epoch": 1.4329769736842106, + "grad_norm": 1.1844843670516698, + "learning_rate": 4.675869058624282e-05, + "loss": 3.6907, + "step": 13940 + }, + { + "epoch": 1.4340049342105263, + "grad_norm": 0.9362882730583413, + "learning_rate": 4.671960726171278e-05, + "loss": 3.6443, + "step": 13950 + }, + { + "epoch": 1.435032894736842, + "grad_norm": 1.1189681581018402, + "learning_rate": 4.668051929440296e-05, + "loss": 3.6359, + "step": 13960 + }, + { + "epoch": 1.436060855263158, + "grad_norm": 0.8871205297698515, + "learning_rate": 4.664142672732137e-05, + "loss": 3.7147, + "step": 13970 + }, + { + "epoch": 1.4370888157894737, + "grad_norm": 1.0486457993002865, + "learning_rate": 4.660232960348105e-05, + "loss": 3.6516, + "step": 13980 + }, + { + "epoch": 1.4381167763157894, + "grad_norm": 1.0691857637487296, + "learning_rate": 4.6563227965900065e-05, + "loss": 3.6355, + "step": 13990 + }, + { + "epoch": 1.4391447368421053, + "grad_norm": 1.5302563771633966, + "learning_rate": 4.652412185760144e-05, + "loss": 3.6717, + "step": 14000 + }, + { + "epoch": 1.440172697368421, + "grad_norm": 0.7590518138097169, + "learning_rate": 4.648501132161314e-05, + "loss": 3.7097, + "step": 14010 + }, + { + "epoch": 1.4412006578947367, + "grad_norm": 1.1904480129267818, + "learning_rate": 4.644589640096797e-05, + "loss": 3.698, + "step": 14020 + }, + { + "epoch": 1.4422286184210527, + "grad_norm": 1.1496187739193808, + "learning_rate": 4.64067771387036e-05, + "loss": 3.6696, + "step": 14030 + }, + { + "epoch": 1.4432565789473684, + "grad_norm": 1.8629256894888468, + "learning_rate": 4.6367653577862415e-05, + "loss": 3.6555, + "step": 14040 + }, + { + "epoch": 1.4442845394736843, + "grad_norm": 0.9819047298580492, + "learning_rate": 4.6328525761491595e-05, + "loss": 3.6674, + "step": 14050 + }, + { + "epoch": 1.4453125, + "grad_norm": 1.1065827812053448, + "learning_rate": 4.628939373264298e-05, + "loss": 3.63, + "step": 14060 + }, + { + "epoch": 1.4463404605263157, + "grad_norm": 1.0745902234128109, + "learning_rate": 4.625025753437301e-05, + "loss": 3.6725, + "step": 14070 + }, + { + "epoch": 1.4473684210526316, + "grad_norm": 1.0831087442048797, + "learning_rate": 4.621111720974276e-05, + "loss": 3.6934, + "step": 14080 + }, + { + "epoch": 1.4483963815789473, + "grad_norm": 1.198605507982069, + "learning_rate": 4.617197280181784e-05, + "loss": 3.673, + "step": 14090 + }, + { + "epoch": 1.4494243421052633, + "grad_norm": 1.1256995145628772, + "learning_rate": 4.6132824353668326e-05, + "loss": 3.6856, + "step": 14100 + }, + { + "epoch": 1.450452302631579, + "grad_norm": 1.1561286965739095, + "learning_rate": 4.6093671908368754e-05, + "loss": 3.7206, + "step": 14110 + }, + { + "epoch": 1.4514802631578947, + "grad_norm": 0.8831950683766181, + "learning_rate": 4.6054515508998055e-05, + "loss": 3.7283, + "step": 14120 + }, + { + "epoch": 1.4525082236842106, + "grad_norm": 0.9732229274901035, + "learning_rate": 4.601535519863951e-05, + "loss": 3.6503, + "step": 14130 + }, + { + "epoch": 1.4535361842105263, + "grad_norm": 1.1436286987018054, + "learning_rate": 4.597619102038073e-05, + "loss": 3.688, + "step": 14140 + }, + { + "epoch": 1.454564144736842, + "grad_norm": 0.7741003952637405, + "learning_rate": 4.593702301731353e-05, + "loss": 3.7194, + "step": 14150 + }, + { + "epoch": 1.455592105263158, + "grad_norm": 0.8738990396808866, + "learning_rate": 4.589785123253397e-05, + "loss": 3.6972, + "step": 14160 + }, + { + "epoch": 1.4566200657894737, + "grad_norm": 0.7202574636489063, + "learning_rate": 4.5858675709142265e-05, + "loss": 3.6187, + "step": 14170 + }, + { + "epoch": 1.4576480263157894, + "grad_norm": 0.7573330335761986, + "learning_rate": 4.581949649024275e-05, + "loss": 3.6105, + "step": 14180 + }, + { + "epoch": 1.4586759868421053, + "grad_norm": 1.0018357474002322, + "learning_rate": 4.57803136189438e-05, + "loss": 3.6616, + "step": 14190 + }, + { + "epoch": 1.459703947368421, + "grad_norm": 0.8894482585252829, + "learning_rate": 4.574112713835783e-05, + "loss": 3.7012, + "step": 14200 + }, + { + "epoch": 1.4607319078947367, + "grad_norm": 1.1125913925349498, + "learning_rate": 4.5701937091601245e-05, + "loss": 3.6369, + "step": 14210 + }, + { + "epoch": 1.4617598684210527, + "grad_norm": 1.0304285881770952, + "learning_rate": 4.5662743521794315e-05, + "loss": 3.6651, + "step": 14220 + }, + { + "epoch": 1.4627878289473684, + "grad_norm": 1.1847594912829602, + "learning_rate": 4.5623546472061254e-05, + "loss": 3.5503, + "step": 14230 + }, + { + "epoch": 1.4638157894736843, + "grad_norm": 1.1311558152081056, + "learning_rate": 4.558434598553005e-05, + "loss": 3.7945, + "step": 14240 + }, + { + "epoch": 1.46484375, + "grad_norm": 1.0495016254959495, + "learning_rate": 4.554514210533251e-05, + "loss": 3.6078, + "step": 14250 + }, + { + "epoch": 1.4658717105263157, + "grad_norm": 1.3894460588371507, + "learning_rate": 4.5505934874604155e-05, + "loss": 3.6466, + "step": 14260 + }, + { + "epoch": 1.4668996710526316, + "grad_norm": 0.9277190518368601, + "learning_rate": 4.546672433648419e-05, + "loss": 3.6196, + "step": 14270 + }, + { + "epoch": 1.4679276315789473, + "grad_norm": 0.9515772274988558, + "learning_rate": 4.542751053411548e-05, + "loss": 3.6597, + "step": 14280 + }, + { + "epoch": 1.4689555921052633, + "grad_norm": 1.105200421553375, + "learning_rate": 4.538829351064448e-05, + "loss": 3.6946, + "step": 14290 + }, + { + "epoch": 1.469983552631579, + "grad_norm": 1.2816650834753793, + "learning_rate": 4.534907330922114e-05, + "loss": 3.5164, + "step": 14300 + }, + { + "epoch": 1.4710115131578947, + "grad_norm": 1.5747443521497975, + "learning_rate": 4.530984997299898e-05, + "loss": 3.6811, + "step": 14310 + }, + { + "epoch": 1.4720394736842106, + "grad_norm": 0.7753829763501262, + "learning_rate": 4.5270623545134924e-05, + "loss": 3.6284, + "step": 14320 + }, + { + "epoch": 1.4730674342105263, + "grad_norm": 1.119570451517297, + "learning_rate": 4.5231394068789306e-05, + "loss": 3.6274, + "step": 14330 + }, + { + "epoch": 1.474095394736842, + "grad_norm": 1.2991978169381775, + "learning_rate": 4.5192161587125804e-05, + "loss": 3.6473, + "step": 14340 + }, + { + "epoch": 1.475123355263158, + "grad_norm": 0.9987411622287811, + "learning_rate": 4.515292614331144e-05, + "loss": 3.6881, + "step": 14350 + }, + { + "epoch": 1.4761513157894737, + "grad_norm": 1.4219910291343179, + "learning_rate": 4.511368778051645e-05, + "loss": 3.672, + "step": 14360 + }, + { + "epoch": 1.4771792763157894, + "grad_norm": 0.9625121450277221, + "learning_rate": 4.50744465419143e-05, + "loss": 3.7273, + "step": 14370 + }, + { + "epoch": 1.4782072368421053, + "grad_norm": 1.1139714828484657, + "learning_rate": 4.503520247068164e-05, + "loss": 3.6355, + "step": 14380 + }, + { + "epoch": 1.479235197368421, + "grad_norm": 1.098251398635762, + "learning_rate": 4.499595560999821e-05, + "loss": 3.6467, + "step": 14390 + }, + { + "epoch": 1.4802631578947367, + "grad_norm": 1.193449625701315, + "learning_rate": 4.4956706003046804e-05, + "loss": 3.7547, + "step": 14400 + }, + { + "epoch": 1.4812911184210527, + "grad_norm": 0.8728896987378874, + "learning_rate": 4.4917453693013304e-05, + "loss": 3.7286, + "step": 14410 + }, + { + "epoch": 1.4823190789473684, + "grad_norm": 0.9542421454070635, + "learning_rate": 4.487819872308649e-05, + "loss": 3.6951, + "step": 14420 + }, + { + "epoch": 1.4833470394736843, + "grad_norm": 0.8124322496190681, + "learning_rate": 4.483894113645811e-05, + "loss": 3.7326, + "step": 14430 + }, + { + "epoch": 1.484375, + "grad_norm": 1.3007802877336672, + "learning_rate": 4.47996809763228e-05, + "loss": 3.5975, + "step": 14440 + }, + { + "epoch": 1.4854029605263157, + "grad_norm": 0.8804993834831857, + "learning_rate": 4.4760418285877985e-05, + "loss": 3.6294, + "step": 14450 + }, + { + "epoch": 1.4864309210526316, + "grad_norm": 1.0958607062243788, + "learning_rate": 4.472115310832392e-05, + "loss": 3.6157, + "step": 14460 + }, + { + "epoch": 1.4874588815789473, + "grad_norm": 0.9222906113325489, + "learning_rate": 4.468188548686357e-05, + "loss": 3.7072, + "step": 14470 + }, + { + "epoch": 1.4884868421052633, + "grad_norm": 0.9686955775079147, + "learning_rate": 4.464261546470259e-05, + "loss": 3.6831, + "step": 14480 + }, + { + "epoch": 1.489514802631579, + "grad_norm": 0.971428705513969, + "learning_rate": 4.460334308504928e-05, + "loss": 3.707, + "step": 14490 + }, + { + "epoch": 1.4905427631578947, + "grad_norm": 1.1881869989118883, + "learning_rate": 4.4564068391114544e-05, + "loss": 3.6579, + "step": 14500 + }, + { + "epoch": 1.4915707236842106, + "grad_norm": 0.9815783358331538, + "learning_rate": 4.4524791426111836e-05, + "loss": 3.7193, + "step": 14510 + }, + { + "epoch": 1.4925986842105263, + "grad_norm": 0.9514702675504375, + "learning_rate": 4.4485512233257084e-05, + "loss": 3.6358, + "step": 14520 + }, + { + "epoch": 1.493626644736842, + "grad_norm": 0.7691954833887478, + "learning_rate": 4.4446230855768666e-05, + "loss": 3.6176, + "step": 14530 + }, + { + "epoch": 1.494654605263158, + "grad_norm": 1.0581543633430746, + "learning_rate": 4.440694733686741e-05, + "loss": 3.7137, + "step": 14540 + }, + { + "epoch": 1.4956825657894737, + "grad_norm": 2.141746283014892, + "learning_rate": 4.436766171977643e-05, + "loss": 3.6928, + "step": 14550 + }, + { + "epoch": 1.4967105263157894, + "grad_norm": 1.1167072250819787, + "learning_rate": 4.432837404772121e-05, + "loss": 3.6402, + "step": 14560 + }, + { + "epoch": 1.4977384868421053, + "grad_norm": 1.6623297611353982, + "learning_rate": 4.4289084363929466e-05, + "loss": 3.6157, + "step": 14570 + }, + { + "epoch": 1.498766447368421, + "grad_norm": 1.2495066622907165, + "learning_rate": 4.4249792711631135e-05, + "loss": 3.7535, + "step": 14580 + }, + { + "epoch": 1.4997944078947367, + "grad_norm": 1.0144779476967032, + "learning_rate": 4.421049913405833e-05, + "loss": 3.7554, + "step": 14590 + }, + { + "epoch": 1.5008223684210527, + "grad_norm": 1.0446544269298348, + "learning_rate": 4.417120367444526e-05, + "loss": 3.6144, + "step": 14600 + }, + { + "epoch": 1.5018503289473686, + "grad_norm": 0.9429032069880837, + "learning_rate": 4.413190637602819e-05, + "loss": 3.6416, + "step": 14610 + }, + { + "epoch": 1.502878289473684, + "grad_norm": 0.8964055843261034, + "learning_rate": 4.409260728204549e-05, + "loss": 3.6715, + "step": 14620 + }, + { + "epoch": 1.50390625, + "grad_norm": 0.8851500427621024, + "learning_rate": 4.4053306435737394e-05, + "loss": 3.6692, + "step": 14630 + }, + { + "epoch": 1.504934210526316, + "grad_norm": 1.1888640249381657, + "learning_rate": 4.401400388034617e-05, + "loss": 3.6925, + "step": 14640 + }, + { + "epoch": 1.5059621710526314, + "grad_norm": 1.4021829922057183, + "learning_rate": 4.3974699659115864e-05, + "loss": 3.6522, + "step": 14650 + }, + { + "epoch": 1.5069901315789473, + "grad_norm": 0.9472136807701399, + "learning_rate": 4.393539381529243e-05, + "loss": 3.7609, + "step": 14660 + }, + { + "epoch": 1.5080180921052633, + "grad_norm": 1.2177449508308575, + "learning_rate": 4.3896086392123606e-05, + "loss": 3.7189, + "step": 14670 + }, + { + "epoch": 1.509046052631579, + "grad_norm": 0.749722151152262, + "learning_rate": 4.385677743285881e-05, + "loss": 3.6331, + "step": 14680 + }, + { + "epoch": 1.5100740131578947, + "grad_norm": 1.5279822222389472, + "learning_rate": 4.3817466980749206e-05, + "loss": 3.6372, + "step": 14690 + }, + { + "epoch": 1.5111019736842106, + "grad_norm": 1.3610228712106778, + "learning_rate": 4.377815507904758e-05, + "loss": 3.6367, + "step": 14700 + }, + { + "epoch": 1.5121299342105263, + "grad_norm": 0.9410824357286373, + "learning_rate": 4.373884177100829e-05, + "loss": 3.6479, + "step": 14710 + }, + { + "epoch": 1.513157894736842, + "grad_norm": 1.224611419237977, + "learning_rate": 4.36995270998873e-05, + "loss": 3.622, + "step": 14720 + }, + { + "epoch": 1.514185855263158, + "grad_norm": 0.8571156365466034, + "learning_rate": 4.3660211108942006e-05, + "loss": 3.714, + "step": 14730 + }, + { + "epoch": 1.5152138157894737, + "grad_norm": 1.443841381401228, + "learning_rate": 4.362089384143133e-05, + "loss": 3.693, + "step": 14740 + }, + { + "epoch": 1.5162417763157894, + "grad_norm": 1.2262556822136712, + "learning_rate": 4.358157534061551e-05, + "loss": 3.663, + "step": 14750 + }, + { + "epoch": 1.5172697368421053, + "grad_norm": 1.2967253449278389, + "learning_rate": 4.354225564975622e-05, + "loss": 3.5786, + "step": 14760 + }, + { + "epoch": 1.518297697368421, + "grad_norm": 0.7892493225109248, + "learning_rate": 4.3502934812116406e-05, + "loss": 3.6866, + "step": 14770 + }, + { + "epoch": 1.5193256578947367, + "grad_norm": 0.755161880292214, + "learning_rate": 4.346361287096026e-05, + "loss": 3.594, + "step": 14780 + }, + { + "epoch": 1.5203536184210527, + "grad_norm": 0.7342081895041577, + "learning_rate": 4.342428986955325e-05, + "loss": 3.6836, + "step": 14790 + }, + { + "epoch": 1.5213815789473686, + "grad_norm": 1.077210577575646, + "learning_rate": 4.3384965851161945e-05, + "loss": 3.6441, + "step": 14800 + }, + { + "epoch": 1.522409539473684, + "grad_norm": 1.531798128388344, + "learning_rate": 4.334564085905404e-05, + "loss": 3.6403, + "step": 14810 + }, + { + "epoch": 1.5234375, + "grad_norm": 1.292909649653586, + "learning_rate": 4.3306314936498364e-05, + "loss": 3.6502, + "step": 14820 + }, + { + "epoch": 1.524465460526316, + "grad_norm": 0.9468013923663791, + "learning_rate": 4.326698812676469e-05, + "loss": 3.7168, + "step": 14830 + }, + { + "epoch": 1.5254934210526314, + "grad_norm": 0.7786069411619261, + "learning_rate": 4.3227660473123815e-05, + "loss": 3.6643, + "step": 14840 + }, + { + "epoch": 1.5265213815789473, + "grad_norm": 0.9104119461484226, + "learning_rate": 4.318833201884745e-05, + "loss": 3.6273, + "step": 14850 + }, + { + "epoch": 1.5275493421052633, + "grad_norm": 1.202843323468981, + "learning_rate": 4.314900280720819e-05, + "loss": 3.641, + "step": 14860 + }, + { + "epoch": 1.528577302631579, + "grad_norm": 0.9574898380108815, + "learning_rate": 4.310967288147947e-05, + "loss": 3.6696, + "step": 14870 + }, + { + "epoch": 1.5296052631578947, + "grad_norm": 1.1943987177196143, + "learning_rate": 4.3070342284935495e-05, + "loss": 3.6871, + "step": 14880 + }, + { + "epoch": 1.5306332236842106, + "grad_norm": 0.9181296025025141, + "learning_rate": 4.3031011060851205e-05, + "loss": 3.6858, + "step": 14890 + }, + { + "epoch": 1.5316611842105263, + "grad_norm": 0.8377173184514839, + "learning_rate": 4.2991679252502256e-05, + "loss": 3.6727, + "step": 14900 + }, + { + "epoch": 1.532689144736842, + "grad_norm": 0.7029759990914445, + "learning_rate": 4.295234690316493e-05, + "loss": 3.6822, + "step": 14910 + }, + { + "epoch": 1.533717105263158, + "grad_norm": 0.8032948933963687, + "learning_rate": 4.2913014056116115e-05, + "loss": 3.6389, + "step": 14920 + }, + { + "epoch": 1.5347450657894737, + "grad_norm": 1.2610293651522577, + "learning_rate": 4.287368075463322e-05, + "loss": 3.6592, + "step": 14930 + }, + { + "epoch": 1.5357730263157894, + "grad_norm": 0.8931236019350486, + "learning_rate": 4.283434704199419e-05, + "loss": 3.6201, + "step": 14940 + }, + { + "epoch": 1.5368009868421053, + "grad_norm": 0.9486370423513055, + "learning_rate": 4.279501296147739e-05, + "loss": 3.7044, + "step": 14950 + }, + { + "epoch": 1.537828947368421, + "grad_norm": 1.027705081883258, + "learning_rate": 4.275567855636162e-05, + "loss": 3.5621, + "step": 14960 + }, + { + "epoch": 1.5388569078947367, + "grad_norm": 1.7215103178620845, + "learning_rate": 4.2716343869926005e-05, + "loss": 3.7187, + "step": 14970 + }, + { + "epoch": 1.5398848684210527, + "grad_norm": 1.048585945456184, + "learning_rate": 4.267700894545e-05, + "loss": 3.6699, + "step": 14980 + }, + { + "epoch": 1.5409128289473686, + "grad_norm": 1.267681852308088, + "learning_rate": 4.263767382621334e-05, + "loss": 3.6764, + "step": 14990 + }, + { + "epoch": 1.541940789473684, + "grad_norm": 1.0696690985355886, + "learning_rate": 4.259833855549592e-05, + "loss": 3.7305, + "step": 15000 + }, + { + "epoch": 1.54296875, + "grad_norm": 1.2075892272900854, + "learning_rate": 4.2559003176577853e-05, + "loss": 3.6525, + "step": 15010 + }, + { + "epoch": 1.543996710526316, + "grad_norm": 0.9895309869406582, + "learning_rate": 4.251966773273934e-05, + "loss": 3.6501, + "step": 15020 + }, + { + "epoch": 1.5450246710526314, + "grad_norm": 1.2176992826666595, + "learning_rate": 4.248033226726068e-05, + "loss": 3.6661, + "step": 15030 + }, + { + "epoch": 1.5460526315789473, + "grad_norm": 1.0648196647982018, + "learning_rate": 4.244099682342217e-05, + "loss": 3.6775, + "step": 15040 + }, + { + "epoch": 1.5470805921052633, + "grad_norm": 0.8705651086169357, + "learning_rate": 4.24016614445041e-05, + "loss": 3.6122, + "step": 15050 + }, + { + "epoch": 1.548108552631579, + "grad_norm": 0.9371593492228931, + "learning_rate": 4.236232617378668e-05, + "loss": 3.6622, + "step": 15060 + }, + { + "epoch": 1.5491365131578947, + "grad_norm": 1.2884105536515689, + "learning_rate": 4.232299105455e-05, + "loss": 3.6778, + "step": 15070 + }, + { + "epoch": 1.5501644736842106, + "grad_norm": 1.531795715870129, + "learning_rate": 4.228365613007402e-05, + "loss": 3.673, + "step": 15080 + }, + { + "epoch": 1.5511924342105263, + "grad_norm": 1.0147396630406558, + "learning_rate": 4.2244321443638394e-05, + "loss": 3.6772, + "step": 15090 + }, + { + "epoch": 1.552220394736842, + "grad_norm": 1.1363137693734295, + "learning_rate": 4.2204987038522616e-05, + "loss": 3.6129, + "step": 15100 + }, + { + "epoch": 1.553248355263158, + "grad_norm": 1.0101322416091518, + "learning_rate": 4.216565295800582e-05, + "loss": 3.6896, + "step": 15110 + }, + { + "epoch": 1.5542763157894737, + "grad_norm": 1.1076824547714568, + "learning_rate": 4.2126319245366785e-05, + "loss": 3.6827, + "step": 15120 + }, + { + "epoch": 1.5553042763157894, + "grad_norm": 1.8529844336887775, + "learning_rate": 4.20869859438839e-05, + "loss": 3.6941, + "step": 15130 + }, + { + "epoch": 1.5563322368421053, + "grad_norm": 0.9073442889843091, + "learning_rate": 4.2047653096835075e-05, + "loss": 3.6085, + "step": 15140 + }, + { + "epoch": 1.557360197368421, + "grad_norm": 0.9617734544877308, + "learning_rate": 4.200832074749776e-05, + "loss": 3.6624, + "step": 15150 + }, + { + "epoch": 1.5583881578947367, + "grad_norm": 1.278643445584411, + "learning_rate": 4.196898893914881e-05, + "loss": 3.5383, + "step": 15160 + }, + { + "epoch": 1.5594161184210527, + "grad_norm": 1.1423075559379379, + "learning_rate": 4.192965771506452e-05, + "loss": 3.6869, + "step": 15170 + }, + { + "epoch": 1.5604440789473686, + "grad_norm": 0.9160455239302735, + "learning_rate": 4.1890327118520545e-05, + "loss": 3.6289, + "step": 15180 + }, + { + "epoch": 1.561472039473684, + "grad_norm": 1.1684069879817196, + "learning_rate": 4.185099719279182e-05, + "loss": 3.6205, + "step": 15190 + }, + { + "epoch": 1.5625, + "grad_norm": 1.0843157287581247, + "learning_rate": 4.181166798115257e-05, + "loss": 3.5908, + "step": 15200 + }, + { + "epoch": 1.563527960526316, + "grad_norm": 1.1731022321537221, + "learning_rate": 4.177233952687621e-05, + "loss": 3.7985, + "step": 15210 + }, + { + "epoch": 1.5645559210526314, + "grad_norm": 1.0670450833242693, + "learning_rate": 4.1733011873235314e-05, + "loss": 3.6031, + "step": 15220 + }, + { + "epoch": 1.5655838815789473, + "grad_norm": 0.8311701872520117, + "learning_rate": 4.169368506350164e-05, + "loss": 3.5949, + "step": 15230 + }, + { + "epoch": 1.5666118421052633, + "grad_norm": 0.8741077915168648, + "learning_rate": 4.1654359140945964e-05, + "loss": 3.6731, + "step": 15240 + }, + { + "epoch": 1.567639802631579, + "grad_norm": 0.8813406904373271, + "learning_rate": 4.1615034148838075e-05, + "loss": 3.7964, + "step": 15250 + }, + { + "epoch": 1.5686677631578947, + "grad_norm": 1.0941544466190938, + "learning_rate": 4.1575710130446757e-05, + "loss": 3.6851, + "step": 15260 + }, + { + "epoch": 1.5696957236842106, + "grad_norm": 1.4096549456014296, + "learning_rate": 4.1536387129039746e-05, + "loss": 3.6467, + "step": 15270 + }, + { + "epoch": 1.5707236842105263, + "grad_norm": 0.7791086222877919, + "learning_rate": 4.14970651878836e-05, + "loss": 3.5753, + "step": 15280 + }, + { + "epoch": 1.571751644736842, + "grad_norm": 1.7592822224901128, + "learning_rate": 4.145774435024378e-05, + "loss": 3.6739, + "step": 15290 + }, + { + "epoch": 1.572779605263158, + "grad_norm": 1.5228512932157163, + "learning_rate": 4.1418424659384496e-05, + "loss": 3.6809, + "step": 15300 + }, + { + "epoch": 1.5738075657894737, + "grad_norm": 1.0433699732634576, + "learning_rate": 4.137910615856869e-05, + "loss": 3.6689, + "step": 15310 + }, + { + "epoch": 1.5748355263157894, + "grad_norm": 0.7517586989284809, + "learning_rate": 4.1339788891058e-05, + "loss": 3.7195, + "step": 15320 + }, + { + "epoch": 1.5758634868421053, + "grad_norm": 0.8971301310715274, + "learning_rate": 4.1300472900112706e-05, + "loss": 3.6106, + "step": 15330 + }, + { + "epoch": 1.576891447368421, + "grad_norm": 0.9131559457306825, + "learning_rate": 4.126115822899172e-05, + "loss": 3.6145, + "step": 15340 + }, + { + "epoch": 1.5779194078947367, + "grad_norm": 0.7494874235806904, + "learning_rate": 4.122184492095245e-05, + "loss": 3.6983, + "step": 15350 + }, + { + "epoch": 1.5789473684210527, + "grad_norm": 0.914177716158563, + "learning_rate": 4.118253301925081e-05, + "loss": 3.6672, + "step": 15360 + }, + { + "epoch": 1.5799753289473686, + "grad_norm": 0.7922560773130678, + "learning_rate": 4.11432225671412e-05, + "loss": 3.6018, + "step": 15370 + }, + { + "epoch": 1.581003289473684, + "grad_norm": 1.0140204445886494, + "learning_rate": 4.11039136078764e-05, + "loss": 3.6709, + "step": 15380 + }, + { + "epoch": 1.58203125, + "grad_norm": 0.9323334107385884, + "learning_rate": 4.106460618470757e-05, + "loss": 3.6106, + "step": 15390 + }, + { + "epoch": 1.583059210526316, + "grad_norm": 0.8963398203938377, + "learning_rate": 4.1025300340884156e-05, + "loss": 3.7064, + "step": 15400 + }, + { + "epoch": 1.5840871710526314, + "grad_norm": 1.168156433335677, + "learning_rate": 4.098599611965385e-05, + "loss": 3.6004, + "step": 15410 + }, + { + "epoch": 1.5851151315789473, + "grad_norm": 1.1197688019066752, + "learning_rate": 4.094669356426261e-05, + "loss": 3.6279, + "step": 15420 + }, + { + "epoch": 1.5861430921052633, + "grad_norm": 0.8383294225896041, + "learning_rate": 4.090739271795452e-05, + "loss": 3.6532, + "step": 15430 + }, + { + "epoch": 1.587171052631579, + "grad_norm": 1.1059597259556024, + "learning_rate": 4.0868093623971805e-05, + "loss": 3.6066, + "step": 15440 + }, + { + "epoch": 1.5881990131578947, + "grad_norm": 1.232278516263384, + "learning_rate": 4.082879632555475e-05, + "loss": 3.6505, + "step": 15450 + }, + { + "epoch": 1.5892269736842106, + "grad_norm": 1.7124125075439744, + "learning_rate": 4.0789500865941684e-05, + "loss": 3.6919, + "step": 15460 + }, + { + "epoch": 1.5902549342105263, + "grad_norm": 1.2355169977078668, + "learning_rate": 4.075020728836888e-05, + "loss": 3.7116, + "step": 15470 + }, + { + "epoch": 1.591282894736842, + "grad_norm": 0.8140648664268028, + "learning_rate": 4.071091563607054e-05, + "loss": 3.6544, + "step": 15480 + }, + { + "epoch": 1.592310855263158, + "grad_norm": 0.7400103307982331, + "learning_rate": 4.06716259522788e-05, + "loss": 3.6894, + "step": 15490 + }, + { + "epoch": 1.5933388157894737, + "grad_norm": 1.1475681985273474, + "learning_rate": 4.063233828022359e-05, + "loss": 3.6275, + "step": 15500 + }, + { + "epoch": 1.5943667763157894, + "grad_norm": 1.0630377923091456, + "learning_rate": 4.059305266313262e-05, + "loss": 3.6277, + "step": 15510 + }, + { + "epoch": 1.5953947368421053, + "grad_norm": 0.856930359402283, + "learning_rate": 4.0553769144231354e-05, + "loss": 3.6713, + "step": 15520 + }, + { + "epoch": 1.596422697368421, + "grad_norm": 2.1123621382111537, + "learning_rate": 4.051448776674293e-05, + "loss": 3.6101, + "step": 15530 + }, + { + "epoch": 1.5974506578947367, + "grad_norm": 1.367737926309239, + "learning_rate": 4.047520857388817e-05, + "loss": 3.6376, + "step": 15540 + }, + { + "epoch": 1.5984786184210527, + "grad_norm": 0.7686901627489895, + "learning_rate": 4.043593160888545e-05, + "loss": 3.5913, + "step": 15550 + }, + { + "epoch": 1.5995065789473686, + "grad_norm": 1.0724257052637944, + "learning_rate": 4.039665691495073e-05, + "loss": 3.692, + "step": 15560 + }, + { + "epoch": 1.600534539473684, + "grad_norm": 1.2421311659604704, + "learning_rate": 4.035738453529742e-05, + "loss": 3.6171, + "step": 15570 + }, + { + "epoch": 1.6015625, + "grad_norm": 1.0061069813376822, + "learning_rate": 4.0318114513136437e-05, + "loss": 3.584, + "step": 15580 + }, + { + "epoch": 1.602590460526316, + "grad_norm": 1.3454478870280981, + "learning_rate": 4.027884689167609e-05, + "loss": 3.6969, + "step": 15590 + }, + { + "epoch": 1.6036184210526314, + "grad_norm": 0.8382356017981202, + "learning_rate": 4.023958171412203e-05, + "loss": 3.5693, + "step": 15600 + }, + { + "epoch": 1.6046463815789473, + "grad_norm": 0.7668306627957359, + "learning_rate": 4.020031902367721e-05, + "loss": 3.6345, + "step": 15610 + }, + { + "epoch": 1.6056743421052633, + "grad_norm": 1.2962777014690434, + "learning_rate": 4.01610588635419e-05, + "loss": 3.6392, + "step": 15620 + }, + { + "epoch": 1.606702302631579, + "grad_norm": 1.0332981347170118, + "learning_rate": 4.012180127691353e-05, + "loss": 3.5315, + "step": 15630 + }, + { + "epoch": 1.6077302631578947, + "grad_norm": 0.9518757121433492, + "learning_rate": 4.0082546306986716e-05, + "loss": 3.6326, + "step": 15640 + }, + { + "epoch": 1.6087582236842106, + "grad_norm": 1.0811708520575303, + "learning_rate": 4.004329399695321e-05, + "loss": 3.6388, + "step": 15650 + }, + { + "epoch": 1.6097861842105263, + "grad_norm": 1.2131825462815198, + "learning_rate": 4.0004044390001805e-05, + "loss": 3.6773, + "step": 15660 + }, + { + "epoch": 1.610814144736842, + "grad_norm": 0.8306134820029021, + "learning_rate": 3.996479752931838e-05, + "loss": 3.6548, + "step": 15670 + }, + { + "epoch": 1.611842105263158, + "grad_norm": 0.9256551471568542, + "learning_rate": 3.992555345808571e-05, + "loss": 3.6342, + "step": 15680 + }, + { + "epoch": 1.6128700657894737, + "grad_norm": 1.1132888525987668, + "learning_rate": 3.988631221948357e-05, + "loss": 3.6265, + "step": 15690 + }, + { + "epoch": 1.6138980263157894, + "grad_norm": 1.015623356744464, + "learning_rate": 3.9847073856688573e-05, + "loss": 3.6278, + "step": 15700 + }, + { + "epoch": 1.6149259868421053, + "grad_norm": 0.913477947110898, + "learning_rate": 3.9807838412874195e-05, + "loss": 3.7435, + "step": 15710 + }, + { + "epoch": 1.615953947368421, + "grad_norm": 0.9811272404655219, + "learning_rate": 3.976860593121072e-05, + "loss": 3.6067, + "step": 15720 + }, + { + "epoch": 1.6169819078947367, + "grad_norm": 1.1164847878232533, + "learning_rate": 3.972937645486508e-05, + "loss": 3.6011, + "step": 15730 + }, + { + "epoch": 1.6180098684210527, + "grad_norm": 1.1447362618522765, + "learning_rate": 3.969015002700103e-05, + "loss": 3.5878, + "step": 15740 + }, + { + "epoch": 1.6190378289473686, + "grad_norm": 1.1204828066377026, + "learning_rate": 3.965092669077886e-05, + "loss": 3.6644, + "step": 15750 + }, + { + "epoch": 1.620065789473684, + "grad_norm": 1.6384257517089125, + "learning_rate": 3.961170648935554e-05, + "loss": 3.6744, + "step": 15760 + }, + { + "epoch": 1.62109375, + "grad_norm": 1.1931611690963855, + "learning_rate": 3.957248946588453e-05, + "loss": 3.6361, + "step": 15770 + }, + { + "epoch": 1.622121710526316, + "grad_norm": 0.8640228814256455, + "learning_rate": 3.953327566351582e-05, + "loss": 3.6486, + "step": 15780 + }, + { + "epoch": 1.6231496710526314, + "grad_norm": 0.8519915278689606, + "learning_rate": 3.949406512539586e-05, + "loss": 3.5905, + "step": 15790 + }, + { + "epoch": 1.6241776315789473, + "grad_norm": 1.032389502535588, + "learning_rate": 3.945485789466751e-05, + "loss": 3.6544, + "step": 15800 + }, + { + "epoch": 1.6252055921052633, + "grad_norm": 0.9947610054816614, + "learning_rate": 3.9415654014469956e-05, + "loss": 3.6206, + "step": 15810 + }, + { + "epoch": 1.626233552631579, + "grad_norm": 0.9628725185587639, + "learning_rate": 3.937645352793875e-05, + "loss": 3.5844, + "step": 15820 + }, + { + "epoch": 1.6272615131578947, + "grad_norm": 1.0652129934561996, + "learning_rate": 3.93372564782057e-05, + "loss": 3.6058, + "step": 15830 + }, + { + "epoch": 1.6282894736842106, + "grad_norm": 1.4024227194480443, + "learning_rate": 3.929806290839877e-05, + "loss": 3.636, + "step": 15840 + }, + { + "epoch": 1.6293174342105263, + "grad_norm": 1.7730295661573898, + "learning_rate": 3.925887286164217e-05, + "loss": 3.6278, + "step": 15850 + }, + { + "epoch": 1.630345394736842, + "grad_norm": 1.415099633015023, + "learning_rate": 3.921968638105621e-05, + "loss": 3.6871, + "step": 15860 + }, + { + "epoch": 1.631373355263158, + "grad_norm": 1.0449461176546735, + "learning_rate": 3.918050350975726e-05, + "loss": 3.592, + "step": 15870 + }, + { + "epoch": 1.6324013157894737, + "grad_norm": 0.8860541049979074, + "learning_rate": 3.9141324290857754e-05, + "loss": 3.5643, + "step": 15880 + }, + { + "epoch": 1.6334292763157894, + "grad_norm": 0.7101401584258349, + "learning_rate": 3.9102148767466044e-05, + "loss": 3.638, + "step": 15890 + }, + { + "epoch": 1.6344572368421053, + "grad_norm": 1.0162737608286718, + "learning_rate": 3.906297698268649e-05, + "loss": 3.7362, + "step": 15900 + }, + { + "epoch": 1.635485197368421, + "grad_norm": 0.7739200820472332, + "learning_rate": 3.9023808979619285e-05, + "loss": 3.6097, + "step": 15910 + }, + { + "epoch": 1.6365131578947367, + "grad_norm": 1.1243628711056857, + "learning_rate": 3.898464480136049e-05, + "loss": 3.6951, + "step": 15920 + }, + { + "epoch": 1.6375411184210527, + "grad_norm": 1.3845977896709507, + "learning_rate": 3.894548449100197e-05, + "loss": 3.609, + "step": 15930 + }, + { + "epoch": 1.6385690789473686, + "grad_norm": 0.9474067818776408, + "learning_rate": 3.8906328091631266e-05, + "loss": 3.7243, + "step": 15940 + }, + { + "epoch": 1.639597039473684, + "grad_norm": 0.9359981587261446, + "learning_rate": 3.8867175646331687e-05, + "loss": 3.6221, + "step": 15950 + }, + { + "epoch": 1.640625, + "grad_norm": 0.7398121618598976, + "learning_rate": 3.882802719818217e-05, + "loss": 3.6583, + "step": 15960 + }, + { + "epoch": 1.641652960526316, + "grad_norm": 1.0468261479608567, + "learning_rate": 3.878888279025724e-05, + "loss": 3.6131, + "step": 15970 + }, + { + "epoch": 1.6426809210526314, + "grad_norm": 1.1775977718152082, + "learning_rate": 3.8749742465626995e-05, + "loss": 3.6132, + "step": 15980 + }, + { + "epoch": 1.6437088815789473, + "grad_norm": 0.93653460705792, + "learning_rate": 3.871060626735704e-05, + "loss": 3.5558, + "step": 15990 + }, + { + "epoch": 1.6447368421052633, + "grad_norm": 1.1574006412449616, + "learning_rate": 3.867147423850842e-05, + "loss": 3.6592, + "step": 16000 + }, + { + "epoch": 1.645764802631579, + "grad_norm": 0.7266875538828019, + "learning_rate": 3.86323464221376e-05, + "loss": 3.6236, + "step": 16010 + }, + { + "epoch": 1.6467927631578947, + "grad_norm": 0.7614350793282285, + "learning_rate": 3.8593222861296415e-05, + "loss": 3.6962, + "step": 16020 + }, + { + "epoch": 1.6478207236842106, + "grad_norm": 0.9900973975792825, + "learning_rate": 3.855410359903203e-05, + "loss": 3.7033, + "step": 16030 + }, + { + "epoch": 1.6488486842105263, + "grad_norm": 0.9569786524429564, + "learning_rate": 3.851498867838688e-05, + "loss": 3.6829, + "step": 16040 + }, + { + "epoch": 1.649876644736842, + "grad_norm": 0.8573569444823289, + "learning_rate": 3.847587814239857e-05, + "loss": 3.659, + "step": 16050 + }, + { + "epoch": 1.650904605263158, + "grad_norm": 1.0128193175154965, + "learning_rate": 3.8436772034099955e-05, + "loss": 3.6745, + "step": 16060 + }, + { + "epoch": 1.6519325657894737, + "grad_norm": 0.9038181891774375, + "learning_rate": 3.839767039651896e-05, + "loss": 3.5973, + "step": 16070 + }, + { + "epoch": 1.6529605263157894, + "grad_norm": 1.0280492992417474, + "learning_rate": 3.8358573272678644e-05, + "loss": 3.6825, + "step": 16080 + }, + { + "epoch": 1.6539884868421053, + "grad_norm": 0.9826465751667849, + "learning_rate": 3.831948070559705e-05, + "loss": 3.6111, + "step": 16090 + }, + { + "epoch": 1.655016447368421, + "grad_norm": 0.9333240965545085, + "learning_rate": 3.828039273828724e-05, + "loss": 3.6669, + "step": 16100 + }, + { + "epoch": 1.6560444078947367, + "grad_norm": 0.9718231861984986, + "learning_rate": 3.824130941375719e-05, + "loss": 3.6667, + "step": 16110 + }, + { + "epoch": 1.6570723684210527, + "grad_norm": 0.7835417132152357, + "learning_rate": 3.820223077500978e-05, + "loss": 3.7145, + "step": 16120 + }, + { + "epoch": 1.6581003289473686, + "grad_norm": 0.9714285214408093, + "learning_rate": 3.816315686504275e-05, + "loss": 3.721, + "step": 16130 + }, + { + "epoch": 1.659128289473684, + "grad_norm": 1.1929374985514263, + "learning_rate": 3.812408772684862e-05, + "loss": 3.6594, + "step": 16140 + }, + { + "epoch": 1.66015625, + "grad_norm": 0.9569488489916725, + "learning_rate": 3.808502340341466e-05, + "loss": 3.5377, + "step": 16150 + }, + { + "epoch": 1.661184210526316, + "grad_norm": 1.3265585693732487, + "learning_rate": 3.804596393772284e-05, + "loss": 3.6066, + "step": 16160 + }, + { + "epoch": 1.6622121710526314, + "grad_norm": 1.27012136975755, + "learning_rate": 3.8006909372749786e-05, + "loss": 3.6441, + "step": 16170 + }, + { + "epoch": 1.6632401315789473, + "grad_norm": 0.9709071783433202, + "learning_rate": 3.796785975146674e-05, + "loss": 3.6335, + "step": 16180 + }, + { + "epoch": 1.6642680921052633, + "grad_norm": 0.7938718116586911, + "learning_rate": 3.79288151168395e-05, + "loss": 3.6914, + "step": 16190 + }, + { + "epoch": 1.665296052631579, + "grad_norm": 1.0373977863563937, + "learning_rate": 3.788977551182838e-05, + "loss": 3.5589, + "step": 16200 + }, + { + "epoch": 1.6663240131578947, + "grad_norm": 1.3578007231136862, + "learning_rate": 3.785074097938816e-05, + "loss": 3.7087, + "step": 16210 + }, + { + "epoch": 1.6673519736842106, + "grad_norm": 0.9659509830774134, + "learning_rate": 3.7811711562468006e-05, + "loss": 3.6201, + "step": 16220 + }, + { + "epoch": 1.6683799342105263, + "grad_norm": 0.9244728661264267, + "learning_rate": 3.777268730401152e-05, + "loss": 3.5545, + "step": 16230 + }, + { + "epoch": 1.669407894736842, + "grad_norm": 1.0025651932444704, + "learning_rate": 3.773366824695658e-05, + "loss": 3.6198, + "step": 16240 + }, + { + "epoch": 1.670435855263158, + "grad_norm": 0.8218245603219115, + "learning_rate": 3.769465443423537e-05, + "loss": 3.6947, + "step": 16250 + }, + { + "epoch": 1.6714638157894737, + "grad_norm": 0.9658415110299197, + "learning_rate": 3.7655645908774265e-05, + "loss": 3.629, + "step": 16260 + }, + { + "epoch": 1.6724917763157894, + "grad_norm": 1.0490123508842684, + "learning_rate": 3.7616642713493836e-05, + "loss": 3.7005, + "step": 16270 + }, + { + "epoch": 1.6735197368421053, + "grad_norm": 0.8262666327783306, + "learning_rate": 3.757764489130883e-05, + "loss": 3.6259, + "step": 16280 + }, + { + "epoch": 1.674547697368421, + "grad_norm": 0.9908412901802437, + "learning_rate": 3.753865248512804e-05, + "loss": 3.7464, + "step": 16290 + }, + { + "epoch": 1.6755756578947367, + "grad_norm": 0.9453241331787591, + "learning_rate": 3.74996655378543e-05, + "loss": 3.6701, + "step": 16300 + }, + { + "epoch": 1.6766036184210527, + "grad_norm": 0.8975672612183165, + "learning_rate": 3.746068409238447e-05, + "loss": 3.6565, + "step": 16310 + }, + { + "epoch": 1.6776315789473686, + "grad_norm": 0.9830384642098644, + "learning_rate": 3.7421708191609305e-05, + "loss": 3.6631, + "step": 16320 + }, + { + "epoch": 1.678659539473684, + "grad_norm": 0.9794638906329799, + "learning_rate": 3.7382737878413504e-05, + "loss": 3.5572, + "step": 16330 + }, + { + "epoch": 1.6796875, + "grad_norm": 1.354647164805778, + "learning_rate": 3.734377319567561e-05, + "loss": 3.6372, + "step": 16340 + }, + { + "epoch": 1.680715460526316, + "grad_norm": 1.1054974339945813, + "learning_rate": 3.730481418626795e-05, + "loss": 3.6695, + "step": 16350 + }, + { + "epoch": 1.6817434210526314, + "grad_norm": 1.0515051408062481, + "learning_rate": 3.726586089305665e-05, + "loss": 3.7198, + "step": 16360 + }, + { + "epoch": 1.6827713815789473, + "grad_norm": 1.0613345317626142, + "learning_rate": 3.722691335890148e-05, + "loss": 3.6744, + "step": 16370 + }, + { + "epoch": 1.6837993421052633, + "grad_norm": 0.8985229783237801, + "learning_rate": 3.7187971626655924e-05, + "loss": 3.6215, + "step": 16380 + }, + { + "epoch": 1.684827302631579, + "grad_norm": 1.2797753291096612, + "learning_rate": 3.714903573916709e-05, + "loss": 3.5486, + "step": 16390 + }, + { + "epoch": 1.6858552631578947, + "grad_norm": 0.9715761062390795, + "learning_rate": 3.711010573927563e-05, + "loss": 3.5673, + "step": 16400 + }, + { + "epoch": 1.6868832236842106, + "grad_norm": 0.7998758174953465, + "learning_rate": 3.7071181669815704e-05, + "loss": 3.5806, + "step": 16410 + }, + { + "epoch": 1.6879111842105263, + "grad_norm": 1.0478544635145943, + "learning_rate": 3.703226357361499e-05, + "loss": 3.6513, + "step": 16420 + }, + { + "epoch": 1.688939144736842, + "grad_norm": 0.939576012074662, + "learning_rate": 3.699335149349455e-05, + "loss": 3.6982, + "step": 16430 + }, + { + "epoch": 1.689967105263158, + "grad_norm": 1.6412451888839736, + "learning_rate": 3.6954445472268854e-05, + "loss": 3.6507, + "step": 16440 + }, + { + "epoch": 1.6909950657894737, + "grad_norm": 1.1945180808639375, + "learning_rate": 3.6915545552745714e-05, + "loss": 3.7085, + "step": 16450 + }, + { + "epoch": 1.6920230263157894, + "grad_norm": 1.1132398631116027, + "learning_rate": 3.687665177772619e-05, + "loss": 3.6378, + "step": 16460 + }, + { + "epoch": 1.6930509868421053, + "grad_norm": 0.7644929886191417, + "learning_rate": 3.683776419000463e-05, + "loss": 3.6527, + "step": 16470 + }, + { + "epoch": 1.694078947368421, + "grad_norm": 1.4262826534347999, + "learning_rate": 3.679888283236851e-05, + "loss": 3.6512, + "step": 16480 + }, + { + "epoch": 1.6951069078947367, + "grad_norm": 0.8983634586508465, + "learning_rate": 3.676000774759852e-05, + "loss": 3.6646, + "step": 16490 + }, + { + "epoch": 1.6961348684210527, + "grad_norm": 1.3681144405844512, + "learning_rate": 3.672113897846841e-05, + "loss": 3.6295, + "step": 16500 + }, + { + "epoch": 1.6971628289473686, + "grad_norm": 0.9103218550091701, + "learning_rate": 3.668227656774498e-05, + "loss": 3.7006, + "step": 16510 + }, + { + "epoch": 1.698190789473684, + "grad_norm": 0.8601499358039845, + "learning_rate": 3.6643420558188075e-05, + "loss": 3.5873, + "step": 16520 + }, + { + "epoch": 1.69921875, + "grad_norm": 0.9254842676178533, + "learning_rate": 3.660457099255041e-05, + "loss": 3.6336, + "step": 16530 + }, + { + "epoch": 1.700246710526316, + "grad_norm": 1.7190116683137873, + "learning_rate": 3.656572791357771e-05, + "loss": 3.7267, + "step": 16540 + }, + { + "epoch": 1.7012746710526314, + "grad_norm": 1.3402299685782406, + "learning_rate": 3.6526891364008485e-05, + "loss": 3.6406, + "step": 16550 + }, + { + "epoch": 1.7023026315789473, + "grad_norm": 1.2939573352183198, + "learning_rate": 3.648806138657411e-05, + "loss": 3.6415, + "step": 16560 + }, + { + "epoch": 1.7033305921052633, + "grad_norm": 1.264911780315608, + "learning_rate": 3.644923802399872e-05, + "loss": 3.6825, + "step": 16570 + }, + { + "epoch": 1.704358552631579, + "grad_norm": 0.8428090464825678, + "learning_rate": 3.641042131899913e-05, + "loss": 3.6802, + "step": 16580 + }, + { + "epoch": 1.7053865131578947, + "grad_norm": 0.7918343533725012, + "learning_rate": 3.63716113142849e-05, + "loss": 3.5661, + "step": 16590 + }, + { + "epoch": 1.7064144736842106, + "grad_norm": 0.6924492894207241, + "learning_rate": 3.633280805255816e-05, + "loss": 3.5436, + "step": 16600 + }, + { + "epoch": 1.7074424342105263, + "grad_norm": 1.5937414730068307, + "learning_rate": 3.6294011576513635e-05, + "loss": 3.6709, + "step": 16610 + }, + { + "epoch": 1.708470394736842, + "grad_norm": 0.9252920669108402, + "learning_rate": 3.6255221928838605e-05, + "loss": 3.6004, + "step": 16620 + }, + { + "epoch": 1.709498355263158, + "grad_norm": 0.7163020616687298, + "learning_rate": 3.621643915221283e-05, + "loss": 3.6145, + "step": 16630 + }, + { + "epoch": 1.7105263157894737, + "grad_norm": 1.1544367000363724, + "learning_rate": 3.6177663289308484e-05, + "loss": 3.6649, + "step": 16640 + }, + { + "epoch": 1.7115542763157894, + "grad_norm": 0.897131958148788, + "learning_rate": 3.613889438279016e-05, + "loss": 3.6274, + "step": 16650 + }, + { + "epoch": 1.7125822368421053, + "grad_norm": 0.9103761006329907, + "learning_rate": 3.610013247531478e-05, + "loss": 3.6568, + "step": 16660 + }, + { + "epoch": 1.713610197368421, + "grad_norm": 0.69276800591765, + "learning_rate": 3.606137760953158e-05, + "loss": 3.64, + "step": 16670 + }, + { + "epoch": 1.7146381578947367, + "grad_norm": 1.3468727377321579, + "learning_rate": 3.602262982808205e-05, + "loss": 3.5599, + "step": 16680 + }, + { + "epoch": 1.7156661184210527, + "grad_norm": 0.8769266850960976, + "learning_rate": 3.598388917359986e-05, + "loss": 3.6159, + "step": 16690 + }, + { + "epoch": 1.7166940789473686, + "grad_norm": 1.012704494217658, + "learning_rate": 3.594515568871087e-05, + "loss": 3.6445, + "step": 16700 + }, + { + "epoch": 1.717722039473684, + "grad_norm": 0.7672110886064821, + "learning_rate": 3.590642941603302e-05, + "loss": 3.6311, + "step": 16710 + }, + { + "epoch": 1.71875, + "grad_norm": 0.9264761011198279, + "learning_rate": 3.5867710398176354e-05, + "loss": 3.6477, + "step": 16720 + }, + { + "epoch": 1.719777960526316, + "grad_norm": 1.3304958947773147, + "learning_rate": 3.58289986777429e-05, + "loss": 3.5969, + "step": 16730 + }, + { + "epoch": 1.7208059210526314, + "grad_norm": 0.9801606077817651, + "learning_rate": 3.579029429732667e-05, + "loss": 3.7486, + "step": 16740 + }, + { + "epoch": 1.7218338815789473, + "grad_norm": 1.1592911599992228, + "learning_rate": 3.5751597299513585e-05, + "loss": 3.6561, + "step": 16750 + }, + { + "epoch": 1.7228618421052633, + "grad_norm": 0.8068910445576423, + "learning_rate": 3.571290772688147e-05, + "loss": 3.6141, + "step": 16760 + }, + { + "epoch": 1.723889802631579, + "grad_norm": 0.994970967909173, + "learning_rate": 3.567422562199995e-05, + "loss": 3.6037, + "step": 16770 + }, + { + "epoch": 1.7249177631578947, + "grad_norm": 1.5782565071531356, + "learning_rate": 3.5635551027430465e-05, + "loss": 3.6097, + "step": 16780 + }, + { + "epoch": 1.7259457236842106, + "grad_norm": 1.0333167087591737, + "learning_rate": 3.559688398572618e-05, + "loss": 3.6087, + "step": 16790 + }, + { + "epoch": 1.7269736842105263, + "grad_norm": 0.9150525461544905, + "learning_rate": 3.555822453943193e-05, + "loss": 3.6483, + "step": 16800 + }, + { + "epoch": 1.728001644736842, + "grad_norm": 1.1507602479964432, + "learning_rate": 3.5519572731084214e-05, + "loss": 3.6531, + "step": 16810 + }, + { + "epoch": 1.729029605263158, + "grad_norm": 0.9994036863502945, + "learning_rate": 3.5480928603211116e-05, + "loss": 3.5856, + "step": 16820 + }, + { + "epoch": 1.7300575657894737, + "grad_norm": 0.9103512207240126, + "learning_rate": 3.5442292198332286e-05, + "loss": 3.7827, + "step": 16830 + }, + { + "epoch": 1.7310855263157894, + "grad_norm": 1.177034945323293, + "learning_rate": 3.540366355895886e-05, + "loss": 3.6984, + "step": 16840 + }, + { + "epoch": 1.7321134868421053, + "grad_norm": 1.3071338766044136, + "learning_rate": 3.536504272759343e-05, + "loss": 3.6159, + "step": 16850 + }, + { + "epoch": 1.733141447368421, + "grad_norm": 0.789796468977909, + "learning_rate": 3.532642974673e-05, + "loss": 3.6442, + "step": 16860 + }, + { + "epoch": 1.7341694078947367, + "grad_norm": 0.8165981669470558, + "learning_rate": 3.5287824658853936e-05, + "loss": 3.5498, + "step": 16870 + }, + { + "epoch": 1.7351973684210527, + "grad_norm": 0.8632912268428903, + "learning_rate": 3.524922750644193e-05, + "loss": 3.6416, + "step": 16880 + }, + { + "epoch": 1.7362253289473686, + "grad_norm": 0.8148808076794635, + "learning_rate": 3.5210638331961945e-05, + "loss": 3.6037, + "step": 16890 + }, + { + "epoch": 1.737253289473684, + "grad_norm": 1.0630315124384002, + "learning_rate": 3.5172057177873135e-05, + "loss": 3.622, + "step": 16900 + }, + { + "epoch": 1.73828125, + "grad_norm": 1.1618188831552725, + "learning_rate": 3.513348408662586e-05, + "loss": 3.6477, + "step": 16910 + }, + { + "epoch": 1.739309210526316, + "grad_norm": 1.8767988476974904, + "learning_rate": 3.509491910066157e-05, + "loss": 3.5898, + "step": 16920 + }, + { + "epoch": 1.7403371710526314, + "grad_norm": 0.8720548997907493, + "learning_rate": 3.505636226241286e-05, + "loss": 3.6062, + "step": 16930 + }, + { + "epoch": 1.7413651315789473, + "grad_norm": 0.9847591195653888, + "learning_rate": 3.501781361430332e-05, + "loss": 3.6561, + "step": 16940 + }, + { + "epoch": 1.7423930921052633, + "grad_norm": 1.0467558266291, + "learning_rate": 3.4979273198747525e-05, + "loss": 3.6269, + "step": 16950 + }, + { + "epoch": 1.743421052631579, + "grad_norm": 0.9633301820725462, + "learning_rate": 3.4940741058150995e-05, + "loss": 3.5979, + "step": 16960 + }, + { + "epoch": 1.7444490131578947, + "grad_norm": 1.1087687272959936, + "learning_rate": 3.490221723491015e-05, + "loss": 3.6593, + "step": 16970 + }, + { + "epoch": 1.7454769736842106, + "grad_norm": 0.9461616926422551, + "learning_rate": 3.486370177141227e-05, + "loss": 3.6516, + "step": 16980 + }, + { + "epoch": 1.7465049342105263, + "grad_norm": 1.0340312674755074, + "learning_rate": 3.482519471003542e-05, + "loss": 3.6671, + "step": 16990 + }, + { + "epoch": 1.747532894736842, + "grad_norm": 0.8363378403948772, + "learning_rate": 3.478669609314844e-05, + "loss": 3.6006, + "step": 17000 + }, + { + "epoch": 1.748560855263158, + "grad_norm": 1.3815000092033196, + "learning_rate": 3.4748205963110845e-05, + "loss": 3.709, + "step": 17010 + }, + { + "epoch": 1.7495888157894737, + "grad_norm": 0.8158228491738454, + "learning_rate": 3.4709724362272817e-05, + "loss": 3.5986, + "step": 17020 + }, + { + "epoch": 1.7506167763157894, + "grad_norm": 1.7097259133398335, + "learning_rate": 3.4671251332975205e-05, + "loss": 3.6591, + "step": 17030 + }, + { + "epoch": 1.7516447368421053, + "grad_norm": 1.0606254984057035, + "learning_rate": 3.463278691754937e-05, + "loss": 3.6486, + "step": 17040 + }, + { + "epoch": 1.752672697368421, + "grad_norm": 2.209742678165228, + "learning_rate": 3.459433115831723e-05, + "loss": 3.6301, + "step": 17050 + }, + { + "epoch": 1.7537006578947367, + "grad_norm": 0.8386897475635146, + "learning_rate": 3.455588409759114e-05, + "loss": 3.6322, + "step": 17060 + }, + { + "epoch": 1.7547286184210527, + "grad_norm": 0.8489822002090621, + "learning_rate": 3.451744577767391e-05, + "loss": 3.6424, + "step": 17070 + }, + { + "epoch": 1.7557565789473686, + "grad_norm": 1.1405461427785673, + "learning_rate": 3.447901624085875e-05, + "loss": 3.731, + "step": 17080 + }, + { + "epoch": 1.756784539473684, + "grad_norm": 1.554571176135642, + "learning_rate": 3.444059552942918e-05, + "loss": 3.7083, + "step": 17090 + }, + { + "epoch": 1.7578125, + "grad_norm": 1.0761309995002128, + "learning_rate": 3.4402183685659016e-05, + "loss": 3.6978, + "step": 17100 + }, + { + "epoch": 1.758840460526316, + "grad_norm": 0.9898951624741211, + "learning_rate": 3.436378075181231e-05, + "loss": 3.5522, + "step": 17110 + }, + { + "epoch": 1.7598684210526314, + "grad_norm": 0.8776710137661514, + "learning_rate": 3.432538677014332e-05, + "loss": 3.6409, + "step": 17120 + }, + { + "epoch": 1.7608963815789473, + "grad_norm": 0.9884998848663711, + "learning_rate": 3.4287001782896454e-05, + "loss": 3.6722, + "step": 17130 + }, + { + "epoch": 1.7619243421052633, + "grad_norm": 0.8768492979169663, + "learning_rate": 3.424862583230623e-05, + "loss": 3.7472, + "step": 17140 + }, + { + "epoch": 1.762952302631579, + "grad_norm": 0.6648674518022626, + "learning_rate": 3.421025896059719e-05, + "loss": 3.6952, + "step": 17150 + }, + { + "epoch": 1.7639802631578947, + "grad_norm": 0.8690281827393012, + "learning_rate": 3.417190120998395e-05, + "loss": 3.6749, + "step": 17160 + }, + { + "epoch": 1.7650082236842106, + "grad_norm": 1.0224872283324453, + "learning_rate": 3.4133552622671e-05, + "loss": 3.5739, + "step": 17170 + }, + { + "epoch": 1.7660361842105263, + "grad_norm": 1.1982242797408784, + "learning_rate": 3.4095213240852843e-05, + "loss": 3.6415, + "step": 17180 + }, + { + "epoch": 1.767064144736842, + "grad_norm": 1.061866346911132, + "learning_rate": 3.4056883106713794e-05, + "loss": 3.6448, + "step": 17190 + }, + { + "epoch": 1.768092105263158, + "grad_norm": 1.1261091592601675, + "learning_rate": 3.4018562262427994e-05, + "loss": 3.6759, + "step": 17200 + }, + { + "epoch": 1.7691200657894737, + "grad_norm": 0.9767628273532938, + "learning_rate": 3.3980250750159416e-05, + "loss": 3.608, + "step": 17210 + }, + { + "epoch": 1.7701480263157894, + "grad_norm": 1.052853494391245, + "learning_rate": 3.394194861206168e-05, + "loss": 3.6737, + "step": 17220 + }, + { + "epoch": 1.7711759868421053, + "grad_norm": 0.8337623246982027, + "learning_rate": 3.3903655890278156e-05, + "loss": 3.66, + "step": 17230 + }, + { + "epoch": 1.772203947368421, + "grad_norm": 0.6930277931494238, + "learning_rate": 3.3865372626941844e-05, + "loss": 3.6357, + "step": 17240 + }, + { + "epoch": 1.7732319078947367, + "grad_norm": 0.7500224110115999, + "learning_rate": 3.3827098864175317e-05, + "loss": 3.6877, + "step": 17250 + }, + { + "epoch": 1.7742598684210527, + "grad_norm": 0.9344184776065657, + "learning_rate": 3.378883464409071e-05, + "loss": 3.6332, + "step": 17260 + }, + { + "epoch": 1.7752878289473686, + "grad_norm": 2.3468954722886832, + "learning_rate": 3.375058000878964e-05, + "loss": 3.629, + "step": 17270 + }, + { + "epoch": 1.776315789473684, + "grad_norm": 1.1064901317599398, + "learning_rate": 3.3712335000363216e-05, + "loss": 3.683, + "step": 17280 + }, + { + "epoch": 1.77734375, + "grad_norm": 1.4093148212314779, + "learning_rate": 3.3674099660891915e-05, + "loss": 3.5846, + "step": 17290 + }, + { + "epoch": 1.778371710526316, + "grad_norm": 1.0815810496082312, + "learning_rate": 3.3635874032445593e-05, + "loss": 3.5305, + "step": 17300 + }, + { + "epoch": 1.7793996710526314, + "grad_norm": 0.7035393024126594, + "learning_rate": 3.359765815708341e-05, + "loss": 3.6511, + "step": 17310 + }, + { + "epoch": 1.7804276315789473, + "grad_norm": 1.2586415561758153, + "learning_rate": 3.355945207685383e-05, + "loss": 3.6404, + "step": 17320 + }, + { + "epoch": 1.7814555921052633, + "grad_norm": 1.1170307396233774, + "learning_rate": 3.352125583379449e-05, + "loss": 3.6022, + "step": 17330 + }, + { + "epoch": 1.782483552631579, + "grad_norm": 0.9460845190805933, + "learning_rate": 3.348306946993223e-05, + "loss": 3.6064, + "step": 17340 + }, + { + "epoch": 1.7835115131578947, + "grad_norm": 0.8661725058303426, + "learning_rate": 3.344489302728302e-05, + "loss": 3.6369, + "step": 17350 + }, + { + "epoch": 1.7845394736842106, + "grad_norm": 0.756053456074119, + "learning_rate": 3.340672654785191e-05, + "loss": 3.6702, + "step": 17360 + }, + { + "epoch": 1.7855674342105263, + "grad_norm": 1.1158955713416159, + "learning_rate": 3.3368570073633e-05, + "loss": 3.7077, + "step": 17370 + }, + { + "epoch": 1.786595394736842, + "grad_norm": 0.6974812449391417, + "learning_rate": 3.333042364660934e-05, + "loss": 3.6026, + "step": 17380 + }, + { + "epoch": 1.787623355263158, + "grad_norm": 1.3889721681210903, + "learning_rate": 3.329228730875298e-05, + "loss": 3.5922, + "step": 17390 + }, + { + "epoch": 1.7886513157894737, + "grad_norm": 1.0775191774896498, + "learning_rate": 3.325416110202484e-05, + "loss": 3.7008, + "step": 17400 + }, + { + "epoch": 1.7896792763157894, + "grad_norm": 0.7779856576210453, + "learning_rate": 3.3216045068374666e-05, + "loss": 3.5552, + "step": 17410 + }, + { + "epoch": 1.7907072368421053, + "grad_norm": 1.1639949087706711, + "learning_rate": 3.3177939249741105e-05, + "loss": 3.5819, + "step": 17420 + }, + { + "epoch": 1.791735197368421, + "grad_norm": 1.0389202852372523, + "learning_rate": 3.313984368805144e-05, + "loss": 3.7615, + "step": 17430 + }, + { + "epoch": 1.7927631578947367, + "grad_norm": 0.9047643058997156, + "learning_rate": 3.3101758425221755e-05, + "loss": 3.5933, + "step": 17440 + }, + { + "epoch": 1.7937911184210527, + "grad_norm": 0.8243437396550974, + "learning_rate": 3.3063683503156775e-05, + "loss": 3.6553, + "step": 17450 + }, + { + "epoch": 1.7948190789473686, + "grad_norm": 1.3594972402047265, + "learning_rate": 3.302561896374983e-05, + "loss": 3.6343, + "step": 17460 + }, + { + "epoch": 1.795847039473684, + "grad_norm": 1.042250425163433, + "learning_rate": 3.298756484888287e-05, + "loss": 3.6849, + "step": 17470 + }, + { + "epoch": 1.796875, + "grad_norm": 0.7482524619197396, + "learning_rate": 3.294952120042636e-05, + "loss": 3.5814, + "step": 17480 + }, + { + "epoch": 1.797902960526316, + "grad_norm": 0.8551176561303162, + "learning_rate": 3.2911488060239207e-05, + "loss": 3.5868, + "step": 17490 + }, + { + "epoch": 1.7989309210526314, + "grad_norm": 1.8767678509746555, + "learning_rate": 3.287346547016879e-05, + "loss": 3.6293, + "step": 17500 + }, + { + "epoch": 1.7999588815789473, + "grad_norm": 1.251181282725056, + "learning_rate": 3.28354534720509e-05, + "loss": 3.5661, + "step": 17510 + }, + { + "epoch": 1.8009868421052633, + "grad_norm": 1.009571167393295, + "learning_rate": 3.2797452107709625e-05, + "loss": 3.666, + "step": 17520 + }, + { + "epoch": 1.802014802631579, + "grad_norm": 0.6571381553376556, + "learning_rate": 3.275946141895741e-05, + "loss": 3.6083, + "step": 17530 + }, + { + "epoch": 1.8030427631578947, + "grad_norm": 0.7680812671773339, + "learning_rate": 3.27214814475949e-05, + "loss": 3.6502, + "step": 17540 + }, + { + "epoch": 1.8040707236842106, + "grad_norm": 1.1780953684351263, + "learning_rate": 3.268351223541097e-05, + "loss": 3.6178, + "step": 17550 + }, + { + "epoch": 1.8050986842105263, + "grad_norm": 0.9968723918916552, + "learning_rate": 3.264555382418266e-05, + "loss": 3.6306, + "step": 17560 + }, + { + "epoch": 1.806126644736842, + "grad_norm": 0.9457723901807282, + "learning_rate": 3.2607606255675126e-05, + "loss": 3.6535, + "step": 17570 + }, + { + "epoch": 1.807154605263158, + "grad_norm": 1.1491347291937675, + "learning_rate": 3.256966957164161e-05, + "loss": 3.6049, + "step": 17580 + }, + { + "epoch": 1.8081825657894737, + "grad_norm": 1.0768869450947782, + "learning_rate": 3.2531743813823316e-05, + "loss": 3.6465, + "step": 17590 + }, + { + "epoch": 1.8092105263157894, + "grad_norm": 0.8851317601801659, + "learning_rate": 3.249382902394951e-05, + "loss": 3.6314, + "step": 17600 + }, + { + "epoch": 1.8102384868421053, + "grad_norm": 0.8317252299292665, + "learning_rate": 3.245592524373731e-05, + "loss": 3.6637, + "step": 17610 + }, + { + "epoch": 1.811266447368421, + "grad_norm": 1.0649982141649916, + "learning_rate": 3.241803251489181e-05, + "loss": 3.5535, + "step": 17620 + }, + { + "epoch": 1.8122944078947367, + "grad_norm": 1.140997055200359, + "learning_rate": 3.2380150879105845e-05, + "loss": 3.5462, + "step": 17630 + }, + { + "epoch": 1.8133223684210527, + "grad_norm": 1.4475203687588654, + "learning_rate": 3.2342280378060134e-05, + "loss": 3.6256, + "step": 17640 + }, + { + "epoch": 1.8143503289473686, + "grad_norm": 0.7551611304603948, + "learning_rate": 3.2304421053423065e-05, + "loss": 3.6142, + "step": 17650 + }, + { + "epoch": 1.815378289473684, + "grad_norm": 0.8659172377320311, + "learning_rate": 3.226657294685078e-05, + "loss": 3.6021, + "step": 17660 + }, + { + "epoch": 1.81640625, + "grad_norm": 0.7594918981299555, + "learning_rate": 3.2228736099987086e-05, + "loss": 3.6693, + "step": 17670 + }, + { + "epoch": 1.817434210526316, + "grad_norm": 0.7770764068485727, + "learning_rate": 3.219091055446336e-05, + "loss": 3.5721, + "step": 17680 + }, + { + "epoch": 1.8184621710526314, + "grad_norm": 1.1047010148447844, + "learning_rate": 3.215309635189858e-05, + "loss": 3.6984, + "step": 17690 + }, + { + "epoch": 1.8194901315789473, + "grad_norm": 1.0368174341374001, + "learning_rate": 3.2115293533899214e-05, + "loss": 3.6701, + "step": 17700 + }, + { + "epoch": 1.8205180921052633, + "grad_norm": 1.111769710828401, + "learning_rate": 3.2077502142059214e-05, + "loss": 3.6303, + "step": 17710 + }, + { + "epoch": 1.821546052631579, + "grad_norm": 0.7827655587808062, + "learning_rate": 3.203972221795998e-05, + "loss": 3.5877, + "step": 17720 + }, + { + "epoch": 1.8225740131578947, + "grad_norm": 0.7854775298101982, + "learning_rate": 3.200195380317027e-05, + "loss": 3.5903, + "step": 17730 + }, + { + "epoch": 1.8236019736842106, + "grad_norm": 0.6471914936128494, + "learning_rate": 3.1964196939246196e-05, + "loss": 3.7276, + "step": 17740 + }, + { + "epoch": 1.8246299342105263, + "grad_norm": 0.8560576132276785, + "learning_rate": 3.192645166773113e-05, + "loss": 3.6364, + "step": 17750 + }, + { + "epoch": 1.825657894736842, + "grad_norm": 1.0648339946297167, + "learning_rate": 3.1888718030155716e-05, + "loss": 3.7437, + "step": 17760 + }, + { + "epoch": 1.826685855263158, + "grad_norm": 1.034139457691027, + "learning_rate": 3.185099606803781e-05, + "loss": 3.6322, + "step": 17770 + }, + { + "epoch": 1.8277138157894737, + "grad_norm": 0.8658685018087152, + "learning_rate": 3.1813285822882386e-05, + "loss": 3.6964, + "step": 17780 + }, + { + "epoch": 1.8287417763157894, + "grad_norm": 1.5466694550790534, + "learning_rate": 3.177558733618155e-05, + "loss": 3.6021, + "step": 17790 + }, + { + "epoch": 1.8297697368421053, + "grad_norm": 0.9842004772669615, + "learning_rate": 3.1737900649414475e-05, + "loss": 3.7061, + "step": 17800 + }, + { + "epoch": 1.830797697368421, + "grad_norm": 1.0337921943955841, + "learning_rate": 3.170022580404731e-05, + "loss": 3.7401, + "step": 17810 + }, + { + "epoch": 1.8318256578947367, + "grad_norm": 1.2115925647546497, + "learning_rate": 3.1662562841533223e-05, + "loss": 3.616, + "step": 17820 + }, + { + "epoch": 1.8328536184210527, + "grad_norm": 1.0645479212555324, + "learning_rate": 3.162491180331229e-05, + "loss": 3.6179, + "step": 17830 + }, + { + "epoch": 1.8338815789473686, + "grad_norm": 1.3054393954489516, + "learning_rate": 3.158727273081146e-05, + "loss": 3.5944, + "step": 17840 + }, + { + "epoch": 1.834909539473684, + "grad_norm": 1.024682023573862, + "learning_rate": 3.154964566544454e-05, + "loss": 3.6115, + "step": 17850 + }, + { + "epoch": 1.8359375, + "grad_norm": 0.9078695033129066, + "learning_rate": 3.151203064861207e-05, + "loss": 3.673, + "step": 17860 + }, + { + "epoch": 1.836965460526316, + "grad_norm": 1.1298669865059536, + "learning_rate": 3.1474427721701404e-05, + "loss": 3.6843, + "step": 17870 + }, + { + "epoch": 1.8379934210526314, + "grad_norm": 1.2247296385294064, + "learning_rate": 3.143683692608654e-05, + "loss": 3.5496, + "step": 17880 + }, + { + "epoch": 1.8390213815789473, + "grad_norm": 1.082085232816862, + "learning_rate": 3.139925830312816e-05, + "loss": 3.7265, + "step": 17890 + }, + { + "epoch": 1.8400493421052633, + "grad_norm": 0.9304138039252905, + "learning_rate": 3.136169189417355e-05, + "loss": 3.6452, + "step": 17900 + }, + { + "epoch": 1.841077302631579, + "grad_norm": 1.576531313690751, + "learning_rate": 3.132413774055653e-05, + "loss": 3.6316, + "step": 17910 + }, + { + "epoch": 1.8421052631578947, + "grad_norm": 0.8752787690913857, + "learning_rate": 3.128659588359745e-05, + "loss": 3.6597, + "step": 17920 + }, + { + "epoch": 1.8431332236842106, + "grad_norm": 0.9902097258801136, + "learning_rate": 3.124906636460315e-05, + "loss": 3.6588, + "step": 17930 + }, + { + "epoch": 1.8441611842105263, + "grad_norm": 0.722164337169706, + "learning_rate": 3.121154922486687e-05, + "loss": 3.5851, + "step": 17940 + }, + { + "epoch": 1.845189144736842, + "grad_norm": 0.8709729713906198, + "learning_rate": 3.1174044505668245e-05, + "loss": 3.5744, + "step": 17950 + }, + { + "epoch": 1.846217105263158, + "grad_norm": 1.1576564070981712, + "learning_rate": 3.1136552248273236e-05, + "loss": 3.5628, + "step": 17960 + }, + { + "epoch": 1.8472450657894737, + "grad_norm": 0.7616471085725848, + "learning_rate": 3.109907249393408e-05, + "loss": 3.596, + "step": 17970 + }, + { + "epoch": 1.8482730263157894, + "grad_norm": 1.0884979032392024, + "learning_rate": 3.106160528388929e-05, + "loss": 3.6646, + "step": 17980 + }, + { + "epoch": 1.8493009868421053, + "grad_norm": 0.9470782537082156, + "learning_rate": 3.102415065936353e-05, + "loss": 3.588, + "step": 17990 + }, + { + "epoch": 1.850328947368421, + "grad_norm": 0.7379091324108359, + "learning_rate": 3.0986708661567665e-05, + "loss": 3.6364, + "step": 18000 + }, + { + "epoch": 1.8513569078947367, + "grad_norm": 1.0260124718761334, + "learning_rate": 3.094927933169866e-05, + "loss": 3.6439, + "step": 18010 + }, + { + "epoch": 1.8523848684210527, + "grad_norm": 0.8342986039751835, + "learning_rate": 3.091186271093947e-05, + "loss": 3.6428, + "step": 18020 + }, + { + "epoch": 1.8534128289473686, + "grad_norm": 1.140647417658155, + "learning_rate": 3.087445884045918e-05, + "loss": 3.6595, + "step": 18030 + }, + { + "epoch": 1.854440789473684, + "grad_norm": 1.2505188818676312, + "learning_rate": 3.083706776141277e-05, + "loss": 3.5479, + "step": 18040 + }, + { + "epoch": 1.85546875, + "grad_norm": 0.8029960706954344, + "learning_rate": 3.079968951494115e-05, + "loss": 3.682, + "step": 18050 + }, + { + "epoch": 1.856496710526316, + "grad_norm": 1.581377058993193, + "learning_rate": 3.076232414217117e-05, + "loss": 3.6626, + "step": 18060 + }, + { + "epoch": 1.8575246710526314, + "grad_norm": 0.7388749334225264, + "learning_rate": 3.07249716842154e-05, + "loss": 3.7168, + "step": 18070 + }, + { + "epoch": 1.8585526315789473, + "grad_norm": 0.9698260699000874, + "learning_rate": 3.0687632182172326e-05, + "loss": 3.6332, + "step": 18080 + }, + { + "epoch": 1.8595805921052633, + "grad_norm": 1.0704578384513692, + "learning_rate": 3.06503056771261e-05, + "loss": 3.6501, + "step": 18090 + }, + { + "epoch": 1.860608552631579, + "grad_norm": 1.3292574374177561, + "learning_rate": 3.06129922101466e-05, + "loss": 3.5866, + "step": 18100 + }, + { + "epoch": 1.8616365131578947, + "grad_norm": 0.9183159029726292, + "learning_rate": 3.057569182228937e-05, + "loss": 3.6244, + "step": 18110 + }, + { + "epoch": 1.8626644736842106, + "grad_norm": 0.8846450959354459, + "learning_rate": 3.053840455459555e-05, + "loss": 3.5886, + "step": 18120 + }, + { + "epoch": 1.8636924342105263, + "grad_norm": 1.6733153368956084, + "learning_rate": 3.0501130448091817e-05, + "loss": 3.6598, + "step": 18130 + }, + { + "epoch": 1.864720394736842, + "grad_norm": 0.7560682377824178, + "learning_rate": 3.0463869543790405e-05, + "loss": 3.5899, + "step": 18140 + }, + { + "epoch": 1.865748355263158, + "grad_norm": 1.066820839578155, + "learning_rate": 3.0426621882689e-05, + "loss": 3.6134, + "step": 18150 + }, + { + "epoch": 1.8667763157894737, + "grad_norm": 0.767823779373011, + "learning_rate": 3.0389387505770744e-05, + "loss": 3.6536, + "step": 18160 + }, + { + "epoch": 1.8678042763157894, + "grad_norm": 0.720590308645501, + "learning_rate": 3.0352166454004145e-05, + "loss": 3.6736, + "step": 18170 + }, + { + "epoch": 1.8688322368421053, + "grad_norm": 0.9161905475495247, + "learning_rate": 3.0314958768343033e-05, + "loss": 3.6281, + "step": 18180 + }, + { + "epoch": 1.869860197368421, + "grad_norm": 1.1580727103338737, + "learning_rate": 3.027776448972655e-05, + "loss": 3.6375, + "step": 18190 + }, + { + "epoch": 1.8708881578947367, + "grad_norm": 0.9684286507481932, + "learning_rate": 3.0240583659079087e-05, + "loss": 3.5999, + "step": 18200 + }, + { + "epoch": 1.8719161184210527, + "grad_norm": 1.0139868215761678, + "learning_rate": 3.0203416317310245e-05, + "loss": 3.757, + "step": 18210 + }, + { + "epoch": 1.8729440789473686, + "grad_norm": 0.7523851218885523, + "learning_rate": 3.0166262505314773e-05, + "loss": 3.6248, + "step": 18220 + }, + { + "epoch": 1.873972039473684, + "grad_norm": 0.8616194459444779, + "learning_rate": 3.0129122263972515e-05, + "loss": 3.601, + "step": 18230 + }, + { + "epoch": 1.875, + "grad_norm": 0.8447573970392921, + "learning_rate": 3.009199563414841e-05, + "loss": 3.6647, + "step": 18240 + }, + { + "epoch": 1.876027960526316, + "grad_norm": 0.9897532601607536, + "learning_rate": 3.005488265669242e-05, + "loss": 3.5895, + "step": 18250 + }, + { + "epoch": 1.8770559210526314, + "grad_norm": 1.0608951565771716, + "learning_rate": 3.0017783372439475e-05, + "loss": 3.6153, + "step": 18260 + }, + { + "epoch": 1.8780838815789473, + "grad_norm": 1.2787208250265687, + "learning_rate": 2.9980697822209438e-05, + "loss": 3.5739, + "step": 18270 + }, + { + "epoch": 1.8791118421052633, + "grad_norm": 1.0405901814268996, + "learning_rate": 2.9943626046807085e-05, + "loss": 3.6292, + "step": 18280 + }, + { + "epoch": 1.880139802631579, + "grad_norm": 1.1286106448577193, + "learning_rate": 2.9906568087021988e-05, + "loss": 3.5117, + "step": 18290 + }, + { + "epoch": 1.8811677631578947, + "grad_norm": 0.922785002715047, + "learning_rate": 2.986952398362855e-05, + "loss": 3.6057, + "step": 18300 + }, + { + "epoch": 1.8821957236842106, + "grad_norm": 0.8044805538394884, + "learning_rate": 2.9832493777385924e-05, + "loss": 3.6182, + "step": 18310 + }, + { + "epoch": 1.8832236842105263, + "grad_norm": 0.7771151029077139, + "learning_rate": 2.9795477509037982e-05, + "loss": 3.6541, + "step": 18320 + }, + { + "epoch": 1.884251644736842, + "grad_norm": 0.9593711430475198, + "learning_rate": 2.9758475219313254e-05, + "loss": 3.572, + "step": 18330 + }, + { + "epoch": 1.885279605263158, + "grad_norm": 0.9465924259970048, + "learning_rate": 2.972148694892486e-05, + "loss": 3.5489, + "step": 18340 + }, + { + "epoch": 1.8863075657894737, + "grad_norm": 1.1079718083812984, + "learning_rate": 2.9684512738570537e-05, + "loss": 3.6483, + "step": 18350 + }, + { + "epoch": 1.8873355263157894, + "grad_norm": 0.7024702414389847, + "learning_rate": 2.964755262893252e-05, + "loss": 3.684, + "step": 18360 + }, + { + "epoch": 1.8883634868421053, + "grad_norm": 0.7886422426156036, + "learning_rate": 2.9610606660677564e-05, + "loss": 3.6836, + "step": 18370 + }, + { + "epoch": 1.889391447368421, + "grad_norm": 0.92400724517368, + "learning_rate": 2.9573674874456852e-05, + "loss": 3.6527, + "step": 18380 + }, + { + "epoch": 1.8904194078947367, + "grad_norm": 0.9328565040829044, + "learning_rate": 2.9536757310905914e-05, + "loss": 3.6366, + "step": 18390 + }, + { + "epoch": 1.8914473684210527, + "grad_norm": 0.8840064355758367, + "learning_rate": 2.9499854010644706e-05, + "loss": 3.5621, + "step": 18400 + }, + { + "epoch": 1.8924753289473686, + "grad_norm": 1.0907722264468385, + "learning_rate": 2.9462965014277435e-05, + "loss": 3.677, + "step": 18410 + }, + { + "epoch": 1.893503289473684, + "grad_norm": 0.8805307591560767, + "learning_rate": 2.9426090362392603e-05, + "loss": 3.6427, + "step": 18420 + }, + { + "epoch": 1.89453125, + "grad_norm": 1.4541738683017615, + "learning_rate": 2.938923009556293e-05, + "loss": 3.6063, + "step": 18430 + }, + { + "epoch": 1.895559210526316, + "grad_norm": 1.3258157737970968, + "learning_rate": 2.935238425434527e-05, + "loss": 3.5919, + "step": 18440 + }, + { + "epoch": 1.8965871710526314, + "grad_norm": 1.7944746487650431, + "learning_rate": 2.9315552879280636e-05, + "loss": 3.6258, + "step": 18450 + }, + { + "epoch": 1.8976151315789473, + "grad_norm": 1.5087567471003325, + "learning_rate": 2.9278736010894118e-05, + "loss": 3.635, + "step": 18460 + }, + { + "epoch": 1.8986430921052633, + "grad_norm": 0.8833294932863751, + "learning_rate": 2.924193368969486e-05, + "loss": 3.6114, + "step": 18470 + }, + { + "epoch": 1.899671052631579, + "grad_norm": 1.162918706573443, + "learning_rate": 2.920514595617596e-05, + "loss": 3.6311, + "step": 18480 + }, + { + "epoch": 1.9006990131578947, + "grad_norm": 0.8332803868322016, + "learning_rate": 2.9168372850814516e-05, + "loss": 3.6295, + "step": 18490 + }, + { + "epoch": 1.9017269736842106, + "grad_norm": 0.8109775364277695, + "learning_rate": 2.913161441407148e-05, + "loss": 3.5777, + "step": 18500 + }, + { + "epoch": 1.9027549342105263, + "grad_norm": 0.9438704533791414, + "learning_rate": 2.909487068639169e-05, + "loss": 3.6706, + "step": 18510 + }, + { + "epoch": 1.903782894736842, + "grad_norm": 0.689852417905446, + "learning_rate": 2.905814170820382e-05, + "loss": 3.5873, + "step": 18520 + }, + { + "epoch": 1.904810855263158, + "grad_norm": 0.7565156911292952, + "learning_rate": 2.9021427519920275e-05, + "loss": 3.5504, + "step": 18530 + }, + { + "epoch": 1.9058388157894737, + "grad_norm": 0.6197229288489391, + "learning_rate": 2.8984728161937226e-05, + "loss": 3.6638, + "step": 18540 + }, + { + "epoch": 1.9068667763157894, + "grad_norm": 0.6445518721547028, + "learning_rate": 2.8948043674634477e-05, + "loss": 3.7177, + "step": 18550 + }, + { + "epoch": 1.9078947368421053, + "grad_norm": 1.1234152015733263, + "learning_rate": 2.89113740983755e-05, + "loss": 3.5636, + "step": 18560 + }, + { + "epoch": 1.908922697368421, + "grad_norm": 0.8913132200249549, + "learning_rate": 2.8874719473507388e-05, + "loss": 3.65, + "step": 18570 + }, + { + "epoch": 1.9099506578947367, + "grad_norm": 1.1306852690921574, + "learning_rate": 2.8838079840360733e-05, + "loss": 3.6334, + "step": 18580 + }, + { + "epoch": 1.9109786184210527, + "grad_norm": 0.8974366957884511, + "learning_rate": 2.8801455239249673e-05, + "loss": 3.5994, + "step": 18590 + }, + { + "epoch": 1.9120065789473686, + "grad_norm": 1.304498213325584, + "learning_rate": 2.876484571047177e-05, + "loss": 3.5419, + "step": 18600 + }, + { + "epoch": 1.913034539473684, + "grad_norm": 1.5088370207537802, + "learning_rate": 2.872825129430801e-05, + "loss": 3.6235, + "step": 18610 + }, + { + "epoch": 1.9140625, + "grad_norm": 0.9038266963784231, + "learning_rate": 2.869167203102278e-05, + "loss": 3.6042, + "step": 18620 + }, + { + "epoch": 1.915090460526316, + "grad_norm": 0.8830132045298555, + "learning_rate": 2.865510796086377e-05, + "loss": 3.623, + "step": 18630 + }, + { + "epoch": 1.9161184210526314, + "grad_norm": 1.030122487368191, + "learning_rate": 2.8618559124061943e-05, + "loss": 3.6531, + "step": 18640 + }, + { + "epoch": 1.9171463815789473, + "grad_norm": 1.1136412456773463, + "learning_rate": 2.858202556083156e-05, + "loss": 3.7011, + "step": 18650 + }, + { + "epoch": 1.9181743421052633, + "grad_norm": 0.9022273199095832, + "learning_rate": 2.854550731136997e-05, + "loss": 3.5333, + "step": 18660 + }, + { + "epoch": 1.919202302631579, + "grad_norm": 0.8918347758770591, + "learning_rate": 2.8509004415857773e-05, + "loss": 3.5753, + "step": 18670 + }, + { + "epoch": 1.9202302631578947, + "grad_norm": 0.9998239719910483, + "learning_rate": 2.8472516914458626e-05, + "loss": 3.6262, + "step": 18680 + }, + { + "epoch": 1.9212582236842106, + "grad_norm": 0.9961441924842778, + "learning_rate": 2.8436044847319255e-05, + "loss": 3.6318, + "step": 18690 + }, + { + "epoch": 1.9222861842105263, + "grad_norm": 0.8681564416540165, + "learning_rate": 2.8399588254569442e-05, + "loss": 3.5255, + "step": 18700 + }, + { + "epoch": 1.923314144736842, + "grad_norm": 1.2617202617057508, + "learning_rate": 2.8363147176321853e-05, + "loss": 3.6501, + "step": 18710 + }, + { + "epoch": 1.924342105263158, + "grad_norm": 0.8558695999293832, + "learning_rate": 2.8326721652672173e-05, + "loss": 3.6454, + "step": 18720 + }, + { + "epoch": 1.9253700657894737, + "grad_norm": 0.9953037794560541, + "learning_rate": 2.829031172369893e-05, + "loss": 3.5752, + "step": 18730 + }, + { + "epoch": 1.9263980263157894, + "grad_norm": 1.034189658256497, + "learning_rate": 2.8253917429463506e-05, + "loss": 3.6217, + "step": 18740 + }, + { + "epoch": 1.9274259868421053, + "grad_norm": 0.8300886445220382, + "learning_rate": 2.821753881001009e-05, + "loss": 3.7215, + "step": 18750 + }, + { + "epoch": 1.928453947368421, + "grad_norm": 1.020165724661403, + "learning_rate": 2.8181175905365578e-05, + "loss": 3.6071, + "step": 18760 + }, + { + "epoch": 1.9294819078947367, + "grad_norm": 1.183678954428515, + "learning_rate": 2.8144828755539628e-05, + "loss": 3.6735, + "step": 18770 + }, + { + "epoch": 1.9305098684210527, + "grad_norm": 1.0479146435353421, + "learning_rate": 2.810849740052454e-05, + "loss": 3.6431, + "step": 18780 + }, + { + "epoch": 1.9315378289473686, + "grad_norm": 1.3805071909052975, + "learning_rate": 2.807218188029524e-05, + "loss": 3.6422, + "step": 18790 + }, + { + "epoch": 1.932565789473684, + "grad_norm": 1.0876076020249035, + "learning_rate": 2.803588223480922e-05, + "loss": 3.6075, + "step": 18800 + }, + { + "epoch": 1.93359375, + "grad_norm": 0.9781785944189055, + "learning_rate": 2.799959850400653e-05, + "loss": 3.6314, + "step": 18810 + }, + { + "epoch": 1.934621710526316, + "grad_norm": 1.1705063520550032, + "learning_rate": 2.7963330727809688e-05, + "loss": 3.6226, + "step": 18820 + }, + { + "epoch": 1.9356496710526314, + "grad_norm": 1.2276237416099265, + "learning_rate": 2.7927078946123658e-05, + "loss": 3.6388, + "step": 18830 + }, + { + "epoch": 1.9366776315789473, + "grad_norm": 1.069058539933244, + "learning_rate": 2.7890843198835816e-05, + "loss": 3.5974, + "step": 18840 + }, + { + "epoch": 1.9377055921052633, + "grad_norm": 0.8440052635336088, + "learning_rate": 2.785462352581587e-05, + "loss": 3.6367, + "step": 18850 + }, + { + "epoch": 1.938733552631579, + "grad_norm": 0.7125341909135485, + "learning_rate": 2.7818419966915912e-05, + "loss": 3.5965, + "step": 18860 + }, + { + "epoch": 1.9397615131578947, + "grad_norm": 0.7092138922491994, + "learning_rate": 2.7782232561970196e-05, + "loss": 3.5749, + "step": 18870 + }, + { + "epoch": 1.9407894736842106, + "grad_norm": 0.895458557038944, + "learning_rate": 2.7746061350795285e-05, + "loss": 3.6281, + "step": 18880 + }, + { + "epoch": 1.9418174342105263, + "grad_norm": 0.8160436454698965, + "learning_rate": 2.770990637318991e-05, + "loss": 3.607, + "step": 18890 + }, + { + "epoch": 1.942845394736842, + "grad_norm": 1.041045340566595, + "learning_rate": 2.7673767668934904e-05, + "loss": 3.5939, + "step": 18900 + }, + { + "epoch": 1.943873355263158, + "grad_norm": 0.9746997921270382, + "learning_rate": 2.7637645277793255e-05, + "loss": 3.6421, + "step": 18910 + }, + { + "epoch": 1.9449013157894737, + "grad_norm": 0.8470544744536538, + "learning_rate": 2.7601539239509926e-05, + "loss": 3.594, + "step": 18920 + }, + { + "epoch": 1.9459292763157894, + "grad_norm": 1.4143572758243823, + "learning_rate": 2.7565449593811956e-05, + "loss": 3.6222, + "step": 18930 + }, + { + "epoch": 1.9469572368421053, + "grad_norm": 0.8854261883990354, + "learning_rate": 2.752937638040831e-05, + "loss": 3.6263, + "step": 18940 + }, + { + "epoch": 1.947985197368421, + "grad_norm": 0.6992231400847475, + "learning_rate": 2.7493319638989867e-05, + "loss": 3.678, + "step": 18950 + }, + { + "epoch": 1.9490131578947367, + "grad_norm": 0.9048039310358122, + "learning_rate": 2.7457279409229435e-05, + "loss": 3.6574, + "step": 18960 + }, + { + "epoch": 1.9500411184210527, + "grad_norm": 1.4846820714454272, + "learning_rate": 2.742125573078159e-05, + "loss": 3.6306, + "step": 18970 + }, + { + "epoch": 1.9510690789473686, + "grad_norm": 1.3436963713834063, + "learning_rate": 2.738524864328273e-05, + "loss": 3.6006, + "step": 18980 + }, + { + "epoch": 1.952097039473684, + "grad_norm": 0.9702535774166972, + "learning_rate": 2.734925818635099e-05, + "loss": 3.65, + "step": 18990 + }, + { + "epoch": 1.953125, + "grad_norm": 0.9254671682678854, + "learning_rate": 2.7313284399586195e-05, + "loss": 3.6128, + "step": 19000 + }, + { + "epoch": 1.954152960526316, + "grad_norm": 1.1609072839185581, + "learning_rate": 2.727732732256987e-05, + "loss": 3.5562, + "step": 19010 + }, + { + "epoch": 1.9551809210526314, + "grad_norm": 1.1307426221068329, + "learning_rate": 2.724138699486511e-05, + "loss": 3.6333, + "step": 19020 + }, + { + "epoch": 1.9562088815789473, + "grad_norm": 1.0731752077953474, + "learning_rate": 2.7205463456016594e-05, + "loss": 3.6723, + "step": 19030 + }, + { + "epoch": 1.9572368421052633, + "grad_norm": 0.8616411328052668, + "learning_rate": 2.7169556745550538e-05, + "loss": 3.5917, + "step": 19040 + }, + { + "epoch": 1.958264802631579, + "grad_norm": 1.1398953233311209, + "learning_rate": 2.713366690297462e-05, + "loss": 3.517, + "step": 19050 + }, + { + "epoch": 1.9592927631578947, + "grad_norm": 0.8901736470993757, + "learning_rate": 2.7097793967777993e-05, + "loss": 3.628, + "step": 19060 + }, + { + "epoch": 1.9603207236842106, + "grad_norm": 1.3672134832910412, + "learning_rate": 2.706193797943119e-05, + "loss": 3.6539, + "step": 19070 + }, + { + "epoch": 1.9613486842105263, + "grad_norm": 1.2121997284475254, + "learning_rate": 2.7026098977386076e-05, + "loss": 3.7076, + "step": 19080 + }, + { + "epoch": 1.962376644736842, + "grad_norm": 1.583844687891249, + "learning_rate": 2.6990277001075856e-05, + "loss": 3.5584, + "step": 19090 + }, + { + "epoch": 1.963404605263158, + "grad_norm": 1.0545108788563557, + "learning_rate": 2.6954472089914988e-05, + "loss": 3.6647, + "step": 19100 + }, + { + "epoch": 1.9644325657894737, + "grad_norm": 0.9948617833909946, + "learning_rate": 2.6918684283299158e-05, + "loss": 3.6555, + "step": 19110 + }, + { + "epoch": 1.9654605263157894, + "grad_norm": 0.9397780080682352, + "learning_rate": 2.688291362060523e-05, + "loss": 3.5627, + "step": 19120 + }, + { + "epoch": 1.9664884868421053, + "grad_norm": 0.897396380040022, + "learning_rate": 2.6847160141191227e-05, + "loss": 3.6175, + "step": 19130 + }, + { + "epoch": 1.967516447368421, + "grad_norm": 1.2500602230823532, + "learning_rate": 2.6811423884396196e-05, + "loss": 3.5856, + "step": 19140 + }, + { + "epoch": 1.9685444078947367, + "grad_norm": 0.7865325480090485, + "learning_rate": 2.677570488954032e-05, + "loss": 3.5284, + "step": 19150 + }, + { + "epoch": 1.9695723684210527, + "grad_norm": 0.9570041263395698, + "learning_rate": 2.6740003195924744e-05, + "loss": 3.6357, + "step": 19160 + }, + { + "epoch": 1.9706003289473686, + "grad_norm": 1.3056218348017812, + "learning_rate": 2.670431884283158e-05, + "loss": 3.5837, + "step": 19170 + }, + { + "epoch": 1.971628289473684, + "grad_norm": 1.1377193071839322, + "learning_rate": 2.6668651869523875e-05, + "loss": 3.5808, + "step": 19180 + }, + { + "epoch": 1.97265625, + "grad_norm": 1.4451189066132581, + "learning_rate": 2.663300231524551e-05, + "loss": 3.6509, + "step": 19190 + }, + { + "epoch": 1.973684210526316, + "grad_norm": 1.120580894849022, + "learning_rate": 2.6597370219221266e-05, + "loss": 3.5851, + "step": 19200 + }, + { + "epoch": 1.9747121710526314, + "grad_norm": 0.8795211675394361, + "learning_rate": 2.6561755620656674e-05, + "loss": 3.5993, + "step": 19210 + }, + { + "epoch": 1.9757401315789473, + "grad_norm": 0.7522744025002476, + "learning_rate": 2.6526158558738018e-05, + "loss": 3.5624, + "step": 19220 + }, + { + "epoch": 1.9767680921052633, + "grad_norm": 0.800886454300697, + "learning_rate": 2.6490579072632295e-05, + "loss": 3.576, + "step": 19230 + }, + { + "epoch": 1.977796052631579, + "grad_norm": 1.1308751871060996, + "learning_rate": 2.6455017201487122e-05, + "loss": 3.6578, + "step": 19240 + }, + { + "epoch": 1.9788240131578947, + "grad_norm": 0.8018192541855046, + "learning_rate": 2.6419472984430816e-05, + "loss": 3.6459, + "step": 19250 + }, + { + "epoch": 1.9798519736842106, + "grad_norm": 0.6586267709655756, + "learning_rate": 2.6383946460572202e-05, + "loss": 3.6934, + "step": 19260 + }, + { + "epoch": 1.9808799342105263, + "grad_norm": 1.0085362637401447, + "learning_rate": 2.634843766900066e-05, + "loss": 3.6466, + "step": 19270 + }, + { + "epoch": 1.981907894736842, + "grad_norm": 0.8805108575644305, + "learning_rate": 2.631294664878606e-05, + "loss": 3.6803, + "step": 19280 + }, + { + "epoch": 1.982935855263158, + "grad_norm": 1.1543200598645829, + "learning_rate": 2.627747343897872e-05, + "loss": 3.5145, + "step": 19290 + }, + { + "epoch": 1.9839638157894737, + "grad_norm": 0.7310326587436924, + "learning_rate": 2.624201807860935e-05, + "loss": 3.5698, + "step": 19300 + }, + { + "epoch": 1.9849917763157894, + "grad_norm": 0.7552829091750973, + "learning_rate": 2.6206580606689042e-05, + "loss": 3.6404, + "step": 19310 + }, + { + "epoch": 1.9860197368421053, + "grad_norm": 1.1072934461395711, + "learning_rate": 2.617116106220917e-05, + "loss": 3.6361, + "step": 19320 + }, + { + "epoch": 1.987047697368421, + "grad_norm": 1.8216833102113652, + "learning_rate": 2.6135759484141426e-05, + "loss": 3.6569, + "step": 19330 + }, + { + "epoch": 1.9880756578947367, + "grad_norm": 1.0660907991433597, + "learning_rate": 2.6100375911437694e-05, + "loss": 3.5417, + "step": 19340 + }, + { + "epoch": 1.9891036184210527, + "grad_norm": 0.6735212428871185, + "learning_rate": 2.6065010383030084e-05, + "loss": 3.6746, + "step": 19350 + }, + { + "epoch": 1.9901315789473686, + "grad_norm": 1.1630255157381, + "learning_rate": 2.6029662937830815e-05, + "loss": 3.585, + "step": 19360 + }, + { + "epoch": 1.991159539473684, + "grad_norm": 0.7741162568123258, + "learning_rate": 2.5994333614732247e-05, + "loss": 3.6149, + "step": 19370 + }, + { + "epoch": 1.9921875, + "grad_norm": 1.089468122169124, + "learning_rate": 2.5959022452606767e-05, + "loss": 3.6432, + "step": 19380 + }, + { + "epoch": 1.993215460526316, + "grad_norm": 1.2780050263373164, + "learning_rate": 2.5923729490306793e-05, + "loss": 3.6017, + "step": 19390 + }, + { + "epoch": 1.9942434210526314, + "grad_norm": 0.8795440733071885, + "learning_rate": 2.5888454766664732e-05, + "loss": 3.6143, + "step": 19400 + }, + { + "epoch": 1.9952713815789473, + "grad_norm": 0.6564535778902043, + "learning_rate": 2.5853198320492903e-05, + "loss": 3.6388, + "step": 19410 + }, + { + "epoch": 1.9962993421052633, + "grad_norm": 0.8083241699787407, + "learning_rate": 2.5817960190583514e-05, + "loss": 3.5623, + "step": 19420 + }, + { + "epoch": 1.997327302631579, + "grad_norm": 1.0359880668070702, + "learning_rate": 2.5782740415708638e-05, + "loss": 3.6424, + "step": 19430 + }, + { + "epoch": 1.9983552631578947, + "grad_norm": 0.7905144881863724, + "learning_rate": 2.5747539034620138e-05, + "loss": 3.6505, + "step": 19440 + }, + { + "epoch": 1.9993832236842106, + "grad_norm": 1.0567220208764518, + "learning_rate": 2.5712356086049644e-05, + "loss": 3.5662, + "step": 19450 + }, + { + "epoch": 2.000411184210526, + "grad_norm": 0.9962745053712834, + "learning_rate": 2.5677191608708502e-05, + "loss": 3.5526, + "step": 19460 + }, + { + "epoch": 2.001439144736842, + "grad_norm": 0.9665403391791724, + "learning_rate": 2.564204564128773e-05, + "loss": 3.5819, + "step": 19470 + }, + { + "epoch": 2.002467105263158, + "grad_norm": 0.8307060469675815, + "learning_rate": 2.5606918222457994e-05, + "loss": 3.5966, + "step": 19480 + }, + { + "epoch": 2.003495065789474, + "grad_norm": 1.0168569282562443, + "learning_rate": 2.557180939086954e-05, + "loss": 3.6733, + "step": 19490 + }, + { + "epoch": 2.0045230263157894, + "grad_norm": 0.6396161719185065, + "learning_rate": 2.553671918515216e-05, + "loss": 3.65, + "step": 19500 + }, + { + "epoch": 2.0055509868421053, + "grad_norm": 0.9619078040524515, + "learning_rate": 2.5501647643915167e-05, + "loss": 3.5539, + "step": 19510 + }, + { + "epoch": 2.0065789473684212, + "grad_norm": 1.0273359755305285, + "learning_rate": 2.5466594805747323e-05, + "loss": 3.5872, + "step": 19520 + }, + { + "epoch": 2.0076069078947367, + "grad_norm": 1.0460232358361155, + "learning_rate": 2.5431560709216814e-05, + "loss": 3.5726, + "step": 19530 + }, + { + "epoch": 2.0086348684210527, + "grad_norm": 0.8863170907219301, + "learning_rate": 2.539654539287121e-05, + "loss": 3.547, + "step": 19540 + }, + { + "epoch": 2.0096628289473686, + "grad_norm": 0.7633089730139027, + "learning_rate": 2.5361548895237408e-05, + "loss": 3.5846, + "step": 19550 + }, + { + "epoch": 2.010690789473684, + "grad_norm": 0.9249515598088686, + "learning_rate": 2.5326571254821622e-05, + "loss": 3.5531, + "step": 19560 + }, + { + "epoch": 2.01171875, + "grad_norm": 0.8442199952156786, + "learning_rate": 2.529161251010929e-05, + "loss": 3.6967, + "step": 19570 + }, + { + "epoch": 2.012746710526316, + "grad_norm": 0.8991819531611759, + "learning_rate": 2.5256672699565062e-05, + "loss": 3.6351, + "step": 19580 + }, + { + "epoch": 2.0137746710526314, + "grad_norm": 1.1875201273768414, + "learning_rate": 2.5221751861632782e-05, + "loss": 3.6684, + "step": 19590 + }, + { + "epoch": 2.0148026315789473, + "grad_norm": 1.2760082110735702, + "learning_rate": 2.5186850034735386e-05, + "loss": 3.5628, + "step": 19600 + }, + { + "epoch": 2.0158305921052633, + "grad_norm": 1.2392761854368435, + "learning_rate": 2.5151967257274916e-05, + "loss": 3.5752, + "step": 19610 + }, + { + "epoch": 2.0168585526315788, + "grad_norm": 0.8844861732402767, + "learning_rate": 2.511710356763244e-05, + "loss": 3.5819, + "step": 19620 + }, + { + "epoch": 2.0178865131578947, + "grad_norm": 0.790741974391003, + "learning_rate": 2.508225900416803e-05, + "loss": 3.5691, + "step": 19630 + }, + { + "epoch": 2.0189144736842106, + "grad_norm": 1.5896067149724558, + "learning_rate": 2.504743360522071e-05, + "loss": 3.5416, + "step": 19640 + }, + { + "epoch": 2.019942434210526, + "grad_norm": 1.0202594363564423, + "learning_rate": 2.5012627409108422e-05, + "loss": 3.6127, + "step": 19650 + }, + { + "epoch": 2.020970394736842, + "grad_norm": 0.8024601525963606, + "learning_rate": 2.4977840454127983e-05, + "loss": 3.6386, + "step": 19660 + }, + { + "epoch": 2.021998355263158, + "grad_norm": 0.9397516549540584, + "learning_rate": 2.4943072778555014e-05, + "loss": 3.6228, + "step": 19670 + }, + { + "epoch": 2.023026315789474, + "grad_norm": 0.900569343620844, + "learning_rate": 2.490832442064397e-05, + "loss": 3.5596, + "step": 19680 + }, + { + "epoch": 2.0240542763157894, + "grad_norm": 1.1314048176411178, + "learning_rate": 2.4873595418627997e-05, + "loss": 3.5484, + "step": 19690 + }, + { + "epoch": 2.0250822368421053, + "grad_norm": 0.9472293178246595, + "learning_rate": 2.4838885810718994e-05, + "loss": 3.5943, + "step": 19700 + }, + { + "epoch": 2.0261101973684212, + "grad_norm": 0.9211747047368559, + "learning_rate": 2.4804195635107488e-05, + "loss": 3.6842, + "step": 19710 + }, + { + "epoch": 2.0271381578947367, + "grad_norm": 1.3080090939004427, + "learning_rate": 2.476952492996263e-05, + "loss": 3.5619, + "step": 19720 + }, + { + "epoch": 2.0281661184210527, + "grad_norm": 1.095253864193569, + "learning_rate": 2.4734873733432157e-05, + "loss": 3.5872, + "step": 19730 + }, + { + "epoch": 2.0291940789473686, + "grad_norm": 0.7493470449061398, + "learning_rate": 2.470024208364233e-05, + "loss": 3.4416, + "step": 19740 + }, + { + "epoch": 2.030222039473684, + "grad_norm": 1.0809106636079673, + "learning_rate": 2.4665630018697922e-05, + "loss": 3.6186, + "step": 19750 + }, + { + "epoch": 2.03125, + "grad_norm": 1.3453111229866237, + "learning_rate": 2.4631037576682134e-05, + "loss": 3.5656, + "step": 19760 + }, + { + "epoch": 2.032277960526316, + "grad_norm": 0.9563958374713025, + "learning_rate": 2.4596464795656593e-05, + "loss": 3.634, + "step": 19770 + }, + { + "epoch": 2.0333059210526314, + "grad_norm": 1.0434380664806764, + "learning_rate": 2.4561911713661277e-05, + "loss": 3.5977, + "step": 19780 + }, + { + "epoch": 2.0343338815789473, + "grad_norm": 0.8917385969531769, + "learning_rate": 2.4527378368714504e-05, + "loss": 3.6172, + "step": 19790 + }, + { + "epoch": 2.0353618421052633, + "grad_norm": 1.135668659536145, + "learning_rate": 2.4492864798812875e-05, + "loss": 3.5883, + "step": 19800 + }, + { + "epoch": 2.0363898026315788, + "grad_norm": 0.7802037290293807, + "learning_rate": 2.4458371041931207e-05, + "loss": 3.6029, + "step": 19810 + }, + { + "epoch": 2.0374177631578947, + "grad_norm": 0.9208949100107248, + "learning_rate": 2.4423897136022595e-05, + "loss": 3.665, + "step": 19820 + }, + { + "epoch": 2.0384457236842106, + "grad_norm": 0.8436277442239749, + "learning_rate": 2.4389443119018166e-05, + "loss": 3.6394, + "step": 19830 + }, + { + "epoch": 2.039473684210526, + "grad_norm": 0.9962512321895317, + "learning_rate": 2.435500902882728e-05, + "loss": 3.5867, + "step": 19840 + }, + { + "epoch": 2.040501644736842, + "grad_norm": 0.8859359714286306, + "learning_rate": 2.43205949033373e-05, + "loss": 3.616, + "step": 19850 + }, + { + "epoch": 2.041529605263158, + "grad_norm": 0.8626066653319836, + "learning_rate": 2.428620078041364e-05, + "loss": 3.6958, + "step": 19860 + }, + { + "epoch": 2.042557565789474, + "grad_norm": 0.878246347860099, + "learning_rate": 2.425182669789977e-05, + "loss": 3.5663, + "step": 19870 + }, + { + "epoch": 2.0435855263157894, + "grad_norm": 0.7525236665292834, + "learning_rate": 2.4217472693616988e-05, + "loss": 3.6192, + "step": 19880 + }, + { + "epoch": 2.0446134868421053, + "grad_norm": 0.9287709117501549, + "learning_rate": 2.4183138805364584e-05, + "loss": 3.6203, + "step": 19890 + }, + { + "epoch": 2.0456414473684212, + "grad_norm": 0.9741247344367273, + "learning_rate": 2.4148825070919707e-05, + "loss": 3.5597, + "step": 19900 + }, + { + "epoch": 2.0466694078947367, + "grad_norm": 1.1728393655693858, + "learning_rate": 2.411453152803729e-05, + "loss": 3.6327, + "step": 19910 + }, + { + "epoch": 2.0476973684210527, + "grad_norm": 1.0662224020746, + "learning_rate": 2.408025821445013e-05, + "loss": 3.5592, + "step": 19920 + }, + { + "epoch": 2.0487253289473686, + "grad_norm": 0.8331217179230176, + "learning_rate": 2.4046005167868666e-05, + "loss": 3.6502, + "step": 19930 + }, + { + "epoch": 2.049753289473684, + "grad_norm": 0.892314847018196, + "learning_rate": 2.401177242598111e-05, + "loss": 3.6437, + "step": 19940 + }, + { + "epoch": 2.05078125, + "grad_norm": 0.8862158067399399, + "learning_rate": 2.3977560026453292e-05, + "loss": 3.5805, + "step": 19950 + }, + { + "epoch": 2.051809210526316, + "grad_norm": 1.4505784064965828, + "learning_rate": 2.3943368006928683e-05, + "loss": 3.6597, + "step": 19960 + }, + { + "epoch": 2.0528371710526314, + "grad_norm": 1.1219834575453562, + "learning_rate": 2.3909196405028333e-05, + "loss": 3.6097, + "step": 19970 + }, + { + "epoch": 2.0538651315789473, + "grad_norm": 1.0636727648621702, + "learning_rate": 2.387504525835083e-05, + "loss": 3.5867, + "step": 19980 + }, + { + "epoch": 2.0548930921052633, + "grad_norm": 0.813316558452539, + "learning_rate": 2.3840914604472217e-05, + "loss": 3.6205, + "step": 19990 + }, + { + "epoch": 2.0559210526315788, + "grad_norm": 0.939047013851695, + "learning_rate": 2.380680448094601e-05, + "loss": 3.6202, + "step": 20000 + }, + { + "epoch": 2.0569490131578947, + "grad_norm": 1.15249575889287, + "learning_rate": 2.377271492530316e-05, + "loss": 3.6081, + "step": 20010 + }, + { + "epoch": 2.0579769736842106, + "grad_norm": 0.7173087971156245, + "learning_rate": 2.373864597505195e-05, + "loss": 3.5957, + "step": 20020 + }, + { + "epoch": 2.059004934210526, + "grad_norm": 1.0498207552367764, + "learning_rate": 2.370459766767805e-05, + "loss": 3.5563, + "step": 20030 + }, + { + "epoch": 2.060032894736842, + "grad_norm": 0.8020038111184719, + "learning_rate": 2.3670570040644325e-05, + "loss": 3.5964, + "step": 20040 + }, + { + "epoch": 2.061060855263158, + "grad_norm": 1.1821104014357358, + "learning_rate": 2.3636563131390963e-05, + "loss": 3.6553, + "step": 20050 + }, + { + "epoch": 2.062088815789474, + "grad_norm": 0.7767727542290902, + "learning_rate": 2.3602576977335323e-05, + "loss": 3.6059, + "step": 20060 + }, + { + "epoch": 2.0631167763157894, + "grad_norm": 0.770540147303544, + "learning_rate": 2.356861161587192e-05, + "loss": 3.5235, + "step": 20070 + }, + { + "epoch": 2.0641447368421053, + "grad_norm": 0.8764863673040527, + "learning_rate": 2.353466708437246e-05, + "loss": 3.5672, + "step": 20080 + }, + { + "epoch": 2.0651726973684212, + "grad_norm": 0.8701757888668937, + "learning_rate": 2.3500743420185616e-05, + "loss": 3.6032, + "step": 20090 + }, + { + "epoch": 2.0662006578947367, + "grad_norm": 0.7831885890610648, + "learning_rate": 2.3466840660637187e-05, + "loss": 3.6794, + "step": 20100 + }, + { + "epoch": 2.0672286184210527, + "grad_norm": 0.9753266105376703, + "learning_rate": 2.343295884302994e-05, + "loss": 3.6167, + "step": 20110 + }, + { + "epoch": 2.0682565789473686, + "grad_norm": 1.017744698683318, + "learning_rate": 2.3399098004643586e-05, + "loss": 3.5717, + "step": 20120 + }, + { + "epoch": 2.069284539473684, + "grad_norm": 0.7745051865524091, + "learning_rate": 2.3365258182734815e-05, + "loss": 3.6487, + "step": 20130 + }, + { + "epoch": 2.0703125, + "grad_norm": 1.170868403134659, + "learning_rate": 2.333143941453713e-05, + "loss": 3.6697, + "step": 20140 + }, + { + "epoch": 2.071340460526316, + "grad_norm": 1.1756428481860846, + "learning_rate": 2.329764173726088e-05, + "loss": 3.6358, + "step": 20150 + }, + { + "epoch": 2.0723684210526314, + "grad_norm": 1.1145200028059896, + "learning_rate": 2.3263865188093204e-05, + "loss": 3.6244, + "step": 20160 + }, + { + "epoch": 2.0733963815789473, + "grad_norm": 1.0118675564356077, + "learning_rate": 2.3230109804198014e-05, + "loss": 3.6669, + "step": 20170 + }, + { + "epoch": 2.0744243421052633, + "grad_norm": 0.7632971427145341, + "learning_rate": 2.319637562271595e-05, + "loss": 3.6579, + "step": 20180 + }, + { + "epoch": 2.0754523026315788, + "grad_norm": 0.902809221176579, + "learning_rate": 2.3162662680764293e-05, + "loss": 3.5746, + "step": 20190 + }, + { + "epoch": 2.0764802631578947, + "grad_norm": 0.7224055087196338, + "learning_rate": 2.312897101543693e-05, + "loss": 3.6028, + "step": 20200 + }, + { + "epoch": 2.0775082236842106, + "grad_norm": 1.355669869664797, + "learning_rate": 2.3095300663804395e-05, + "loss": 3.5615, + "step": 20210 + }, + { + "epoch": 2.078536184210526, + "grad_norm": 1.108567817282518, + "learning_rate": 2.306165166291373e-05, + "loss": 3.5449, + "step": 20220 + }, + { + "epoch": 2.079564144736842, + "grad_norm": 0.919179304859733, + "learning_rate": 2.3028024049788508e-05, + "loss": 3.6123, + "step": 20230 + }, + { + "epoch": 2.080592105263158, + "grad_norm": 0.8090651976008878, + "learning_rate": 2.2994417861428785e-05, + "loss": 3.6314, + "step": 20240 + }, + { + "epoch": 2.081620065789474, + "grad_norm": 0.9498204651388343, + "learning_rate": 2.296083313481097e-05, + "loss": 3.5961, + "step": 20250 + }, + { + "epoch": 2.0826480263157894, + "grad_norm": 0.7050296206711005, + "learning_rate": 2.2927269906887936e-05, + "loss": 3.5798, + "step": 20260 + }, + { + "epoch": 2.0836759868421053, + "grad_norm": 1.1244886083708607, + "learning_rate": 2.2893728214588845e-05, + "loss": 3.6003, + "step": 20270 + }, + { + "epoch": 2.0847039473684212, + "grad_norm": 1.0736005079056439, + "learning_rate": 2.2860208094819233e-05, + "loss": 3.6027, + "step": 20280 + }, + { + "epoch": 2.0857319078947367, + "grad_norm": 0.9393367893571902, + "learning_rate": 2.282670958446084e-05, + "loss": 3.5653, + "step": 20290 + }, + { + "epoch": 2.0867598684210527, + "grad_norm": 0.8286678046752939, + "learning_rate": 2.279323272037166e-05, + "loss": 3.4929, + "step": 20300 + }, + { + "epoch": 2.0877878289473686, + "grad_norm": 0.798281494248107, + "learning_rate": 2.2759777539385823e-05, + "loss": 3.5591, + "step": 20310 + }, + { + "epoch": 2.088815789473684, + "grad_norm": 0.7538780246649274, + "learning_rate": 2.2726344078313644e-05, + "loss": 3.594, + "step": 20320 + }, + { + "epoch": 2.08984375, + "grad_norm": 1.235433679799125, + "learning_rate": 2.2692932373941555e-05, + "loss": 3.7131, + "step": 20330 + }, + { + "epoch": 2.090871710526316, + "grad_norm": 0.8333660397469174, + "learning_rate": 2.2659542463032018e-05, + "loss": 3.5819, + "step": 20340 + }, + { + "epoch": 2.0918996710526314, + "grad_norm": 0.880649279387646, + "learning_rate": 2.262617438232353e-05, + "loss": 3.636, + "step": 20350 + }, + { + "epoch": 2.0929276315789473, + "grad_norm": 0.8797797376610004, + "learning_rate": 2.2592828168530545e-05, + "loss": 3.6091, + "step": 20360 + }, + { + "epoch": 2.0939555921052633, + "grad_norm": 1.005684076237115, + "learning_rate": 2.2559503858343457e-05, + "loss": 3.6117, + "step": 20370 + }, + { + "epoch": 2.0949835526315788, + "grad_norm": 1.118074238596709, + "learning_rate": 2.252620148842862e-05, + "loss": 3.6115, + "step": 20380 + }, + { + "epoch": 2.0960115131578947, + "grad_norm": 1.0264868698246579, + "learning_rate": 2.2492921095428192e-05, + "loss": 3.6363, + "step": 20390 + }, + { + "epoch": 2.0970394736842106, + "grad_norm": 0.8738929693780616, + "learning_rate": 2.2459662715960162e-05, + "loss": 3.5051, + "step": 20400 + }, + { + "epoch": 2.098067434210526, + "grad_norm": 0.948182313327183, + "learning_rate": 2.2426426386618286e-05, + "loss": 3.6599, + "step": 20410 + }, + { + "epoch": 2.099095394736842, + "grad_norm": 0.7183624134756592, + "learning_rate": 2.2393212143972064e-05, + "loss": 3.5998, + "step": 20420 + }, + { + "epoch": 2.100123355263158, + "grad_norm": 1.0503599889450512, + "learning_rate": 2.2360020024566724e-05, + "loss": 3.5836, + "step": 20430 + }, + { + "epoch": 2.101151315789474, + "grad_norm": 0.907217265924263, + "learning_rate": 2.232685006492313e-05, + "loss": 3.5612, + "step": 20440 + }, + { + "epoch": 2.1021792763157894, + "grad_norm": 1.2552884763508558, + "learning_rate": 2.2293702301537748e-05, + "loss": 3.6623, + "step": 20450 + }, + { + "epoch": 2.1032072368421053, + "grad_norm": 0.8628840849021124, + "learning_rate": 2.2260576770882655e-05, + "loss": 3.5882, + "step": 20460 + }, + { + "epoch": 2.1042351973684212, + "grad_norm": 1.117449056245034, + "learning_rate": 2.2227473509405418e-05, + "loss": 3.5874, + "step": 20470 + }, + { + "epoch": 2.1052631578947367, + "grad_norm": 1.0655500726842737, + "learning_rate": 2.2194392553529157e-05, + "loss": 3.6594, + "step": 20480 + }, + { + "epoch": 2.1062911184210527, + "grad_norm": 0.8899976294196629, + "learning_rate": 2.216133393965242e-05, + "loss": 3.6782, + "step": 20490 + }, + { + "epoch": 2.1073190789473686, + "grad_norm": 0.7491980477218528, + "learning_rate": 2.2128297704149166e-05, + "loss": 3.6377, + "step": 20500 + }, + { + "epoch": 2.108347039473684, + "grad_norm": 1.3945024417925576, + "learning_rate": 2.2095283883368762e-05, + "loss": 3.6378, + "step": 20510 + }, + { + "epoch": 2.109375, + "grad_norm": 1.110821707441233, + "learning_rate": 2.2062292513635858e-05, + "loss": 3.6019, + "step": 20520 + }, + { + "epoch": 2.110402960526316, + "grad_norm": 1.0524732163163262, + "learning_rate": 2.202932363125046e-05, + "loss": 3.6326, + "step": 20530 + }, + { + "epoch": 2.1114309210526314, + "grad_norm": 1.1418835737870103, + "learning_rate": 2.19963772724878e-05, + "loss": 3.5658, + "step": 20540 + }, + { + "epoch": 2.1124588815789473, + "grad_norm": 0.7161987470094425, + "learning_rate": 2.196345347359834e-05, + "loss": 3.6054, + "step": 20550 + }, + { + "epoch": 2.1134868421052633, + "grad_norm": 0.8741267819816704, + "learning_rate": 2.1930552270807724e-05, + "loss": 3.5423, + "step": 20560 + }, + { + "epoch": 2.1145148026315788, + "grad_norm": 0.8262470833384965, + "learning_rate": 2.1897673700316685e-05, + "loss": 3.6426, + "step": 20570 + }, + { + "epoch": 2.1155427631578947, + "grad_norm": 0.974070367472319, + "learning_rate": 2.186481779830111e-05, + "loss": 3.5818, + "step": 20580 + }, + { + "epoch": 2.1165707236842106, + "grad_norm": 1.046174227615251, + "learning_rate": 2.1831984600911942e-05, + "loss": 3.6205, + "step": 20590 + }, + { + "epoch": 2.117598684210526, + "grad_norm": 0.6740859934214798, + "learning_rate": 2.1799174144275127e-05, + "loss": 3.5517, + "step": 20600 + }, + { + "epoch": 2.118626644736842, + "grad_norm": 1.1779059289893306, + "learning_rate": 2.176638646449157e-05, + "loss": 3.6373, + "step": 20610 + }, + { + "epoch": 2.119654605263158, + "grad_norm": 0.8401979985647102, + "learning_rate": 2.1733621597637163e-05, + "loss": 3.5295, + "step": 20620 + }, + { + "epoch": 2.120682565789474, + "grad_norm": 0.7926145993596974, + "learning_rate": 2.1700879579762617e-05, + "loss": 3.612, + "step": 20630 + }, + { + "epoch": 2.1217105263157894, + "grad_norm": 0.9774813487329664, + "learning_rate": 2.1668160446893608e-05, + "loss": 3.5316, + "step": 20640 + }, + { + "epoch": 2.1227384868421053, + "grad_norm": 0.9449687994998556, + "learning_rate": 2.163546423503056e-05, + "loss": 3.6155, + "step": 20650 + }, + { + "epoch": 2.1237664473684212, + "grad_norm": 1.0220547719189848, + "learning_rate": 2.1602790980148693e-05, + "loss": 3.6252, + "step": 20660 + }, + { + "epoch": 2.1247944078947367, + "grad_norm": 1.2496189490306593, + "learning_rate": 2.1570140718197984e-05, + "loss": 3.639, + "step": 20670 + }, + { + "epoch": 2.1258223684210527, + "grad_norm": 0.8046679448094155, + "learning_rate": 2.1537513485103057e-05, + "loss": 3.6061, + "step": 20680 + }, + { + "epoch": 2.1268503289473686, + "grad_norm": 1.0805988953296155, + "learning_rate": 2.1504909316763288e-05, + "loss": 3.6684, + "step": 20690 + }, + { + "epoch": 2.127878289473684, + "grad_norm": 2.7567819717752045, + "learning_rate": 2.1472328249052597e-05, + "loss": 3.5991, + "step": 20700 + }, + { + "epoch": 2.12890625, + "grad_norm": 0.9122556777363432, + "learning_rate": 2.1439770317819524e-05, + "loss": 3.6036, + "step": 20710 + }, + { + "epoch": 2.129934210526316, + "grad_norm": 1.2283532743418335, + "learning_rate": 2.1407235558887155e-05, + "loss": 3.5934, + "step": 20720 + }, + { + "epoch": 2.1309621710526314, + "grad_norm": 1.090717908514378, + "learning_rate": 2.1374724008053026e-05, + "loss": 3.5882, + "step": 20730 + }, + { + "epoch": 2.1319901315789473, + "grad_norm": 0.8697645712342146, + "learning_rate": 2.1342235701089224e-05, + "loss": 3.659, + "step": 20740 + }, + { + "epoch": 2.1330180921052633, + "grad_norm": 1.083935985687963, + "learning_rate": 2.13097706737422e-05, + "loss": 3.683, + "step": 20750 + }, + { + "epoch": 2.1340460526315788, + "grad_norm": 1.203979535505566, + "learning_rate": 2.1277328961732806e-05, + "loss": 3.5601, + "step": 20760 + }, + { + "epoch": 2.1350740131578947, + "grad_norm": 1.0663731049357719, + "learning_rate": 2.124491060075625e-05, + "loss": 3.6083, + "step": 20770 + }, + { + "epoch": 2.1361019736842106, + "grad_norm": 1.4757208468998646, + "learning_rate": 2.121251562648203e-05, + "loss": 3.6122, + "step": 20780 + }, + { + "epoch": 2.1371299342105265, + "grad_norm": 0.6440915544185684, + "learning_rate": 2.118014407455392e-05, + "loss": 3.6036, + "step": 20790 + }, + { + "epoch": 2.138157894736842, + "grad_norm": 1.175835744412386, + "learning_rate": 2.114779598058993e-05, + "loss": 3.687, + "step": 20800 + }, + { + "epoch": 2.139185855263158, + "grad_norm": 1.1201064916630519, + "learning_rate": 2.1115471380182243e-05, + "loss": 3.611, + "step": 20810 + }, + { + "epoch": 2.1402138157894735, + "grad_norm": 0.842470222818532, + "learning_rate": 2.1083170308897215e-05, + "loss": 3.6216, + "step": 20820 + }, + { + "epoch": 2.1412417763157894, + "grad_norm": 0.917475498870171, + "learning_rate": 2.10508928022753e-05, + "loss": 3.6201, + "step": 20830 + }, + { + "epoch": 2.1422697368421053, + "grad_norm": 0.7846973273642917, + "learning_rate": 2.1018638895831013e-05, + "loss": 3.5944, + "step": 20840 + }, + { + "epoch": 2.1432976973684212, + "grad_norm": 1.2085312593526318, + "learning_rate": 2.0986408625052918e-05, + "loss": 3.5799, + "step": 20850 + }, + { + "epoch": 2.1443256578947367, + "grad_norm": 1.0425143162684307, + "learning_rate": 2.0954202025403577e-05, + "loss": 3.6492, + "step": 20860 + }, + { + "epoch": 2.1453536184210527, + "grad_norm": 0.8744523173594062, + "learning_rate": 2.092201913231949e-05, + "loss": 3.5844, + "step": 20870 + }, + { + "epoch": 2.1463815789473686, + "grad_norm": 0.9222338349850847, + "learning_rate": 2.088985998121109e-05, + "loss": 3.5376, + "step": 20880 + }, + { + "epoch": 2.147409539473684, + "grad_norm": 1.318219484042214, + "learning_rate": 2.085772460746267e-05, + "loss": 3.5819, + "step": 20890 + }, + { + "epoch": 2.1484375, + "grad_norm": 1.1974059080297685, + "learning_rate": 2.0825613046432366e-05, + "loss": 3.5721, + "step": 20900 + }, + { + "epoch": 2.149465460526316, + "grad_norm": 1.3235563799036638, + "learning_rate": 2.0793525333452136e-05, + "loss": 3.5271, + "step": 20910 + }, + { + "epoch": 2.1504934210526314, + "grad_norm": 0.9147325651785811, + "learning_rate": 2.0761461503827656e-05, + "loss": 3.6204, + "step": 20920 + }, + { + "epoch": 2.1515213815789473, + "grad_norm": 0.9857470018523613, + "learning_rate": 2.072942159283836e-05, + "loss": 3.5549, + "step": 20930 + }, + { + "epoch": 2.1525493421052633, + "grad_norm": 1.130938802245614, + "learning_rate": 2.069740563573735e-05, + "loss": 3.5786, + "step": 20940 + }, + { + "epoch": 2.1535773026315788, + "grad_norm": 1.0526913437691896, + "learning_rate": 2.066541366775137e-05, + "loss": 3.5879, + "step": 20950 + }, + { + "epoch": 2.1546052631578947, + "grad_norm": 0.9071955187919821, + "learning_rate": 2.0633445724080773e-05, + "loss": 3.6184, + "step": 20960 + }, + { + "epoch": 2.1556332236842106, + "grad_norm": 0.9331695358982615, + "learning_rate": 2.060150183989948e-05, + "loss": 3.5844, + "step": 20970 + }, + { + "epoch": 2.1566611842105265, + "grad_norm": 0.6358806266003307, + "learning_rate": 2.0569582050354944e-05, + "loss": 3.5232, + "step": 20980 + }, + { + "epoch": 2.157689144736842, + "grad_norm": 1.1697970978137215, + "learning_rate": 2.0537686390568085e-05, + "loss": 3.6444, + "step": 20990 + }, + { + "epoch": 2.158717105263158, + "grad_norm": 0.7995298330217292, + "learning_rate": 2.050581489563329e-05, + "loss": 3.5997, + "step": 21000 + }, + { + "epoch": 2.1597450657894735, + "grad_norm": 0.8892812128002329, + "learning_rate": 2.0473967600618368e-05, + "loss": 3.5501, + "step": 21010 + }, + { + "epoch": 2.1607730263157894, + "grad_norm": 1.3530114405952571, + "learning_rate": 2.044214454056448e-05, + "loss": 3.5565, + "step": 21020 + }, + { + "epoch": 2.1618009868421053, + "grad_norm": 0.917498204180132, + "learning_rate": 2.0410345750486136e-05, + "loss": 3.5856, + "step": 21030 + }, + { + "epoch": 2.1628289473684212, + "grad_norm": 0.8049075372087812, + "learning_rate": 2.0378571265371132e-05, + "loss": 3.6692, + "step": 21040 + }, + { + "epoch": 2.1638569078947367, + "grad_norm": 0.8957983313235569, + "learning_rate": 2.034682112018053e-05, + "loss": 3.5772, + "step": 21050 + }, + { + "epoch": 2.1648848684210527, + "grad_norm": 0.8287497269297707, + "learning_rate": 2.031509534984861e-05, + "loss": 3.4815, + "step": 21060 + }, + { + "epoch": 2.1659128289473686, + "grad_norm": 1.2743408649806405, + "learning_rate": 2.0283393989282824e-05, + "loss": 3.5634, + "step": 21070 + }, + { + "epoch": 2.166940789473684, + "grad_norm": 0.7989689858997578, + "learning_rate": 2.0251717073363774e-05, + "loss": 3.5224, + "step": 21080 + }, + { + "epoch": 2.16796875, + "grad_norm": 0.7703823282344188, + "learning_rate": 2.0220064636945173e-05, + "loss": 3.5791, + "step": 21090 + }, + { + "epoch": 2.168996710526316, + "grad_norm": 1.2470754744289212, + "learning_rate": 2.018843671485378e-05, + "loss": 3.5976, + "step": 21100 + }, + { + "epoch": 2.1700246710526314, + "grad_norm": 1.3065733386221774, + "learning_rate": 2.01568333418894e-05, + "loss": 3.5441, + "step": 21110 + }, + { + "epoch": 2.1710526315789473, + "grad_norm": 0.6869471017345049, + "learning_rate": 2.0125254552824814e-05, + "loss": 3.5916, + "step": 21120 + }, + { + "epoch": 2.1720805921052633, + "grad_norm": 1.6553311948348903, + "learning_rate": 2.0093700382405757e-05, + "loss": 3.5741, + "step": 21130 + }, + { + "epoch": 2.1731085526315788, + "grad_norm": 0.9176531309974079, + "learning_rate": 2.0062170865350886e-05, + "loss": 3.5806, + "step": 21140 + }, + { + "epoch": 2.1741365131578947, + "grad_norm": 0.7228646879903611, + "learning_rate": 2.0030666036351718e-05, + "loss": 3.628, + "step": 21150 + }, + { + "epoch": 2.1751644736842106, + "grad_norm": 1.0870613485043001, + "learning_rate": 1.9999185930072614e-05, + "loss": 3.556, + "step": 21160 + }, + { + "epoch": 2.1761924342105265, + "grad_norm": 1.0264621912054848, + "learning_rate": 1.996773058115073e-05, + "loss": 3.6567, + "step": 21170 + }, + { + "epoch": 2.177220394736842, + "grad_norm": 0.8969297611651514, + "learning_rate": 1.9936300024195988e-05, + "loss": 3.5428, + "step": 21180 + }, + { + "epoch": 2.178248355263158, + "grad_norm": 1.0044767784673894, + "learning_rate": 1.990489429379102e-05, + "loss": 3.5093, + "step": 21190 + }, + { + "epoch": 2.1792763157894735, + "grad_norm": 1.269987288509042, + "learning_rate": 1.9873513424491152e-05, + "loss": 3.5271, + "step": 21200 + }, + { + "epoch": 2.1803042763157894, + "grad_norm": 0.9328543316584498, + "learning_rate": 1.9842157450824352e-05, + "loss": 3.5576, + "step": 21210 + }, + { + "epoch": 2.1813322368421053, + "grad_norm": 0.8562825134118782, + "learning_rate": 1.9810826407291203e-05, + "loss": 3.5232, + "step": 21220 + }, + { + "epoch": 2.1823601973684212, + "grad_norm": 0.7527926823789403, + "learning_rate": 1.9779520328364835e-05, + "loss": 3.6963, + "step": 21230 + }, + { + "epoch": 2.1833881578947367, + "grad_norm": 0.7506952242435997, + "learning_rate": 1.9748239248490934e-05, + "loss": 3.6006, + "step": 21240 + }, + { + "epoch": 2.1844161184210527, + "grad_norm": 0.8803987842506358, + "learning_rate": 1.9716983202087673e-05, + "loss": 3.6116, + "step": 21250 + }, + { + "epoch": 2.1854440789473686, + "grad_norm": 0.9085783808712964, + "learning_rate": 1.968575222354568e-05, + "loss": 3.6312, + "step": 21260 + }, + { + "epoch": 2.186472039473684, + "grad_norm": 0.788214312642359, + "learning_rate": 1.9654546347227986e-05, + "loss": 3.5824, + "step": 21270 + }, + { + "epoch": 2.1875, + "grad_norm": 1.1577435203936002, + "learning_rate": 1.9623365607470038e-05, + "loss": 3.6828, + "step": 21280 + }, + { + "epoch": 2.188527960526316, + "grad_norm": 0.8266554695442676, + "learning_rate": 1.9592210038579592e-05, + "loss": 3.5259, + "step": 21290 + }, + { + "epoch": 2.1895559210526314, + "grad_norm": 1.1949737168458274, + "learning_rate": 1.9561079674836706e-05, + "loss": 3.5785, + "step": 21300 + }, + { + "epoch": 2.1905838815789473, + "grad_norm": 0.9449165082308387, + "learning_rate": 1.952997455049377e-05, + "loss": 3.6507, + "step": 21310 + }, + { + "epoch": 2.1916118421052633, + "grad_norm": 1.3411644746046585, + "learning_rate": 1.9498894699775298e-05, + "loss": 3.5837, + "step": 21320 + }, + { + "epoch": 2.1926398026315788, + "grad_norm": 1.04192188315352, + "learning_rate": 1.946784015687808e-05, + "loss": 3.6129, + "step": 21330 + }, + { + "epoch": 2.1936677631578947, + "grad_norm": 1.0296244091103097, + "learning_rate": 1.943681095597102e-05, + "loss": 3.5491, + "step": 21340 + }, + { + "epoch": 2.1946957236842106, + "grad_norm": 0.9891842723448389, + "learning_rate": 1.9405807131195147e-05, + "loss": 3.6138, + "step": 21350 + }, + { + "epoch": 2.1957236842105265, + "grad_norm": 0.9512513615100957, + "learning_rate": 1.9374828716663602e-05, + "loss": 3.5361, + "step": 21360 + }, + { + "epoch": 2.196751644736842, + "grad_norm": 0.7689946428888687, + "learning_rate": 1.9343875746461505e-05, + "loss": 3.629, + "step": 21370 + }, + { + "epoch": 2.197779605263158, + "grad_norm": 0.7735284694001667, + "learning_rate": 1.9312948254646023e-05, + "loss": 3.5309, + "step": 21380 + }, + { + "epoch": 2.1988075657894735, + "grad_norm": 1.1430240119721393, + "learning_rate": 1.9282046275246275e-05, + "loss": 3.5727, + "step": 21390 + }, + { + "epoch": 2.1998355263157894, + "grad_norm": 0.8681321025582558, + "learning_rate": 1.9251169842263303e-05, + "loss": 3.6034, + "step": 21400 + }, + { + "epoch": 2.2008634868421053, + "grad_norm": 0.7458445747068208, + "learning_rate": 1.9220318989670093e-05, + "loss": 3.67, + "step": 21410 + }, + { + "epoch": 2.2018914473684212, + "grad_norm": 0.9352249196780474, + "learning_rate": 1.9189493751411392e-05, + "loss": 3.6181, + "step": 21420 + }, + { + "epoch": 2.2029194078947367, + "grad_norm": 0.8452619498716104, + "learning_rate": 1.915869416140384e-05, + "loss": 3.5757, + "step": 21430 + }, + { + "epoch": 2.2039473684210527, + "grad_norm": 0.9736451786216319, + "learning_rate": 1.9127920253535825e-05, + "loss": 3.653, + "step": 21440 + }, + { + "epoch": 2.2049753289473686, + "grad_norm": 1.260796271714638, + "learning_rate": 1.9097172061667472e-05, + "loss": 3.5592, + "step": 21450 + }, + { + "epoch": 2.206003289473684, + "grad_norm": 1.1458112309954007, + "learning_rate": 1.9066449619630647e-05, + "loss": 3.5631, + "step": 21460 + }, + { + "epoch": 2.20703125, + "grad_norm": 0.8317781161803787, + "learning_rate": 1.903575296122886e-05, + "loss": 3.626, + "step": 21470 + }, + { + "epoch": 2.208059210526316, + "grad_norm": 1.493332747916266, + "learning_rate": 1.900508212023722e-05, + "loss": 3.7293, + "step": 21480 + }, + { + "epoch": 2.2090871710526314, + "grad_norm": 1.0375457248033646, + "learning_rate": 1.8974437130402484e-05, + "loss": 3.6129, + "step": 21490 + }, + { + "epoch": 2.2101151315789473, + "grad_norm": 1.16450256631715, + "learning_rate": 1.894381802544293e-05, + "loss": 3.5807, + "step": 21500 + }, + { + "epoch": 2.2111430921052633, + "grad_norm": 0.869346577106367, + "learning_rate": 1.8913224839048363e-05, + "loss": 3.6373, + "step": 21510 + }, + { + "epoch": 2.2121710526315788, + "grad_norm": 0.9260914600717942, + "learning_rate": 1.8882657604880102e-05, + "loss": 3.5619, + "step": 21520 + }, + { + "epoch": 2.2131990131578947, + "grad_norm": 1.2469361904493623, + "learning_rate": 1.8852116356570854e-05, + "loss": 3.482, + "step": 21530 + }, + { + "epoch": 2.2142269736842106, + "grad_norm": 1.1448193600850953, + "learning_rate": 1.8821601127724768e-05, + "loss": 3.6046, + "step": 21540 + }, + { + "epoch": 2.2152549342105265, + "grad_norm": 0.8365396844077511, + "learning_rate": 1.8791111951917368e-05, + "loss": 3.5455, + "step": 21550 + }, + { + "epoch": 2.216282894736842, + "grad_norm": 0.7783857866075559, + "learning_rate": 1.8760648862695484e-05, + "loss": 3.5336, + "step": 21560 + }, + { + "epoch": 2.217310855263158, + "grad_norm": 0.8431589741008763, + "learning_rate": 1.87302118935773e-05, + "loss": 3.5975, + "step": 21570 + }, + { + "epoch": 2.2183388157894735, + "grad_norm": 0.7449662641913207, + "learning_rate": 1.8699801078052188e-05, + "loss": 3.5962, + "step": 21580 + }, + { + "epoch": 2.2193667763157894, + "grad_norm": 0.9782388566023696, + "learning_rate": 1.8669416449580787e-05, + "loss": 3.5027, + "step": 21590 + }, + { + "epoch": 2.2203947368421053, + "grad_norm": 0.9579467753993359, + "learning_rate": 1.8639058041594916e-05, + "loss": 3.5153, + "step": 21600 + }, + { + "epoch": 2.2214226973684212, + "grad_norm": 1.4533560784481996, + "learning_rate": 1.8608725887497536e-05, + "loss": 3.6156, + "step": 21610 + }, + { + "epoch": 2.2224506578947367, + "grad_norm": 1.036739707338245, + "learning_rate": 1.857842002066274e-05, + "loss": 3.514, + "step": 21620 + }, + { + "epoch": 2.2234786184210527, + "grad_norm": 1.0151131734094514, + "learning_rate": 1.85481404744357e-05, + "loss": 3.6723, + "step": 21630 + }, + { + "epoch": 2.2245065789473686, + "grad_norm": 1.1348342743427708, + "learning_rate": 1.8517887282132575e-05, + "loss": 3.534, + "step": 21640 + }, + { + "epoch": 2.225534539473684, + "grad_norm": 0.8120721277279681, + "learning_rate": 1.848766047704058e-05, + "loss": 3.6643, + "step": 21650 + }, + { + "epoch": 2.2265625, + "grad_norm": 1.0186137898196101, + "learning_rate": 1.8457460092417865e-05, + "loss": 3.527, + "step": 21660 + }, + { + "epoch": 2.227590460526316, + "grad_norm": 1.3887041520070997, + "learning_rate": 1.8427286161493564e-05, + "loss": 3.5612, + "step": 21670 + }, + { + "epoch": 2.2286184210526314, + "grad_norm": 0.9897997201281321, + "learning_rate": 1.8397138717467655e-05, + "loss": 3.592, + "step": 21680 + }, + { + "epoch": 2.2296463815789473, + "grad_norm": 1.0613580064213959, + "learning_rate": 1.836701779351096e-05, + "loss": 3.5481, + "step": 21690 + }, + { + "epoch": 2.2306743421052633, + "grad_norm": 1.0693185465195412, + "learning_rate": 1.8336923422765162e-05, + "loss": 3.5747, + "step": 21700 + }, + { + "epoch": 2.2317023026315788, + "grad_norm": 0.6526359771582743, + "learning_rate": 1.8306855638342697e-05, + "loss": 3.5895, + "step": 21710 + }, + { + "epoch": 2.2327302631578947, + "grad_norm": 0.6993705542101787, + "learning_rate": 1.827681447332678e-05, + "loss": 3.5827, + "step": 21720 + }, + { + "epoch": 2.2337582236842106, + "grad_norm": 0.816816892884859, + "learning_rate": 1.8246799960771336e-05, + "loss": 3.6218, + "step": 21730 + }, + { + "epoch": 2.2347861842105265, + "grad_norm": 0.6496415965793717, + "learning_rate": 1.8216812133700913e-05, + "loss": 3.5766, + "step": 21740 + }, + { + "epoch": 2.235814144736842, + "grad_norm": 1.0403845271497363, + "learning_rate": 1.8186851025110745e-05, + "loss": 3.5942, + "step": 21750 + }, + { + "epoch": 2.236842105263158, + "grad_norm": 1.2019582763799517, + "learning_rate": 1.8156916667966644e-05, + "loss": 3.64, + "step": 21760 + }, + { + "epoch": 2.2378700657894735, + "grad_norm": 0.741225653340207, + "learning_rate": 1.8127009095205025e-05, + "loss": 3.4611, + "step": 21770 + }, + { + "epoch": 2.2388980263157894, + "grad_norm": 1.3989958261259912, + "learning_rate": 1.8097128339732796e-05, + "loss": 3.5797, + "step": 21780 + }, + { + "epoch": 2.2399259868421053, + "grad_norm": 1.05245435738079, + "learning_rate": 1.8067274434427377e-05, + "loss": 3.5206, + "step": 21790 + }, + { + "epoch": 2.2409539473684212, + "grad_norm": 0.9455684283896018, + "learning_rate": 1.8037447412136616e-05, + "loss": 3.5465, + "step": 21800 + }, + { + "epoch": 2.2419819078947367, + "grad_norm": 0.9244641298315883, + "learning_rate": 1.8007647305678807e-05, + "loss": 3.6203, + "step": 21810 + }, + { + "epoch": 2.2430098684210527, + "grad_norm": 0.7207616353255234, + "learning_rate": 1.7977874147842637e-05, + "loss": 3.5609, + "step": 21820 + }, + { + "epoch": 2.2440378289473686, + "grad_norm": 0.7726308295809647, + "learning_rate": 1.7948127971387134e-05, + "loss": 3.4969, + "step": 21830 + }, + { + "epoch": 2.245065789473684, + "grad_norm": 1.1893771042356656, + "learning_rate": 1.7918408809041632e-05, + "loss": 3.6083, + "step": 21840 + }, + { + "epoch": 2.24609375, + "grad_norm": 0.928374862432566, + "learning_rate": 1.788871669350573e-05, + "loss": 3.5599, + "step": 21850 + }, + { + "epoch": 2.247121710526316, + "grad_norm": 1.2869650220022923, + "learning_rate": 1.7859051657449277e-05, + "loss": 3.5371, + "step": 21860 + }, + { + "epoch": 2.2481496710526314, + "grad_norm": 0.7173899761603554, + "learning_rate": 1.782941373351236e-05, + "loss": 3.5457, + "step": 21870 + }, + { + "epoch": 2.2491776315789473, + "grad_norm": 0.7302492597341785, + "learning_rate": 1.7799802954305188e-05, + "loss": 3.6247, + "step": 21880 + }, + { + "epoch": 2.2502055921052633, + "grad_norm": 0.7289790093679588, + "learning_rate": 1.777021935240814e-05, + "loss": 3.5148, + "step": 21890 + }, + { + "epoch": 2.2512335526315788, + "grad_norm": 1.0198267366005078, + "learning_rate": 1.7740662960371645e-05, + "loss": 3.5299, + "step": 21900 + }, + { + "epoch": 2.2522615131578947, + "grad_norm": 0.8220927222041696, + "learning_rate": 1.771113381071623e-05, + "loss": 3.543, + "step": 21910 + }, + { + "epoch": 2.2532894736842106, + "grad_norm": 1.0619004745975207, + "learning_rate": 1.7681631935932455e-05, + "loss": 3.6146, + "step": 21920 + }, + { + "epoch": 2.2543174342105265, + "grad_norm": 1.129079204892921, + "learning_rate": 1.765215736848085e-05, + "loss": 3.5539, + "step": 21930 + }, + { + "epoch": 2.255345394736842, + "grad_norm": 0.7642204480258606, + "learning_rate": 1.7622710140791896e-05, + "loss": 3.5487, + "step": 21940 + }, + { + "epoch": 2.256373355263158, + "grad_norm": 0.7944847235846445, + "learning_rate": 1.7593290285266015e-05, + "loss": 3.5503, + "step": 21950 + }, + { + "epoch": 2.2574013157894735, + "grad_norm": 1.108855327260975, + "learning_rate": 1.7563897834273465e-05, + "loss": 3.6168, + "step": 21960 + }, + { + "epoch": 2.2584292763157894, + "grad_norm": 0.937059744140383, + "learning_rate": 1.7534532820154414e-05, + "loss": 3.5941, + "step": 21970 + }, + { + "epoch": 2.2594572368421053, + "grad_norm": 0.6904472553872391, + "learning_rate": 1.7505195275218802e-05, + "loss": 3.5726, + "step": 21980 + }, + { + "epoch": 2.2604851973684212, + "grad_norm": 0.7373174198895459, + "learning_rate": 1.7475885231746346e-05, + "loss": 3.4501, + "step": 21990 + }, + { + "epoch": 2.2615131578947367, + "grad_norm": 0.9556271899680373, + "learning_rate": 1.7446602721986536e-05, + "loss": 3.5369, + "step": 22000 + }, + { + "epoch": 2.2625411184210527, + "grad_norm": 0.8695849707940962, + "learning_rate": 1.7417347778158495e-05, + "loss": 3.6224, + "step": 22010 + }, + { + "epoch": 2.2635690789473686, + "grad_norm": 1.0675434016175447, + "learning_rate": 1.73881204324511e-05, + "loss": 3.5443, + "step": 22020 + }, + { + "epoch": 2.264597039473684, + "grad_norm": 0.7736582200275314, + "learning_rate": 1.735892071702282e-05, + "loss": 3.5415, + "step": 22030 + }, + { + "epoch": 2.265625, + "grad_norm": 0.8152822029915034, + "learning_rate": 1.732974866400172e-05, + "loss": 3.5491, + "step": 22040 + }, + { + "epoch": 2.266652960526316, + "grad_norm": 0.691984549019151, + "learning_rate": 1.730060430548545e-05, + "loss": 3.6144, + "step": 22050 + }, + { + "epoch": 2.2676809210526314, + "grad_norm": 0.9081719505948359, + "learning_rate": 1.7271487673541152e-05, + "loss": 3.6217, + "step": 22060 + }, + { + "epoch": 2.2687088815789473, + "grad_norm": 1.1287671171730227, + "learning_rate": 1.7242398800205484e-05, + "loss": 3.6213, + "step": 22070 + }, + { + "epoch": 2.2697368421052633, + "grad_norm": 1.2052849541093662, + "learning_rate": 1.7213337717484572e-05, + "loss": 3.564, + "step": 22080 + }, + { + "epoch": 2.2707648026315788, + "grad_norm": 1.2500509728529192, + "learning_rate": 1.718430445735396e-05, + "loss": 3.4831, + "step": 22090 + }, + { + "epoch": 2.2717927631578947, + "grad_norm": 0.9901005581156741, + "learning_rate": 1.715529905175856e-05, + "loss": 3.6239, + "step": 22100 + }, + { + "epoch": 2.2728207236842106, + "grad_norm": 1.158194118934341, + "learning_rate": 1.712632153261266e-05, + "loss": 3.5409, + "step": 22110 + }, + { + "epoch": 2.2738486842105265, + "grad_norm": 1.134073279322474, + "learning_rate": 1.709737193179982e-05, + "loss": 3.5007, + "step": 22120 + }, + { + "epoch": 2.274876644736842, + "grad_norm": 0.7838320977197677, + "learning_rate": 1.7068450281172954e-05, + "loss": 3.6415, + "step": 22130 + }, + { + "epoch": 2.275904605263158, + "grad_norm": 0.8712670192397104, + "learning_rate": 1.7039556612554158e-05, + "loss": 3.6427, + "step": 22140 + }, + { + "epoch": 2.2769325657894735, + "grad_norm": 0.9632727307811517, + "learning_rate": 1.7010690957734775e-05, + "loss": 3.5679, + "step": 22150 + }, + { + "epoch": 2.2779605263157894, + "grad_norm": 0.9503003774896495, + "learning_rate": 1.6981853348475333e-05, + "loss": 3.6411, + "step": 22160 + }, + { + "epoch": 2.2789884868421053, + "grad_norm": 0.8939500944856873, + "learning_rate": 1.6953043816505435e-05, + "loss": 3.5758, + "step": 22170 + }, + { + "epoch": 2.2800164473684212, + "grad_norm": 0.8310966077691961, + "learning_rate": 1.6924262393523883e-05, + "loss": 3.581, + "step": 22180 + }, + { + "epoch": 2.2810444078947367, + "grad_norm": 1.1052277642692296, + "learning_rate": 1.6895509111198495e-05, + "loss": 3.5753, + "step": 22190 + }, + { + "epoch": 2.2820723684210527, + "grad_norm": 0.9907470218826417, + "learning_rate": 1.6866784001166133e-05, + "loss": 3.6348, + "step": 22200 + }, + { + "epoch": 2.2831003289473686, + "grad_norm": 1.1227430911313445, + "learning_rate": 1.683808709503269e-05, + "loss": 3.5674, + "step": 22210 + }, + { + "epoch": 2.284128289473684, + "grad_norm": 0.8100190799726502, + "learning_rate": 1.6809418424372957e-05, + "loss": 3.5198, + "step": 22220 + }, + { + "epoch": 2.28515625, + "grad_norm": 0.8501814045591162, + "learning_rate": 1.678077802073075e-05, + "loss": 3.5809, + "step": 22230 + }, + { + "epoch": 2.286184210526316, + "grad_norm": 1.0760217136085328, + "learning_rate": 1.6752165915618716e-05, + "loss": 3.5025, + "step": 22240 + }, + { + "epoch": 2.2872121710526314, + "grad_norm": 0.6308127701967674, + "learning_rate": 1.67235821405184e-05, + "loss": 3.6115, + "step": 22250 + }, + { + "epoch": 2.2882401315789473, + "grad_norm": 0.9228338976587278, + "learning_rate": 1.6695026726880165e-05, + "loss": 3.604, + "step": 22260 + }, + { + "epoch": 2.2892680921052633, + "grad_norm": 1.029821273148655, + "learning_rate": 1.6666499706123132e-05, + "loss": 3.5821, + "step": 22270 + }, + { + "epoch": 2.2902960526315788, + "grad_norm": 1.2395427066759677, + "learning_rate": 1.663800110963525e-05, + "loss": 3.576, + "step": 22280 + }, + { + "epoch": 2.2913240131578947, + "grad_norm": 0.9031739482352633, + "learning_rate": 1.660953096877316e-05, + "loss": 3.579, + "step": 22290 + }, + { + "epoch": 2.2923519736842106, + "grad_norm": 0.7982160839781067, + "learning_rate": 1.658108931486218e-05, + "loss": 3.5077, + "step": 22300 + }, + { + "epoch": 2.2933799342105265, + "grad_norm": 0.9199202875662226, + "learning_rate": 1.65526761791963e-05, + "loss": 3.5626, + "step": 22310 + }, + { + "epoch": 2.294407894736842, + "grad_norm": 1.1606461746018024, + "learning_rate": 1.652429159303813e-05, + "loss": 3.569, + "step": 22320 + }, + { + "epoch": 2.295435855263158, + "grad_norm": 1.2405558972847717, + "learning_rate": 1.649593558761887e-05, + "loss": 3.4476, + "step": 22330 + }, + { + "epoch": 2.2964638157894735, + "grad_norm": 0.951529214461631, + "learning_rate": 1.646760819413826e-05, + "loss": 3.6215, + "step": 22340 + }, + { + "epoch": 2.2974917763157894, + "grad_norm": 1.1178528198444133, + "learning_rate": 1.643930944376458e-05, + "loss": 3.613, + "step": 22350 + }, + { + "epoch": 2.2985197368421053, + "grad_norm": 0.8480657827552888, + "learning_rate": 1.6411039367634572e-05, + "loss": 3.5939, + "step": 22360 + }, + { + "epoch": 2.2995476973684212, + "grad_norm": 0.807095784859491, + "learning_rate": 1.638279799685344e-05, + "loss": 3.6108, + "step": 22370 + }, + { + "epoch": 2.3005756578947367, + "grad_norm": 0.9732047070523066, + "learning_rate": 1.6354585362494807e-05, + "loss": 3.5679, + "step": 22380 + }, + { + "epoch": 2.3016036184210527, + "grad_norm": 0.7990359756429645, + "learning_rate": 1.6326401495600668e-05, + "loss": 3.5757, + "step": 22390 + }, + { + "epoch": 2.3026315789473686, + "grad_norm": 0.7501669538958285, + "learning_rate": 1.6298246427181374e-05, + "loss": 3.6191, + "step": 22400 + }, + { + "epoch": 2.303659539473684, + "grad_norm": 0.7435685409245356, + "learning_rate": 1.6270120188215586e-05, + "loss": 3.6013, + "step": 22410 + }, + { + "epoch": 2.3046875, + "grad_norm": 1.4280545440589778, + "learning_rate": 1.6242022809650234e-05, + "loss": 3.5624, + "step": 22420 + }, + { + "epoch": 2.305715460526316, + "grad_norm": 0.9299276346334199, + "learning_rate": 1.6213954322400518e-05, + "loss": 3.6202, + "step": 22430 + }, + { + "epoch": 2.3067434210526314, + "grad_norm": 1.486667509092996, + "learning_rate": 1.618591475734983e-05, + "loss": 3.6129, + "step": 22440 + }, + { + "epoch": 2.3077713815789473, + "grad_norm": 0.9769527113470142, + "learning_rate": 1.6157904145349737e-05, + "loss": 3.5593, + "step": 22450 + }, + { + "epoch": 2.3087993421052633, + "grad_norm": 1.0406206537562614, + "learning_rate": 1.6129922517219964e-05, + "loss": 3.6434, + "step": 22460 + }, + { + "epoch": 2.3098273026315788, + "grad_norm": 1.1062659095439118, + "learning_rate": 1.6101969903748333e-05, + "loss": 3.6159, + "step": 22470 + }, + { + "epoch": 2.3108552631578947, + "grad_norm": 0.7653374423864868, + "learning_rate": 1.607404633569075e-05, + "loss": 3.5938, + "step": 22480 + }, + { + "epoch": 2.3118832236842106, + "grad_norm": 0.7723357319787054, + "learning_rate": 1.6046151843771157e-05, + "loss": 3.6185, + "step": 22490 + }, + { + "epoch": 2.3129111842105265, + "grad_norm": 0.7556220182197496, + "learning_rate": 1.60182864586815e-05, + "loss": 3.6096, + "step": 22500 + }, + { + "epoch": 2.313939144736842, + "grad_norm": 0.9842524376515149, + "learning_rate": 1.5990450211081717e-05, + "loss": 3.5359, + "step": 22510 + }, + { + "epoch": 2.314967105263158, + "grad_norm": 0.9248818966499747, + "learning_rate": 1.596264313159967e-05, + "loss": 3.5851, + "step": 22520 + }, + { + "epoch": 2.3159950657894735, + "grad_norm": 0.7713418563351558, + "learning_rate": 1.593486525083112e-05, + "loss": 3.6864, + "step": 22530 + }, + { + "epoch": 2.3170230263157894, + "grad_norm": 1.1888674843030884, + "learning_rate": 1.5907116599339733e-05, + "loss": 3.5938, + "step": 22540 + }, + { + "epoch": 2.3180509868421053, + "grad_norm": 1.0306571065275087, + "learning_rate": 1.5879397207656982e-05, + "loss": 3.642, + "step": 22550 + }, + { + "epoch": 2.3190789473684212, + "grad_norm": 0.9420354805903712, + "learning_rate": 1.585170710628216e-05, + "loss": 3.578, + "step": 22560 + }, + { + "epoch": 2.3201069078947367, + "grad_norm": 1.0245067456322619, + "learning_rate": 1.5824046325682334e-05, + "loss": 3.6082, + "step": 22570 + }, + { + "epoch": 2.3211348684210527, + "grad_norm": 0.9478161971676966, + "learning_rate": 1.5796414896292302e-05, + "loss": 3.7109, + "step": 22580 + }, + { + "epoch": 2.3221628289473686, + "grad_norm": 0.8930179839357777, + "learning_rate": 1.5768812848514574e-05, + "loss": 3.6077, + "step": 22590 + }, + { + "epoch": 2.323190789473684, + "grad_norm": 0.8977431226215687, + "learning_rate": 1.574124021271932e-05, + "loss": 3.6158, + "step": 22600 + }, + { + "epoch": 2.32421875, + "grad_norm": 1.5927223931792442, + "learning_rate": 1.5713697019244382e-05, + "loss": 3.6104, + "step": 22610 + }, + { + "epoch": 2.325246710526316, + "grad_norm": 1.411320171655818, + "learning_rate": 1.5686183298395154e-05, + "loss": 3.6324, + "step": 22620 + }, + { + "epoch": 2.3262746710526314, + "grad_norm": 1.048537116285597, + "learning_rate": 1.5658699080444647e-05, + "loss": 3.6598, + "step": 22630 + }, + { + "epoch": 2.3273026315789473, + "grad_norm": 0.8715284827956902, + "learning_rate": 1.5631244395633386e-05, + "loss": 3.6737, + "step": 22640 + }, + { + "epoch": 2.3283305921052633, + "grad_norm": 1.0892334463189395, + "learning_rate": 1.5603819274169417e-05, + "loss": 3.6209, + "step": 22650 + }, + { + "epoch": 2.3293585526315788, + "grad_norm": 0.939200892858552, + "learning_rate": 1.5576423746228246e-05, + "loss": 3.5489, + "step": 22660 + }, + { + "epoch": 2.3303865131578947, + "grad_norm": 1.0029885220008765, + "learning_rate": 1.5549057841952812e-05, + "loss": 3.577, + "step": 22670 + }, + { + "epoch": 2.3314144736842106, + "grad_norm": 0.7869940737194592, + "learning_rate": 1.5521721591453486e-05, + "loss": 3.5718, + "step": 22680 + }, + { + "epoch": 2.3324424342105265, + "grad_norm": 0.8588308518952106, + "learning_rate": 1.5494415024807975e-05, + "loss": 3.5451, + "step": 22690 + }, + { + "epoch": 2.333470394736842, + "grad_norm": 0.8125516068134989, + "learning_rate": 1.5467138172061364e-05, + "loss": 3.6413, + "step": 22700 + }, + { + "epoch": 2.334498355263158, + "grad_norm": 0.9769063115486455, + "learning_rate": 1.5439891063226e-05, + "loss": 3.4809, + "step": 22710 + }, + { + "epoch": 2.3355263157894735, + "grad_norm": 0.7538031472974149, + "learning_rate": 1.5412673728281552e-05, + "loss": 3.5387, + "step": 22720 + }, + { + "epoch": 2.3365542763157894, + "grad_norm": 0.8734818640072458, + "learning_rate": 1.5385486197174896e-05, + "loss": 3.5896, + "step": 22730 + }, + { + "epoch": 2.3375822368421053, + "grad_norm": 0.9521377328124215, + "learning_rate": 1.5358328499820114e-05, + "loss": 3.6884, + "step": 22740 + }, + { + "epoch": 2.3386101973684212, + "grad_norm": 1.6594478756406854, + "learning_rate": 1.5331200666098483e-05, + "loss": 3.5489, + "step": 22750 + }, + { + "epoch": 2.3396381578947367, + "grad_norm": 1.549621532749249, + "learning_rate": 1.5304102725858406e-05, + "loss": 3.6243, + "step": 22760 + }, + { + "epoch": 2.3406661184210527, + "grad_norm": 0.7801589213378449, + "learning_rate": 1.5277034708915405e-05, + "loss": 3.6569, + "step": 22770 + }, + { + "epoch": 2.3416940789473686, + "grad_norm": 0.7853567901642258, + "learning_rate": 1.5249996645052065e-05, + "loss": 3.6221, + "step": 22780 + }, + { + "epoch": 2.342722039473684, + "grad_norm": 1.3122563817222797, + "learning_rate": 1.5222988564018013e-05, + "loss": 3.5103, + "step": 22790 + }, + { + "epoch": 2.34375, + "grad_norm": 1.148027411701828, + "learning_rate": 1.519601049552993e-05, + "loss": 3.513, + "step": 22800 + }, + { + "epoch": 2.344777960526316, + "grad_norm": 0.7933402483168598, + "learning_rate": 1.51690624692714e-05, + "loss": 3.5869, + "step": 22810 + }, + { + "epoch": 2.3458059210526314, + "grad_norm": 0.9111854726082108, + "learning_rate": 1.5142144514893003e-05, + "loss": 3.6679, + "step": 22820 + }, + { + "epoch": 2.3468338815789473, + "grad_norm": 0.9798196110332007, + "learning_rate": 1.5115256662012226e-05, + "loss": 3.6155, + "step": 22830 + }, + { + "epoch": 2.3478618421052633, + "grad_norm": 0.7910887131068202, + "learning_rate": 1.5088398940213411e-05, + "loss": 3.5954, + "step": 22840 + }, + { + "epoch": 2.3488898026315788, + "grad_norm": 0.7128922135844886, + "learning_rate": 1.5061571379047795e-05, + "loss": 3.6469, + "step": 22850 + }, + { + "epoch": 2.3499177631578947, + "grad_norm": 0.6970103675503592, + "learning_rate": 1.5034774008033375e-05, + "loss": 3.4428, + "step": 22860 + }, + { + "epoch": 2.3509457236842106, + "grad_norm": 1.057761509172058, + "learning_rate": 1.5008006856654963e-05, + "loss": 3.619, + "step": 22870 + }, + { + "epoch": 2.3519736842105265, + "grad_norm": 1.1068302033900017, + "learning_rate": 1.4981269954364112e-05, + "loss": 3.5884, + "step": 22880 + }, + { + "epoch": 2.353001644736842, + "grad_norm": 0.9961531378301941, + "learning_rate": 1.4954563330579078e-05, + "loss": 3.6529, + "step": 22890 + }, + { + "epoch": 2.354029605263158, + "grad_norm": 0.7821609526703971, + "learning_rate": 1.4927887014684852e-05, + "loss": 3.6408, + "step": 22900 + }, + { + "epoch": 2.3550575657894735, + "grad_norm": 1.4719388717107864, + "learning_rate": 1.4901241036033016e-05, + "loss": 3.5754, + "step": 22910 + }, + { + "epoch": 2.3560855263157894, + "grad_norm": 1.0905833588878502, + "learning_rate": 1.48746254239418e-05, + "loss": 3.5675, + "step": 22920 + }, + { + "epoch": 2.3571134868421053, + "grad_norm": 1.0898567663803238, + "learning_rate": 1.4848040207696021e-05, + "loss": 3.5577, + "step": 22930 + }, + { + "epoch": 2.3581414473684212, + "grad_norm": 0.758083924126567, + "learning_rate": 1.482148541654705e-05, + "loss": 3.5806, + "step": 22940 + }, + { + "epoch": 2.3591694078947367, + "grad_norm": 0.8288621320857512, + "learning_rate": 1.4794961079712795e-05, + "loss": 3.549, + "step": 22950 + }, + { + "epoch": 2.3601973684210527, + "grad_norm": 1.0817425063710988, + "learning_rate": 1.4768467226377654e-05, + "loss": 3.5671, + "step": 22960 + }, + { + "epoch": 2.3612253289473686, + "grad_norm": 0.9625854937247598, + "learning_rate": 1.4742003885692443e-05, + "loss": 3.5494, + "step": 22970 + }, + { + "epoch": 2.362253289473684, + "grad_norm": 0.8181000801761571, + "learning_rate": 1.4715571086774447e-05, + "loss": 3.5688, + "step": 22980 + }, + { + "epoch": 2.36328125, + "grad_norm": 0.9348030717285903, + "learning_rate": 1.4689168858707344e-05, + "loss": 3.5764, + "step": 22990 + }, + { + "epoch": 2.364309210526316, + "grad_norm": 0.9661338613476137, + "learning_rate": 1.466279723054115e-05, + "loss": 3.6282, + "step": 23000 + }, + { + "epoch": 2.3653371710526314, + "grad_norm": 0.8462017958983313, + "learning_rate": 1.4636456231292262e-05, + "loss": 3.5785, + "step": 23010 + }, + { + "epoch": 2.3663651315789473, + "grad_norm": 1.1890670575770133, + "learning_rate": 1.4610145889943312e-05, + "loss": 3.5906, + "step": 23020 + }, + { + "epoch": 2.3673930921052633, + "grad_norm": 1.1746342028729975, + "learning_rate": 1.458386623544325e-05, + "loss": 3.5814, + "step": 23030 + }, + { + "epoch": 2.3684210526315788, + "grad_norm": 1.3148085863167853, + "learning_rate": 1.455761729670723e-05, + "loss": 3.5781, + "step": 23040 + }, + { + "epoch": 2.3694490131578947, + "grad_norm": 1.0955130944385671, + "learning_rate": 1.4531399102616617e-05, + "loss": 3.496, + "step": 23050 + }, + { + "epoch": 2.3704769736842106, + "grad_norm": 1.4879532099565973, + "learning_rate": 1.4505211682018989e-05, + "loss": 3.5707, + "step": 23060 + }, + { + "epoch": 2.3715049342105265, + "grad_norm": 0.8423583598737103, + "learning_rate": 1.447905506372799e-05, + "loss": 3.6073, + "step": 23070 + }, + { + "epoch": 2.372532894736842, + "grad_norm": 0.777230527673766, + "learning_rate": 1.4452929276523429e-05, + "loss": 3.5455, + "step": 23080 + }, + { + "epoch": 2.373560855263158, + "grad_norm": 1.4080780862645483, + "learning_rate": 1.4426834349151176e-05, + "loss": 3.5303, + "step": 23090 + }, + { + "epoch": 2.3745888157894735, + "grad_norm": 0.9036562733042554, + "learning_rate": 1.4400770310323126e-05, + "loss": 3.5989, + "step": 23100 + }, + { + "epoch": 2.3756167763157894, + "grad_norm": 0.9631699781665114, + "learning_rate": 1.437473718871724e-05, + "loss": 3.6438, + "step": 23110 + }, + { + "epoch": 2.3766447368421053, + "grad_norm": 1.0020794705153238, + "learning_rate": 1.4348735012977415e-05, + "loss": 3.6529, + "step": 23120 + }, + { + "epoch": 2.3776726973684212, + "grad_norm": 1.2529655088684124, + "learning_rate": 1.4322763811713498e-05, + "loss": 3.5855, + "step": 23130 + }, + { + "epoch": 2.3787006578947367, + "grad_norm": 0.9756380512114832, + "learning_rate": 1.4296823613501278e-05, + "loss": 3.6631, + "step": 23140 + }, + { + "epoch": 2.3797286184210527, + "grad_norm": 0.72862627349915, + "learning_rate": 1.4270914446882414e-05, + "loss": 3.6117, + "step": 23150 + }, + { + "epoch": 2.3807565789473686, + "grad_norm": 0.7645851001840354, + "learning_rate": 1.4245036340364444e-05, + "loss": 3.6064, + "step": 23160 + }, + { + "epoch": 2.381784539473684, + "grad_norm": 1.0744810581729949, + "learning_rate": 1.4219189322420724e-05, + "loss": 3.5276, + "step": 23170 + }, + { + "epoch": 2.3828125, + "grad_norm": 0.988748914715621, + "learning_rate": 1.4193373421490376e-05, + "loss": 3.5202, + "step": 23180 + }, + { + "epoch": 2.383840460526316, + "grad_norm": 1.0468242120522053, + "learning_rate": 1.4167588665978305e-05, + "loss": 3.6686, + "step": 23190 + }, + { + "epoch": 2.3848684210526314, + "grad_norm": 0.7289037820952117, + "learning_rate": 1.4141835084255146e-05, + "loss": 3.5996, + "step": 23200 + }, + { + "epoch": 2.3858963815789473, + "grad_norm": 0.6350172981774621, + "learning_rate": 1.4116112704657256e-05, + "loss": 3.6616, + "step": 23210 + }, + { + "epoch": 2.3869243421052633, + "grad_norm": 0.996223471179438, + "learning_rate": 1.4090421555486633e-05, + "loss": 3.6191, + "step": 23220 + }, + { + "epoch": 2.3879523026315788, + "grad_norm": 1.1395968676407875, + "learning_rate": 1.4064761665010904e-05, + "loss": 3.5597, + "step": 23230 + }, + { + "epoch": 2.3889802631578947, + "grad_norm": 0.8929218323297528, + "learning_rate": 1.4039133061463326e-05, + "loss": 3.5541, + "step": 23240 + }, + { + "epoch": 2.3900082236842106, + "grad_norm": 0.8035059652457195, + "learning_rate": 1.401353577304271e-05, + "loss": 3.5942, + "step": 23250 + }, + { + "epoch": 2.3910361842105265, + "grad_norm": 0.6691304417073021, + "learning_rate": 1.3987969827913448e-05, + "loss": 3.6247, + "step": 23260 + }, + { + "epoch": 2.392064144736842, + "grad_norm": 0.6487550762102868, + "learning_rate": 1.3962435254205409e-05, + "loss": 3.5793, + "step": 23270 + }, + { + "epoch": 2.393092105263158, + "grad_norm": 1.145906474929068, + "learning_rate": 1.3936932080013974e-05, + "loss": 3.5278, + "step": 23280 + }, + { + "epoch": 2.3941200657894735, + "grad_norm": 0.9316673354952282, + "learning_rate": 1.3911460333399928e-05, + "loss": 3.5973, + "step": 23290 + }, + { + "epoch": 2.3951480263157894, + "grad_norm": 0.9721513226557829, + "learning_rate": 1.388602004238951e-05, + "loss": 3.5282, + "step": 23300 + }, + { + "epoch": 2.3961759868421053, + "grad_norm": 1.4571062738570577, + "learning_rate": 1.386061123497437e-05, + "loss": 3.594, + "step": 23310 + }, + { + "epoch": 2.3972039473684212, + "grad_norm": 1.3702145444684657, + "learning_rate": 1.3835233939111486e-05, + "loss": 3.6222, + "step": 23320 + }, + { + "epoch": 2.3982319078947367, + "grad_norm": 1.2556130744667768, + "learning_rate": 1.380988818272317e-05, + "loss": 3.5871, + "step": 23330 + }, + { + "epoch": 2.3992598684210527, + "grad_norm": 1.229773333994086, + "learning_rate": 1.3784573993697024e-05, + "loss": 3.5185, + "step": 23340 + }, + { + "epoch": 2.4002878289473686, + "grad_norm": 0.9168753422565511, + "learning_rate": 1.375929139988592e-05, + "loss": 3.6092, + "step": 23350 + }, + { + "epoch": 2.401315789473684, + "grad_norm": 0.8066068184998322, + "learning_rate": 1.3734040429107993e-05, + "loss": 3.6231, + "step": 23360 + }, + { + "epoch": 2.40234375, + "grad_norm": 0.9292851386786316, + "learning_rate": 1.3708821109146549e-05, + "loss": 3.5725, + "step": 23370 + }, + { + "epoch": 2.403371710526316, + "grad_norm": 1.0081256705161867, + "learning_rate": 1.3683633467750102e-05, + "loss": 3.5345, + "step": 23380 + }, + { + "epoch": 2.4043996710526314, + "grad_norm": 0.7633356002776732, + "learning_rate": 1.3658477532632258e-05, + "loss": 3.6898, + "step": 23390 + }, + { + "epoch": 2.4054276315789473, + "grad_norm": 0.7439052980637624, + "learning_rate": 1.3633353331471779e-05, + "loss": 3.491, + "step": 23400 + }, + { + "epoch": 2.4064555921052633, + "grad_norm": 0.9426738704613423, + "learning_rate": 1.3608260891912517e-05, + "loss": 3.5393, + "step": 23410 + }, + { + "epoch": 2.4074835526315788, + "grad_norm": 0.9104552862383968, + "learning_rate": 1.3583200241563357e-05, + "loss": 3.6633, + "step": 23420 + }, + { + "epoch": 2.4085115131578947, + "grad_norm": 0.7531632235629055, + "learning_rate": 1.3558171407998202e-05, + "loss": 3.5034, + "step": 23430 + }, + { + "epoch": 2.4095394736842106, + "grad_norm": 0.8052445492777586, + "learning_rate": 1.3533174418755977e-05, + "loss": 3.5018, + "step": 23440 + }, + { + "epoch": 2.4105674342105265, + "grad_norm": 0.6997644862712672, + "learning_rate": 1.350820930134051e-05, + "loss": 3.6045, + "step": 23450 + }, + { + "epoch": 2.411595394736842, + "grad_norm": 1.3240662883662033, + "learning_rate": 1.3483276083220632e-05, + "loss": 3.5843, + "step": 23460 + }, + { + "epoch": 2.412623355263158, + "grad_norm": 0.9867340763257887, + "learning_rate": 1.345837479183004e-05, + "loss": 3.5284, + "step": 23470 + }, + { + "epoch": 2.4136513157894735, + "grad_norm": 0.7018665071348953, + "learning_rate": 1.3433505454567297e-05, + "loss": 3.5984, + "step": 23480 + }, + { + "epoch": 2.4146792763157894, + "grad_norm": 1.2870346765448262, + "learning_rate": 1.3408668098795833e-05, + "loss": 3.5609, + "step": 23490 + }, + { + "epoch": 2.4157072368421053, + "grad_norm": 0.8532717637919786, + "learning_rate": 1.3383862751843835e-05, + "loss": 3.5423, + "step": 23500 + }, + { + "epoch": 2.4167351973684212, + "grad_norm": 0.7646956739779944, + "learning_rate": 1.3359089441004338e-05, + "loss": 3.5427, + "step": 23510 + }, + { + "epoch": 2.4177631578947367, + "grad_norm": 0.9183910940413021, + "learning_rate": 1.3334348193535095e-05, + "loss": 3.5994, + "step": 23520 + }, + { + "epoch": 2.4187911184210527, + "grad_norm": 0.9524207092683554, + "learning_rate": 1.3309639036658584e-05, + "loss": 3.5943, + "step": 23530 + }, + { + "epoch": 2.4198190789473686, + "grad_norm": 0.9240683309856788, + "learning_rate": 1.3284961997561973e-05, + "loss": 3.4962, + "step": 23540 + }, + { + "epoch": 2.420847039473684, + "grad_norm": 0.8181139229650412, + "learning_rate": 1.3260317103397084e-05, + "loss": 3.6052, + "step": 23550 + }, + { + "epoch": 2.421875, + "grad_norm": 0.9115051947763203, + "learning_rate": 1.3235704381280372e-05, + "loss": 3.5424, + "step": 23560 + }, + { + "epoch": 2.422902960526316, + "grad_norm": 0.9109932648486202, + "learning_rate": 1.3211123858292916e-05, + "loss": 3.5407, + "step": 23570 + }, + { + "epoch": 2.4239309210526314, + "grad_norm": 0.7406334518401047, + "learning_rate": 1.3186575561480348e-05, + "loss": 3.5985, + "step": 23580 + }, + { + "epoch": 2.4249588815789473, + "grad_norm": 0.7925734638736962, + "learning_rate": 1.316205951785285e-05, + "loss": 3.4995, + "step": 23590 + }, + { + "epoch": 2.4259868421052633, + "grad_norm": 1.0019365870726402, + "learning_rate": 1.3137575754385091e-05, + "loss": 3.5764, + "step": 23600 + }, + { + "epoch": 2.4270148026315788, + "grad_norm": 0.7802769514059644, + "learning_rate": 1.311312429801624e-05, + "loss": 3.5632, + "step": 23610 + }, + { + "epoch": 2.4280427631578947, + "grad_norm": 1.3006588531737184, + "learning_rate": 1.308870517564994e-05, + "loss": 3.616, + "step": 23620 + }, + { + "epoch": 2.4290707236842106, + "grad_norm": 1.0016828324848321, + "learning_rate": 1.3064318414154236e-05, + "loss": 3.5964, + "step": 23630 + }, + { + "epoch": 2.4300986842105265, + "grad_norm": 1.0815026269282069, + "learning_rate": 1.3039964040361564e-05, + "loss": 3.5963, + "step": 23640 + }, + { + "epoch": 2.431126644736842, + "grad_norm": 1.3160606406017965, + "learning_rate": 1.301564208106874e-05, + "loss": 3.6309, + "step": 23650 + }, + { + "epoch": 2.432154605263158, + "grad_norm": 1.0040190518307952, + "learning_rate": 1.299135256303687e-05, + "loss": 3.5988, + "step": 23660 + }, + { + "epoch": 2.4331825657894735, + "grad_norm": 1.1335242304658342, + "learning_rate": 1.2967095512991437e-05, + "loss": 3.5466, + "step": 23670 + }, + { + "epoch": 2.4342105263157894, + "grad_norm": 0.9732666857700366, + "learning_rate": 1.294287095762214e-05, + "loss": 3.542, + "step": 23680 + }, + { + "epoch": 2.4352384868421053, + "grad_norm": 1.1701602342710693, + "learning_rate": 1.2918678923582959e-05, + "loss": 3.578, + "step": 23690 + }, + { + "epoch": 2.4362664473684212, + "grad_norm": 0.6958627987712414, + "learning_rate": 1.2894519437492087e-05, + "loss": 3.6077, + "step": 23700 + }, + { + "epoch": 2.4372944078947367, + "grad_norm": 0.6329721555521703, + "learning_rate": 1.2870392525931864e-05, + "loss": 3.5403, + "step": 23710 + }, + { + "epoch": 2.4383223684210527, + "grad_norm": 0.6378533020126637, + "learning_rate": 1.284629821544885e-05, + "loss": 3.5096, + "step": 23720 + }, + { + "epoch": 2.4393503289473686, + "grad_norm": 0.9200573475213196, + "learning_rate": 1.2822236532553696e-05, + "loss": 3.5184, + "step": 23730 + }, + { + "epoch": 2.440378289473684, + "grad_norm": 1.063560237342538, + "learning_rate": 1.2798207503721163e-05, + "loss": 3.56, + "step": 23740 + }, + { + "epoch": 2.44140625, + "grad_norm": 0.6008247954978432, + "learning_rate": 1.2774211155390099e-05, + "loss": 3.5827, + "step": 23750 + }, + { + "epoch": 2.442434210526316, + "grad_norm": 1.1065333870885408, + "learning_rate": 1.2750247513963338e-05, + "loss": 3.6167, + "step": 23760 + }, + { + "epoch": 2.4434621710526314, + "grad_norm": 0.7634893721244886, + "learning_rate": 1.2726316605807809e-05, + "loss": 3.5422, + "step": 23770 + }, + { + "epoch": 2.4444901315789473, + "grad_norm": 0.9011826414102575, + "learning_rate": 1.2702418457254368e-05, + "loss": 3.5124, + "step": 23780 + }, + { + "epoch": 2.4455180921052633, + "grad_norm": 0.8348286484886233, + "learning_rate": 1.2678553094597841e-05, + "loss": 3.6519, + "step": 23790 + }, + { + "epoch": 2.4465460526315788, + "grad_norm": 1.3129916859586528, + "learning_rate": 1.2654720544096981e-05, + "loss": 3.5203, + "step": 23800 + }, + { + "epoch": 2.4475740131578947, + "grad_norm": 1.3068495328256169, + "learning_rate": 1.2630920831974436e-05, + "loss": 3.5796, + "step": 23810 + }, + { + "epoch": 2.4486019736842106, + "grad_norm": 0.9380726654683275, + "learning_rate": 1.2607153984416731e-05, + "loss": 3.5764, + "step": 23820 + }, + { + "epoch": 2.4496299342105265, + "grad_norm": 0.7814119552590146, + "learning_rate": 1.258342002757421e-05, + "loss": 3.5145, + "step": 23830 + }, + { + "epoch": 2.450657894736842, + "grad_norm": 0.8299133137165572, + "learning_rate": 1.2559718987561058e-05, + "loss": 3.5858, + "step": 23840 + }, + { + "epoch": 2.451685855263158, + "grad_norm": 0.9845602678612546, + "learning_rate": 1.2536050890455215e-05, + "loss": 3.529, + "step": 23850 + }, + { + "epoch": 2.4527138157894735, + "grad_norm": 0.7092767538188764, + "learning_rate": 1.2512415762298385e-05, + "loss": 3.5043, + "step": 23860 + }, + { + "epoch": 2.4537417763157894, + "grad_norm": 1.311539389256933, + "learning_rate": 1.2488813629096002e-05, + "loss": 3.5483, + "step": 23870 + }, + { + "epoch": 2.4547697368421053, + "grad_norm": 0.9324263292517063, + "learning_rate": 1.2465244516817182e-05, + "loss": 3.5964, + "step": 23880 + }, + { + "epoch": 2.4557976973684212, + "grad_norm": 0.7807932472185185, + "learning_rate": 1.2441708451394727e-05, + "loss": 3.6318, + "step": 23890 + }, + { + "epoch": 2.4568256578947367, + "grad_norm": 0.6883237628602413, + "learning_rate": 1.2418205458725061e-05, + "loss": 3.5343, + "step": 23900 + }, + { + "epoch": 2.4578536184210527, + "grad_norm": 1.023816511720697, + "learning_rate": 1.239473556466823e-05, + "loss": 3.5682, + "step": 23910 + }, + { + "epoch": 2.4588815789473686, + "grad_norm": 0.7088171391249115, + "learning_rate": 1.2371298795047854e-05, + "loss": 3.6335, + "step": 23920 + }, + { + "epoch": 2.459909539473684, + "grad_norm": 0.6680501252515658, + "learning_rate": 1.234789517565111e-05, + "loss": 3.5933, + "step": 23930 + }, + { + "epoch": 2.4609375, + "grad_norm": 0.8308642445983371, + "learning_rate": 1.2324524732228701e-05, + "loss": 3.5956, + "step": 23940 + }, + { + "epoch": 2.461965460526316, + "grad_norm": 0.9293553376057927, + "learning_rate": 1.2301187490494828e-05, + "loss": 3.5353, + "step": 23950 + }, + { + "epoch": 2.4629934210526314, + "grad_norm": 0.867212759114025, + "learning_rate": 1.2277883476127159e-05, + "loss": 3.5556, + "step": 23960 + }, + { + "epoch": 2.4640213815789473, + "grad_norm": 0.9599583296871369, + "learning_rate": 1.22546127147668e-05, + "loss": 3.5866, + "step": 23970 + }, + { + "epoch": 2.4650493421052633, + "grad_norm": 0.8995608397429875, + "learning_rate": 1.2231375232018277e-05, + "loss": 3.591, + "step": 23980 + }, + { + "epoch": 2.4660773026315788, + "grad_norm": 0.805787861878092, + "learning_rate": 1.2208171053449482e-05, + "loss": 3.5258, + "step": 23990 + }, + { + "epoch": 2.4671052631578947, + "grad_norm": 0.9090528881185111, + "learning_rate": 1.2185000204591685e-05, + "loss": 3.5834, + "step": 24000 + }, + { + "epoch": 2.4681332236842106, + "grad_norm": 0.777319711284738, + "learning_rate": 1.2161862710939476e-05, + "loss": 3.6755, + "step": 24010 + }, + { + "epoch": 2.4691611842105265, + "grad_norm": 0.9567546211133069, + "learning_rate": 1.2138758597950732e-05, + "loss": 3.6698, + "step": 24020 + }, + { + "epoch": 2.470189144736842, + "grad_norm": 0.7417192286893056, + "learning_rate": 1.2115687891046623e-05, + "loss": 3.5878, + "step": 24030 + }, + { + "epoch": 2.471217105263158, + "grad_norm": 1.2407336574218348, + "learning_rate": 1.2092650615611547e-05, + "loss": 3.6167, + "step": 24040 + }, + { + "epoch": 2.4722450657894735, + "grad_norm": 0.7978789045861954, + "learning_rate": 1.2069646796993122e-05, + "loss": 3.5988, + "step": 24050 + }, + { + "epoch": 2.4732730263157894, + "grad_norm": 0.8966062708076115, + "learning_rate": 1.2046676460502158e-05, + "loss": 3.5665, + "step": 24060 + }, + { + "epoch": 2.4743009868421053, + "grad_norm": 0.9864555956363789, + "learning_rate": 1.2023739631412616e-05, + "loss": 3.6299, + "step": 24070 + }, + { + "epoch": 2.4753289473684212, + "grad_norm": 0.9817766683524829, + "learning_rate": 1.2000836334961605e-05, + "loss": 3.5701, + "step": 24080 + }, + { + "epoch": 2.4763569078947367, + "grad_norm": 0.9607077223056729, + "learning_rate": 1.197796659634932e-05, + "loss": 3.6009, + "step": 24090 + }, + { + "epoch": 2.4773848684210527, + "grad_norm": 0.950033991607086, + "learning_rate": 1.1955130440739041e-05, + "loss": 3.5817, + "step": 24100 + }, + { + "epoch": 2.4784128289473686, + "grad_norm": 1.1620846016431152, + "learning_rate": 1.19323278932571e-05, + "loss": 3.5334, + "step": 24110 + }, + { + "epoch": 2.479440789473684, + "grad_norm": 1.035001174539093, + "learning_rate": 1.1909558978992846e-05, + "loss": 3.6374, + "step": 24120 + }, + { + "epoch": 2.48046875, + "grad_norm": 1.01821492893317, + "learning_rate": 1.1886823722998625e-05, + "loss": 3.532, + "step": 24130 + }, + { + "epoch": 2.481496710526316, + "grad_norm": 1.247266498568952, + "learning_rate": 1.186412215028975e-05, + "loss": 3.598, + "step": 24140 + }, + { + "epoch": 2.4825246710526314, + "grad_norm": 0.7225423362234167, + "learning_rate": 1.1841454285844467e-05, + "loss": 3.5512, + "step": 24150 + }, + { + "epoch": 2.4835526315789473, + "grad_norm": 0.8021003465375638, + "learning_rate": 1.1818820154603939e-05, + "loss": 3.5466, + "step": 24160 + }, + { + "epoch": 2.4845805921052633, + "grad_norm": 0.9013343879460292, + "learning_rate": 1.1796219781472205e-05, + "loss": 3.6219, + "step": 24170 + }, + { + "epoch": 2.4856085526315788, + "grad_norm": 0.8351179365713838, + "learning_rate": 1.1773653191316173e-05, + "loss": 3.5776, + "step": 24180 + }, + { + "epoch": 2.4866365131578947, + "grad_norm": 0.7937865902133975, + "learning_rate": 1.1751120408965565e-05, + "loss": 3.5723, + "step": 24190 + }, + { + "epoch": 2.4876644736842106, + "grad_norm": 0.7197405996208949, + "learning_rate": 1.172862145921292e-05, + "loss": 3.5419, + "step": 24200 + }, + { + "epoch": 2.4886924342105265, + "grad_norm": 1.0792562106379946, + "learning_rate": 1.1706156366813536e-05, + "loss": 3.5602, + "step": 24210 + }, + { + "epoch": 2.489720394736842, + "grad_norm": 0.8146696100293674, + "learning_rate": 1.1683725156485478e-05, + "loss": 3.5058, + "step": 24220 + }, + { + "epoch": 2.490748355263158, + "grad_norm": 1.0882193652520087, + "learning_rate": 1.1661327852909508e-05, + "loss": 3.5925, + "step": 24230 + }, + { + "epoch": 2.4917763157894735, + "grad_norm": 0.9304975939170727, + "learning_rate": 1.1638964480729095e-05, + "loss": 3.5484, + "step": 24240 + }, + { + "epoch": 2.4928042763157894, + "grad_norm": 0.8772025996023989, + "learning_rate": 1.161663506455037e-05, + "loss": 3.4786, + "step": 24250 + }, + { + "epoch": 2.4938322368421053, + "grad_norm": 1.0227205517095264, + "learning_rate": 1.1594339628942108e-05, + "loss": 3.6397, + "step": 24260 + }, + { + "epoch": 2.4948601973684212, + "grad_norm": 0.9478525133047596, + "learning_rate": 1.1572078198435688e-05, + "loss": 3.5918, + "step": 24270 + }, + { + "epoch": 2.4958881578947367, + "grad_norm": 1.0579674476274106, + "learning_rate": 1.1549850797525067e-05, + "loss": 3.5551, + "step": 24280 + }, + { + "epoch": 2.4969161184210527, + "grad_norm": 0.7431930957391365, + "learning_rate": 1.1527657450666799e-05, + "loss": 3.5221, + "step": 24290 + }, + { + "epoch": 2.4979440789473686, + "grad_norm": 0.7074852428986577, + "learning_rate": 1.1505498182279904e-05, + "loss": 3.5879, + "step": 24300 + }, + { + "epoch": 2.498972039473684, + "grad_norm": 1.0677423177133272, + "learning_rate": 1.148337301674596e-05, + "loss": 3.6413, + "step": 24310 + }, + { + "epoch": 2.5, + "grad_norm": 0.7718134311105417, + "learning_rate": 1.1461281978408999e-05, + "loss": 3.5198, + "step": 24320 + }, + { + "epoch": 2.501027960526316, + "grad_norm": 0.9656272517798381, + "learning_rate": 1.1439225091575492e-05, + "loss": 3.534, + "step": 24330 + }, + { + "epoch": 2.5020559210526314, + "grad_norm": 1.0750671365754472, + "learning_rate": 1.1417202380514386e-05, + "loss": 3.5684, + "step": 24340 + }, + { + "epoch": 2.5030838815789473, + "grad_norm": 0.9088062867648148, + "learning_rate": 1.1395213869456958e-05, + "loss": 3.5481, + "step": 24350 + }, + { + "epoch": 2.5041118421052633, + "grad_norm": 0.7711072555333226, + "learning_rate": 1.1373259582596887e-05, + "loss": 3.5885, + "step": 24360 + }, + { + "epoch": 2.5051398026315788, + "grad_norm": 1.1887290015707597, + "learning_rate": 1.1351339544090197e-05, + "loss": 3.632, + "step": 24370 + }, + { + "epoch": 2.5061677631578947, + "grad_norm": 0.9291086719073337, + "learning_rate": 1.1329453778055224e-05, + "loss": 3.5429, + "step": 24380 + }, + { + "epoch": 2.5071957236842106, + "grad_norm": 0.6747212779365376, + "learning_rate": 1.1307602308572612e-05, + "loss": 3.605, + "step": 24390 + }, + { + "epoch": 2.5082236842105265, + "grad_norm": 0.7089934545518823, + "learning_rate": 1.1285785159685232e-05, + "loss": 3.6712, + "step": 24400 + }, + { + "epoch": 2.509251644736842, + "grad_norm": 0.7813474212939826, + "learning_rate": 1.1264002355398218e-05, + "loss": 3.5409, + "step": 24410 + }, + { + "epoch": 2.510279605263158, + "grad_norm": 0.769079007712117, + "learning_rate": 1.1242253919678913e-05, + "loss": 3.5304, + "step": 24420 + }, + { + "epoch": 2.5113075657894735, + "grad_norm": 1.067831909967836, + "learning_rate": 1.1220539876456833e-05, + "loss": 3.4874, + "step": 24430 + }, + { + "epoch": 2.5123355263157894, + "grad_norm": 1.0223020122204902, + "learning_rate": 1.1198860249623677e-05, + "loss": 3.5695, + "step": 24440 + }, + { + "epoch": 2.5133634868421053, + "grad_norm": 0.8760103455220026, + "learning_rate": 1.1177215063033264e-05, + "loss": 3.6048, + "step": 24450 + }, + { + "epoch": 2.5143914473684212, + "grad_norm": 0.8147719968838693, + "learning_rate": 1.1155604340501498e-05, + "loss": 3.5709, + "step": 24460 + }, + { + "epoch": 2.5154194078947367, + "grad_norm": 1.035544096066298, + "learning_rate": 1.113402810580638e-05, + "loss": 3.5436, + "step": 24470 + }, + { + "epoch": 2.5164473684210527, + "grad_norm": 1.1216903322017981, + "learning_rate": 1.1112486382687975e-05, + "loss": 3.5788, + "step": 24480 + }, + { + "epoch": 2.5174753289473686, + "grad_norm": 0.8135818468266736, + "learning_rate": 1.1090979194848353e-05, + "loss": 3.6138, + "step": 24490 + }, + { + "epoch": 2.518503289473684, + "grad_norm": 0.8951638329654669, + "learning_rate": 1.1069506565951619e-05, + "loss": 3.5474, + "step": 24500 + }, + { + "epoch": 2.51953125, + "grad_norm": 0.8175576591572933, + "learning_rate": 1.1048068519623808e-05, + "loss": 3.6354, + "step": 24510 + }, + { + "epoch": 2.520559210526316, + "grad_norm": 0.9745805388257388, + "learning_rate": 1.1026665079452935e-05, + "loss": 3.5404, + "step": 24520 + }, + { + "epoch": 2.5215871710526314, + "grad_norm": 0.9763463200664536, + "learning_rate": 1.1005296268988936e-05, + "loss": 3.5504, + "step": 24530 + }, + { + "epoch": 2.5226151315789473, + "grad_norm": 1.097421912975885, + "learning_rate": 1.0983962111743622e-05, + "loss": 3.6389, + "step": 24540 + }, + { + "epoch": 2.5236430921052633, + "grad_norm": 0.6813020493808452, + "learning_rate": 1.0962662631190728e-05, + "loss": 3.5771, + "step": 24550 + }, + { + "epoch": 2.5246710526315788, + "grad_norm": 0.8965505272221101, + "learning_rate": 1.0941397850765769e-05, + "loss": 3.5994, + "step": 24560 + }, + { + "epoch": 2.5256990131578947, + "grad_norm": 0.7284465276354298, + "learning_rate": 1.0920167793866113e-05, + "loss": 3.599, + "step": 24570 + }, + { + "epoch": 2.5267269736842106, + "grad_norm": 1.0975641503843794, + "learning_rate": 1.0898972483850932e-05, + "loss": 3.5113, + "step": 24580 + }, + { + "epoch": 2.5277549342105265, + "grad_norm": 1.062621334104157, + "learning_rate": 1.087781194404113e-05, + "loss": 3.5966, + "step": 24590 + }, + { + "epoch": 2.528782894736842, + "grad_norm": 1.1276765877928783, + "learning_rate": 1.0856686197719408e-05, + "loss": 3.5496, + "step": 24600 + }, + { + "epoch": 2.529810855263158, + "grad_norm": 0.9707613031464609, + "learning_rate": 1.083559526813015e-05, + "loss": 3.5231, + "step": 24610 + }, + { + "epoch": 2.5308388157894735, + "grad_norm": 0.8607805375535765, + "learning_rate": 1.0814539178479413e-05, + "loss": 3.5308, + "step": 24620 + }, + { + "epoch": 2.5318667763157894, + "grad_norm": 0.9451002718189573, + "learning_rate": 1.0793517951934958e-05, + "loss": 3.6129, + "step": 24630 + }, + { + "epoch": 2.5328947368421053, + "grad_norm": 0.7793782127210145, + "learning_rate": 1.0772531611626156e-05, + "loss": 3.6267, + "step": 24640 + }, + { + "epoch": 2.5339226973684212, + "grad_norm": 0.9949629704375744, + "learning_rate": 1.075158018064403e-05, + "loss": 3.537, + "step": 24650 + }, + { + "epoch": 2.5349506578947367, + "grad_norm": 1.5633310015538333, + "learning_rate": 1.0730663682041174e-05, + "loss": 3.5513, + "step": 24660 + }, + { + "epoch": 2.5359786184210527, + "grad_norm": 1.1476399253686544, + "learning_rate": 1.0709782138831718e-05, + "loss": 3.623, + "step": 24670 + }, + { + "epoch": 2.5370065789473686, + "grad_norm": 0.753935778736299, + "learning_rate": 1.0688935573991368e-05, + "loss": 3.5761, + "step": 24680 + }, + { + "epoch": 2.538034539473684, + "grad_norm": 0.7527322275223162, + "learning_rate": 1.0668124010457328e-05, + "loss": 3.5766, + "step": 24690 + }, + { + "epoch": 2.5390625, + "grad_norm": 0.8043630742253819, + "learning_rate": 1.0647347471128305e-05, + "loss": 3.6007, + "step": 24700 + }, + { + "epoch": 2.540090460526316, + "grad_norm": 0.7335712823484272, + "learning_rate": 1.0626605978864464e-05, + "loss": 3.4847, + "step": 24710 + }, + { + "epoch": 2.5411184210526314, + "grad_norm": 0.7433487891898285, + "learning_rate": 1.0605899556487379e-05, + "loss": 3.5846, + "step": 24720 + }, + { + "epoch": 2.5421463815789473, + "grad_norm": 0.9505560916301118, + "learning_rate": 1.0585228226780076e-05, + "loss": 3.6301, + "step": 24730 + }, + { + "epoch": 2.5431743421052633, + "grad_norm": 1.3914814572622018, + "learning_rate": 1.0564592012486943e-05, + "loss": 3.6356, + "step": 24740 + }, + { + "epoch": 2.5442023026315788, + "grad_norm": 0.7863365904366099, + "learning_rate": 1.054399093631376e-05, + "loss": 3.5052, + "step": 24750 + }, + { + "epoch": 2.5452302631578947, + "grad_norm": 0.7748555756405165, + "learning_rate": 1.052342502092762e-05, + "loss": 3.6494, + "step": 24760 + }, + { + "epoch": 2.5462582236842106, + "grad_norm": 0.8645395283153599, + "learning_rate": 1.0502894288956937e-05, + "loss": 3.4711, + "step": 24770 + }, + { + "epoch": 2.5472861842105265, + "grad_norm": 0.9982657593787909, + "learning_rate": 1.0482398762991408e-05, + "loss": 3.5408, + "step": 24780 + }, + { + "epoch": 2.548314144736842, + "grad_norm": 0.8818536463044002, + "learning_rate": 1.0461938465581984e-05, + "loss": 3.5327, + "step": 24790 + }, + { + "epoch": 2.549342105263158, + "grad_norm": 0.6283407571423182, + "learning_rate": 1.0441513419240896e-05, + "loss": 3.5923, + "step": 24800 + }, + { + "epoch": 2.5503700657894735, + "grad_norm": 0.8381280095944571, + "learning_rate": 1.042112364644154e-05, + "loss": 3.6211, + "step": 24810 + }, + { + "epoch": 2.5513980263157894, + "grad_norm": 0.6676428373432668, + "learning_rate": 1.0400769169618545e-05, + "loss": 3.615, + "step": 24820 + }, + { + "epoch": 2.5524259868421053, + "grad_norm": 0.9537419995513574, + "learning_rate": 1.0380450011167653e-05, + "loss": 3.5345, + "step": 24830 + }, + { + "epoch": 2.5534539473684212, + "grad_norm": 0.7897672621822944, + "learning_rate": 1.0360166193445777e-05, + "loss": 3.5595, + "step": 24840 + }, + { + "epoch": 2.5544819078947367, + "grad_norm": 0.8115321777460663, + "learning_rate": 1.033991773877096e-05, + "loss": 3.6114, + "step": 24850 + }, + { + "epoch": 2.5555098684210527, + "grad_norm": 0.8707484771294629, + "learning_rate": 1.0319704669422301e-05, + "loss": 3.5655, + "step": 24860 + }, + { + "epoch": 2.5565378289473686, + "grad_norm": 1.0975361822850607, + "learning_rate": 1.029952700764e-05, + "loss": 3.6293, + "step": 24870 + }, + { + "epoch": 2.557565789473684, + "grad_norm": 0.9137771030482083, + "learning_rate": 1.0279384775625254e-05, + "loss": 3.5943, + "step": 24880 + }, + { + "epoch": 2.55859375, + "grad_norm": 0.9866561193213885, + "learning_rate": 1.025927799554031e-05, + "loss": 3.5912, + "step": 24890 + }, + { + "epoch": 2.559621710526316, + "grad_norm": 1.412716556928865, + "learning_rate": 1.0239206689508414e-05, + "loss": 3.5519, + "step": 24900 + }, + { + "epoch": 2.5606496710526314, + "grad_norm": 0.9086909142953565, + "learning_rate": 1.0219170879613753e-05, + "loss": 3.5244, + "step": 24910 + }, + { + "epoch": 2.5616776315789473, + "grad_norm": 0.7278261578311502, + "learning_rate": 1.0199170587901488e-05, + "loss": 3.5382, + "step": 24920 + }, + { + "epoch": 2.5627055921052633, + "grad_norm": 0.9383494978834219, + "learning_rate": 1.0179205836377665e-05, + "loss": 3.5622, + "step": 24930 + }, + { + "epoch": 2.5637335526315788, + "grad_norm": 0.7732370290177059, + "learning_rate": 1.0159276647009241e-05, + "loss": 3.5143, + "step": 24940 + }, + { + "epoch": 2.5647615131578947, + "grad_norm": 0.8428231906460366, + "learning_rate": 1.013938304172407e-05, + "loss": 3.5947, + "step": 24950 + }, + { + "epoch": 2.5657894736842106, + "grad_norm": 0.8761906016810744, + "learning_rate": 1.0119525042410816e-05, + "loss": 3.5776, + "step": 24960 + }, + { + "epoch": 2.5668174342105265, + "grad_norm": 1.2156673473052566, + "learning_rate": 1.0099702670918983e-05, + "loss": 3.5707, + "step": 24970 + }, + { + "epoch": 2.567845394736842, + "grad_norm": 1.0014713906493098, + "learning_rate": 1.0079915949058883e-05, + "loss": 3.5119, + "step": 24980 + }, + { + "epoch": 2.568873355263158, + "grad_norm": 0.8978461268096349, + "learning_rate": 1.0060164898601564e-05, + "loss": 3.4809, + "step": 24990 + }, + { + "epoch": 2.5699013157894735, + "grad_norm": 0.6389533539627662, + "learning_rate": 1.004044954127888e-05, + "loss": 3.59, + "step": 25000 + }, + { + "epoch": 2.5709292763157894, + "grad_norm": 0.9454436723481523, + "learning_rate": 1.0020769898783378e-05, + "loss": 3.5921, + "step": 25010 + }, + { + "epoch": 2.5719572368421053, + "grad_norm": 0.9897332964588764, + "learning_rate": 1.0001125992768317e-05, + "loss": 3.5743, + "step": 25020 + }, + { + "epoch": 2.5729851973684212, + "grad_norm": 1.0083922263882716, + "learning_rate": 9.98151784484764e-06, + "loss": 3.5958, + "step": 25030 + }, + { + "epoch": 2.5740131578947367, + "grad_norm": 1.648641853863595, + "learning_rate": 9.96194547659592e-06, + "loss": 3.5661, + "step": 25040 + }, + { + "epoch": 2.5750411184210527, + "grad_norm": 1.2294856912867003, + "learning_rate": 9.942408909548407e-06, + "loss": 3.6405, + "step": 25050 + }, + { + "epoch": 2.5760690789473686, + "grad_norm": 0.8065093443956516, + "learning_rate": 9.922908165200925e-06, + "loss": 3.4932, + "step": 25060 + }, + { + "epoch": 2.577097039473684, + "grad_norm": 0.7067880791902889, + "learning_rate": 9.903443265009894e-06, + "loss": 3.599, + "step": 25070 + }, + { + "epoch": 2.578125, + "grad_norm": 0.7678423322472244, + "learning_rate": 9.8840142303923e-06, + "loss": 3.5188, + "step": 25080 + }, + { + "epoch": 2.579152960526316, + "grad_norm": 1.3248796642288667, + "learning_rate": 9.864621082725645e-06, + "loss": 3.5944, + "step": 25090 + }, + { + "epoch": 2.5801809210526314, + "grad_norm": 0.8491200759118217, + "learning_rate": 9.845263843347965e-06, + "loss": 3.6013, + "step": 25100 + }, + { + "epoch": 2.5812088815789473, + "grad_norm": 0.7730007672546645, + "learning_rate": 9.825942533557795e-06, + "loss": 3.5414, + "step": 25110 + }, + { + "epoch": 2.5822368421052633, + "grad_norm": 1.0587709848605449, + "learning_rate": 9.806657174614117e-06, + "loss": 3.59, + "step": 25120 + }, + { + "epoch": 2.5832648026315788, + "grad_norm": 0.9456486695347165, + "learning_rate": 9.787407787736365e-06, + "loss": 3.5081, + "step": 25130 + }, + { + "epoch": 2.5842927631578947, + "grad_norm": 0.843308651252592, + "learning_rate": 9.7681943941044e-06, + "loss": 3.6212, + "step": 25140 + }, + { + "epoch": 2.5853207236842106, + "grad_norm": 0.6578515582596857, + "learning_rate": 9.749017014858448e-06, + "loss": 3.6045, + "step": 25150 + }, + { + "epoch": 2.5863486842105265, + "grad_norm": 0.8972887075242694, + "learning_rate": 9.729875671099156e-06, + "loss": 3.5473, + "step": 25160 + }, + { + "epoch": 2.587376644736842, + "grad_norm": 1.054929691340264, + "learning_rate": 9.710770383887493e-06, + "loss": 3.6017, + "step": 25170 + }, + { + "epoch": 2.588404605263158, + "grad_norm": 0.7352968882059745, + "learning_rate": 9.69170117424476e-06, + "loss": 3.5406, + "step": 25180 + }, + { + "epoch": 2.5894325657894735, + "grad_norm": 0.9292079425989584, + "learning_rate": 9.672668063152576e-06, + "loss": 3.6178, + "step": 25190 + }, + { + "epoch": 2.5904605263157894, + "grad_norm": 1.0260887457927572, + "learning_rate": 9.653671071552806e-06, + "loss": 3.6426, + "step": 25200 + }, + { + "epoch": 2.5914884868421053, + "grad_norm": 0.9868404153340912, + "learning_rate": 9.634710220347612e-06, + "loss": 3.6329, + "step": 25210 + }, + { + "epoch": 2.5925164473684212, + "grad_norm": 0.8682487455073948, + "learning_rate": 9.615785530399374e-06, + "loss": 3.611, + "step": 25220 + }, + { + "epoch": 2.5935444078947367, + "grad_norm": 0.6981241619955445, + "learning_rate": 9.59689702253068e-06, + "loss": 3.573, + "step": 25230 + }, + { + "epoch": 2.5945723684210527, + "grad_norm": 1.0808610889240624, + "learning_rate": 9.578044717524328e-06, + "loss": 3.5587, + "step": 25240 + }, + { + "epoch": 2.5956003289473686, + "grad_norm": 0.6980856766720802, + "learning_rate": 9.559228636123242e-06, + "loss": 3.6195, + "step": 25250 + }, + { + "epoch": 2.596628289473684, + "grad_norm": 1.0928235626315272, + "learning_rate": 9.540448799030535e-06, + "loss": 3.6205, + "step": 25260 + }, + { + "epoch": 2.59765625, + "grad_norm": 0.7795348604737621, + "learning_rate": 9.521705226909416e-06, + "loss": 3.5712, + "step": 25270 + }, + { + "epoch": 2.598684210526316, + "grad_norm": 1.143215581976862, + "learning_rate": 9.502997940383197e-06, + "loss": 3.568, + "step": 25280 + }, + { + "epoch": 2.5997121710526314, + "grad_norm": 1.2315853320630663, + "learning_rate": 9.484326960035267e-06, + "loss": 3.6105, + "step": 25290 + }, + { + "epoch": 2.6007401315789473, + "grad_norm": 1.4193531350820559, + "learning_rate": 9.465692306409064e-06, + "loss": 3.5556, + "step": 25300 + }, + { + "epoch": 2.6017680921052633, + "grad_norm": 0.8999862259234633, + "learning_rate": 9.447094000008055e-06, + "loss": 3.5717, + "step": 25310 + }, + { + "epoch": 2.6027960526315788, + "grad_norm": 1.1581160462759532, + "learning_rate": 9.428532061295726e-06, + "loss": 3.5382, + "step": 25320 + }, + { + "epoch": 2.6038240131578947, + "grad_norm": 1.0649161075628684, + "learning_rate": 9.410006510695535e-06, + "loss": 3.5523, + "step": 25330 + }, + { + "epoch": 2.6048519736842106, + "grad_norm": 0.8142340569017241, + "learning_rate": 9.391517368590908e-06, + "loss": 3.5825, + "step": 25340 + }, + { + "epoch": 2.6058799342105265, + "grad_norm": 1.0338215410415952, + "learning_rate": 9.373064655325212e-06, + "loss": 3.5115, + "step": 25350 + }, + { + "epoch": 2.606907894736842, + "grad_norm": 0.7284062280410645, + "learning_rate": 9.354648391201731e-06, + "loss": 3.6139, + "step": 25360 + }, + { + "epoch": 2.607935855263158, + "grad_norm": 0.7566161790324453, + "learning_rate": 9.336268596483642e-06, + "loss": 3.5818, + "step": 25370 + }, + { + "epoch": 2.6089638157894735, + "grad_norm": 0.7179508742472123, + "learning_rate": 9.317925291394e-06, + "loss": 3.5394, + "step": 25380 + }, + { + "epoch": 2.6099917763157894, + "grad_norm": 0.7584354946642795, + "learning_rate": 9.299618496115714e-06, + "loss": 3.6212, + "step": 25390 + }, + { + "epoch": 2.6110197368421053, + "grad_norm": 0.7425111968226971, + "learning_rate": 9.281348230791504e-06, + "loss": 3.6243, + "step": 25400 + }, + { + "epoch": 2.6120476973684212, + "grad_norm": 0.8414361096594286, + "learning_rate": 9.263114515523914e-06, + "loss": 3.6409, + "step": 25410 + }, + { + "epoch": 2.6130756578947367, + "grad_norm": 0.742520749415483, + "learning_rate": 9.24491737037527e-06, + "loss": 3.5446, + "step": 25420 + }, + { + "epoch": 2.6141036184210527, + "grad_norm": 1.187841215802708, + "learning_rate": 9.226756815367655e-06, + "loss": 3.5231, + "step": 25430 + }, + { + "epoch": 2.6151315789473686, + "grad_norm": 1.4345723899374423, + "learning_rate": 9.208632870482893e-06, + "loss": 3.5793, + "step": 25440 + }, + { + "epoch": 2.616159539473684, + "grad_norm": 1.0963775136942955, + "learning_rate": 9.190545555662535e-06, + "loss": 3.6102, + "step": 25450 + }, + { + "epoch": 2.6171875, + "grad_norm": 0.767090232261399, + "learning_rate": 9.172494890807813e-06, + "loss": 3.5056, + "step": 25460 + }, + { + "epoch": 2.618215460526316, + "grad_norm": 0.800978733925805, + "learning_rate": 9.154480895779646e-06, + "loss": 3.6341, + "step": 25470 + }, + { + "epoch": 2.6192434210526314, + "grad_norm": 1.4663281866562703, + "learning_rate": 9.136503590398605e-06, + "loss": 3.5158, + "step": 25480 + }, + { + "epoch": 2.6202713815789473, + "grad_norm": 0.8523251329551407, + "learning_rate": 9.11856299444488e-06, + "loss": 3.5545, + "step": 25490 + }, + { + "epoch": 2.6212993421052633, + "grad_norm": 1.0974716085528544, + "learning_rate": 9.100659127658286e-06, + "loss": 3.6544, + "step": 25500 + }, + { + "epoch": 2.6223273026315788, + "grad_norm": 0.8889656335927125, + "learning_rate": 9.08279200973822e-06, + "loss": 3.5615, + "step": 25510 + }, + { + "epoch": 2.6233552631578947, + "grad_norm": 0.9498680625448336, + "learning_rate": 9.064961660343632e-06, + "loss": 3.6, + "step": 25520 + }, + { + "epoch": 2.6243832236842106, + "grad_norm": 0.835307730772801, + "learning_rate": 9.047168099093037e-06, + "loss": 3.5309, + "step": 25530 + }, + { + "epoch": 2.6254111842105265, + "grad_norm": 0.6494731904840623, + "learning_rate": 9.02941134556446e-06, + "loss": 3.5759, + "step": 25540 + }, + { + "epoch": 2.626439144736842, + "grad_norm": 1.1829317967512225, + "learning_rate": 9.011691419295428e-06, + "loss": 3.5476, + "step": 25550 + }, + { + "epoch": 2.627467105263158, + "grad_norm": 1.0798865039670582, + "learning_rate": 8.994008339782948e-06, + "loss": 3.5414, + "step": 25560 + }, + { + "epoch": 2.6284950657894735, + "grad_norm": 0.9321830657843291, + "learning_rate": 8.976362126483485e-06, + "loss": 3.5945, + "step": 25570 + }, + { + "epoch": 2.6295230263157894, + "grad_norm": 0.7562329061798899, + "learning_rate": 8.958752798812946e-06, + "loss": 3.5739, + "step": 25580 + }, + { + "epoch": 2.6305509868421053, + "grad_norm": 0.7867910342453762, + "learning_rate": 8.941180376146643e-06, + "loss": 3.5678, + "step": 25590 + }, + { + "epoch": 2.6315789473684212, + "grad_norm": 0.7328867746546213, + "learning_rate": 8.92364487781929e-06, + "loss": 3.514, + "step": 25600 + }, + { + "epoch": 2.6326069078947367, + "grad_norm": 0.8502238189313975, + "learning_rate": 8.906146323124972e-06, + "loss": 3.6269, + "step": 25610 + }, + { + "epoch": 2.6336348684210527, + "grad_norm": 0.9364209959389443, + "learning_rate": 8.888684731317121e-06, + "loss": 3.5696, + "step": 25620 + }, + { + "epoch": 2.6346628289473686, + "grad_norm": 0.7553962254550873, + "learning_rate": 8.871260121608505e-06, + "loss": 3.5154, + "step": 25630 + }, + { + "epoch": 2.635690789473684, + "grad_norm": 0.8570469968700098, + "learning_rate": 8.853872513171196e-06, + "loss": 3.6634, + "step": 25640 + }, + { + "epoch": 2.63671875, + "grad_norm": 0.5985956973943098, + "learning_rate": 8.836521925136561e-06, + "loss": 3.5514, + "step": 25650 + }, + { + "epoch": 2.637746710526316, + "grad_norm": 0.7022438886382645, + "learning_rate": 8.819208376595228e-06, + "loss": 3.5832, + "step": 25660 + }, + { + "epoch": 2.6387746710526314, + "grad_norm": 1.6275167782529534, + "learning_rate": 8.801931886597073e-06, + "loss": 3.6101, + "step": 25670 + }, + { + "epoch": 2.6398026315789473, + "grad_norm": 1.1476500010466362, + "learning_rate": 8.784692474151192e-06, + "loss": 3.6216, + "step": 25680 + }, + { + "epoch": 2.6408305921052633, + "grad_norm": 1.0353123385230347, + "learning_rate": 8.767490158225895e-06, + "loss": 3.5812, + "step": 25690 + }, + { + "epoch": 2.6418585526315788, + "grad_norm": 1.1817281391114292, + "learning_rate": 8.750324957748663e-06, + "loss": 3.5389, + "step": 25700 + }, + { + "epoch": 2.6428865131578947, + "grad_norm": 0.7684430013353427, + "learning_rate": 8.733196891606152e-06, + "loss": 3.6841, + "step": 25710 + }, + { + "epoch": 2.6439144736842106, + "grad_norm": 0.8341973276311798, + "learning_rate": 8.716105978644154e-06, + "loss": 3.6371, + "step": 25720 + }, + { + "epoch": 2.6449424342105265, + "grad_norm": 0.7032032605168848, + "learning_rate": 8.699052237667579e-06, + "loss": 3.5707, + "step": 25730 + }, + { + "epoch": 2.645970394736842, + "grad_norm": 0.8090357654997633, + "learning_rate": 8.68203568744044e-06, + "loss": 3.5811, + "step": 25740 + }, + { + "epoch": 2.646998355263158, + "grad_norm": 0.7402439167142493, + "learning_rate": 8.665056346685826e-06, + "loss": 3.588, + "step": 25750 + }, + { + "epoch": 2.6480263157894735, + "grad_norm": 1.151863696435616, + "learning_rate": 8.648114234085898e-06, + "loss": 3.5972, + "step": 25760 + }, + { + "epoch": 2.6490542763157894, + "grad_norm": 1.2218739502260816, + "learning_rate": 8.631209368281824e-06, + "loss": 3.5146, + "step": 25770 + }, + { + "epoch": 2.6500822368421053, + "grad_norm": 1.1219027587516686, + "learning_rate": 8.61434176787384e-06, + "loss": 3.6261, + "step": 25780 + }, + { + "epoch": 2.6511101973684212, + "grad_norm": 1.1295253019905813, + "learning_rate": 8.597511451421127e-06, + "loss": 3.5137, + "step": 25790 + }, + { + "epoch": 2.6521381578947367, + "grad_norm": 0.9546297421129682, + "learning_rate": 8.580718437441882e-06, + "loss": 3.4841, + "step": 25800 + }, + { + "epoch": 2.6531661184210527, + "grad_norm": 0.706822591098535, + "learning_rate": 8.563962744413231e-06, + "loss": 3.5919, + "step": 25810 + }, + { + "epoch": 2.6541940789473686, + "grad_norm": 0.7484676283345011, + "learning_rate": 8.547244390771248e-06, + "loss": 3.64, + "step": 25820 + }, + { + "epoch": 2.655222039473684, + "grad_norm": 0.8500065017900317, + "learning_rate": 8.53056339491094e-06, + "loss": 3.5106, + "step": 25830 + }, + { + "epoch": 2.65625, + "grad_norm": 1.0809457339800044, + "learning_rate": 8.513919775186173e-06, + "loss": 3.6026, + "step": 25840 + }, + { + "epoch": 2.657277960526316, + "grad_norm": 1.378576136509715, + "learning_rate": 8.49731354990972e-06, + "loss": 3.5107, + "step": 25850 + }, + { + "epoch": 2.6583059210526314, + "grad_norm": 0.9082645846665112, + "learning_rate": 8.48074473735319e-06, + "loss": 3.5446, + "step": 25860 + }, + { + "epoch": 2.6593338815789473, + "grad_norm": 0.7501991722764422, + "learning_rate": 8.46421335574704e-06, + "loss": 3.6202, + "step": 25870 + }, + { + "epoch": 2.6603618421052633, + "grad_norm": 0.8072089532297039, + "learning_rate": 8.447719423280545e-06, + "loss": 3.6097, + "step": 25880 + }, + { + "epoch": 2.6613898026315788, + "grad_norm": 1.1035953459173873, + "learning_rate": 8.431262958101751e-06, + "loss": 3.529, + "step": 25890 + }, + { + "epoch": 2.6624177631578947, + "grad_norm": 0.7565391303088536, + "learning_rate": 8.414843978317505e-06, + "loss": 3.5273, + "step": 25900 + }, + { + "epoch": 2.6634457236842106, + "grad_norm": 1.0530048086703236, + "learning_rate": 8.398462501993399e-06, + "loss": 3.6216, + "step": 25910 + }, + { + "epoch": 2.6644736842105265, + "grad_norm": 0.9111491014796965, + "learning_rate": 8.382118547153756e-06, + "loss": 3.5813, + "step": 25920 + }, + { + "epoch": 2.665501644736842, + "grad_norm": 0.682307138825694, + "learning_rate": 8.36581213178163e-06, + "loss": 3.5766, + "step": 25930 + }, + { + "epoch": 2.666529605263158, + "grad_norm": 0.6197779651270708, + "learning_rate": 8.349543273818762e-06, + "loss": 3.5245, + "step": 25940 + }, + { + "epoch": 2.6675575657894735, + "grad_norm": 0.739792785178173, + "learning_rate": 8.333311991165556e-06, + "loss": 3.5307, + "step": 25950 + }, + { + "epoch": 2.6685855263157894, + "grad_norm": 1.0761902075450496, + "learning_rate": 8.317118301681087e-06, + "loss": 3.5556, + "step": 25960 + }, + { + "epoch": 2.6696134868421053, + "grad_norm": 0.774889305884824, + "learning_rate": 8.30096222318306e-06, + "loss": 3.6049, + "step": 25970 + }, + { + "epoch": 2.6706414473684212, + "grad_norm": 0.833428333748415, + "learning_rate": 8.284843773447815e-06, + "loss": 3.6245, + "step": 25980 + }, + { + "epoch": 2.6716694078947367, + "grad_norm": 0.8596826176173665, + "learning_rate": 8.268762970210276e-06, + "loss": 3.5268, + "step": 25990 + }, + { + "epoch": 2.6726973684210527, + "grad_norm": 0.8092354333796287, + "learning_rate": 8.25271983116393e-06, + "loss": 3.5742, + "step": 26000 + }, + { + "epoch": 2.6737253289473686, + "grad_norm": 1.063554913285444, + "learning_rate": 8.236714373960842e-06, + "loss": 3.5601, + "step": 26010 + }, + { + "epoch": 2.674753289473684, + "grad_norm": 0.9540964397812195, + "learning_rate": 8.220746616211614e-06, + "loss": 3.545, + "step": 26020 + }, + { + "epoch": 2.67578125, + "grad_norm": 0.9522320363440239, + "learning_rate": 8.204816575485359e-06, + "loss": 3.5742, + "step": 26030 + }, + { + "epoch": 2.676809210526316, + "grad_norm": 0.6176722349283695, + "learning_rate": 8.188924269309715e-06, + "loss": 3.5704, + "step": 26040 + }, + { + "epoch": 2.6778371710526314, + "grad_norm": 0.6482116121929811, + "learning_rate": 8.173069715170761e-06, + "loss": 3.5173, + "step": 26050 + }, + { + "epoch": 2.6788651315789473, + "grad_norm": 0.9741263253207733, + "learning_rate": 8.157252930513072e-06, + "loss": 3.549, + "step": 26060 + }, + { + "epoch": 2.6798930921052633, + "grad_norm": 0.9345583860140584, + "learning_rate": 8.141473932739647e-06, + "loss": 3.6454, + "step": 26070 + }, + { + "epoch": 2.6809210526315788, + "grad_norm": 0.7627622841164983, + "learning_rate": 8.125732739211913e-06, + "loss": 3.6077, + "step": 26080 + }, + { + "epoch": 2.6819490131578947, + "grad_norm": 0.8164123096880003, + "learning_rate": 8.110029367249711e-06, + "loss": 3.5847, + "step": 26090 + }, + { + "epoch": 2.6829769736842106, + "grad_norm": 0.9827783442203846, + "learning_rate": 8.094363834131264e-06, + "loss": 3.5419, + "step": 26100 + }, + { + "epoch": 2.6840049342105265, + "grad_norm": 0.8536270383139342, + "learning_rate": 8.078736157093144e-06, + "loss": 3.6154, + "step": 26110 + }, + { + "epoch": 2.685032894736842, + "grad_norm": 0.8449790621313458, + "learning_rate": 8.063146353330292e-06, + "loss": 3.5127, + "step": 26120 + }, + { + "epoch": 2.686060855263158, + "grad_norm": 1.0386219094414433, + "learning_rate": 8.04759443999596e-06, + "loss": 3.5384, + "step": 26130 + }, + { + "epoch": 2.6870888157894735, + "grad_norm": 1.1798615485126855, + "learning_rate": 8.032080434201729e-06, + "loss": 3.6023, + "step": 26140 + }, + { + "epoch": 2.6881167763157894, + "grad_norm": 0.8674406421042943, + "learning_rate": 8.016604353017467e-06, + "loss": 3.5791, + "step": 26150 + }, + { + "epoch": 2.6891447368421053, + "grad_norm": 0.7625899590128093, + "learning_rate": 8.001166213471292e-06, + "loss": 3.5225, + "step": 26160 + }, + { + "epoch": 2.6901726973684212, + "grad_norm": 1.1631740789702465, + "learning_rate": 7.985766032549598e-06, + "loss": 3.6342, + "step": 26170 + }, + { + "epoch": 2.6912006578947367, + "grad_norm": 0.7720648404696359, + "learning_rate": 7.970403827196998e-06, + "loss": 3.5866, + "step": 26180 + }, + { + "epoch": 2.6922286184210527, + "grad_norm": 0.8683243937102365, + "learning_rate": 7.955079614316339e-06, + "loss": 3.6306, + "step": 26190 + }, + { + "epoch": 2.6932565789473686, + "grad_norm": 0.9883323324518996, + "learning_rate": 7.939793410768667e-06, + "loss": 3.597, + "step": 26200 + }, + { + "epoch": 2.694284539473684, + "grad_norm": 1.0086817578155187, + "learning_rate": 7.924545233373172e-06, + "loss": 3.5879, + "step": 26210 + }, + { + "epoch": 2.6953125, + "grad_norm": 0.7127325013259533, + "learning_rate": 7.90933509890724e-06, + "loss": 3.5382, + "step": 26220 + }, + { + "epoch": 2.696340460526316, + "grad_norm": 0.8197555831342119, + "learning_rate": 7.894163024106385e-06, + "loss": 3.5733, + "step": 26230 + }, + { + "epoch": 2.6973684210526314, + "grad_norm": 0.9803813576220395, + "learning_rate": 7.87902902566425e-06, + "loss": 3.6924, + "step": 26240 + }, + { + "epoch": 2.6983963815789473, + "grad_norm": 0.7416759936857065, + "learning_rate": 7.863933120232583e-06, + "loss": 3.6042, + "step": 26250 + }, + { + "epoch": 2.6994243421052633, + "grad_norm": 0.9678466645992237, + "learning_rate": 7.848875324421202e-06, + "loss": 3.5939, + "step": 26260 + }, + { + "epoch": 2.7004523026315788, + "grad_norm": 0.9077790285331683, + "learning_rate": 7.833855654798012e-06, + "loss": 3.5322, + "step": 26270 + }, + { + "epoch": 2.7014802631578947, + "grad_norm": 0.985992344269425, + "learning_rate": 7.81887412788896e-06, + "loss": 3.4839, + "step": 26280 + }, + { + "epoch": 2.7025082236842106, + "grad_norm": 0.9718429351619721, + "learning_rate": 7.803930760178042e-06, + "loss": 3.6082, + "step": 26290 + }, + { + "epoch": 2.7035361842105265, + "grad_norm": 0.6889976531502295, + "learning_rate": 7.789025568107242e-06, + "loss": 3.5609, + "step": 26300 + }, + { + "epoch": 2.704564144736842, + "grad_norm": 0.7833817960749604, + "learning_rate": 7.774158568076562e-06, + "loss": 3.5666, + "step": 26310 + }, + { + "epoch": 2.705592105263158, + "grad_norm": 0.979425551638039, + "learning_rate": 7.759329776443955e-06, + "loss": 3.6312, + "step": 26320 + }, + { + "epoch": 2.7066200657894735, + "grad_norm": 1.2913344273842382, + "learning_rate": 7.744539209525355e-06, + "loss": 3.5717, + "step": 26330 + }, + { + "epoch": 2.7076480263157894, + "grad_norm": 1.0002140173776695, + "learning_rate": 7.729786883594638e-06, + "loss": 3.5278, + "step": 26340 + }, + { + "epoch": 2.7086759868421053, + "grad_norm": 1.8267416651613801, + "learning_rate": 7.715072814883603e-06, + "loss": 3.5638, + "step": 26350 + }, + { + "epoch": 2.7097039473684212, + "grad_norm": 0.9701420104397471, + "learning_rate": 7.700397019581955e-06, + "loss": 3.5839, + "step": 26360 + }, + { + "epoch": 2.7107319078947367, + "grad_norm": 0.6964669234506513, + "learning_rate": 7.685759513837265e-06, + "loss": 3.6066, + "step": 26370 + }, + { + "epoch": 2.7117598684210527, + "grad_norm": 0.6232741847083698, + "learning_rate": 7.671160313754996e-06, + "loss": 3.6167, + "step": 26380 + }, + { + "epoch": 2.7127878289473686, + "grad_norm": 1.1184505445471602, + "learning_rate": 7.656599435398475e-06, + "loss": 3.5787, + "step": 26390 + }, + { + "epoch": 2.713815789473684, + "grad_norm": 1.167033098213999, + "learning_rate": 7.642076894788843e-06, + "loss": 3.5449, + "step": 26400 + }, + { + "epoch": 2.71484375, + "grad_norm": 0.8628170439680981, + "learning_rate": 7.627592707905069e-06, + "loss": 3.5222, + "step": 26410 + }, + { + "epoch": 2.715871710526316, + "grad_norm": 0.860368812899032, + "learning_rate": 7.613146890683903e-06, + "loss": 3.5471, + "step": 26420 + }, + { + "epoch": 2.7168996710526314, + "grad_norm": 0.7903846013374192, + "learning_rate": 7.598739459019905e-06, + "loss": 3.5252, + "step": 26430 + }, + { + "epoch": 2.7179276315789473, + "grad_norm": 0.7987195526433037, + "learning_rate": 7.584370428765387e-06, + "loss": 3.5506, + "step": 26440 + }, + { + "epoch": 2.7189555921052633, + "grad_norm": 1.2460717943702757, + "learning_rate": 7.5700398157304055e-06, + "loss": 3.5821, + "step": 26450 + }, + { + "epoch": 2.7199835526315788, + "grad_norm": 0.9082183508468332, + "learning_rate": 7.555747635682753e-06, + "loss": 3.6475, + "step": 26460 + }, + { + "epoch": 2.7210115131578947, + "grad_norm": 0.8909701966675386, + "learning_rate": 7.541493904347939e-06, + "loss": 3.5558, + "step": 26470 + }, + { + "epoch": 2.7220394736842106, + "grad_norm": 0.6663906574102124, + "learning_rate": 7.527278637409139e-06, + "loss": 3.6233, + "step": 26480 + }, + { + "epoch": 2.7230674342105265, + "grad_norm": 1.6619035927391963, + "learning_rate": 7.513101850507251e-06, + "loss": 3.5508, + "step": 26490 + }, + { + "epoch": 2.724095394736842, + "grad_norm": 0.963669220325786, + "learning_rate": 7.4989635592408e-06, + "loss": 3.5619, + "step": 26500 + }, + { + "epoch": 2.725123355263158, + "grad_norm": 0.932519749849026, + "learning_rate": 7.484863779165972e-06, + "loss": 3.5325, + "step": 26510 + }, + { + "epoch": 2.7261513157894735, + "grad_norm": 0.7105566408934302, + "learning_rate": 7.470802525796579e-06, + "loss": 3.6368, + "step": 26520 + }, + { + "epoch": 2.7271792763157894, + "grad_norm": 0.779966902886519, + "learning_rate": 7.456779814604023e-06, + "loss": 3.4654, + "step": 26530 + }, + { + "epoch": 2.7282072368421053, + "grad_norm": 1.0018547857557192, + "learning_rate": 7.442795661017326e-06, + "loss": 3.5017, + "step": 26540 + }, + { + "epoch": 2.7292351973684212, + "grad_norm": 0.8699010372877926, + "learning_rate": 7.4288500804230645e-06, + "loss": 3.554, + "step": 26550 + }, + { + "epoch": 2.7302631578947367, + "grad_norm": 0.9128626586129976, + "learning_rate": 7.4149430881653895e-06, + "loss": 3.4551, + "step": 26560 + }, + { + "epoch": 2.7312911184210527, + "grad_norm": 1.1372940338123174, + "learning_rate": 7.401074699545987e-06, + "loss": 3.5081, + "step": 26570 + }, + { + "epoch": 2.7323190789473686, + "grad_norm": 0.8475625474180275, + "learning_rate": 7.387244929824061e-06, + "loss": 3.534, + "step": 26580 + }, + { + "epoch": 2.733347039473684, + "grad_norm": 0.7760192859332494, + "learning_rate": 7.373453794216321e-06, + "loss": 3.6123, + "step": 26590 + }, + { + "epoch": 2.734375, + "grad_norm": 0.846606857528958, + "learning_rate": 7.359701307896998e-06, + "loss": 3.6141, + "step": 26600 + }, + { + "epoch": 2.735402960526316, + "grad_norm": 0.7595228577138219, + "learning_rate": 7.345987485997764e-06, + "loss": 3.4701, + "step": 26610 + }, + { + "epoch": 2.7364309210526314, + "grad_norm": 0.889131397869464, + "learning_rate": 7.3323123436077636e-06, + "loss": 3.5924, + "step": 26620 + }, + { + "epoch": 2.7374588815789473, + "grad_norm": 0.8488600309285399, + "learning_rate": 7.318675895773586e-06, + "loss": 3.5701, + "step": 26630 + }, + { + "epoch": 2.7384868421052633, + "grad_norm": 0.8412888979031563, + "learning_rate": 7.3050781574992234e-06, + "loss": 3.6266, + "step": 26640 + }, + { + "epoch": 2.7395148026315788, + "grad_norm": 1.2283064477653316, + "learning_rate": 7.291519143746108e-06, + "loss": 3.5591, + "step": 26650 + }, + { + "epoch": 2.7405427631578947, + "grad_norm": 0.6387371298208161, + "learning_rate": 7.27799886943305e-06, + "loss": 3.6245, + "step": 26660 + }, + { + "epoch": 2.7415707236842106, + "grad_norm": 0.7559610145699457, + "learning_rate": 7.2645173494362235e-06, + "loss": 3.5637, + "step": 26670 + }, + { + "epoch": 2.7425986842105265, + "grad_norm": 0.6466800819886576, + "learning_rate": 7.251074598589186e-06, + "loss": 3.6115, + "step": 26680 + }, + { + "epoch": 2.743626644736842, + "grad_norm": 0.8772451003908888, + "learning_rate": 7.237670631682799e-06, + "loss": 3.5152, + "step": 26690 + }, + { + "epoch": 2.744654605263158, + "grad_norm": 1.1905879978627818, + "learning_rate": 7.2243054634653e-06, + "loss": 3.6311, + "step": 26700 + }, + { + "epoch": 2.7456825657894735, + "grad_norm": 0.957321963628136, + "learning_rate": 7.2109791086422035e-06, + "loss": 3.5854, + "step": 26710 + }, + { + "epoch": 2.7467105263157894, + "grad_norm": 0.781893693866662, + "learning_rate": 7.197691581876326e-06, + "loss": 3.6004, + "step": 26720 + }, + { + "epoch": 2.7477384868421053, + "grad_norm": 0.9388296870151733, + "learning_rate": 7.184442897787769e-06, + "loss": 3.6165, + "step": 26730 + }, + { + "epoch": 2.7487664473684212, + "grad_norm": 0.724951527060304, + "learning_rate": 7.171233070953876e-06, + "loss": 3.5444, + "step": 26740 + }, + { + "epoch": 2.7497944078947367, + "grad_norm": 0.9941774672480862, + "learning_rate": 7.158062115909261e-06, + "loss": 3.5859, + "step": 26750 + }, + { + "epoch": 2.7508223684210527, + "grad_norm": 0.7048209611662716, + "learning_rate": 7.144930047145759e-06, + "loss": 3.4885, + "step": 26760 + }, + { + "epoch": 2.7518503289473686, + "grad_norm": 0.8418958211928081, + "learning_rate": 7.13183687911241e-06, + "loss": 3.5647, + "step": 26770 + }, + { + "epoch": 2.752878289473684, + "grad_norm": 0.7721516873905987, + "learning_rate": 7.118782626215459e-06, + "loss": 3.5495, + "step": 26780 + }, + { + "epoch": 2.75390625, + "grad_norm": 0.7752096507850558, + "learning_rate": 7.10576730281834e-06, + "loss": 3.5414, + "step": 26790 + }, + { + "epoch": 2.754934210526316, + "grad_norm": 0.8752946698062553, + "learning_rate": 7.0927909232416385e-06, + "loss": 3.5934, + "step": 26800 + }, + { + "epoch": 2.7559621710526314, + "grad_norm": 0.7103047751708584, + "learning_rate": 7.079853501763105e-06, + "loss": 3.4853, + "step": 26810 + }, + { + "epoch": 2.7569901315789473, + "grad_norm": 1.082254324823534, + "learning_rate": 7.066955052617613e-06, + "loss": 3.4996, + "step": 26820 + }, + { + "epoch": 2.7580180921052633, + "grad_norm": 1.12018492542906, + "learning_rate": 7.054095589997161e-06, + "loss": 3.5561, + "step": 26830 + }, + { + "epoch": 2.7590460526315788, + "grad_norm": 0.6603822264932109, + "learning_rate": 7.041275128050857e-06, + "loss": 3.6348, + "step": 26840 + }, + { + "epoch": 2.7600740131578947, + "grad_norm": 0.8751289749458444, + "learning_rate": 7.028493680884886e-06, + "loss": 3.575, + "step": 26850 + }, + { + "epoch": 2.7611019736842106, + "grad_norm": 1.2440992794622574, + "learning_rate": 7.015751262562513e-06, + "loss": 3.468, + "step": 26860 + }, + { + "epoch": 2.7621299342105265, + "grad_norm": 0.7963903486682488, + "learning_rate": 7.003047887104053e-06, + "loss": 3.5991, + "step": 26870 + }, + { + "epoch": 2.763157894736842, + "grad_norm": 0.6892817340957568, + "learning_rate": 6.99038356848687e-06, + "loss": 3.5709, + "step": 26880 + }, + { + "epoch": 2.764185855263158, + "grad_norm": 1.157586073260179, + "learning_rate": 6.977758320645353e-06, + "loss": 3.6318, + "step": 26890 + }, + { + "epoch": 2.7652138157894735, + "grad_norm": 1.10536258039407, + "learning_rate": 6.9651721574708965e-06, + "loss": 3.5682, + "step": 26900 + }, + { + "epoch": 2.7662417763157894, + "grad_norm": 1.4506333020460425, + "learning_rate": 6.952625092811899e-06, + "loss": 3.5091, + "step": 26910 + }, + { + "epoch": 2.7672697368421053, + "grad_norm": 0.7818426554551108, + "learning_rate": 6.940117140473733e-06, + "loss": 3.5938, + "step": 26920 + }, + { + "epoch": 2.7682976973684212, + "grad_norm": 0.6572283763208626, + "learning_rate": 6.927648314218735e-06, + "loss": 3.5306, + "step": 26930 + }, + { + "epoch": 2.7693256578947367, + "grad_norm": 0.8665966392310182, + "learning_rate": 6.915218627766197e-06, + "loss": 3.5009, + "step": 26940 + }, + { + "epoch": 2.7703536184210527, + "grad_norm": 0.6981889611077722, + "learning_rate": 6.902828094792347e-06, + "loss": 3.5566, + "step": 26950 + }, + { + "epoch": 2.7713815789473686, + "grad_norm": 0.6283817354914667, + "learning_rate": 6.890476728930324e-06, + "loss": 3.5769, + "step": 26960 + }, + { + "epoch": 2.772409539473684, + "grad_norm": 0.9608012777397814, + "learning_rate": 6.87816454377018e-06, + "loss": 3.5035, + "step": 26970 + }, + { + "epoch": 2.7734375, + "grad_norm": 1.3589553185163958, + "learning_rate": 6.865891552858859e-06, + "loss": 3.5265, + "step": 26980 + }, + { + "epoch": 2.774465460526316, + "grad_norm": 1.2497594125007485, + "learning_rate": 6.8536577697001665e-06, + "loss": 3.5977, + "step": 26990 + }, + { + "epoch": 2.7754934210526314, + "grad_norm": 0.9416212097662904, + "learning_rate": 6.841463207754781e-06, + "loss": 3.5743, + "step": 27000 + }, + { + "epoch": 2.7765213815789473, + "grad_norm": 0.9282262522122613, + "learning_rate": 6.829307880440226e-06, + "loss": 3.5121, + "step": 27010 + }, + { + "epoch": 2.7775493421052633, + "grad_norm": 0.756054559784989, + "learning_rate": 6.817191801130849e-06, + "loss": 3.5507, + "step": 27020 + }, + { + "epoch": 2.7785773026315788, + "grad_norm": 0.8100829485818528, + "learning_rate": 6.805114983157818e-06, + "loss": 3.5511, + "step": 27030 + }, + { + "epoch": 2.7796052631578947, + "grad_norm": 0.8998451483812179, + "learning_rate": 6.7930774398091005e-06, + "loss": 3.5223, + "step": 27040 + }, + { + "epoch": 2.7806332236842106, + "grad_norm": 0.7722889627992977, + "learning_rate": 6.78107918432945e-06, + "loss": 3.5915, + "step": 27050 + }, + { + "epoch": 2.7816611842105265, + "grad_norm": 1.0330957598509016, + "learning_rate": 6.769120229920394e-06, + "loss": 3.6161, + "step": 27060 + }, + { + "epoch": 2.782689144736842, + "grad_norm": 0.8704782201484291, + "learning_rate": 6.757200589740217e-06, + "loss": 3.4856, + "step": 27070 + }, + { + "epoch": 2.783717105263158, + "grad_norm": 0.8108485384381552, + "learning_rate": 6.745320276903948e-06, + "loss": 3.5834, + "step": 27080 + }, + { + "epoch": 2.7847450657894735, + "grad_norm": 1.2197736695892372, + "learning_rate": 6.733479304483337e-06, + "loss": 3.6289, + "step": 27090 + }, + { + "epoch": 2.7857730263157894, + "grad_norm": 1.105277540297375, + "learning_rate": 6.721677685506862e-06, + "loss": 3.663, + "step": 27100 + }, + { + "epoch": 2.7868009868421053, + "grad_norm": 1.4006824056427105, + "learning_rate": 6.7099154329596875e-06, + "loss": 3.5258, + "step": 27110 + }, + { + "epoch": 2.7878289473684212, + "grad_norm": 0.7711178838630616, + "learning_rate": 6.698192559783668e-06, + "loss": 3.5061, + "step": 27120 + }, + { + "epoch": 2.7888569078947367, + "grad_norm": 0.5891243039601968, + "learning_rate": 6.686509078877337e-06, + "loss": 3.4884, + "step": 27130 + }, + { + "epoch": 2.7898848684210527, + "grad_norm": 1.022803190091633, + "learning_rate": 6.674865003095873e-06, + "loss": 3.5298, + "step": 27140 + }, + { + "epoch": 2.7909128289473686, + "grad_norm": 0.6705729487644423, + "learning_rate": 6.663260345251107e-06, + "loss": 3.5587, + "step": 27150 + }, + { + "epoch": 2.791940789473684, + "grad_norm": 0.6309839131636724, + "learning_rate": 6.651695118111496e-06, + "loss": 3.4609, + "step": 27160 + }, + { + "epoch": 2.79296875, + "grad_norm": 0.9476351616911763, + "learning_rate": 6.640169334402104e-06, + "loss": 3.5907, + "step": 27170 + }, + { + "epoch": 2.793996710526316, + "grad_norm": 0.9360144925809123, + "learning_rate": 6.6286830068046094e-06, + "loss": 3.4997, + "step": 27180 + }, + { + "epoch": 2.7950246710526314, + "grad_norm": 1.7843415051738782, + "learning_rate": 6.61723614795727e-06, + "loss": 3.6169, + "step": 27190 + }, + { + "epoch": 2.7960526315789473, + "grad_norm": 0.7774934480765162, + "learning_rate": 6.6058287704549205e-06, + "loss": 3.5872, + "step": 27200 + }, + { + "epoch": 2.7970805921052633, + "grad_norm": 0.9799888317775785, + "learning_rate": 6.5944608868489455e-06, + "loss": 3.5638, + "step": 27210 + }, + { + "epoch": 2.7981085526315788, + "grad_norm": 1.1114169924959765, + "learning_rate": 6.5831325096472846e-06, + "loss": 3.5264, + "step": 27220 + }, + { + "epoch": 2.7991365131578947, + "grad_norm": 0.978795420833862, + "learning_rate": 6.571843651314409e-06, + "loss": 3.5913, + "step": 27230 + }, + { + "epoch": 2.8001644736842106, + "grad_norm": 0.8516598567080729, + "learning_rate": 6.560594324271301e-06, + "loss": 3.5483, + "step": 27240 + }, + { + "epoch": 2.8011924342105265, + "grad_norm": 1.1284969340169935, + "learning_rate": 6.549384540895449e-06, + "loss": 3.5506, + "step": 27250 + }, + { + "epoch": 2.802220394736842, + "grad_norm": 0.8303548553540764, + "learning_rate": 6.5382143135208356e-06, + "loss": 3.6631, + "step": 27260 + }, + { + "epoch": 2.803248355263158, + "grad_norm": 1.7495038146538588, + "learning_rate": 6.527083654437931e-06, + "loss": 3.5331, + "step": 27270 + }, + { + "epoch": 2.8042763157894735, + "grad_norm": 0.8664471670796214, + "learning_rate": 6.515992575893639e-06, + "loss": 3.6031, + "step": 27280 + }, + { + "epoch": 2.8053042763157894, + "grad_norm": 0.9215349119654949, + "learning_rate": 6.504941090091338e-06, + "loss": 3.5804, + "step": 27290 + }, + { + "epoch": 2.8063322368421053, + "grad_norm": 0.7037173848699843, + "learning_rate": 6.493929209190836e-06, + "loss": 3.551, + "step": 27300 + }, + { + "epoch": 2.8073601973684212, + "grad_norm": 0.6896219019942113, + "learning_rate": 6.482956945308362e-06, + "loss": 3.5493, + "step": 27310 + }, + { + "epoch": 2.8083881578947367, + "grad_norm": 0.7554768227557146, + "learning_rate": 6.472024310516563e-06, + "loss": 3.5528, + "step": 27320 + }, + { + "epoch": 2.8094161184210527, + "grad_norm": 0.8842561446481398, + "learning_rate": 6.461131316844478e-06, + "loss": 3.6096, + "step": 27330 + }, + { + "epoch": 2.8104440789473686, + "grad_norm": 0.8512070990351558, + "learning_rate": 6.4502779762775156e-06, + "loss": 3.5379, + "step": 27340 + }, + { + "epoch": 2.811472039473684, + "grad_norm": 1.3271794262903873, + "learning_rate": 6.439464300757481e-06, + "loss": 3.4919, + "step": 27350 + }, + { + "epoch": 2.8125, + "grad_norm": 0.6720656412810931, + "learning_rate": 6.428690302182512e-06, + "loss": 3.4611, + "step": 27360 + }, + { + "epoch": 2.813527960526316, + "grad_norm": 1.0798528343175144, + "learning_rate": 6.417955992407115e-06, + "loss": 3.5064, + "step": 27370 + }, + { + "epoch": 2.8145559210526314, + "grad_norm": 1.0076420366299554, + "learning_rate": 6.4072613832421005e-06, + "loss": 3.536, + "step": 27380 + }, + { + "epoch": 2.8155838815789473, + "grad_norm": 0.9134228090037331, + "learning_rate": 6.396606486454615e-06, + "loss": 3.5501, + "step": 27390 + }, + { + "epoch": 2.8166118421052633, + "grad_norm": 0.8061545988313591, + "learning_rate": 6.385991313768099e-06, + "loss": 3.5631, + "step": 27400 + }, + { + "epoch": 2.8176398026315788, + "grad_norm": 0.696275601250214, + "learning_rate": 6.375415876862297e-06, + "loss": 3.5806, + "step": 27410 + }, + { + "epoch": 2.8186677631578947, + "grad_norm": 0.6886396498992271, + "learning_rate": 6.364880187373235e-06, + "loss": 3.6213, + "step": 27420 + }, + { + "epoch": 2.8196957236842106, + "grad_norm": 0.6463905796570253, + "learning_rate": 6.354384256893178e-06, + "loss": 3.4512, + "step": 27430 + }, + { + "epoch": 2.8207236842105265, + "grad_norm": 0.7607414039930978, + "learning_rate": 6.3439280969706775e-06, + "loss": 3.663, + "step": 27440 + }, + { + "epoch": 2.821751644736842, + "grad_norm": 2.267857891636179, + "learning_rate": 6.333511719110509e-06, + "loss": 3.5001, + "step": 27450 + }, + { + "epoch": 2.822779605263158, + "grad_norm": 0.8243782286909733, + "learning_rate": 6.32313513477367e-06, + "loss": 3.6154, + "step": 27460 + }, + { + "epoch": 2.8238075657894735, + "grad_norm": 1.1024848377283742, + "learning_rate": 6.312798355377397e-06, + "loss": 3.675, + "step": 27470 + }, + { + "epoch": 2.8248355263157894, + "grad_norm": 0.8220475512485101, + "learning_rate": 6.302501392295115e-06, + "loss": 3.5563, + "step": 27480 + }, + { + "epoch": 2.8258634868421053, + "grad_norm": 0.8692822287429515, + "learning_rate": 6.2922442568564334e-06, + "loss": 3.5267, + "step": 27490 + }, + { + "epoch": 2.8268914473684212, + "grad_norm": 0.8480052668952224, + "learning_rate": 6.2820269603471485e-06, + "loss": 3.5508, + "step": 27500 + }, + { + "epoch": 2.8279194078947367, + "grad_norm": 0.7760996231523505, + "learning_rate": 6.271849514009221e-06, + "loss": 3.5862, + "step": 27510 + }, + { + "epoch": 2.8289473684210527, + "grad_norm": 0.7044604229514665, + "learning_rate": 6.2617119290407635e-06, + "loss": 3.5169, + "step": 27520 + }, + { + "epoch": 2.8299753289473686, + "grad_norm": 0.726797414560291, + "learning_rate": 6.251614216596037e-06, + "loss": 3.5696, + "step": 27530 + }, + { + "epoch": 2.831003289473684, + "grad_norm": 0.7261515039435882, + "learning_rate": 6.241556387785419e-06, + "loss": 3.5908, + "step": 27540 + }, + { + "epoch": 2.83203125, + "grad_norm": 0.6871038943056406, + "learning_rate": 6.23153845367541e-06, + "loss": 3.5693, + "step": 27550 + }, + { + "epoch": 2.833059210526316, + "grad_norm": 0.8936637103166903, + "learning_rate": 6.221560425288615e-06, + "loss": 3.5798, + "step": 27560 + }, + { + "epoch": 2.8340871710526314, + "grad_norm": 0.7043576561824253, + "learning_rate": 6.211622313603731e-06, + "loss": 3.5008, + "step": 27570 + }, + { + "epoch": 2.8351151315789473, + "grad_norm": 0.8297560841290469, + "learning_rate": 6.2017241295555384e-06, + "loss": 3.4775, + "step": 27580 + }, + { + "epoch": 2.8361430921052633, + "grad_norm": 0.6870351216682342, + "learning_rate": 6.191865884034878e-06, + "loss": 3.5479, + "step": 27590 + }, + { + "epoch": 2.8371710526315788, + "grad_norm": 0.7028443306125934, + "learning_rate": 6.182047587888651e-06, + "loss": 3.551, + "step": 27600 + }, + { + "epoch": 2.8381990131578947, + "grad_norm": 0.8103932798160649, + "learning_rate": 6.172269251919799e-06, + "loss": 3.6345, + "step": 27610 + }, + { + "epoch": 2.8392269736842106, + "grad_norm": 0.7151689467200041, + "learning_rate": 6.162530886887308e-06, + "loss": 3.5441, + "step": 27620 + }, + { + "epoch": 2.8402549342105265, + "grad_norm": 1.1507654793688764, + "learning_rate": 6.152832503506169e-06, + "loss": 3.5384, + "step": 27630 + }, + { + "epoch": 2.841282894736842, + "grad_norm": 0.9226479280437271, + "learning_rate": 6.1431741124473994e-06, + "loss": 3.4519, + "step": 27640 + }, + { + "epoch": 2.842310855263158, + "grad_norm": 0.7377514830664552, + "learning_rate": 6.133555724337993e-06, + "loss": 3.5383, + "step": 27650 + }, + { + "epoch": 2.8433388157894735, + "grad_norm": 0.8527027510450884, + "learning_rate": 6.123977349760943e-06, + "loss": 3.625, + "step": 27660 + }, + { + "epoch": 2.8443667763157894, + "grad_norm": 1.1477670074678492, + "learning_rate": 6.114438999255207e-06, + "loss": 3.5965, + "step": 27670 + }, + { + "epoch": 2.8453947368421053, + "grad_norm": 1.1426257522959455, + "learning_rate": 6.104940683315715e-06, + "loss": 3.5567, + "step": 27680 + }, + { + "epoch": 2.8464226973684212, + "grad_norm": 0.8013285557627741, + "learning_rate": 6.095482412393346e-06, + "loss": 3.5766, + "step": 27690 + }, + { + "epoch": 2.8474506578947367, + "grad_norm": 0.9927444696937596, + "learning_rate": 6.086064196894904e-06, + "loss": 3.5711, + "step": 27700 + }, + { + "epoch": 2.8484786184210527, + "grad_norm": 0.7581734338495962, + "learning_rate": 6.076686047183139e-06, + "loss": 3.5123, + "step": 27710 + }, + { + "epoch": 2.8495065789473686, + "grad_norm": 0.6477496853970847, + "learning_rate": 6.0673479735767e-06, + "loss": 3.5284, + "step": 27720 + }, + { + "epoch": 2.850534539473684, + "grad_norm": 0.6899665671348209, + "learning_rate": 6.0580499863501585e-06, + "loss": 3.5073, + "step": 27730 + }, + { + "epoch": 2.8515625, + "grad_norm": 0.8425858201483294, + "learning_rate": 6.0487920957339755e-06, + "loss": 3.6298, + "step": 27740 + }, + { + "epoch": 2.852590460526316, + "grad_norm": 0.7958497951129837, + "learning_rate": 6.039574311914476e-06, + "loss": 3.5672, + "step": 27750 + }, + { + "epoch": 2.8536184210526314, + "grad_norm": 1.0538462356661593, + "learning_rate": 6.0303966450338715e-06, + "loss": 3.5824, + "step": 27760 + }, + { + "epoch": 2.8546463815789473, + "grad_norm": 1.2903469737596687, + "learning_rate": 6.021259105190231e-06, + "loss": 3.471, + "step": 27770 + }, + { + "epoch": 2.8556743421052633, + "grad_norm": 1.048011675118067, + "learning_rate": 6.012161702437485e-06, + "loss": 3.519, + "step": 27780 + }, + { + "epoch": 2.8567023026315788, + "grad_norm": 1.598686650790508, + "learning_rate": 6.003104446785376e-06, + "loss": 3.516, + "step": 27790 + }, + { + "epoch": 2.8577302631578947, + "grad_norm": 1.2433763009710075, + "learning_rate": 5.994087348199495e-06, + "loss": 3.5284, + "step": 27800 + }, + { + "epoch": 2.8587582236842106, + "grad_norm": 0.9546800029354133, + "learning_rate": 5.985110416601233e-06, + "loss": 3.5694, + "step": 27810 + }, + { + "epoch": 2.8597861842105265, + "grad_norm": 0.9697507795707871, + "learning_rate": 5.97617366186779e-06, + "loss": 3.5084, + "step": 27820 + }, + { + "epoch": 2.860814144736842, + "grad_norm": 1.0746410302810996, + "learning_rate": 5.96727709383217e-06, + "loss": 3.6208, + "step": 27830 + }, + { + "epoch": 2.861842105263158, + "grad_norm": 1.2386604474568215, + "learning_rate": 5.958420722283151e-06, + "loss": 3.5386, + "step": 27840 + }, + { + "epoch": 2.8628700657894735, + "grad_norm": 0.9952211933983395, + "learning_rate": 5.94960455696529e-06, + "loss": 3.4973, + "step": 27850 + }, + { + "epoch": 2.8638980263157894, + "grad_norm": 0.6689415256365213, + "learning_rate": 5.940828607578894e-06, + "loss": 3.5332, + "step": 27860 + }, + { + "epoch": 2.8649259868421053, + "grad_norm": 0.7516151362205997, + "learning_rate": 5.932092883780021e-06, + "loss": 3.6123, + "step": 27870 + }, + { + "epoch": 2.8659539473684212, + "grad_norm": 0.7511174698526948, + "learning_rate": 5.92339739518049e-06, + "loss": 3.5767, + "step": 27880 + }, + { + "epoch": 2.8669819078947367, + "grad_norm": 1.0551992799471581, + "learning_rate": 5.914742151347824e-06, + "loss": 3.5299, + "step": 27890 + }, + { + "epoch": 2.8680098684210527, + "grad_norm": 1.0622783317459623, + "learning_rate": 5.9061271618052884e-06, + "loss": 3.5173, + "step": 27900 + }, + { + "epoch": 2.8690378289473686, + "grad_norm": 1.1552205529124198, + "learning_rate": 5.897552436031839e-06, + "loss": 3.5484, + "step": 27910 + }, + { + "epoch": 2.870065789473684, + "grad_norm": 0.7394154228195114, + "learning_rate": 5.889017983462132e-06, + "loss": 3.5555, + "step": 27920 + }, + { + "epoch": 2.87109375, + "grad_norm": 0.6934107586800334, + "learning_rate": 5.880523813486523e-06, + "loss": 3.6446, + "step": 27930 + }, + { + "epoch": 2.872121710526316, + "grad_norm": 0.6077331285210128, + "learning_rate": 5.872069935451036e-06, + "loss": 3.5874, + "step": 27940 + }, + { + "epoch": 2.8731496710526314, + "grad_norm": 0.7801130414014719, + "learning_rate": 5.863656358657367e-06, + "loss": 3.5845, + "step": 27950 + }, + { + "epoch": 2.8741776315789473, + "grad_norm": 0.7259882226335482, + "learning_rate": 5.855283092362867e-06, + "loss": 3.4756, + "step": 27960 + }, + { + "epoch": 2.8752055921052633, + "grad_norm": 0.9789636617936162, + "learning_rate": 5.846950145780526e-06, + "loss": 3.593, + "step": 27970 + }, + { + "epoch": 2.8762335526315788, + "grad_norm": 1.0985197967044225, + "learning_rate": 5.838657528078984e-06, + "loss": 3.6279, + "step": 27980 + }, + { + "epoch": 2.8772615131578947, + "grad_norm": 0.784921525628783, + "learning_rate": 5.830405248382505e-06, + "loss": 3.5727, + "step": 27990 + }, + { + "epoch": 2.8782894736842106, + "grad_norm": 0.7870754111560894, + "learning_rate": 5.8221933157709596e-06, + "loss": 3.5758, + "step": 28000 + }, + { + "epoch": 2.8793174342105265, + "grad_norm": 1.130053138918197, + "learning_rate": 5.814021739279843e-06, + "loss": 3.6379, + "step": 28010 + }, + { + "epoch": 2.880345394736842, + "grad_norm": 1.0075982036011217, + "learning_rate": 5.805890527900216e-06, + "loss": 3.5997, + "step": 28020 + }, + { + "epoch": 2.881373355263158, + "grad_norm": 0.6722092351587206, + "learning_rate": 5.797799690578766e-06, + "loss": 3.5944, + "step": 28030 + }, + { + "epoch": 2.8824013157894735, + "grad_norm": 1.3396660155808637, + "learning_rate": 5.7897492362177325e-06, + "loss": 3.566, + "step": 28040 + }, + { + "epoch": 2.8834292763157894, + "grad_norm": 1.077907982697741, + "learning_rate": 5.781739173674923e-06, + "loss": 3.5464, + "step": 28050 + }, + { + "epoch": 2.8844572368421053, + "grad_norm": 0.9596890969792615, + "learning_rate": 5.773769511763712e-06, + "loss": 3.5136, + "step": 28060 + }, + { + "epoch": 2.8854851973684212, + "grad_norm": 0.9286011832573751, + "learning_rate": 5.765840259253005e-06, + "loss": 3.48, + "step": 28070 + }, + { + "epoch": 2.8865131578947367, + "grad_norm": 0.8239436119748191, + "learning_rate": 5.757951424867267e-06, + "loss": 3.4952, + "step": 28080 + }, + { + "epoch": 2.8875411184210527, + "grad_norm": 0.9019321642423037, + "learning_rate": 5.750103017286481e-06, + "loss": 3.5183, + "step": 28090 + }, + { + "epoch": 2.8885690789473686, + "grad_norm": 1.0345775210068389, + "learning_rate": 5.742295045146147e-06, + "loss": 3.5875, + "step": 28100 + }, + { + "epoch": 2.889597039473684, + "grad_norm": 0.7410184788787213, + "learning_rate": 5.734527517037278e-06, + "loss": 3.4909, + "step": 28110 + }, + { + "epoch": 2.890625, + "grad_norm": 0.6878296321969283, + "learning_rate": 5.726800441506383e-06, + "loss": 3.6259, + "step": 28120 + }, + { + "epoch": 2.891652960526316, + "grad_norm": 0.975579675114019, + "learning_rate": 5.719113827055462e-06, + "loss": 3.4987, + "step": 28130 + }, + { + "epoch": 2.8926809210526314, + "grad_norm": 0.8280627749115301, + "learning_rate": 5.711467682142009e-06, + "loss": 3.5615, + "step": 28140 + }, + { + "epoch": 2.8937088815789473, + "grad_norm": 0.7153625273469575, + "learning_rate": 5.703862015178968e-06, + "loss": 3.484, + "step": 28150 + }, + { + "epoch": 2.8947368421052633, + "grad_norm": 1.443874654301399, + "learning_rate": 5.696296834534762e-06, + "loss": 3.6036, + "step": 28160 + }, + { + "epoch": 2.8957648026315788, + "grad_norm": 0.8277116499657621, + "learning_rate": 5.6887721485332655e-06, + "loss": 3.5289, + "step": 28170 + }, + { + "epoch": 2.8967927631578947, + "grad_norm": 0.9111035373968681, + "learning_rate": 5.681287965453781e-06, + "loss": 3.5651, + "step": 28180 + }, + { + "epoch": 2.8978207236842106, + "grad_norm": 1.083450928443364, + "learning_rate": 5.673844293531076e-06, + "loss": 3.5588, + "step": 28190 + }, + { + "epoch": 2.8988486842105265, + "grad_norm": 1.0211799707763778, + "learning_rate": 5.666441140955315e-06, + "loss": 3.5085, + "step": 28200 + }, + { + "epoch": 2.899876644736842, + "grad_norm": 0.8557893683504838, + "learning_rate": 5.6590785158720965e-06, + "loss": 3.532, + "step": 28210 + }, + { + "epoch": 2.900904605263158, + "grad_norm": 0.8756093560073152, + "learning_rate": 5.6517564263824285e-06, + "loss": 3.5016, + "step": 28220 + }, + { + "epoch": 2.9019325657894735, + "grad_norm": 0.7286228786194829, + "learning_rate": 5.644474880542695e-06, + "loss": 3.6416, + "step": 28230 + }, + { + "epoch": 2.9029605263157894, + "grad_norm": 0.8488198305822819, + "learning_rate": 5.637233886364707e-06, + "loss": 3.5696, + "step": 28240 + }, + { + "epoch": 2.9039884868421053, + "grad_norm": 0.9638777000835654, + "learning_rate": 5.630033451815624e-06, + "loss": 3.5615, + "step": 28250 + }, + { + "epoch": 2.9050164473684212, + "grad_norm": 0.9773044666717108, + "learning_rate": 5.622873584817996e-06, + "loss": 3.4897, + "step": 28260 + }, + { + "epoch": 2.9060444078947367, + "grad_norm": 1.161791641011164, + "learning_rate": 5.615754293249739e-06, + "loss": 3.5241, + "step": 28270 + }, + { + "epoch": 2.9070723684210527, + "grad_norm": 0.8076456749906958, + "learning_rate": 5.608675584944108e-06, + "loss": 3.5393, + "step": 28280 + }, + { + "epoch": 2.9081003289473686, + "grad_norm": 0.7757767645484354, + "learning_rate": 5.6016374676897215e-06, + "loss": 3.5264, + "step": 28290 + }, + { + "epoch": 2.909128289473684, + "grad_norm": 0.9940944878265251, + "learning_rate": 5.594639949230527e-06, + "loss": 3.5695, + "step": 28300 + }, + { + "epoch": 2.91015625, + "grad_norm": 1.1034457294305244, + "learning_rate": 5.587683037265808e-06, + "loss": 3.5144, + "step": 28310 + }, + { + "epoch": 2.911184210526316, + "grad_norm": 0.6018007227856221, + "learning_rate": 5.580766739450159e-06, + "loss": 3.5489, + "step": 28320 + }, + { + "epoch": 2.9122121710526314, + "grad_norm": 0.9749112626800466, + "learning_rate": 5.573891063393504e-06, + "loss": 3.5774, + "step": 28330 + }, + { + "epoch": 2.9132401315789473, + "grad_norm": 1.1974064555892372, + "learning_rate": 5.567056016661048e-06, + "loss": 3.5772, + "step": 28340 + }, + { + "epoch": 2.9142680921052633, + "grad_norm": 0.7211893813841539, + "learning_rate": 5.560261606773316e-06, + "loss": 3.5787, + "step": 28350 + }, + { + "epoch": 2.9152960526315788, + "grad_norm": 0.9663679308067142, + "learning_rate": 5.553507841206105e-06, + "loss": 3.4303, + "step": 28360 + }, + { + "epoch": 2.9163240131578947, + "grad_norm": 0.6406191383651899, + "learning_rate": 5.5467947273904975e-06, + "loss": 3.5583, + "step": 28370 + }, + { + "epoch": 2.9173519736842106, + "grad_norm": 0.8050287504570877, + "learning_rate": 5.540122272712846e-06, + "loss": 3.4934, + "step": 28380 + }, + { + "epoch": 2.9183799342105265, + "grad_norm": 0.5887592779833699, + "learning_rate": 5.533490484514769e-06, + "loss": 3.5887, + "step": 28390 + }, + { + "epoch": 2.919407894736842, + "grad_norm": 0.6228450579825456, + "learning_rate": 5.526899370093136e-06, + "loss": 3.5226, + "step": 28400 + }, + { + "epoch": 2.920435855263158, + "grad_norm": 0.5597149194404551, + "learning_rate": 5.5203489367000685e-06, + "loss": 3.5415, + "step": 28410 + }, + { + "epoch": 2.9214638157894735, + "grad_norm": 0.8669867755800902, + "learning_rate": 5.51383919154292e-06, + "loss": 3.4791, + "step": 28420 + }, + { + "epoch": 2.9224917763157894, + "grad_norm": 0.7771987396396896, + "learning_rate": 5.507370141784285e-06, + "loss": 3.5112, + "step": 28430 + }, + { + "epoch": 2.9235197368421053, + "grad_norm": 1.0590023927130248, + "learning_rate": 5.50094179454197e-06, + "loss": 3.5189, + "step": 28440 + }, + { + "epoch": 2.9245476973684212, + "grad_norm": 0.8310881449938377, + "learning_rate": 5.494554156889007e-06, + "loss": 3.5672, + "step": 28450 + }, + { + "epoch": 2.9255756578947367, + "grad_norm": 0.6336962333609395, + "learning_rate": 5.488207235853632e-06, + "loss": 3.4933, + "step": 28460 + }, + { + "epoch": 2.9266036184210527, + "grad_norm": 0.8319888395147789, + "learning_rate": 5.481901038419276e-06, + "loss": 3.5277, + "step": 28470 + }, + { + "epoch": 2.9276315789473686, + "grad_norm": 0.7093969988667771, + "learning_rate": 5.475635571524574e-06, + "loss": 3.5746, + "step": 28480 + }, + { + "epoch": 2.928659539473684, + "grad_norm": 0.7165039045862367, + "learning_rate": 5.469410842063336e-06, + "loss": 3.5624, + "step": 28490 + }, + { + "epoch": 2.9296875, + "grad_norm": 0.7940774692654218, + "learning_rate": 5.463226856884551e-06, + "loss": 3.5976, + "step": 28500 + }, + { + "epoch": 2.930715460526316, + "grad_norm": 0.9612232496708843, + "learning_rate": 5.457083622792378e-06, + "loss": 3.4293, + "step": 28510 + }, + { + "epoch": 2.9317434210526314, + "grad_norm": 0.7188542331912398, + "learning_rate": 5.450981146546142e-06, + "loss": 3.6221, + "step": 28520 + }, + { + "epoch": 2.9327713815789473, + "grad_norm": 0.8514164090613696, + "learning_rate": 5.444919434860316e-06, + "loss": 3.6159, + "step": 28530 + }, + { + "epoch": 2.9337993421052633, + "grad_norm": 0.9711864350480802, + "learning_rate": 5.438898494404529e-06, + "loss": 3.6048, + "step": 28540 + }, + { + "epoch": 2.9348273026315788, + "grad_norm": 0.8678402205515612, + "learning_rate": 5.4329183318035384e-06, + "loss": 3.4982, + "step": 28550 + }, + { + "epoch": 2.9358552631578947, + "grad_norm": 0.6757311058183224, + "learning_rate": 5.426978953637238e-06, + "loss": 3.5498, + "step": 28560 + }, + { + "epoch": 2.9368832236842106, + "grad_norm": 0.7499289081576311, + "learning_rate": 5.4210803664406585e-06, + "loss": 3.5796, + "step": 28570 + }, + { + "epoch": 2.9379111842105265, + "grad_norm": 0.8349685043973643, + "learning_rate": 5.415222576703934e-06, + "loss": 3.5371, + "step": 28580 + }, + { + "epoch": 2.938939144736842, + "grad_norm": 0.6768744342058376, + "learning_rate": 5.40940559087232e-06, + "loss": 3.5087, + "step": 28590 + }, + { + "epoch": 2.939967105263158, + "grad_norm": 0.8940707358147963, + "learning_rate": 5.4036294153461645e-06, + "loss": 3.5017, + "step": 28600 + }, + { + "epoch": 2.9409950657894735, + "grad_norm": 0.7935510386017627, + "learning_rate": 5.397894056480928e-06, + "loss": 3.5422, + "step": 28610 + }, + { + "epoch": 2.9420230263157894, + "grad_norm": 0.8892592616313452, + "learning_rate": 5.3921995205871435e-06, + "loss": 3.5578, + "step": 28620 + }, + { + "epoch": 2.9430509868421053, + "grad_norm": 1.1429080845344537, + "learning_rate": 5.386545813930439e-06, + "loss": 3.6121, + "step": 28630 + }, + { + "epoch": 2.9440789473684212, + "grad_norm": 1.0526517648079, + "learning_rate": 5.380932942731518e-06, + "loss": 3.5424, + "step": 28640 + }, + { + "epoch": 2.9451069078947367, + "grad_norm": 0.9799770626964974, + "learning_rate": 5.37536091316615e-06, + "loss": 3.5062, + "step": 28650 + }, + { + "epoch": 2.9461348684210527, + "grad_norm": 0.9369343640718572, + "learning_rate": 5.369829731365164e-06, + "loss": 3.4416, + "step": 28660 + }, + { + "epoch": 2.9471628289473686, + "grad_norm": 0.8316165445764576, + "learning_rate": 5.364339403414451e-06, + "loss": 3.5423, + "step": 28670 + }, + { + "epoch": 2.948190789473684, + "grad_norm": 0.7425335127720291, + "learning_rate": 5.358889935354952e-06, + "loss": 3.515, + "step": 28680 + }, + { + "epoch": 2.94921875, + "grad_norm": 0.6326473161778391, + "learning_rate": 5.3534813331826385e-06, + "loss": 3.4839, + "step": 28690 + }, + { + "epoch": 2.950246710526316, + "grad_norm": 0.741271367078701, + "learning_rate": 5.348113602848534e-06, + "loss": 3.5703, + "step": 28700 + }, + { + "epoch": 2.9512746710526314, + "grad_norm": 0.7810903386047064, + "learning_rate": 5.342786750258679e-06, + "loss": 3.6117, + "step": 28710 + }, + { + "epoch": 2.9523026315789473, + "grad_norm": 0.680877695805574, + "learning_rate": 5.337500781274147e-06, + "loss": 3.5998, + "step": 28720 + }, + { + "epoch": 2.9533305921052633, + "grad_norm": 1.3218785766120302, + "learning_rate": 5.332255701711011e-06, + "loss": 3.5356, + "step": 28730 + }, + { + "epoch": 2.9543585526315788, + "grad_norm": 0.9721427389230713, + "learning_rate": 5.3270515173403755e-06, + "loss": 3.4603, + "step": 28740 + }, + { + "epoch": 2.9553865131578947, + "grad_norm": 0.7416994196556879, + "learning_rate": 5.321888233888329e-06, + "loss": 3.5773, + "step": 28750 + }, + { + "epoch": 2.9564144736842106, + "grad_norm": 0.8058258449071062, + "learning_rate": 5.316765857035974e-06, + "loss": 3.5503, + "step": 28760 + }, + { + "epoch": 2.9574424342105265, + "grad_norm": 0.8316693660702469, + "learning_rate": 5.311684392419393e-06, + "loss": 3.5639, + "step": 28770 + }, + { + "epoch": 2.958470394736842, + "grad_norm": 0.759803908298017, + "learning_rate": 5.306643845629655e-06, + "loss": 3.5995, + "step": 28780 + }, + { + "epoch": 2.959498355263158, + "grad_norm": 1.4330931576877515, + "learning_rate": 5.301644222212812e-06, + "loss": 3.6133, + "step": 28790 + }, + { + "epoch": 2.9605263157894735, + "grad_norm": 0.7296676731070729, + "learning_rate": 5.296685527669876e-06, + "loss": 3.4559, + "step": 28800 + }, + { + "epoch": 2.9615542763157894, + "grad_norm": 0.7262906622213783, + "learning_rate": 5.291767767456853e-06, + "loss": 3.5547, + "step": 28810 + }, + { + "epoch": 2.9625822368421053, + "grad_norm": 0.703857167826437, + "learning_rate": 5.286890946984673e-06, + "loss": 3.5543, + "step": 28820 + }, + { + "epoch": 2.9636101973684212, + "grad_norm": 0.7287805397740467, + "learning_rate": 5.282055071619252e-06, + "loss": 3.531, + "step": 28830 + }, + { + "epoch": 2.9646381578947367, + "grad_norm": 1.1599300690651397, + "learning_rate": 5.277260146681434e-06, + "loss": 3.5134, + "step": 28840 + }, + { + "epoch": 2.9656661184210527, + "grad_norm": 0.9116124304434866, + "learning_rate": 5.272506177447016e-06, + "loss": 3.5598, + "step": 28850 + }, + { + "epoch": 2.9666940789473686, + "grad_norm": 0.7674881977080923, + "learning_rate": 5.26779316914673e-06, + "loss": 3.5222, + "step": 28860 + }, + { + "epoch": 2.967722039473684, + "grad_norm": 0.6838250341163599, + "learning_rate": 5.2631211269662395e-06, + "loss": 3.6129, + "step": 28870 + }, + { + "epoch": 2.96875, + "grad_norm": 0.8765650104887944, + "learning_rate": 5.25849005604613e-06, + "loss": 3.5991, + "step": 28880 + }, + { + "epoch": 2.969777960526316, + "grad_norm": 0.8607436984892676, + "learning_rate": 5.2538999614819134e-06, + "loss": 3.5675, + "step": 28890 + }, + { + "epoch": 2.9708059210526314, + "grad_norm": 0.7123844387635999, + "learning_rate": 5.249350848324005e-06, + "loss": 3.5453, + "step": 28900 + }, + { + "epoch": 2.9718338815789473, + "grad_norm": 0.9307961465265648, + "learning_rate": 5.244842721577742e-06, + "loss": 3.5437, + "step": 28910 + }, + { + "epoch": 2.9728618421052633, + "grad_norm": 0.8894480239795606, + "learning_rate": 5.240375586203358e-06, + "loss": 3.5356, + "step": 28920 + }, + { + "epoch": 2.9738898026315788, + "grad_norm": 0.8839088316635778, + "learning_rate": 5.235949447115981e-06, + "loss": 3.6433, + "step": 28930 + }, + { + "epoch": 2.9749177631578947, + "grad_norm": 0.6562093313422201, + "learning_rate": 5.2315643091856375e-06, + "loss": 3.5454, + "step": 28940 + }, + { + "epoch": 2.9759457236842106, + "grad_norm": 1.0431257836925747, + "learning_rate": 5.22722017723724e-06, + "loss": 3.5261, + "step": 28950 + }, + { + "epoch": 2.9769736842105265, + "grad_norm": 1.0468391867817486, + "learning_rate": 5.222917056050575e-06, + "loss": 3.6013, + "step": 28960 + }, + { + "epoch": 2.978001644736842, + "grad_norm": 0.9228854380046642, + "learning_rate": 5.218654950360321e-06, + "loss": 3.6, + "step": 28970 + }, + { + "epoch": 2.979029605263158, + "grad_norm": 0.7809950603324011, + "learning_rate": 5.214433864856012e-06, + "loss": 3.5724, + "step": 28980 + }, + { + "epoch": 2.9800575657894735, + "grad_norm": 0.6372487255501228, + "learning_rate": 5.2102538041820575e-06, + "loss": 3.4459, + "step": 28990 + }, + { + "epoch": 2.9810855263157894, + "grad_norm": 1.2416313413847668, + "learning_rate": 5.206114772937717e-06, + "loss": 3.5677, + "step": 29000 + }, + { + "epoch": 2.9821134868421053, + "grad_norm": 0.8009653763639919, + "learning_rate": 5.202016775677122e-06, + "loss": 3.5366, + "step": 29010 + }, + { + "epoch": 2.9831414473684212, + "grad_norm": 1.2581903115312596, + "learning_rate": 5.197959816909244e-06, + "loss": 3.5104, + "step": 29020 + }, + { + "epoch": 2.9841694078947367, + "grad_norm": 0.8512883576252932, + "learning_rate": 5.193943901097903e-06, + "loss": 3.5701, + "step": 29030 + }, + { + "epoch": 2.9851973684210527, + "grad_norm": 0.8993024228670273, + "learning_rate": 5.189969032661757e-06, + "loss": 3.6236, + "step": 29040 + }, + { + "epoch": 2.9862253289473686, + "grad_norm": 1.1403886210447567, + "learning_rate": 5.186035215974307e-06, + "loss": 3.515, + "step": 29050 + }, + { + "epoch": 2.987253289473684, + "grad_norm": 0.9000535114592592, + "learning_rate": 5.182142455363875e-06, + "loss": 3.576, + "step": 29060 + }, + { + "epoch": 2.98828125, + "grad_norm": 0.9337563472874219, + "learning_rate": 5.178290755113624e-06, + "loss": 3.5018, + "step": 29070 + }, + { + "epoch": 2.989309210526316, + "grad_norm": 0.6244443807916771, + "learning_rate": 5.17448011946152e-06, + "loss": 3.5721, + "step": 29080 + }, + { + "epoch": 2.9903371710526314, + "grad_norm": 0.8484085541735004, + "learning_rate": 5.170710552600369e-06, + "loss": 3.5, + "step": 29090 + }, + { + "epoch": 2.9913651315789473, + "grad_norm": 0.62180462827233, + "learning_rate": 5.1669820586777676e-06, + "loss": 3.4815, + "step": 29100 + }, + { + "epoch": 2.9923930921052633, + "grad_norm": 0.7007609999164525, + "learning_rate": 5.16329464179613e-06, + "loss": 3.4527, + "step": 29110 + }, + { + "epoch": 2.9934210526315788, + "grad_norm": 1.157623609200332, + "learning_rate": 5.159648306012679e-06, + "loss": 3.6113, + "step": 29120 + }, + { + "epoch": 2.9944490131578947, + "grad_norm": 1.7257546308406417, + "learning_rate": 5.156043055339429e-06, + "loss": 3.5165, + "step": 29130 + }, + { + "epoch": 2.9954769736842106, + "grad_norm": 0.8612286084180214, + "learning_rate": 5.1524788937431916e-06, + "loss": 3.6459, + "step": 29140 + }, + { + "epoch": 2.9965049342105265, + "grad_norm": 0.85834125419468, + "learning_rate": 5.1489558251455655e-06, + "loss": 3.5442, + "step": 29150 + }, + { + "epoch": 2.997532894736842, + "grad_norm": 0.7756512489764696, + "learning_rate": 5.145473853422935e-06, + "loss": 3.5549, + "step": 29160 + }, + { + "epoch": 2.998560855263158, + "grad_norm": 0.8245626880001032, + "learning_rate": 5.14203298240648e-06, + "loss": 3.4989, + "step": 29170 + }, + { + "epoch": 2.9995888157894735, + "grad_norm": 1.3150622471338267, + "learning_rate": 5.138633215882138e-06, + "loss": 3.5779, + "step": 29180 + }, + { + "epoch": 3.0006167763157894, + "grad_norm": 1.1431582811953376, + "learning_rate": 5.1352745575906315e-06, + "loss": 3.5118, + "step": 29190 + }, + { + "epoch": 3.0016447368421053, + "grad_norm": 0.7755323996278086, + "learning_rate": 5.1319570112274455e-06, + "loss": 3.6045, + "step": 29200 + }, + { + "epoch": 3.0026726973684212, + "grad_norm": 0.7777045864529655, + "learning_rate": 5.128680580442837e-06, + "loss": 3.5433, + "step": 29210 + }, + { + "epoch": 3.0037006578947367, + "grad_norm": 0.683424618097694, + "learning_rate": 5.125445268841822e-06, + "loss": 3.6041, + "step": 29220 + }, + { + "epoch": 3.0047286184210527, + "grad_norm": 0.6709164057606231, + "learning_rate": 5.122251079984175e-06, + "loss": 3.454, + "step": 29230 + }, + { + "epoch": 3.0057565789473686, + "grad_norm": 0.9518522300676858, + "learning_rate": 5.11909801738441e-06, + "loss": 3.5532, + "step": 29240 + }, + { + "epoch": 3.006784539473684, + "grad_norm": 0.8429062474565856, + "learning_rate": 5.115986084511812e-06, + "loss": 3.5457, + "step": 29250 + }, + { + "epoch": 3.0078125, + "grad_norm": 0.8776890782482851, + "learning_rate": 5.112915284790394e-06, + "loss": 3.5373, + "step": 29260 + }, + { + "epoch": 3.008840460526316, + "grad_norm": 0.757884149998554, + "learning_rate": 5.109885621598921e-06, + "loss": 3.4925, + "step": 29270 + }, + { + "epoch": 3.0098684210526314, + "grad_norm": 0.8034099696697674, + "learning_rate": 5.1068970982708934e-06, + "loss": 3.5271, + "step": 29280 + }, + { + "epoch": 3.0108963815789473, + "grad_norm": 0.9051047026938887, + "learning_rate": 5.103949718094543e-06, + "loss": 3.4923, + "step": 29290 + }, + { + "epoch": 3.0119243421052633, + "grad_norm": 0.7046482328990556, + "learning_rate": 5.101043484312838e-06, + "loss": 3.5555, + "step": 29300 + }, + { + "epoch": 3.0129523026315788, + "grad_norm": 0.9449193783333238, + "learning_rate": 5.098178400123463e-06, + "loss": 3.5885, + "step": 29310 + }, + { + "epoch": 3.0139802631578947, + "grad_norm": 0.7308202703505403, + "learning_rate": 5.095354468678842e-06, + "loss": 3.5148, + "step": 29320 + }, + { + "epoch": 3.0150082236842106, + "grad_norm": 1.0872048865471637, + "learning_rate": 5.092571693086103e-06, + "loss": 3.524, + "step": 29330 + }, + { + "epoch": 3.016036184210526, + "grad_norm": 0.9962592492338327, + "learning_rate": 5.089830076407103e-06, + "loss": 3.4798, + "step": 29340 + }, + { + "epoch": 3.017064144736842, + "grad_norm": 1.2240612987739332, + "learning_rate": 5.087129621658407e-06, + "loss": 3.5295, + "step": 29350 + }, + { + "epoch": 3.018092105263158, + "grad_norm": 0.7963737332368858, + "learning_rate": 5.084470331811287e-06, + "loss": 3.5103, + "step": 29360 + }, + { + "epoch": 3.019120065789474, + "grad_norm": 0.6023422735965511, + "learning_rate": 5.081852209791729e-06, + "loss": 3.4741, + "step": 29370 + }, + { + "epoch": 3.0201480263157894, + "grad_norm": 1.1566377968900117, + "learning_rate": 5.079275258480423e-06, + "loss": 3.5122, + "step": 29380 + }, + { + "epoch": 3.0211759868421053, + "grad_norm": 0.9326649918212832, + "learning_rate": 5.07673948071275e-06, + "loss": 3.5412, + "step": 29390 + }, + { + "epoch": 3.0222039473684212, + "grad_norm": 0.6892479221048398, + "learning_rate": 5.0742448792787944e-06, + "loss": 3.5207, + "step": 29400 + }, + { + "epoch": 3.0232319078947367, + "grad_norm": 0.7382195260216983, + "learning_rate": 5.071791456923332e-06, + "loss": 3.5317, + "step": 29410 + }, + { + "epoch": 3.0242598684210527, + "grad_norm": 0.917470009233055, + "learning_rate": 5.069379216345835e-06, + "loss": 3.5481, + "step": 29420 + }, + { + "epoch": 3.0252878289473686, + "grad_norm": 0.6844765984009012, + "learning_rate": 5.067008160200464e-06, + "loss": 3.5285, + "step": 29430 + }, + { + "epoch": 3.026315789473684, + "grad_norm": 0.9811130245846984, + "learning_rate": 5.064678291096055e-06, + "loss": 3.5655, + "step": 29440 + }, + { + "epoch": 3.02734375, + "grad_norm": 0.8245956137849303, + "learning_rate": 5.062389611596138e-06, + "loss": 3.4884, + "step": 29450 + }, + { + "epoch": 3.028371710526316, + "grad_norm": 0.8989728908015796, + "learning_rate": 5.060142124218912e-06, + "loss": 3.5361, + "step": 29460 + }, + { + "epoch": 3.0293996710526314, + "grad_norm": 0.774819459190173, + "learning_rate": 5.057935831437268e-06, + "loss": 3.614, + "step": 29470 + }, + { + "epoch": 3.0304276315789473, + "grad_norm": 0.7520524789146815, + "learning_rate": 5.055770735678752e-06, + "loss": 3.5027, + "step": 29480 + }, + { + "epoch": 3.0314555921052633, + "grad_norm": 0.901567794281804, + "learning_rate": 5.0536468393256e-06, + "loss": 3.4912, + "step": 29490 + }, + { + "epoch": 3.0324835526315788, + "grad_norm": 1.3383241716707086, + "learning_rate": 5.051564144714699e-06, + "loss": 3.5823, + "step": 29500 + }, + { + "epoch": 3.0335115131578947, + "grad_norm": 1.0063685637573738, + "learning_rate": 5.049522654137615e-06, + "loss": 3.5395, + "step": 29510 + }, + { + "epoch": 3.0345394736842106, + "grad_norm": 1.3060943419510702, + "learning_rate": 5.047522369840575e-06, + "loss": 3.4773, + "step": 29520 + }, + { + "epoch": 3.035567434210526, + "grad_norm": 0.986023446038374, + "learning_rate": 5.045563294024466e-06, + "loss": 3.5036, + "step": 29530 + }, + { + "epoch": 3.036595394736842, + "grad_norm": 0.9762374336908989, + "learning_rate": 5.043645428844835e-06, + "loss": 3.5926, + "step": 29540 + }, + { + "epoch": 3.037623355263158, + "grad_norm": 1.1245377968643553, + "learning_rate": 5.0417687764118825e-06, + "loss": 3.5129, + "step": 29550 + }, + { + "epoch": 3.038651315789474, + "grad_norm": 1.1216091340393706, + "learning_rate": 5.039933338790465e-06, + "loss": 3.5324, + "step": 29560 + }, + { + "epoch": 3.0396792763157894, + "grad_norm": 0.9162230104371881, + "learning_rate": 5.0381391180000905e-06, + "loss": 3.6156, + "step": 29570 + }, + { + "epoch": 3.0407072368421053, + "grad_norm": 1.4440076556808543, + "learning_rate": 5.036386116014919e-06, + "loss": 3.508, + "step": 29580 + }, + { + "epoch": 3.0417351973684212, + "grad_norm": 0.6951485987160956, + "learning_rate": 5.0346743347637515e-06, + "loss": 3.5805, + "step": 29590 + }, + { + "epoch": 3.0427631578947367, + "grad_norm": 0.8146880471974339, + "learning_rate": 5.033003776130047e-06, + "loss": 3.4788, + "step": 29600 + }, + { + "epoch": 3.0437911184210527, + "grad_norm": 0.7218409294906638, + "learning_rate": 5.031374441951894e-06, + "loss": 3.5153, + "step": 29610 + }, + { + "epoch": 3.0448190789473686, + "grad_norm": 0.8844059766864216, + "learning_rate": 5.029786334022029e-06, + "loss": 3.5121, + "step": 29620 + }, + { + "epoch": 3.045847039473684, + "grad_norm": 1.2827672581007048, + "learning_rate": 5.02823945408783e-06, + "loss": 3.5078, + "step": 29630 + }, + { + "epoch": 3.046875, + "grad_norm": 0.9300031366603008, + "learning_rate": 5.026733803851303e-06, + "loss": 3.4709, + "step": 29640 + }, + { + "epoch": 3.047902960526316, + "grad_norm": 0.7682482097442255, + "learning_rate": 5.0252693849691004e-06, + "loss": 3.4749, + "step": 29650 + }, + { + "epoch": 3.0489309210526314, + "grad_norm": 0.9848572594189506, + "learning_rate": 5.023846199052501e-06, + "loss": 3.5238, + "step": 29660 + }, + { + "epoch": 3.0499588815789473, + "grad_norm": 0.6569157357111494, + "learning_rate": 5.02246424766742e-06, + "loss": 3.562, + "step": 29670 + }, + { + "epoch": 3.0509868421052633, + "grad_norm": 0.8811400446371122, + "learning_rate": 5.0211235323344e-06, + "loss": 3.5105, + "step": 29680 + }, + { + "epoch": 3.0520148026315788, + "grad_norm": 0.7871096779039429, + "learning_rate": 5.01982405452861e-06, + "loss": 3.5437, + "step": 29690 + }, + { + "epoch": 3.0530427631578947, + "grad_norm": 0.974210699640095, + "learning_rate": 5.0185658156798565e-06, + "loss": 3.5878, + "step": 29700 + }, + { + "epoch": 3.0540707236842106, + "grad_norm": 2.077674759584909, + "learning_rate": 5.017348817172558e-06, + "loss": 3.5454, + "step": 29710 + }, + { + "epoch": 3.055098684210526, + "grad_norm": 0.8944767098270727, + "learning_rate": 5.016173060345759e-06, + "loss": 3.5147, + "step": 29720 + }, + { + "epoch": 3.056126644736842, + "grad_norm": 0.6922880680917876, + "learning_rate": 5.0150385464931385e-06, + "loss": 3.5867, + "step": 29730 + }, + { + "epoch": 3.057154605263158, + "grad_norm": 0.5796182781337302, + "learning_rate": 5.013945276862979e-06, + "loss": 3.4815, + "step": 29740 + }, + { + "epoch": 3.058182565789474, + "grad_norm": 0.5671810507079289, + "learning_rate": 5.012893252658195e-06, + "loss": 3.533, + "step": 29750 + }, + { + "epoch": 3.0592105263157894, + "grad_norm": 1.3046595861681063, + "learning_rate": 5.0118824750363185e-06, + "loss": 3.5449, + "step": 29760 + }, + { + "epoch": 3.0602384868421053, + "grad_norm": 1.8840459685012416, + "learning_rate": 5.010912945109489e-06, + "loss": 3.5584, + "step": 29770 + }, + { + "epoch": 3.0612664473684212, + "grad_norm": 1.5837980976945465, + "learning_rate": 5.009984663944469e-06, + "loss": 3.5019, + "step": 29780 + }, + { + "epoch": 3.0622944078947367, + "grad_norm": 1.1580527402565473, + "learning_rate": 5.00909763256264e-06, + "loss": 3.4753, + "step": 29790 + }, + { + "epoch": 3.0633223684210527, + "grad_norm": 0.8498429251440821, + "learning_rate": 5.0082518519399846e-06, + "loss": 3.5335, + "step": 29800 + }, + { + "epoch": 3.0643503289473686, + "grad_norm": 0.8949125718191421, + "learning_rate": 5.007447323007106e-06, + "loss": 3.5012, + "step": 29810 + }, + { + "epoch": 3.065378289473684, + "grad_norm": 0.6825594898119427, + "learning_rate": 5.006684046649222e-06, + "loss": 3.521, + "step": 29820 + }, + { + "epoch": 3.06640625, + "grad_norm": 1.197106554625162, + "learning_rate": 5.005962023706147e-06, + "loss": 3.5521, + "step": 29830 + }, + { + "epoch": 3.067434210526316, + "grad_norm": 0.9099864883258141, + "learning_rate": 5.00528125497232e-06, + "loss": 3.5412, + "step": 29840 + }, + { + "epoch": 3.0684621710526314, + "grad_norm": 0.6888110491355995, + "learning_rate": 5.004641741196781e-06, + "loss": 3.4367, + "step": 29850 + }, + { + "epoch": 3.0694901315789473, + "grad_norm": 1.02715407096332, + "learning_rate": 5.004043483083182e-06, + "loss": 3.5448, + "step": 29860 + }, + { + "epoch": 3.0705180921052633, + "grad_norm": 1.25608322972413, + "learning_rate": 5.0034864812897674e-06, + "loss": 3.5252, + "step": 29870 + }, + { + "epoch": 3.0715460526315788, + "grad_norm": 0.6583026391933895, + "learning_rate": 5.002970736429411e-06, + "loss": 3.5689, + "step": 29880 + }, + { + "epoch": 3.0725740131578947, + "grad_norm": 0.7190979239878251, + "learning_rate": 5.002496249069575e-06, + "loss": 3.5042, + "step": 29890 + }, + { + "epoch": 3.0736019736842106, + "grad_norm": 0.9504523718578415, + "learning_rate": 5.002063019732334e-06, + "loss": 3.5089, + "step": 29900 + }, + { + "epoch": 3.074629934210526, + "grad_norm": 1.1372787302295528, + "learning_rate": 5.001671048894363e-06, + "loss": 3.5151, + "step": 29910 + }, + { + "epoch": 3.075657894736842, + "grad_norm": 0.6177731162114715, + "learning_rate": 5.001320336986943e-06, + "loss": 3.5266, + "step": 29920 + }, + { + "epoch": 3.076685855263158, + "grad_norm": 0.6527641449399775, + "learning_rate": 5.001010884395958e-06, + "loss": 3.5666, + "step": 29930 + }, + { + "epoch": 3.077713815789474, + "grad_norm": 0.8745212948443628, + "learning_rate": 5.000742691461893e-06, + "loss": 3.5175, + "step": 29940 + }, + { + "epoch": 3.0787417763157894, + "grad_norm": 0.7758875870984246, + "learning_rate": 5.000515758479841e-06, + "loss": 3.5478, + "step": 29950 + }, + { + "epoch": 3.0797697368421053, + "grad_norm": 1.2712873314971556, + "learning_rate": 5.000330085699491e-06, + "loss": 3.4866, + "step": 29960 + }, + { + "epoch": 3.0807976973684212, + "grad_norm": 0.7736274023716765, + "learning_rate": 5.000185673325136e-06, + "loss": 3.5136, + "step": 29970 + }, + { + "epoch": 3.0818256578947367, + "grad_norm": 0.7485533113013592, + "learning_rate": 5.000082521515671e-06, + "loss": 3.5676, + "step": 29980 + }, + { + "epoch": 3.0828536184210527, + "grad_norm": 0.9016867223143852, + "learning_rate": 5.000020630384592e-06, + "loss": 3.5265, + "step": 29990 + }, + { + "epoch": 3.0838815789473686, + "grad_norm": 0.8196157493703411, + "learning_rate": 5e-06, + "loss": 3.6594, + "step": 30000 + }, + { + "epoch": 3.084909539473684, + "grad_norm": 0.8034458395288847, + "learning_rate": 3.09342961738568e-05, + "loss": 3.5721, + "step": 30010 + }, + { + "epoch": 3.0859375, + "grad_norm": 0.9826617393637546, + "learning_rate": 3.091186271093947e-05, + "loss": 3.5685, + "step": 30020 + }, + { + "epoch": 3.086965460526316, + "grad_norm": 0.8451960851922566, + "learning_rate": 3.088943383199861e-05, + "loss": 3.5255, + "step": 30030 + }, + { + "epoch": 3.0879934210526314, + "grad_norm": 0.7794467332586881, + "learning_rate": 3.086700954590647e-05, + "loss": 3.5669, + "step": 30040 + }, + { + "epoch": 3.0890213815789473, + "grad_norm": 1.291975763725448, + "learning_rate": 3.084458986153358e-05, + "loss": 3.5376, + "step": 30050 + }, + { + "epoch": 3.0900493421052633, + "grad_norm": 0.9478720071074523, + "learning_rate": 3.08221747877486e-05, + "loss": 3.5486, + "step": 30060 + }, + { + "epoch": 3.0910773026315788, + "grad_norm": 0.9890846635646319, + "learning_rate": 3.079976433341836e-05, + "loss": 3.5416, + "step": 30070 + }, + { + "epoch": 3.0921052631578947, + "grad_norm": 1.1840154318415999, + "learning_rate": 3.077735850740787e-05, + "loss": 3.5308, + "step": 30080 + }, + { + "epoch": 3.0931332236842106, + "grad_norm": 0.8957109293741169, + "learning_rate": 3.0754957318580315e-05, + "loss": 3.5301, + "step": 30090 + }, + { + "epoch": 3.094161184210526, + "grad_norm": 0.7483930459090388, + "learning_rate": 3.0732560775797065e-05, + "loss": 3.4756, + "step": 30100 + }, + { + "epoch": 3.095189144736842, + "grad_norm": 0.7653851814695549, + "learning_rate": 3.071016888791759e-05, + "loss": 3.5051, + "step": 30110 + }, + { + "epoch": 3.096217105263158, + "grad_norm": 0.7553421735004944, + "learning_rate": 3.068778166379959e-05, + "loss": 3.5365, + "step": 30120 + }, + { + "epoch": 3.097245065789474, + "grad_norm": 0.6240592313479351, + "learning_rate": 3.0665399112298886e-05, + "loss": 3.5169, + "step": 30130 + }, + { + "epoch": 3.0982730263157894, + "grad_norm": 0.9901831197922878, + "learning_rate": 3.064302124226944e-05, + "loss": 3.4746, + "step": 30140 + }, + { + "epoch": 3.0993009868421053, + "grad_norm": 0.7886663896767632, + "learning_rate": 3.062064806256341e-05, + "loss": 3.4851, + "step": 30150 + }, + { + "epoch": 3.1003289473684212, + "grad_norm": 0.7062891231942129, + "learning_rate": 3.059827958203101e-05, + "loss": 3.5898, + "step": 30160 + }, + { + "epoch": 3.1013569078947367, + "grad_norm": 0.9598520242370732, + "learning_rate": 3.057591580952069e-05, + "loss": 3.528, + "step": 30170 + }, + { + "epoch": 3.1023848684210527, + "grad_norm": 0.790056980528713, + "learning_rate": 3.0553556753879e-05, + "loss": 3.5413, + "step": 30180 + }, + { + "epoch": 3.1034128289473686, + "grad_norm": 0.6863028333299467, + "learning_rate": 3.0531202423950604e-05, + "loss": 3.5428, + "step": 30190 + }, + { + "epoch": 3.104440789473684, + "grad_norm": 0.8032079260498678, + "learning_rate": 3.0508852828578302e-05, + "loss": 3.5493, + "step": 30200 + }, + { + "epoch": 3.10546875, + "grad_norm": 0.8170665644186634, + "learning_rate": 3.0486507976603065e-05, + "loss": 3.5926, + "step": 30210 + }, + { + "epoch": 3.106496710526316, + "grad_norm": 0.931932159809671, + "learning_rate": 3.0464167876863945e-05, + "loss": 3.5292, + "step": 30220 + }, + { + "epoch": 3.1075246710526314, + "grad_norm": 0.9087302698252443, + "learning_rate": 3.0441832538198116e-05, + "loss": 3.5449, + "step": 30230 + }, + { + "epoch": 3.1085526315789473, + "grad_norm": 1.0360586587471432, + "learning_rate": 3.0419501969440892e-05, + "loss": 3.5236, + "step": 30240 + }, + { + "epoch": 3.1095805921052633, + "grad_norm": 0.8732036134121721, + "learning_rate": 3.039717617942566e-05, + "loss": 3.5968, + "step": 30250 + }, + { + "epoch": 3.1106085526315788, + "grad_norm": 0.7058444521349879, + "learning_rate": 3.0374855176983963e-05, + "loss": 3.4577, + "step": 30260 + }, + { + "epoch": 3.1116365131578947, + "grad_norm": 0.7390905717675993, + "learning_rate": 3.0352538970945436e-05, + "loss": 3.5584, + "step": 30270 + }, + { + "epoch": 3.1126644736842106, + "grad_norm": 2.1543096021482055, + "learning_rate": 3.0330227570137803e-05, + "loss": 3.554, + "step": 30280 + }, + { + "epoch": 3.113692434210526, + "grad_norm": 0.747728683486687, + "learning_rate": 3.030792098338687e-05, + "loss": 3.5047, + "step": 30290 + }, + { + "epoch": 3.114720394736842, + "grad_norm": 0.9763525164868452, + "learning_rate": 3.028561921951661e-05, + "loss": 3.5682, + "step": 30300 + }, + { + "epoch": 3.115748355263158, + "grad_norm": 0.7776905992068022, + "learning_rate": 3.0263322287349014e-05, + "loss": 3.5032, + "step": 30310 + }, + { + "epoch": 3.116776315789474, + "grad_norm": 1.2294791950390764, + "learning_rate": 3.0241030195704187e-05, + "loss": 3.5359, + "step": 30320 + }, + { + "epoch": 3.1178042763157894, + "grad_norm": 0.7185056519840212, + "learning_rate": 3.0218742953400313e-05, + "loss": 3.4924, + "step": 30330 + }, + { + "epoch": 3.1188322368421053, + "grad_norm": 0.8657436551184586, + "learning_rate": 3.01964605692537e-05, + "loss": 3.5343, + "step": 30340 + }, + { + "epoch": 3.1198601973684212, + "grad_norm": 0.9982970758348856, + "learning_rate": 3.0174183052078672e-05, + "loss": 3.5532, + "step": 30350 + }, + { + "epoch": 3.1208881578947367, + "grad_norm": 0.8067568124956921, + "learning_rate": 3.015191041068767e-05, + "loss": 3.5611, + "step": 30360 + }, + { + "epoch": 3.1219161184210527, + "grad_norm": 1.325307303006723, + "learning_rate": 3.012964265389117e-05, + "loss": 3.5264, + "step": 30370 + }, + { + "epoch": 3.1229440789473686, + "grad_norm": 0.8626408683070217, + "learning_rate": 3.0107379790497765e-05, + "loss": 3.6187, + "step": 30380 + }, + { + "epoch": 3.123972039473684, + "grad_norm": 0.7824586673947594, + "learning_rate": 3.0085121829314077e-05, + "loss": 3.5023, + "step": 30390 + }, + { + "epoch": 3.125, + "grad_norm": 0.5800300860823089, + "learning_rate": 3.0062868779144797e-05, + "loss": 3.5477, + "step": 30400 + }, + { + "epoch": 3.126027960526316, + "grad_norm": 1.0022212630824396, + "learning_rate": 3.0040620648792655e-05, + "loss": 3.4688, + "step": 30410 + }, + { + "epoch": 3.1270559210526314, + "grad_norm": 0.9175004129642732, + "learning_rate": 3.001837744705848e-05, + "loss": 3.5737, + "step": 30420 + }, + { + "epoch": 3.1280838815789473, + "grad_norm": 0.9121067937450928, + "learning_rate": 2.9996139182741123e-05, + "loss": 3.567, + "step": 30430 + }, + { + "epoch": 3.1291118421052633, + "grad_norm": 0.8233164327318018, + "learning_rate": 2.997390586463746e-05, + "loss": 3.5241, + "step": 30440 + }, + { + "epoch": 3.1301398026315788, + "grad_norm": 0.7336705258039894, + "learning_rate": 2.9951677501542442e-05, + "loss": 3.5509, + "step": 30450 + }, + { + "epoch": 3.1311677631578947, + "grad_norm": 0.6737215261865298, + "learning_rate": 2.9929454102249077e-05, + "loss": 3.5216, + "step": 30460 + }, + { + "epoch": 3.1321957236842106, + "grad_norm": 0.8128084550908694, + "learning_rate": 2.9907235675548355e-05, + "loss": 3.5411, + "step": 30470 + }, + { + "epoch": 3.1332236842105265, + "grad_norm": 0.7165113498882888, + "learning_rate": 2.988502223022935e-05, + "loss": 3.5208, + "step": 30480 + }, + { + "epoch": 3.134251644736842, + "grad_norm": 0.8327687536491092, + "learning_rate": 2.9862813775079118e-05, + "loss": 3.549, + "step": 30490 + }, + { + "epoch": 3.135279605263158, + "grad_norm": 0.781151879824684, + "learning_rate": 2.984061031888279e-05, + "loss": 3.5258, + "step": 30500 + }, + { + "epoch": 3.1363075657894735, + "grad_norm": 0.9896796965946376, + "learning_rate": 2.981841187042349e-05, + "loss": 3.557, + "step": 30510 + }, + { + "epoch": 3.1373355263157894, + "grad_norm": 0.8909003518079345, + "learning_rate": 2.979621843848237e-05, + "loss": 3.5201, + "step": 30520 + }, + { + "epoch": 3.1383634868421053, + "grad_norm": 0.6954681779408721, + "learning_rate": 2.977403003183858e-05, + "loss": 3.4855, + "step": 30530 + }, + { + "epoch": 3.1393914473684212, + "grad_norm": 1.0880860952645282, + "learning_rate": 2.9751846659269302e-05, + "loss": 3.5406, + "step": 30540 + }, + { + "epoch": 3.1404194078947367, + "grad_norm": 0.9099808552583101, + "learning_rate": 2.9729668329549745e-05, + "loss": 3.4879, + "step": 30550 + }, + { + "epoch": 3.1414473684210527, + "grad_norm": 0.8228241389603572, + "learning_rate": 2.9707495051453073e-05, + "loss": 3.5003, + "step": 30560 + }, + { + "epoch": 3.1424753289473686, + "grad_norm": 0.8760037455173876, + "learning_rate": 2.9685326833750495e-05, + "loss": 3.5427, + "step": 30570 + }, + { + "epoch": 3.143503289473684, + "grad_norm": 0.8106127976416039, + "learning_rate": 2.966316368521118e-05, + "loss": 3.5456, + "step": 30580 + }, + { + "epoch": 3.14453125, + "grad_norm": 0.7466389127310342, + "learning_rate": 2.9641005614602333e-05, + "loss": 3.579, + "step": 30590 + }, + { + "epoch": 3.145559210526316, + "grad_norm": 0.8021971675467102, + "learning_rate": 2.9618852630689136e-05, + "loss": 3.5171, + "step": 30600 + }, + { + "epoch": 3.1465871710526314, + "grad_norm": 0.9434296323151882, + "learning_rate": 2.9596704742234733e-05, + "loss": 3.5435, + "step": 30610 + }, + { + "epoch": 3.1476151315789473, + "grad_norm": 0.8809816398266248, + "learning_rate": 2.9574561958000284e-05, + "loss": 3.4796, + "step": 30620 + }, + { + "epoch": 3.1486430921052633, + "grad_norm": 0.653921879589559, + "learning_rate": 2.9552424286744933e-05, + "loss": 3.5099, + "step": 30630 + }, + { + "epoch": 3.1496710526315788, + "grad_norm": 0.8365528658175685, + "learning_rate": 2.9530291737225774e-05, + "loss": 3.4845, + "step": 30640 + }, + { + "epoch": 3.1506990131578947, + "grad_norm": 0.7700648128182828, + "learning_rate": 2.950816431819788e-05, + "loss": 3.5702, + "step": 30650 + }, + { + "epoch": 3.1517269736842106, + "grad_norm": 0.856948478547721, + "learning_rate": 2.9486042038414296e-05, + "loss": 3.5386, + "step": 30660 + }, + { + "epoch": 3.1527549342105265, + "grad_norm": 0.7477626011385151, + "learning_rate": 2.946392490662608e-05, + "loss": 3.4579, + "step": 30670 + }, + { + "epoch": 3.153782894736842, + "grad_norm": 1.0361508180022778, + "learning_rate": 2.9441812931582183e-05, + "loss": 3.5847, + "step": 30680 + }, + { + "epoch": 3.154810855263158, + "grad_norm": 0.6425922705302112, + "learning_rate": 2.9419706122029562e-05, + "loss": 3.4581, + "step": 30690 + }, + { + "epoch": 3.1558388157894735, + "grad_norm": 1.1725030360558737, + "learning_rate": 2.9397604486713092e-05, + "loss": 3.5136, + "step": 30700 + }, + { + "epoch": 3.1568667763157894, + "grad_norm": 0.9962286764277565, + "learning_rate": 2.9375508034375652e-05, + "loss": 3.4703, + "step": 30710 + }, + { + "epoch": 3.1578947368421053, + "grad_norm": 0.614673273865418, + "learning_rate": 2.9353416773758044e-05, + "loss": 3.5207, + "step": 30720 + }, + { + "epoch": 3.1589226973684212, + "grad_norm": 0.9019103888047298, + "learning_rate": 2.9331330713599007e-05, + "loss": 3.5728, + "step": 30730 + }, + { + "epoch": 3.1599506578947367, + "grad_norm": 0.6129538032446108, + "learning_rate": 2.9309249862635224e-05, + "loss": 3.4867, + "step": 30740 + }, + { + "epoch": 3.1609786184210527, + "grad_norm": 0.654083604358227, + "learning_rate": 2.928717422960134e-05, + "loss": 3.4749, + "step": 30750 + }, + { + "epoch": 3.1620065789473686, + "grad_norm": 0.7435202027188257, + "learning_rate": 2.9265103823229926e-05, + "loss": 3.5596, + "step": 30760 + }, + { + "epoch": 3.163034539473684, + "grad_norm": 0.7162845870183632, + "learning_rate": 2.9243038652251458e-05, + "loss": 3.4766, + "step": 30770 + }, + { + "epoch": 3.1640625, + "grad_norm": 1.046183571296096, + "learning_rate": 2.9220978725394395e-05, + "loss": 3.5205, + "step": 30780 + }, + { + "epoch": 3.165090460526316, + "grad_norm": 1.019402740681096, + "learning_rate": 2.9198924051385055e-05, + "loss": 3.5233, + "step": 30790 + }, + { + "epoch": 3.1661184210526314, + "grad_norm": 0.6833743806078755, + "learning_rate": 2.9176874638947744e-05, + "loss": 3.5114, + "step": 30800 + }, + { + "epoch": 3.1671463815789473, + "grad_norm": 0.7922652181991647, + "learning_rate": 2.9154830496804654e-05, + "loss": 3.5349, + "step": 30810 + }, + { + "epoch": 3.1681743421052633, + "grad_norm": 0.8166531641161797, + "learning_rate": 2.9132791633675867e-05, + "loss": 3.5464, + "step": 30820 + }, + { + "epoch": 3.1692023026315788, + "grad_norm": 1.0491191098072732, + "learning_rate": 2.9110758058279425e-05, + "loss": 3.5655, + "step": 30830 + }, + { + "epoch": 3.1702302631578947, + "grad_norm": 0.7212630580467109, + "learning_rate": 2.908872977933127e-05, + "loss": 3.5811, + "step": 30840 + }, + { + "epoch": 3.1712582236842106, + "grad_norm": 0.8754912427946807, + "learning_rate": 2.9066706805545233e-05, + "loss": 3.5279, + "step": 30850 + }, + { + "epoch": 3.1722861842105265, + "grad_norm": 1.2456252792978237, + "learning_rate": 2.9044689145633024e-05, + "loss": 3.5325, + "step": 30860 + }, + { + "epoch": 3.173314144736842, + "grad_norm": 0.7974201656310899, + "learning_rate": 2.9022676808304284e-05, + "loss": 3.4995, + "step": 30870 + }, + { + "epoch": 3.174342105263158, + "grad_norm": 1.3401619224788546, + "learning_rate": 2.9000669802266572e-05, + "loss": 3.5035, + "step": 30880 + }, + { + "epoch": 3.1753700657894735, + "grad_norm": 0.6792777569336113, + "learning_rate": 2.8978668136225273e-05, + "loss": 3.4696, + "step": 30890 + }, + { + "epoch": 3.1763980263157894, + "grad_norm": 0.7287158844227021, + "learning_rate": 2.8956671818883716e-05, + "loss": 3.6302, + "step": 30900 + }, + { + "epoch": 3.1774259868421053, + "grad_norm": 0.7213889892839843, + "learning_rate": 2.8934680858943075e-05, + "loss": 3.5343, + "step": 30910 + }, + { + "epoch": 3.1784539473684212, + "grad_norm": 0.7145922692706814, + "learning_rate": 2.8912695265102435e-05, + "loss": 3.5461, + "step": 30920 + }, + { + "epoch": 3.1794819078947367, + "grad_norm": 0.6941264033537805, + "learning_rate": 2.8890715046058747e-05, + "loss": 3.5226, + "step": 30930 + }, + { + "epoch": 3.1805098684210527, + "grad_norm": 0.758385589042794, + "learning_rate": 2.886874021050682e-05, + "loss": 3.4942, + "step": 30940 + }, + { + "epoch": 3.1815378289473686, + "grad_norm": 1.077916774821021, + "learning_rate": 2.8846770767139354e-05, + "loss": 3.5003, + "step": 30950 + }, + { + "epoch": 3.182565789473684, + "grad_norm": 1.1422140992455485, + "learning_rate": 2.8824806724646927e-05, + "loss": 3.5276, + "step": 30960 + }, + { + "epoch": 3.18359375, + "grad_norm": 1.3323261460333171, + "learning_rate": 2.8802848091717943e-05, + "loss": 3.5068, + "step": 30970 + }, + { + "epoch": 3.184621710526316, + "grad_norm": 1.0241828126516668, + "learning_rate": 2.878089487703869e-05, + "loss": 3.5377, + "step": 30980 + }, + { + "epoch": 3.1856496710526314, + "grad_norm": 0.5967165472005463, + "learning_rate": 2.8758947089293307e-05, + "loss": 3.5152, + "step": 30990 + }, + { + "epoch": 3.1866776315789473, + "grad_norm": 0.8137344372926804, + "learning_rate": 2.87370047371638e-05, + "loss": 3.4921, + "step": 31000 + }, + { + "epoch": 3.1877055921052633, + "grad_norm": 0.9235972464786818, + "learning_rate": 2.8715067829330005e-05, + "loss": 3.4744, + "step": 31010 + }, + { + "epoch": 3.1887335526315788, + "grad_norm": 0.6579787100982685, + "learning_rate": 2.869313637446962e-05, + "loss": 3.5553, + "step": 31020 + }, + { + "epoch": 3.1897615131578947, + "grad_norm": 0.8103303551168672, + "learning_rate": 2.8671210381258157e-05, + "loss": 3.6012, + "step": 31030 + }, + { + "epoch": 3.1907894736842106, + "grad_norm": 0.9306634220469333, + "learning_rate": 2.8649289858369003e-05, + "loss": 3.6103, + "step": 31040 + }, + { + "epoch": 3.1918174342105265, + "grad_norm": 1.0139600752018758, + "learning_rate": 2.862737481447339e-05, + "loss": 3.5902, + "step": 31050 + }, + { + "epoch": 3.192845394736842, + "grad_norm": 0.6027790747710111, + "learning_rate": 2.8605465258240325e-05, + "loss": 3.5028, + "step": 31060 + }, + { + "epoch": 3.193873355263158, + "grad_norm": 1.09902157835043, + "learning_rate": 2.8583561198336683e-05, + "loss": 3.4539, + "step": 31070 + }, + { + "epoch": 3.1949013157894735, + "grad_norm": 0.7953522288174131, + "learning_rate": 2.856166264342717e-05, + "loss": 3.528, + "step": 31080 + }, + { + "epoch": 3.1959292763157894, + "grad_norm": 0.6897595510799728, + "learning_rate": 2.853976960217431e-05, + "loss": 3.5526, + "step": 31090 + }, + { + "epoch": 3.1969572368421053, + "grad_norm": 0.8086126574088686, + "learning_rate": 2.8517882083238433e-05, + "loss": 3.5283, + "step": 31100 + }, + { + "epoch": 3.1979851973684212, + "grad_norm": 1.1899863865818987, + "learning_rate": 2.84960000952777e-05, + "loss": 3.4944, + "step": 31110 + }, + { + "epoch": 3.1990131578947367, + "grad_norm": 0.8095991730003506, + "learning_rate": 2.8474123646948056e-05, + "loss": 3.5457, + "step": 31120 + }, + { + "epoch": 3.2000411184210527, + "grad_norm": 1.0678240395475886, + "learning_rate": 2.84522527469033e-05, + "loss": 3.5435, + "step": 31130 + }, + { + "epoch": 3.2010690789473686, + "grad_norm": 0.7177018940880806, + "learning_rate": 2.843038740379501e-05, + "loss": 3.5859, + "step": 31140 + }, + { + "epoch": 3.202097039473684, + "grad_norm": 0.7880776557345399, + "learning_rate": 2.8408527626272547e-05, + "loss": 3.541, + "step": 31150 + }, + { + "epoch": 3.203125, + "grad_norm": 0.7099867531386876, + "learning_rate": 2.83866734229831e-05, + "loss": 3.5382, + "step": 31160 + }, + { + "epoch": 3.204152960526316, + "grad_norm": 0.7984511178305469, + "learning_rate": 2.836482480257166e-05, + "loss": 3.5852, + "step": 31170 + }, + { + "epoch": 3.2051809210526314, + "grad_norm": 0.6427789623541189, + "learning_rate": 2.834298177368098e-05, + "loss": 3.5322, + "step": 31180 + }, + { + "epoch": 3.2062088815789473, + "grad_norm": 1.0219175977315123, + "learning_rate": 2.83211443449516e-05, + "loss": 3.5361, + "step": 31190 + }, + { + "epoch": 3.2072368421052633, + "grad_norm": 1.2192212685286827, + "learning_rate": 2.829931252502187e-05, + "loss": 3.546, + "step": 31200 + }, + { + "epoch": 3.2082648026315788, + "grad_norm": 0.9685935232473282, + "learning_rate": 2.827748632252793e-05, + "loss": 3.4878, + "step": 31210 + }, + { + "epoch": 3.2092927631578947, + "grad_norm": 1.193984496966471, + "learning_rate": 2.825566574610365e-05, + "loss": 3.4726, + "step": 31220 + }, + { + "epoch": 3.2103207236842106, + "grad_norm": 0.9615299290498545, + "learning_rate": 2.8233850804380726e-05, + "loss": 3.6029, + "step": 31230 + }, + { + "epoch": 3.2113486842105265, + "grad_norm": 0.7443165008081201, + "learning_rate": 2.8212041505988578e-05, + "loss": 3.4783, + "step": 31240 + }, + { + "epoch": 3.212376644736842, + "grad_norm": 0.6579344566175537, + "learning_rate": 2.8190237859554433e-05, + "loss": 3.5104, + "step": 31250 + }, + { + "epoch": 3.213404605263158, + "grad_norm": 0.9262006429373878, + "learning_rate": 2.816843987370327e-05, + "loss": 3.6272, + "step": 31260 + }, + { + "epoch": 3.2144325657894735, + "grad_norm": 0.8204486552276554, + "learning_rate": 2.814664755705781e-05, + "loss": 3.4642, + "step": 31270 + }, + { + "epoch": 3.2154605263157894, + "grad_norm": 0.7649590846570429, + "learning_rate": 2.8124860918238548e-05, + "loss": 3.4886, + "step": 31280 + }, + { + "epoch": 3.2164884868421053, + "grad_norm": 0.8117755080674702, + "learning_rate": 2.8103079965863756e-05, + "loss": 3.521, + "step": 31290 + }, + { + "epoch": 3.2175164473684212, + "grad_norm": 0.870960345124325, + "learning_rate": 2.8081304708549416e-05, + "loss": 3.5574, + "step": 31300 + }, + { + "epoch": 3.2185444078947367, + "grad_norm": 0.7577759252157651, + "learning_rate": 2.805953515490927e-05, + "loss": 3.4665, + "step": 31310 + }, + { + "epoch": 3.2195723684210527, + "grad_norm": 0.8252400944764464, + "learning_rate": 2.803777131355481e-05, + "loss": 3.5378, + "step": 31320 + }, + { + "epoch": 3.2206003289473686, + "grad_norm": 1.2802474588861705, + "learning_rate": 2.8016013193095255e-05, + "loss": 3.5453, + "step": 31330 + }, + { + "epoch": 3.221628289473684, + "grad_norm": 1.1134655180903432, + "learning_rate": 2.7994260802137593e-05, + "loss": 3.5119, + "step": 31340 + }, + { + "epoch": 3.22265625, + "grad_norm": 0.9045825284628946, + "learning_rate": 2.7972514149286523e-05, + "loss": 3.5678, + "step": 31350 + }, + { + "epoch": 3.223684210526316, + "grad_norm": 0.6728608195826604, + "learning_rate": 2.7950773243144453e-05, + "loss": 3.5525, + "step": 31360 + }, + { + "epoch": 3.2247121710526314, + "grad_norm": 0.9540907547806179, + "learning_rate": 2.792903809231155e-05, + "loss": 3.4777, + "step": 31370 + }, + { + "epoch": 3.2257401315789473, + "grad_norm": 0.7822243527346674, + "learning_rate": 2.7907308705385713e-05, + "loss": 3.5539, + "step": 31380 + }, + { + "epoch": 3.2267680921052633, + "grad_norm": 0.956290444908765, + "learning_rate": 2.7885585090962518e-05, + "loss": 3.5427, + "step": 31390 + }, + { + "epoch": 3.2277960526315788, + "grad_norm": 0.9290073695860591, + "learning_rate": 2.7863867257635302e-05, + "loss": 3.5539, + "step": 31400 + }, + { + "epoch": 3.2288240131578947, + "grad_norm": 0.7200887485617354, + "learning_rate": 2.784215521399507e-05, + "loss": 3.5161, + "step": 31410 + }, + { + "epoch": 3.2298519736842106, + "grad_norm": 0.7404061472941031, + "learning_rate": 2.7820448968630595e-05, + "loss": 3.5678, + "step": 31420 + }, + { + "epoch": 3.2308799342105265, + "grad_norm": 0.6902730027482861, + "learning_rate": 2.779874853012829e-05, + "loss": 3.494, + "step": 31430 + }, + { + "epoch": 3.231907894736842, + "grad_norm": 0.9530649947992403, + "learning_rate": 2.777705390707233e-05, + "loss": 3.5532, + "step": 31440 + }, + { + "epoch": 3.232935855263158, + "grad_norm": 0.7698225835047011, + "learning_rate": 2.7755365108044544e-05, + "loss": 3.5528, + "step": 31450 + }, + { + "epoch": 3.2339638157894735, + "grad_norm": 0.8279473456074298, + "learning_rate": 2.7733682141624484e-05, + "loss": 3.63, + "step": 31460 + }, + { + "epoch": 3.2349917763157894, + "grad_norm": 0.8139117154344154, + "learning_rate": 2.7712005016389402e-05, + "loss": 3.5229, + "step": 31470 + }, + { + "epoch": 3.2360197368421053, + "grad_norm": 0.6373452924488553, + "learning_rate": 2.7690333740914205e-05, + "loss": 3.583, + "step": 31480 + }, + { + "epoch": 3.2370476973684212, + "grad_norm": 0.9314808258626356, + "learning_rate": 2.7668668323771508e-05, + "loss": 3.5351, + "step": 31490 + }, + { + "epoch": 3.2380756578947367, + "grad_norm": 0.8048765090143851, + "learning_rate": 2.7647008773531623e-05, + "loss": 3.5504, + "step": 31500 + }, + { + "epoch": 3.2391036184210527, + "grad_norm": 0.6546744097218085, + "learning_rate": 2.7625355098762515e-05, + "loss": 3.5261, + "step": 31510 + }, + { + "epoch": 3.2401315789473686, + "grad_norm": 0.9004996806731391, + "learning_rate": 2.7603707308029827e-05, + "loss": 3.5469, + "step": 31520 + }, + { + "epoch": 3.241159539473684, + "grad_norm": 0.5662343257562026, + "learning_rate": 2.7582065409896877e-05, + "loss": 3.5161, + "step": 31530 + }, + { + "epoch": 3.2421875, + "grad_norm": 0.7792651330128931, + "learning_rate": 2.756042941292469e-05, + "loss": 3.5204, + "step": 31540 + }, + { + "epoch": 3.243215460526316, + "grad_norm": 0.7215643391806721, + "learning_rate": 2.753879932567189e-05, + "loss": 3.4902, + "step": 31550 + }, + { + "epoch": 3.2442434210526314, + "grad_norm": 0.8902137877831873, + "learning_rate": 2.7517175156694817e-05, + "loss": 3.526, + "step": 31560 + }, + { + "epoch": 3.2452713815789473, + "grad_norm": 0.8419610591364863, + "learning_rate": 2.7495556914547437e-05, + "loss": 3.492, + "step": 31570 + }, + { + "epoch": 3.2462993421052633, + "grad_norm": 1.029673498402913, + "learning_rate": 2.747394460778138e-05, + "loss": 3.5246, + "step": 31580 + }, + { + "epoch": 3.2473273026315788, + "grad_norm": 0.7519744077370653, + "learning_rate": 2.7452338244945962e-05, + "loss": 3.5259, + "step": 31590 + }, + { + "epoch": 3.2483552631578947, + "grad_norm": 1.0769552989739914, + "learning_rate": 2.7430737834588084e-05, + "loss": 3.5487, + "step": 31600 + }, + { + "epoch": 3.2493832236842106, + "grad_norm": 0.7727935499312071, + "learning_rate": 2.7409143385252356e-05, + "loss": 3.4771, + "step": 31610 + }, + { + "epoch": 3.2504111842105265, + "grad_norm": 0.6399763824530813, + "learning_rate": 2.7387554905480964e-05, + "loss": 3.549, + "step": 31620 + }, + { + "epoch": 3.251439144736842, + "grad_norm": 1.214161426730992, + "learning_rate": 2.7365972403813805e-05, + "loss": 3.5485, + "step": 31630 + }, + { + "epoch": 3.252467105263158, + "grad_norm": 0.9930688142817631, + "learning_rate": 2.7344395888788353e-05, + "loss": 3.4865, + "step": 31640 + }, + { + "epoch": 3.2534950657894735, + "grad_norm": 0.9012391897391805, + "learning_rate": 2.732282536893976e-05, + "loss": 3.5439, + "step": 31650 + }, + { + "epoch": 3.2545230263157894, + "grad_norm": 0.6932038898478845, + "learning_rate": 2.7301260852800743e-05, + "loss": 3.5941, + "step": 31660 + }, + { + "epoch": 3.2555509868421053, + "grad_norm": 0.7840849366670505, + "learning_rate": 2.7279702348901712e-05, + "loss": 3.5762, + "step": 31670 + }, + { + "epoch": 3.2565789473684212, + "grad_norm": 1.1907209582970957, + "learning_rate": 2.725814986577068e-05, + "loss": 3.5546, + "step": 31680 + }, + { + "epoch": 3.2576069078947367, + "grad_norm": 0.6183028705587049, + "learning_rate": 2.723660341193324e-05, + "loss": 3.5104, + "step": 31690 + }, + { + "epoch": 3.2586348684210527, + "grad_norm": 0.7546395012102164, + "learning_rate": 2.7215062995912638e-05, + "loss": 3.5044, + "step": 31700 + }, + { + "epoch": 3.2596628289473686, + "grad_norm": 0.7003555433739009, + "learning_rate": 2.7193528626229743e-05, + "loss": 3.4913, + "step": 31710 + }, + { + "epoch": 3.260690789473684, + "grad_norm": 0.9398798934823339, + "learning_rate": 2.717200031140299e-05, + "loss": 3.4988, + "step": 31720 + }, + { + "epoch": 3.26171875, + "grad_norm": 0.7014204556901867, + "learning_rate": 2.7150478059948452e-05, + "loss": 3.6047, + "step": 31730 + }, + { + "epoch": 3.262746710526316, + "grad_norm": 1.3858264470039223, + "learning_rate": 2.712896188037977e-05, + "loss": 3.4769, + "step": 31740 + }, + { + "epoch": 3.2637746710526314, + "grad_norm": 0.8803124939874846, + "learning_rate": 2.710745178120824e-05, + "loss": 3.5547, + "step": 31750 + }, + { + "epoch": 3.2648026315789473, + "grad_norm": 0.9644673651952999, + "learning_rate": 2.7085947770942683e-05, + "loss": 3.6153, + "step": 31760 + }, + { + "epoch": 3.2658305921052633, + "grad_norm": 1.4427273286562796, + "learning_rate": 2.7064449858089575e-05, + "loss": 3.5561, + "step": 31770 + }, + { + "epoch": 3.2668585526315788, + "grad_norm": 0.7831319267654658, + "learning_rate": 2.704295805115292e-05, + "loss": 3.6037, + "step": 31780 + }, + { + "epoch": 3.2678865131578947, + "grad_norm": 0.9139727034843758, + "learning_rate": 2.7021472358634362e-05, + "loss": 3.4864, + "step": 31790 + }, + { + "epoch": 3.2689144736842106, + "grad_norm": 0.9190390981881092, + "learning_rate": 2.699999278903311e-05, + "loss": 3.5243, + "step": 31800 + }, + { + "epoch": 3.2699424342105265, + "grad_norm": 0.6945165514597482, + "learning_rate": 2.6978519350845913e-05, + "loss": 3.5083, + "step": 31810 + }, + { + "epoch": 3.270970394736842, + "grad_norm": 0.9604825128972049, + "learning_rate": 2.6957052052567134e-05, + "loss": 3.4415, + "step": 31820 + }, + { + "epoch": 3.271998355263158, + "grad_norm": 1.0510808808213854, + "learning_rate": 2.693559090268872e-05, + "loss": 3.5132, + "step": 31830 + }, + { + "epoch": 3.2730263157894735, + "grad_norm": 0.9151126339382636, + "learning_rate": 2.6914135909700147e-05, + "loss": 3.6539, + "step": 31840 + }, + { + "epoch": 3.2740542763157894, + "grad_norm": 1.0289942605995734, + "learning_rate": 2.6892687082088465e-05, + "loss": 3.4769, + "step": 31850 + }, + { + "epoch": 3.2750822368421053, + "grad_norm": 0.6506114733679621, + "learning_rate": 2.6871244428338305e-05, + "loss": 3.4264, + "step": 31860 + }, + { + "epoch": 3.2761101973684212, + "grad_norm": 0.6987081613451908, + "learning_rate": 2.6849807956931834e-05, + "loss": 3.5745, + "step": 31870 + }, + { + "epoch": 3.2771381578947367, + "grad_norm": 1.0470781271567644, + "learning_rate": 2.6828377676348787e-05, + "loss": 3.5219, + "step": 31880 + }, + { + "epoch": 3.2781661184210527, + "grad_norm": 0.9215246601686027, + "learning_rate": 2.6806953595066443e-05, + "loss": 3.4668, + "step": 31890 + }, + { + "epoch": 3.2791940789473686, + "grad_norm": 1.1527591279958422, + "learning_rate": 2.6785535721559645e-05, + "loss": 3.5718, + "step": 31900 + }, + { + "epoch": 3.280222039473684, + "grad_norm": 0.7546043523946533, + "learning_rate": 2.6764124064300727e-05, + "loss": 3.5139, + "step": 31910 + }, + { + "epoch": 3.28125, + "grad_norm": 0.9856757093022729, + "learning_rate": 2.6742718631759642e-05, + "loss": 3.5365, + "step": 31920 + }, + { + "epoch": 3.282277960526316, + "grad_norm": 0.7104998069134745, + "learning_rate": 2.6721319432403848e-05, + "loss": 3.5654, + "step": 31930 + }, + { + "epoch": 3.2833059210526314, + "grad_norm": 0.6120212269932048, + "learning_rate": 2.66999264746983e-05, + "loss": 3.5493, + "step": 31940 + }, + { + "epoch": 3.2843338815789473, + "grad_norm": 0.8091958057474947, + "learning_rate": 2.6678539767105518e-05, + "loss": 3.5234, + "step": 31950 + }, + { + "epoch": 3.2853618421052633, + "grad_norm": 0.6266475182213457, + "learning_rate": 2.6657159318085585e-05, + "loss": 3.4435, + "step": 31960 + }, + { + "epoch": 3.2863898026315788, + "grad_norm": 0.742675379966813, + "learning_rate": 2.663578513609603e-05, + "loss": 3.5685, + "step": 31970 + }, + { + "epoch": 3.2874177631578947, + "grad_norm": 0.9319268512691785, + "learning_rate": 2.6614417229591957e-05, + "loss": 3.5106, + "step": 31980 + }, + { + "epoch": 3.2884457236842106, + "grad_norm": 0.9533801128229847, + "learning_rate": 2.6593055607025977e-05, + "loss": 3.4793, + "step": 31990 + }, + { + "epoch": 3.2894736842105265, + "grad_norm": 0.5787645874491991, + "learning_rate": 2.6571700276848208e-05, + "loss": 3.5128, + "step": 32000 + }, + { + "epoch": 3.290501644736842, + "grad_norm": 1.2010961493215941, + "learning_rate": 2.6550351247506285e-05, + "loss": 3.523, + "step": 32010 + }, + { + "epoch": 3.291529605263158, + "grad_norm": 0.9239454779578222, + "learning_rate": 2.6529008527445365e-05, + "loss": 3.5662, + "step": 32020 + }, + { + "epoch": 3.2925575657894735, + "grad_norm": 1.1185809428599123, + "learning_rate": 2.650767212510804e-05, + "loss": 3.5859, + "step": 32030 + }, + { + "epoch": 3.2935855263157894, + "grad_norm": 1.0295288287846562, + "learning_rate": 2.6486342048934512e-05, + "loss": 3.5259, + "step": 32040 + }, + { + "epoch": 3.2946134868421053, + "grad_norm": 0.6840598343073601, + "learning_rate": 2.6465018307362416e-05, + "loss": 3.5094, + "step": 32050 + }, + { + "epoch": 3.2956414473684212, + "grad_norm": 1.0024509197216618, + "learning_rate": 2.6443700908826863e-05, + "loss": 3.4899, + "step": 32060 + }, + { + "epoch": 3.2966694078947367, + "grad_norm": 1.1585361458675936, + "learning_rate": 2.6422389861760495e-05, + "loss": 3.5416, + "step": 32070 + }, + { + "epoch": 3.2976973684210527, + "grad_norm": 0.9132319858861843, + "learning_rate": 2.640108517459343e-05, + "loss": 3.6084, + "step": 32080 + }, + { + "epoch": 3.2987253289473686, + "grad_norm": 0.8240244487241773, + "learning_rate": 2.637978685575326e-05, + "loss": 3.5464, + "step": 32090 + }, + { + "epoch": 3.299753289473684, + "grad_norm": 0.7203757516889143, + "learning_rate": 2.6358494913665077e-05, + "loss": 3.5372, + "step": 32100 + }, + { + "epoch": 3.30078125, + "grad_norm": 0.8787981093953683, + "learning_rate": 2.633720935675144e-05, + "loss": 3.4914, + "step": 32110 + }, + { + "epoch": 3.301809210526316, + "grad_norm": 1.1942515932183901, + "learning_rate": 2.631593019343235e-05, + "loss": 3.4564, + "step": 32120 + }, + { + "epoch": 3.3028371710526314, + "grad_norm": 1.4351740554095447, + "learning_rate": 2.6294657432125354e-05, + "loss": 3.5172, + "step": 32130 + }, + { + "epoch": 3.3038651315789473, + "grad_norm": 0.9568723274158751, + "learning_rate": 2.6273391081245415e-05, + "loss": 3.4392, + "step": 32140 + }, + { + "epoch": 3.3048930921052633, + "grad_norm": 0.902911020292669, + "learning_rate": 2.625213114920494e-05, + "loss": 3.555, + "step": 32150 + }, + { + "epoch": 3.3059210526315788, + "grad_norm": 0.5940828394330164, + "learning_rate": 2.6230877644413834e-05, + "loss": 3.4482, + "step": 32160 + }, + { + "epoch": 3.3069490131578947, + "grad_norm": 0.8055434258048361, + "learning_rate": 2.620963057527948e-05, + "loss": 3.5034, + "step": 32170 + }, + { + "epoch": 3.3079769736842106, + "grad_norm": 0.8284799426936715, + "learning_rate": 2.618838995020665e-05, + "loss": 3.5013, + "step": 32180 + }, + { + "epoch": 3.3090049342105265, + "grad_norm": 1.2457954264884037, + "learning_rate": 2.616715577759762e-05, + "loss": 3.4526, + "step": 32190 + }, + { + "epoch": 3.310032894736842, + "grad_norm": 1.0453313007574205, + "learning_rate": 2.6145928065852092e-05, + "loss": 3.5443, + "step": 32200 + }, + { + "epoch": 3.311060855263158, + "grad_norm": 0.8462318723306275, + "learning_rate": 2.6124706823367213e-05, + "loss": 3.4556, + "step": 32210 + }, + { + "epoch": 3.3120888157894735, + "grad_norm": 1.0083330569516478, + "learning_rate": 2.6103492058537576e-05, + "loss": 3.5656, + "step": 32220 + }, + { + "epoch": 3.3131167763157894, + "grad_norm": 0.8734950679580857, + "learning_rate": 2.6082283779755234e-05, + "loss": 3.5302, + "step": 32230 + }, + { + "epoch": 3.3141447368421053, + "grad_norm": 0.6662412691886216, + "learning_rate": 2.6061081995409594e-05, + "loss": 3.497, + "step": 32240 + }, + { + "epoch": 3.3151726973684212, + "grad_norm": 0.8080033805417641, + "learning_rate": 2.60398867138876e-05, + "loss": 3.5361, + "step": 32250 + }, + { + "epoch": 3.3162006578947367, + "grad_norm": 0.6105434268749101, + "learning_rate": 2.6018697943573575e-05, + "loss": 3.6377, + "step": 32260 + }, + { + "epoch": 3.3172286184210527, + "grad_norm": 1.014270054717423, + "learning_rate": 2.5997515692849228e-05, + "loss": 3.3968, + "step": 32270 + }, + { + "epoch": 3.3182565789473686, + "grad_norm": 1.4736857167752535, + "learning_rate": 2.597633997009373e-05, + "loss": 3.5408, + "step": 32280 + }, + { + "epoch": 3.319284539473684, + "grad_norm": 0.7873798218110278, + "learning_rate": 2.5955170783683722e-05, + "loss": 3.4914, + "step": 32290 + }, + { + "epoch": 3.3203125, + "grad_norm": 0.9861363290568025, + "learning_rate": 2.5934008141993135e-05, + "loss": 3.5433, + "step": 32300 + }, + { + "epoch": 3.321340460526316, + "grad_norm": 0.7916101552636657, + "learning_rate": 2.591285205339342e-05, + "loss": 3.5206, + "step": 32310 + }, + { + "epoch": 3.3223684210526314, + "grad_norm": 0.9908547649413153, + "learning_rate": 2.589170252625337e-05, + "loss": 3.4883, + "step": 32320 + }, + { + "epoch": 3.3233963815789473, + "grad_norm": 0.6910358018447035, + "learning_rate": 2.5870559568939224e-05, + "loss": 3.5183, + "step": 32330 + }, + { + "epoch": 3.3244243421052633, + "grad_norm": 0.8825439449748135, + "learning_rate": 2.58494231898146e-05, + "loss": 3.5366, + "step": 32340 + }, + { + "epoch": 3.3254523026315788, + "grad_norm": 0.9368116394767904, + "learning_rate": 2.5828293397240538e-05, + "loss": 3.4453, + "step": 32350 + }, + { + "epoch": 3.3264802631578947, + "grad_norm": 1.1804539798845117, + "learning_rate": 2.5807170199575406e-05, + "loss": 3.5639, + "step": 32360 + }, + { + "epoch": 3.3275082236842106, + "grad_norm": 0.8662453766076036, + "learning_rate": 2.5786053605175058e-05, + "loss": 3.5101, + "step": 32370 + }, + { + "epoch": 3.3285361842105265, + "grad_norm": 1.0590643593284155, + "learning_rate": 2.576494362239268e-05, + "loss": 3.3889, + "step": 32380 + }, + { + "epoch": 3.329564144736842, + "grad_norm": 0.6137353952849449, + "learning_rate": 2.574384025957884e-05, + "loss": 3.5857, + "step": 32390 + }, + { + "epoch": 3.330592105263158, + "grad_norm": 0.730705059811239, + "learning_rate": 2.57227435250815e-05, + "loss": 3.5556, + "step": 32400 + }, + { + "epoch": 3.3316200657894735, + "grad_norm": 0.9590948472420844, + "learning_rate": 2.5701653427245997e-05, + "loss": 3.5116, + "step": 32410 + }, + { + "epoch": 3.3326480263157894, + "grad_norm": 0.8347947340063931, + "learning_rate": 2.5680569974415055e-05, + "loss": 3.479, + "step": 32420 + }, + { + "epoch": 3.3336759868421053, + "grad_norm": 0.6693013155832075, + "learning_rate": 2.5659493174928754e-05, + "loss": 3.6015, + "step": 32430 + }, + { + "epoch": 3.3347039473684212, + "grad_norm": 1.3075082860359521, + "learning_rate": 2.563842303712456e-05, + "loss": 3.5145, + "step": 32440 + }, + { + "epoch": 3.3357319078947367, + "grad_norm": 1.3297799962435228, + "learning_rate": 2.561735956933725e-05, + "loss": 3.6004, + "step": 32450 + }, + { + "epoch": 3.3367598684210527, + "grad_norm": 0.7536411115651791, + "learning_rate": 2.559630277989905e-05, + "loss": 3.5288, + "step": 32460 + }, + { + "epoch": 3.3377878289473686, + "grad_norm": 0.8841696915378535, + "learning_rate": 2.5575252677139488e-05, + "loss": 3.6083, + "step": 32470 + }, + { + "epoch": 3.338815789473684, + "grad_norm": 0.8315797753960517, + "learning_rate": 2.5554209269385437e-05, + "loss": 3.6205, + "step": 32480 + }, + { + "epoch": 3.33984375, + "grad_norm": 0.5889274604279271, + "learning_rate": 2.5533172564961138e-05, + "loss": 3.5436, + "step": 32490 + }, + { + "epoch": 3.340871710526316, + "grad_norm": 0.89637054186058, + "learning_rate": 2.5512142572188233e-05, + "loss": 3.3796, + "step": 32500 + }, + { + "epoch": 3.3418996710526314, + "grad_norm": 0.7979655937220543, + "learning_rate": 2.5491119299385604e-05, + "loss": 3.5024, + "step": 32510 + }, + { + "epoch": 3.3429276315789473, + "grad_norm": 0.6306967747185617, + "learning_rate": 2.547010275486955e-05, + "loss": 3.5147, + "step": 32520 + }, + { + "epoch": 3.3439555921052633, + "grad_norm": 0.9331822466041654, + "learning_rate": 2.5449092946953692e-05, + "loss": 3.4352, + "step": 32530 + }, + { + "epoch": 3.3449835526315788, + "grad_norm": 1.0289590415069085, + "learning_rate": 2.5428089883948986e-05, + "loss": 3.5486, + "step": 32540 + }, + { + "epoch": 3.3460115131578947, + "grad_norm": 0.6355249411673144, + "learning_rate": 2.5407093574163706e-05, + "loss": 3.5489, + "step": 32550 + }, + { + "epoch": 3.3470394736842106, + "grad_norm": 1.0326158552772697, + "learning_rate": 2.5386104025903486e-05, + "loss": 3.5969, + "step": 32560 + }, + { + "epoch": 3.3480674342105265, + "grad_norm": 0.818530408516935, + "learning_rate": 2.5365121247471222e-05, + "loss": 3.5183, + "step": 32570 + }, + { + "epoch": 3.349095394736842, + "grad_norm": 0.9242397625195986, + "learning_rate": 2.5344145247167223e-05, + "loss": 3.5409, + "step": 32580 + }, + { + "epoch": 3.350123355263158, + "grad_norm": 0.8310087484893076, + "learning_rate": 2.5323176033289062e-05, + "loss": 3.5239, + "step": 32590 + }, + { + "epoch": 3.3511513157894735, + "grad_norm": 0.9043360262937438, + "learning_rate": 2.53022136141316e-05, + "loss": 3.4936, + "step": 32600 + }, + { + "epoch": 3.3521792763157894, + "grad_norm": 0.8291523876270597, + "learning_rate": 2.5281257997987053e-05, + "loss": 3.4963, + "step": 32610 + }, + { + "epoch": 3.3532072368421053, + "grad_norm": 0.9118620968194013, + "learning_rate": 2.526030919314498e-05, + "loss": 3.4833, + "step": 32620 + }, + { + "epoch": 3.3542351973684212, + "grad_norm": 0.9188102274247023, + "learning_rate": 2.523936720789216e-05, + "loss": 3.5157, + "step": 32630 + }, + { + "epoch": 3.3552631578947367, + "grad_norm": 0.9344873341681214, + "learning_rate": 2.521843205051273e-05, + "loss": 3.5304, + "step": 32640 + }, + { + "epoch": 3.3562911184210527, + "grad_norm": 0.5830824914384345, + "learning_rate": 2.5197503729288123e-05, + "loss": 3.5471, + "step": 32650 + }, + { + "epoch": 3.3573190789473686, + "grad_norm": 0.8264769583788593, + "learning_rate": 2.5176582252497018e-05, + "loss": 3.5149, + "step": 32660 + }, + { + "epoch": 3.358347039473684, + "grad_norm": 0.8115368416203662, + "learning_rate": 2.5155667628415465e-05, + "loss": 3.5947, + "step": 32670 + }, + { + "epoch": 3.359375, + "grad_norm": 0.7613638417853152, + "learning_rate": 2.5134759865316777e-05, + "loss": 3.4125, + "step": 32680 + }, + { + "epoch": 3.360402960526316, + "grad_norm": 0.8264731360660336, + "learning_rate": 2.5113858971471508e-05, + "loss": 3.4437, + "step": 32690 + }, + { + "epoch": 3.3614309210526314, + "grad_norm": 0.8558039944593188, + "learning_rate": 2.5092964955147518e-05, + "loss": 3.5661, + "step": 32700 + }, + { + "epoch": 3.3624588815789473, + "grad_norm": 0.8479274895904422, + "learning_rate": 2.5072077824610004e-05, + "loss": 3.5259, + "step": 32710 + }, + { + "epoch": 3.3634868421052633, + "grad_norm": 0.6780127137913519, + "learning_rate": 2.5051197588121352e-05, + "loss": 3.4874, + "step": 32720 + }, + { + "epoch": 3.3645148026315788, + "grad_norm": 1.114782505718971, + "learning_rate": 2.5030324253941266e-05, + "loss": 3.4674, + "step": 32730 + }, + { + "epoch": 3.3655427631578947, + "grad_norm": 1.0589677776301023, + "learning_rate": 2.5009457830326724e-05, + "loss": 3.4737, + "step": 32740 + }, + { + "epoch": 3.3665707236842106, + "grad_norm": 0.8366159555814626, + "learning_rate": 2.498859832553195e-05, + "loss": 3.5062, + "step": 32750 + }, + { + "epoch": 3.3675986842105265, + "grad_norm": 0.8083627343156382, + "learning_rate": 2.4967745747808446e-05, + "loss": 3.5236, + "step": 32760 + }, + { + "epoch": 3.368626644736842, + "grad_norm": 1.0392366421125296, + "learning_rate": 2.4946900105404987e-05, + "loss": 3.5434, + "step": 32770 + }, + { + "epoch": 3.369654605263158, + "grad_norm": 1.0569749116509617, + "learning_rate": 2.4926061406567536e-05, + "loss": 3.4758, + "step": 32780 + }, + { + "epoch": 3.3706825657894735, + "grad_norm": 1.3469083175609677, + "learning_rate": 2.4905229659539405e-05, + "loss": 3.5107, + "step": 32790 + }, + { + "epoch": 3.3717105263157894, + "grad_norm": 0.643282657720894, + "learning_rate": 2.4884404872561107e-05, + "loss": 3.4534, + "step": 32800 + }, + { + "epoch": 3.3727384868421053, + "grad_norm": 0.7429411825847906, + "learning_rate": 2.486358705387038e-05, + "loss": 3.5534, + "step": 32810 + }, + { + "epoch": 3.3737664473684212, + "grad_norm": 0.8254889513150895, + "learning_rate": 2.4842776211702234e-05, + "loss": 3.5147, + "step": 32820 + }, + { + "epoch": 3.3747944078947367, + "grad_norm": 0.8967860094640623, + "learning_rate": 2.4821972354288956e-05, + "loss": 3.6376, + "step": 32830 + }, + { + "epoch": 3.3758223684210527, + "grad_norm": 0.7621951838470061, + "learning_rate": 2.4801175489859984e-05, + "loss": 3.465, + "step": 32840 + }, + { + "epoch": 3.3768503289473686, + "grad_norm": 0.8631608283515801, + "learning_rate": 2.478038562664206e-05, + "loss": 3.4576, + "step": 32850 + }, + { + "epoch": 3.377878289473684, + "grad_norm": 0.9057030836184474, + "learning_rate": 2.475960277285912e-05, + "loss": 3.5629, + "step": 32860 + }, + { + "epoch": 3.37890625, + "grad_norm": 1.1337337562549779, + "learning_rate": 2.4738826936732343e-05, + "loss": 3.5027, + "step": 32870 + }, + { + "epoch": 3.379934210526316, + "grad_norm": 0.8174150795532441, + "learning_rate": 2.4718058126480137e-05, + "loss": 3.5377, + "step": 32880 + }, + { + "epoch": 3.3809621710526314, + "grad_norm": 1.0231015767915768, + "learning_rate": 2.469729635031814e-05, + "loss": 3.4687, + "step": 32890 + }, + { + "epoch": 3.3819901315789473, + "grad_norm": 0.8722830552419434, + "learning_rate": 2.4676541616459126e-05, + "loss": 3.5666, + "step": 32900 + }, + { + "epoch": 3.3830180921052633, + "grad_norm": 0.7639241692073456, + "learning_rate": 2.4655793933113205e-05, + "loss": 3.5562, + "step": 32910 + }, + { + "epoch": 3.3840460526315788, + "grad_norm": 0.8721561880856938, + "learning_rate": 2.4635053308487636e-05, + "loss": 3.4957, + "step": 32920 + }, + { + "epoch": 3.3850740131578947, + "grad_norm": 0.6887347231103581, + "learning_rate": 2.4614319750786865e-05, + "loss": 3.5818, + "step": 32930 + }, + { + "epoch": 3.3861019736842106, + "grad_norm": 0.894981204480202, + "learning_rate": 2.4593593268212575e-05, + "loss": 3.566, + "step": 32940 + }, + { + "epoch": 3.3871299342105265, + "grad_norm": 0.847239202753078, + "learning_rate": 2.4572873868963653e-05, + "loss": 3.5622, + "step": 32950 + }, + { + "epoch": 3.388157894736842, + "grad_norm": 1.2188912334282427, + "learning_rate": 2.4552161561236165e-05, + "loss": 3.4487, + "step": 32960 + }, + { + "epoch": 3.389185855263158, + "grad_norm": 0.9630724133710307, + "learning_rate": 2.453145635322338e-05, + "loss": 3.5302, + "step": 32970 + }, + { + "epoch": 3.3902138157894735, + "grad_norm": 0.9644253707853782, + "learning_rate": 2.4510758253115777e-05, + "loss": 3.49, + "step": 32980 + }, + { + "epoch": 3.3912417763157894, + "grad_norm": 0.9329613175524942, + "learning_rate": 2.4490067269100966e-05, + "loss": 3.5105, + "step": 32990 + }, + { + "epoch": 3.3922697368421053, + "grad_norm": 0.7491678946601328, + "learning_rate": 2.4469383409363817e-05, + "loss": 3.5304, + "step": 33000 + }, + { + "epoch": 3.3932976973684212, + "grad_norm": 0.838225504505047, + "learning_rate": 2.444870668208635e-05, + "loss": 3.5149, + "step": 33010 + }, + { + "epoch": 3.3943256578947367, + "grad_norm": 0.7954187363453937, + "learning_rate": 2.442803709544773e-05, + "loss": 3.5088, + "step": 33020 + }, + { + "epoch": 3.3953536184210527, + "grad_norm": 0.7905827220725266, + "learning_rate": 2.4407374657624327e-05, + "loss": 3.5544, + "step": 33030 + }, + { + "epoch": 3.3963815789473686, + "grad_norm": 0.6689252196030053, + "learning_rate": 2.438671937678973e-05, + "loss": 3.4535, + "step": 33040 + }, + { + "epoch": 3.397409539473684, + "grad_norm": 1.2641213986852742, + "learning_rate": 2.4366071261114594e-05, + "loss": 3.599, + "step": 33050 + }, + { + "epoch": 3.3984375, + "grad_norm": 0.9448562025230877, + "learning_rate": 2.4345430318766824e-05, + "loss": 3.5157, + "step": 33060 + }, + { + "epoch": 3.399465460526316, + "grad_norm": 0.6448312003650162, + "learning_rate": 2.432479655791145e-05, + "loss": 3.4935, + "step": 33070 + }, + { + "epoch": 3.4004934210526314, + "grad_norm": 0.8754772860022222, + "learning_rate": 2.4304169986710675e-05, + "loss": 3.5138, + "step": 33080 + }, + { + "epoch": 3.4015213815789473, + "grad_norm": 0.8665641400018712, + "learning_rate": 2.4283550613323854e-05, + "loss": 3.5389, + "step": 33090 + }, + { + "epoch": 3.4025493421052633, + "grad_norm": 1.5413416528482755, + "learning_rate": 2.4262938445907507e-05, + "loss": 3.5311, + "step": 33100 + }, + { + "epoch": 3.4035773026315788, + "grad_norm": 0.8875657124436701, + "learning_rate": 2.424233349261525e-05, + "loss": 3.513, + "step": 33110 + }, + { + "epoch": 3.4046052631578947, + "grad_norm": 0.6313168310914337, + "learning_rate": 2.4221735761597926e-05, + "loss": 3.4872, + "step": 33120 + }, + { + "epoch": 3.4056332236842106, + "grad_norm": 0.6557525384165199, + "learning_rate": 2.4201145261003473e-05, + "loss": 3.5705, + "step": 33130 + }, + { + "epoch": 3.4066611842105265, + "grad_norm": 1.091739059309564, + "learning_rate": 2.4180561998976963e-05, + "loss": 3.4749, + "step": 33140 + }, + { + "epoch": 3.407689144736842, + "grad_norm": 0.644984600267727, + "learning_rate": 2.415998598366061e-05, + "loss": 3.523, + "step": 33150 + }, + { + "epoch": 3.408717105263158, + "grad_norm": 0.6723127048283246, + "learning_rate": 2.4139417223193813e-05, + "loss": 3.5164, + "step": 33160 + }, + { + "epoch": 3.4097450657894735, + "grad_norm": 0.9141195230548148, + "learning_rate": 2.4118855725713015e-05, + "loss": 3.4615, + "step": 33170 + }, + { + "epoch": 3.4107730263157894, + "grad_norm": 1.122223394590548, + "learning_rate": 2.409830149935184e-05, + "loss": 3.4365, + "step": 33180 + }, + { + "epoch": 3.4118009868421053, + "grad_norm": 1.3346342455851061, + "learning_rate": 2.4077754552241042e-05, + "loss": 3.4489, + "step": 33190 + }, + { + "epoch": 3.4128289473684212, + "grad_norm": 0.5385033015218135, + "learning_rate": 2.4057214892508434e-05, + "loss": 3.5368, + "step": 33200 + }, + { + "epoch": 3.4138569078947367, + "grad_norm": 1.2009658760889463, + "learning_rate": 2.4036682528279037e-05, + "loss": 3.4607, + "step": 33210 + }, + { + "epoch": 3.4148848684210527, + "grad_norm": 0.9608548136546327, + "learning_rate": 2.401615746767494e-05, + "loss": 3.4387, + "step": 33220 + }, + { + "epoch": 3.4159128289473686, + "grad_norm": 0.9799848175332476, + "learning_rate": 2.3995639718815308e-05, + "loss": 3.4519, + "step": 33230 + }, + { + "epoch": 3.416940789473684, + "grad_norm": 1.0537174991831144, + "learning_rate": 2.3975129289816436e-05, + "loss": 3.5266, + "step": 33240 + }, + { + "epoch": 3.41796875, + "grad_norm": 1.53703887661454, + "learning_rate": 2.3954626188791803e-05, + "loss": 3.4934, + "step": 33250 + }, + { + "epoch": 3.418996710526316, + "grad_norm": 0.736320475328612, + "learning_rate": 2.393413042385187e-05, + "loss": 3.4547, + "step": 33260 + }, + { + "epoch": 3.4200246710526314, + "grad_norm": 0.852810286717882, + "learning_rate": 2.3913642003104256e-05, + "loss": 3.5249, + "step": 33270 + }, + { + "epoch": 3.4210526315789473, + "grad_norm": 0.8057557579802964, + "learning_rate": 2.389316093465366e-05, + "loss": 3.5727, + "step": 33280 + }, + { + "epoch": 3.4220805921052633, + "grad_norm": 0.9170851330275138, + "learning_rate": 2.3872687226601903e-05, + "loss": 3.432, + "step": 33290 + }, + { + "epoch": 3.4231085526315788, + "grad_norm": 0.8804645203130853, + "learning_rate": 2.3852220887047847e-05, + "loss": 3.5587, + "step": 33300 + }, + { + "epoch": 3.4241365131578947, + "grad_norm": 0.7408737199571782, + "learning_rate": 2.3831761924087488e-05, + "loss": 3.5702, + "step": 33310 + }, + { + "epoch": 3.4251644736842106, + "grad_norm": 1.31409797575211, + "learning_rate": 2.3811310345813833e-05, + "loss": 3.5413, + "step": 33320 + }, + { + "epoch": 3.4261924342105265, + "grad_norm": 0.6967910305461988, + "learning_rate": 2.3790866160317055e-05, + "loss": 3.482, + "step": 33330 + }, + { + "epoch": 3.427220394736842, + "grad_norm": 0.7960675580598621, + "learning_rate": 2.377042937568437e-05, + "loss": 3.5868, + "step": 33340 + }, + { + "epoch": 3.428248355263158, + "grad_norm": 0.7985124034049634, + "learning_rate": 2.375000000000001e-05, + "loss": 3.5512, + "step": 33350 + }, + { + "epoch": 3.4292763157894735, + "grad_norm": 0.7557014083409005, + "learning_rate": 2.372957804134534e-05, + "loss": 3.4713, + "step": 33360 + }, + { + "epoch": 3.4303042763157894, + "grad_norm": 0.5809059786234936, + "learning_rate": 2.3709163507798812e-05, + "loss": 3.5193, + "step": 33370 + }, + { + "epoch": 3.4313322368421053, + "grad_norm": 0.8526264858101478, + "learning_rate": 2.3688756407435864e-05, + "loss": 3.5076, + "step": 33380 + }, + { + "epoch": 3.4323601973684212, + "grad_norm": 0.750276633743562, + "learning_rate": 2.3668356748329036e-05, + "loss": 3.5252, + "step": 33390 + }, + { + "epoch": 3.4333881578947367, + "grad_norm": 0.7168704626576674, + "learning_rate": 2.3647964538547942e-05, + "loss": 3.54, + "step": 33400 + }, + { + "epoch": 3.4344161184210527, + "grad_norm": 0.7073497160366505, + "learning_rate": 2.3627579786159203e-05, + "loss": 3.4653, + "step": 33410 + }, + { + "epoch": 3.4354440789473686, + "grad_norm": 0.7787170930403381, + "learning_rate": 2.360720249922654e-05, + "loss": 3.5034, + "step": 33420 + }, + { + "epoch": 3.436472039473684, + "grad_norm": 1.0428105495069975, + "learning_rate": 2.3586832685810692e-05, + "loss": 3.4862, + "step": 33430 + }, + { + "epoch": 3.4375, + "grad_norm": 1.5790752156264865, + "learning_rate": 2.356647035396943e-05, + "loss": 3.4831, + "step": 33440 + }, + { + "epoch": 3.438527960526316, + "grad_norm": 0.97443481594487, + "learning_rate": 2.354611551175758e-05, + "loss": 3.6203, + "step": 33450 + }, + { + "epoch": 3.4395559210526314, + "grad_norm": 0.83503760270293, + "learning_rate": 2.3525768167227038e-05, + "loss": 3.4394, + "step": 33460 + }, + { + "epoch": 3.4405838815789473, + "grad_norm": 0.7734794990619048, + "learning_rate": 2.3505428328426678e-05, + "loss": 3.5402, + "step": 33470 + }, + { + "epoch": 3.4416118421052633, + "grad_norm": 0.8276691261630849, + "learning_rate": 2.3485096003402435e-05, + "loss": 3.5405, + "step": 33480 + }, + { + "epoch": 3.4426398026315788, + "grad_norm": 0.9522300959074494, + "learning_rate": 2.346477120019727e-05, + "loss": 3.4725, + "step": 33490 + }, + { + "epoch": 3.4436677631578947, + "grad_norm": 0.9535793409738728, + "learning_rate": 2.3444453926851156e-05, + "loss": 3.5346, + "step": 33500 + }, + { + "epoch": 3.4446957236842106, + "grad_norm": 0.7633544964820388, + "learning_rate": 2.342414419140111e-05, + "loss": 3.5951, + "step": 33510 + }, + { + "epoch": 3.4457236842105265, + "grad_norm": 0.9716215029984457, + "learning_rate": 2.3403842001881158e-05, + "loss": 3.5428, + "step": 33520 + }, + { + "epoch": 3.446751644736842, + "grad_norm": 0.791028924524095, + "learning_rate": 2.3383547366322298e-05, + "loss": 3.5301, + "step": 33530 + }, + { + "epoch": 3.447779605263158, + "grad_norm": 0.8860851494471456, + "learning_rate": 2.336326029275262e-05, + "loss": 3.4931, + "step": 33540 + }, + { + "epoch": 3.4488075657894735, + "grad_norm": 0.8044167961400553, + "learning_rate": 2.334298078919719e-05, + "loss": 3.509, + "step": 33550 + }, + { + "epoch": 3.4498355263157894, + "grad_norm": 0.8532958631621093, + "learning_rate": 2.332270886367803e-05, + "loss": 3.4986, + "step": 33560 + }, + { + "epoch": 3.4508634868421053, + "grad_norm": 0.9303644104180283, + "learning_rate": 2.330244452421421e-05, + "loss": 3.5377, + "step": 33570 + }, + { + "epoch": 3.4518914473684212, + "grad_norm": 0.7013936661619482, + "learning_rate": 2.3282187778821845e-05, + "loss": 3.4516, + "step": 33580 + }, + { + "epoch": 3.4529194078947367, + "grad_norm": 1.058693574935371, + "learning_rate": 2.3261938635513942e-05, + "loss": 3.5669, + "step": 33590 + }, + { + "epoch": 3.4539473684210527, + "grad_norm": 1.0417654689979627, + "learning_rate": 2.3241697102300567e-05, + "loss": 3.4943, + "step": 33600 + }, + { + "epoch": 3.4549753289473686, + "grad_norm": 0.8279278358534157, + "learning_rate": 2.3221463187188772e-05, + "loss": 3.4765, + "step": 33610 + }, + { + "epoch": 3.456003289473684, + "grad_norm": 0.8092179400080346, + "learning_rate": 2.3201236898182583e-05, + "loss": 3.4442, + "step": 33620 + }, + { + "epoch": 3.45703125, + "grad_norm": 0.6241203077702594, + "learning_rate": 2.3181018243283014e-05, + "loss": 3.5109, + "step": 33630 + }, + { + "epoch": 3.458059210526316, + "grad_norm": 1.0846338474343233, + "learning_rate": 2.3160807230488066e-05, + "loss": 3.5472, + "step": 33640 + }, + { + "epoch": 3.4590871710526314, + "grad_norm": 0.9080900716660845, + "learning_rate": 2.314060386779267e-05, + "loss": 3.5194, + "step": 33650 + }, + { + "epoch": 3.4601151315789473, + "grad_norm": 0.6524286500425756, + "learning_rate": 2.3120408163188817e-05, + "loss": 3.5091, + "step": 33660 + }, + { + "epoch": 3.4611430921052633, + "grad_norm": 0.6439015173142221, + "learning_rate": 2.310022012466541e-05, + "loss": 3.5746, + "step": 33670 + }, + { + "epoch": 3.4621710526315788, + "grad_norm": 1.2027601704521302, + "learning_rate": 2.308003976020831e-05, + "loss": 3.5091, + "step": 33680 + }, + { + "epoch": 3.4631990131578947, + "grad_norm": 0.9646533979067075, + "learning_rate": 2.305986707780036e-05, + "loss": 3.4662, + "step": 33690 + }, + { + "epoch": 3.4642269736842106, + "grad_norm": 1.2219262427002242, + "learning_rate": 2.303970208542141e-05, + "loss": 3.5171, + "step": 33700 + }, + { + "epoch": 3.4652549342105265, + "grad_norm": 0.7008825146030909, + "learning_rate": 2.3019544791048195e-05, + "loss": 3.5727, + "step": 33710 + }, + { + "epoch": 3.466282894736842, + "grad_norm": 0.7177555834567314, + "learning_rate": 2.2999395202654427e-05, + "loss": 3.5104, + "step": 33720 + }, + { + "epoch": 3.467310855263158, + "grad_norm": 0.8222008178977823, + "learning_rate": 2.2979253328210803e-05, + "loss": 3.5927, + "step": 33730 + }, + { + "epoch": 3.4683388157894735, + "grad_norm": 1.0418780048878524, + "learning_rate": 2.29591191756849e-05, + "loss": 3.57, + "step": 33740 + }, + { + "epoch": 3.4693667763157894, + "grad_norm": 0.73928268572185, + "learning_rate": 2.2938992753041336e-05, + "loss": 3.4863, + "step": 33750 + }, + { + "epoch": 3.4703947368421053, + "grad_norm": 1.2602311094957228, + "learning_rate": 2.2918874068241603e-05, + "loss": 3.4744, + "step": 33760 + }, + { + "epoch": 3.4714226973684212, + "grad_norm": 0.6778588524213576, + "learning_rate": 2.2898763129244123e-05, + "loss": 3.5472, + "step": 33770 + }, + { + "epoch": 3.4724506578947367, + "grad_norm": 0.8337737628232729, + "learning_rate": 2.2878659944004287e-05, + "loss": 3.5223, + "step": 33780 + }, + { + "epoch": 3.4734786184210527, + "grad_norm": 0.775517912057089, + "learning_rate": 2.2858564520474455e-05, + "loss": 3.4867, + "step": 33790 + }, + { + "epoch": 3.4745065789473686, + "grad_norm": 0.6340603240583524, + "learning_rate": 2.283847686660382e-05, + "loss": 3.4836, + "step": 33800 + }, + { + "epoch": 3.475534539473684, + "grad_norm": 0.8402897205152124, + "learning_rate": 2.281839699033857e-05, + "loss": 3.4785, + "step": 33810 + }, + { + "epoch": 3.4765625, + "grad_norm": 0.6515803867510408, + "learning_rate": 2.27983248996218e-05, + "loss": 3.506, + "step": 33820 + }, + { + "epoch": 3.477590460526316, + "grad_norm": 0.6392320748383098, + "learning_rate": 2.2778260602393534e-05, + "loss": 3.4273, + "step": 33830 + }, + { + "epoch": 3.4786184210526314, + "grad_norm": 1.039992468421514, + "learning_rate": 2.27582041065907e-05, + "loss": 3.5743, + "step": 33840 + }, + { + "epoch": 3.4796463815789473, + "grad_norm": 0.9757273650733918, + "learning_rate": 2.273815542014716e-05, + "loss": 3.534, + "step": 33850 + }, + { + "epoch": 3.4806743421052633, + "grad_norm": 0.6921625472968159, + "learning_rate": 2.2718114550993614e-05, + "loss": 3.4947, + "step": 33860 + }, + { + "epoch": 3.4817023026315788, + "grad_norm": 0.748780053097617, + "learning_rate": 2.2698081507057795e-05, + "loss": 3.523, + "step": 33870 + }, + { + "epoch": 3.4827302631578947, + "grad_norm": 0.7754190665537084, + "learning_rate": 2.267805629626425e-05, + "loss": 3.5599, + "step": 33880 + }, + { + "epoch": 3.4837582236842106, + "grad_norm": 0.7388730780225269, + "learning_rate": 2.2658038926534443e-05, + "loss": 3.5076, + "step": 33890 + }, + { + "epoch": 3.4847861842105265, + "grad_norm": 1.0863227435338434, + "learning_rate": 2.2638029405786724e-05, + "loss": 3.4809, + "step": 33900 + }, + { + "epoch": 3.485814144736842, + "grad_norm": 0.7945187457965861, + "learning_rate": 2.2618027741936413e-05, + "loss": 3.5687, + "step": 33910 + }, + { + "epoch": 3.486842105263158, + "grad_norm": 0.6734026906516304, + "learning_rate": 2.259803394289562e-05, + "loss": 3.5313, + "step": 33920 + }, + { + "epoch": 3.4878700657894735, + "grad_norm": 0.7405444781476601, + "learning_rate": 2.25780480165734e-05, + "loss": 3.5487, + "step": 33930 + }, + { + "epoch": 3.4888980263157894, + "grad_norm": 0.6683485501793394, + "learning_rate": 2.2558069970875688e-05, + "loss": 3.4603, + "step": 33940 + }, + { + "epoch": 3.4899259868421053, + "grad_norm": 0.6930589481730253, + "learning_rate": 2.2538099813705302e-05, + "loss": 3.5149, + "step": 33950 + }, + { + "epoch": 3.4909539473684212, + "grad_norm": 0.8803086007397155, + "learning_rate": 2.2518137552961927e-05, + "loss": 3.5274, + "step": 33960 + }, + { + "epoch": 3.4919819078947367, + "grad_norm": 1.180523052266786, + "learning_rate": 2.2498183196542155e-05, + "loss": 3.4816, + "step": 33970 + }, + { + "epoch": 3.4930098684210527, + "grad_norm": 0.8559861382226617, + "learning_rate": 2.247823675233939e-05, + "loss": 3.5407, + "step": 33980 + }, + { + "epoch": 3.4940378289473686, + "grad_norm": 0.6950754340146413, + "learning_rate": 2.2458298228243957e-05, + "loss": 3.4685, + "step": 33990 + }, + { + "epoch": 3.495065789473684, + "grad_norm": 0.6699730767349178, + "learning_rate": 2.2438367632143075e-05, + "loss": 3.5859, + "step": 34000 + }, + { + "epoch": 3.49609375, + "grad_norm": 0.7017546970331189, + "learning_rate": 2.2418444971920738e-05, + "loss": 3.5295, + "step": 34010 + }, + { + "epoch": 3.497121710526316, + "grad_norm": 1.1381866311025002, + "learning_rate": 2.239853025545787e-05, + "loss": 3.5215, + "step": 34020 + }, + { + "epoch": 3.4981496710526314, + "grad_norm": 1.1376893399454282, + "learning_rate": 2.2378623490632236e-05, + "loss": 3.4612, + "step": 34030 + }, + { + "epoch": 3.4991776315789473, + "grad_norm": 0.7412527924431989, + "learning_rate": 2.2358724685318453e-05, + "loss": 3.4462, + "step": 34040 + }, + { + "epoch": 3.5002055921052633, + "grad_norm": 0.8608420940506074, + "learning_rate": 2.233883384738798e-05, + "loss": 3.5353, + "step": 34050 + }, + { + "epoch": 3.5012335526315788, + "grad_norm": 0.8873293954447554, + "learning_rate": 2.2318950984709158e-05, + "loss": 3.5245, + "step": 34060 + }, + { + "epoch": 3.5022615131578947, + "grad_norm": 0.6736221662140739, + "learning_rate": 2.2299076105147108e-05, + "loss": 3.4698, + "step": 34070 + }, + { + "epoch": 3.5032894736842106, + "grad_norm": 1.124906642007154, + "learning_rate": 2.227920921656386e-05, + "loss": 3.5394, + "step": 34080 + }, + { + "epoch": 3.5043174342105265, + "grad_norm": 0.8402379020357967, + "learning_rate": 2.225935032681828e-05, + "loss": 3.5647, + "step": 34090 + }, + { + "epoch": 3.505345394736842, + "grad_norm": 0.6718142060565073, + "learning_rate": 2.2239499443765996e-05, + "loss": 3.4604, + "step": 34100 + }, + { + "epoch": 3.506373355263158, + "grad_norm": 1.4739356516608235, + "learning_rate": 2.221965657525954e-05, + "loss": 3.4748, + "step": 34110 + }, + { + "epoch": 3.5074013157894735, + "grad_norm": 1.0916452592590156, + "learning_rate": 2.2199821729148273e-05, + "loss": 3.5377, + "step": 34120 + }, + { + "epoch": 3.5084292763157894, + "grad_norm": 0.7444766429877742, + "learning_rate": 2.2179994913278345e-05, + "loss": 3.4794, + "step": 34130 + }, + { + "epoch": 3.5094572368421053, + "grad_norm": 0.70607211987092, + "learning_rate": 2.2160176135492738e-05, + "loss": 3.4482, + "step": 34140 + }, + { + "epoch": 3.5104851973684212, + "grad_norm": 0.6441385864817771, + "learning_rate": 2.2140365403631277e-05, + "loss": 3.4697, + "step": 34150 + }, + { + "epoch": 3.5115131578947367, + "grad_norm": 0.6704527861508884, + "learning_rate": 2.212056272553059e-05, + "loss": 3.6309, + "step": 34160 + }, + { + "epoch": 3.5125411184210527, + "grad_norm": 0.6195759250845301, + "learning_rate": 2.2100768109024117e-05, + "loss": 3.5782, + "step": 34170 + }, + { + "epoch": 3.5135690789473686, + "grad_norm": 1.2796858101671773, + "learning_rate": 2.2080981561942124e-05, + "loss": 3.5889, + "step": 34180 + }, + { + "epoch": 3.514597039473684, + "grad_norm": 0.943163770532237, + "learning_rate": 2.206120309211163e-05, + "loss": 3.4942, + "step": 34190 + }, + { + "epoch": 3.515625, + "grad_norm": 0.7195296412811354, + "learning_rate": 2.2041432707356553e-05, + "loss": 3.5439, + "step": 34200 + }, + { + "epoch": 3.516652960526316, + "grad_norm": 0.7034831936068454, + "learning_rate": 2.2021670415497553e-05, + "loss": 3.5222, + "step": 34210 + }, + { + "epoch": 3.5176809210526314, + "grad_norm": 0.7600369005907226, + "learning_rate": 2.2001916224352066e-05, + "loss": 3.4953, + "step": 34220 + }, + { + "epoch": 3.5187088815789473, + "grad_norm": 0.8336749250935135, + "learning_rate": 2.198217014173436e-05, + "loss": 3.5446, + "step": 34230 + }, + { + "epoch": 3.5197368421052633, + "grad_norm": 0.737213088655113, + "learning_rate": 2.1962432175455532e-05, + "loss": 3.5132, + "step": 34240 + }, + { + "epoch": 3.5207648026315788, + "grad_norm": 1.3352123768785358, + "learning_rate": 2.194270233332339e-05, + "loss": 3.4762, + "step": 34250 + }, + { + "epoch": 3.5217927631578947, + "grad_norm": 1.7427344725161567, + "learning_rate": 2.192298062314256e-05, + "loss": 3.4677, + "step": 34260 + }, + { + "epoch": 3.5228207236842106, + "grad_norm": 0.9718077302080051, + "learning_rate": 2.1903267052714493e-05, + "loss": 3.5574, + "step": 34270 + }, + { + "epoch": 3.5238486842105265, + "grad_norm": 0.7364613541501117, + "learning_rate": 2.1883561629837326e-05, + "loss": 3.5653, + "step": 34280 + }, + { + "epoch": 3.524876644736842, + "grad_norm": 1.0125379154971008, + "learning_rate": 2.1863864362306075e-05, + "loss": 3.5594, + "step": 34290 + }, + { + "epoch": 3.525904605263158, + "grad_norm": 0.7791724619590398, + "learning_rate": 2.1844175257912488e-05, + "loss": 3.5141, + "step": 34300 + }, + { + "epoch": 3.5269325657894735, + "grad_norm": 0.7647662114709977, + "learning_rate": 2.1824494324445055e-05, + "loss": 3.5172, + "step": 34310 + }, + { + "epoch": 3.5279605263157894, + "grad_norm": 0.6322684834193074, + "learning_rate": 2.1804821569689047e-05, + "loss": 3.4484, + "step": 34320 + }, + { + "epoch": 3.5289884868421053, + "grad_norm": 0.8334109190745306, + "learning_rate": 2.1785157001426565e-05, + "loss": 3.5191, + "step": 34330 + }, + { + "epoch": 3.5300164473684212, + "grad_norm": 1.0036634455852667, + "learning_rate": 2.1765500627436376e-05, + "loss": 3.485, + "step": 34340 + }, + { + "epoch": 3.5310444078947367, + "grad_norm": 0.8049719964955672, + "learning_rate": 2.1745852455494065e-05, + "loss": 3.4973, + "step": 34350 + }, + { + "epoch": 3.5320723684210527, + "grad_norm": 0.7977594441494725, + "learning_rate": 2.1726212493371946e-05, + "loss": 3.4857, + "step": 34360 + }, + { + "epoch": 3.5331003289473686, + "grad_norm": 0.7810223820027631, + "learning_rate": 2.1706580748839103e-05, + "loss": 3.5256, + "step": 34370 + }, + { + "epoch": 3.534128289473684, + "grad_norm": 0.7156914442775028, + "learning_rate": 2.1686957229661366e-05, + "loss": 3.5094, + "step": 34380 + }, + { + "epoch": 3.53515625, + "grad_norm": 0.5883109532288007, + "learning_rate": 2.1667341943601307e-05, + "loss": 3.5186, + "step": 34390 + }, + { + "epoch": 3.536184210526316, + "grad_norm": 1.6346432584224326, + "learning_rate": 2.1647734898418217e-05, + "loss": 3.5254, + "step": 34400 + }, + { + "epoch": 3.5372121710526314, + "grad_norm": 0.9154826812022709, + "learning_rate": 2.1628136101868176e-05, + "loss": 3.5338, + "step": 34410 + }, + { + "epoch": 3.5382401315789473, + "grad_norm": 0.8267406552862016, + "learning_rate": 2.1608545561703988e-05, + "loss": 3.4537, + "step": 34420 + }, + { + "epoch": 3.5392680921052633, + "grad_norm": 0.5715653753860086, + "learning_rate": 2.158896328567514e-05, + "loss": 3.4898, + "step": 34430 + }, + { + "epoch": 3.5402960526315788, + "grad_norm": 0.7611548268122844, + "learning_rate": 2.156938928152791e-05, + "loss": 3.5032, + "step": 34440 + }, + { + "epoch": 3.5413240131578947, + "grad_norm": 1.0080566110608649, + "learning_rate": 2.1549823557005306e-05, + "loss": 3.4735, + "step": 34450 + }, + { + "epoch": 3.5423519736842106, + "grad_norm": 0.66826114612707, + "learning_rate": 2.1530266119847e-05, + "loss": 3.5192, + "step": 34460 + }, + { + "epoch": 3.5433799342105265, + "grad_norm": 1.0811780188971878, + "learning_rate": 2.151071697778944e-05, + "loss": 3.5484, + "step": 34470 + }, + { + "epoch": 3.544407894736842, + "grad_norm": 1.0743769858598802, + "learning_rate": 2.1491176138565774e-05, + "loss": 3.4629, + "step": 34480 + }, + { + "epoch": 3.545435855263158, + "grad_norm": 0.9044806209278523, + "learning_rate": 2.147164360990587e-05, + "loss": 3.5544, + "step": 34490 + }, + { + "epoch": 3.5464638157894735, + "grad_norm": 0.8574609769277479, + "learning_rate": 2.145211939953631e-05, + "loss": 3.5737, + "step": 34500 + }, + { + "epoch": 3.5474917763157894, + "grad_norm": 0.7231118905108117, + "learning_rate": 2.1432603515180387e-05, + "loss": 3.5274, + "step": 34510 + }, + { + "epoch": 3.5485197368421053, + "grad_norm": 1.0512229813450016, + "learning_rate": 2.1413095964558074e-05, + "loss": 3.4882, + "step": 34520 + }, + { + "epoch": 3.5495476973684212, + "grad_norm": 0.8113723412239355, + "learning_rate": 2.1393596755386063e-05, + "loss": 3.5878, + "step": 34530 + }, + { + "epoch": 3.5505756578947367, + "grad_norm": 0.676868005894307, + "learning_rate": 2.1374105895377806e-05, + "loss": 3.5493, + "step": 34540 + }, + { + "epoch": 3.5516036184210527, + "grad_norm": 0.6330074433887962, + "learning_rate": 2.1354623392243336e-05, + "loss": 3.5601, + "step": 34550 + }, + { + "epoch": 3.5526315789473686, + "grad_norm": 0.716003186439904, + "learning_rate": 2.1335149253689477e-05, + "loss": 3.5733, + "step": 34560 + }, + { + "epoch": 3.553659539473684, + "grad_norm": 0.7688472050564098, + "learning_rate": 2.1315683487419702e-05, + "loss": 3.4644, + "step": 34570 + }, + { + "epoch": 3.5546875, + "grad_norm": 0.8292654210296438, + "learning_rate": 2.1296226101134178e-05, + "loss": 3.6022, + "step": 34580 + }, + { + "epoch": 3.555715460526316, + "grad_norm": 0.7358944372063031, + "learning_rate": 2.1276777102529757e-05, + "loss": 3.5229, + "step": 34590 + }, + { + "epoch": 3.5567434210526314, + "grad_norm": 0.7108914706242343, + "learning_rate": 2.1257336499299992e-05, + "loss": 3.5866, + "step": 34600 + }, + { + "epoch": 3.5577713815789473, + "grad_norm": 0.974592648288093, + "learning_rate": 2.1237904299135065e-05, + "loss": 3.5737, + "step": 34610 + }, + { + "epoch": 3.5587993421052633, + "grad_norm": 0.8222998387700855, + "learning_rate": 2.1218480509721892e-05, + "loss": 3.5357, + "step": 34620 + }, + { + "epoch": 3.5598273026315788, + "grad_norm": 0.8068111507290128, + "learning_rate": 2.1199065138744047e-05, + "loss": 3.4725, + "step": 34630 + }, + { + "epoch": 3.5608552631578947, + "grad_norm": 0.7765012613324338, + "learning_rate": 2.1179658193881733e-05, + "loss": 3.5592, + "step": 34640 + }, + { + "epoch": 3.5618832236842106, + "grad_norm": 0.8497151977581535, + "learning_rate": 2.1160259682811855e-05, + "loss": 3.4479, + "step": 34650 + }, + { + "epoch": 3.5629111842105265, + "grad_norm": 0.749664986810234, + "learning_rate": 2.1140869613208013e-05, + "loss": 3.4938, + "step": 34660 + }, + { + "epoch": 3.563939144736842, + "grad_norm": 0.927877475343116, + "learning_rate": 2.1121487992740394e-05, + "loss": 3.4641, + "step": 34670 + }, + { + "epoch": 3.564967105263158, + "grad_norm": 0.6569023522980504, + "learning_rate": 2.1102114829075894e-05, + "loss": 3.4903, + "step": 34680 + }, + { + "epoch": 3.5659950657894735, + "grad_norm": 0.8892634508275499, + "learning_rate": 2.1082750129878046e-05, + "loss": 3.4703, + "step": 34690 + }, + { + "epoch": 3.5670230263157894, + "grad_norm": 0.873305143861114, + "learning_rate": 2.106339390280705e-05, + "loss": 3.5163, + "step": 34700 + }, + { + "epoch": 3.5680509868421053, + "grad_norm": 0.7354288453200473, + "learning_rate": 2.1044046155519738e-05, + "loss": 3.4855, + "step": 34710 + }, + { + "epoch": 3.5690789473684212, + "grad_norm": 0.849049701727598, + "learning_rate": 2.1024706895669605e-05, + "loss": 3.5279, + "step": 34720 + }, + { + "epoch": 3.5701069078947367, + "grad_norm": 0.666017414884875, + "learning_rate": 2.1005376130906734e-05, + "loss": 3.4947, + "step": 34730 + }, + { + "epoch": 3.5711348684210527, + "grad_norm": 1.0129638433565598, + "learning_rate": 2.0986053868877948e-05, + "loss": 3.5757, + "step": 34740 + }, + { + "epoch": 3.5721628289473686, + "grad_norm": 0.7410847552704911, + "learning_rate": 2.0966740117226632e-05, + "loss": 3.5699, + "step": 34750 + }, + { + "epoch": 3.573190789473684, + "grad_norm": 0.6729440611665317, + "learning_rate": 2.094743488359279e-05, + "loss": 3.5622, + "step": 34760 + }, + { + "epoch": 3.57421875, + "grad_norm": 0.9386131353965783, + "learning_rate": 2.0928138175613102e-05, + "loss": 3.4709, + "step": 34770 + }, + { + "epoch": 3.575246710526316, + "grad_norm": 0.7046121764262173, + "learning_rate": 2.0908850000920903e-05, + "loss": 3.4812, + "step": 34780 + }, + { + "epoch": 3.5762746710526314, + "grad_norm": 1.0352097983275395, + "learning_rate": 2.0889570367146064e-05, + "loss": 3.5976, + "step": 34790 + }, + { + "epoch": 3.5773026315789473, + "grad_norm": 0.941191874427998, + "learning_rate": 2.0870299281915143e-05, + "loss": 3.4999, + "step": 34800 + }, + { + "epoch": 3.5783305921052633, + "grad_norm": 1.1057716736399736, + "learning_rate": 2.0851036752851285e-05, + "loss": 3.4891, + "step": 34810 + }, + { + "epoch": 3.5793585526315788, + "grad_norm": 0.7922611932129323, + "learning_rate": 2.083178278757428e-05, + "loss": 3.4663, + "step": 34820 + }, + { + "epoch": 3.5803865131578947, + "grad_norm": 0.9914503888825023, + "learning_rate": 2.081253739370051e-05, + "loss": 3.47, + "step": 34830 + }, + { + "epoch": 3.5814144736842106, + "grad_norm": 1.2091653962570097, + "learning_rate": 2.0793300578842973e-05, + "loss": 3.4603, + "step": 34840 + }, + { + "epoch": 3.5824424342105265, + "grad_norm": 0.9514221238196438, + "learning_rate": 2.077407235061125e-05, + "loss": 3.5173, + "step": 34850 + }, + { + "epoch": 3.583470394736842, + "grad_norm": 0.9358320976095174, + "learning_rate": 2.075485271661155e-05, + "loss": 3.546, + "step": 34860 + }, + { + "epoch": 3.584498355263158, + "grad_norm": 0.8598598153056922, + "learning_rate": 2.073564168444672e-05, + "loss": 3.578, + "step": 34870 + }, + { + "epoch": 3.5855263157894735, + "grad_norm": 0.7895467801721056, + "learning_rate": 2.0716439261716105e-05, + "loss": 3.5107, + "step": 34880 + }, + { + "epoch": 3.5865542763157894, + "grad_norm": 0.5919857416207912, + "learning_rate": 2.0697245456015737e-05, + "loss": 3.6109, + "step": 34890 + }, + { + "epoch": 3.5875822368421053, + "grad_norm": 0.8404152987961866, + "learning_rate": 2.067806027493819e-05, + "loss": 3.5536, + "step": 34900 + }, + { + "epoch": 3.5886101973684212, + "grad_norm": 0.7394246929685641, + "learning_rate": 2.0658883726072644e-05, + "loss": 3.5727, + "step": 34910 + }, + { + "epoch": 3.5896381578947367, + "grad_norm": 0.732227391171031, + "learning_rate": 2.0639715817004863e-05, + "loss": 3.4687, + "step": 34920 + }, + { + "epoch": 3.5906661184210527, + "grad_norm": 0.762379091088282, + "learning_rate": 2.0620556555317192e-05, + "loss": 3.5169, + "step": 34930 + }, + { + "epoch": 3.5916940789473686, + "grad_norm": 0.6107180296313861, + "learning_rate": 2.0601405948588548e-05, + "loss": 3.5389, + "step": 34940 + }, + { + "epoch": 3.592722039473684, + "grad_norm": 0.6690185729514957, + "learning_rate": 2.058226400439444e-05, + "loss": 3.5434, + "step": 34950 + }, + { + "epoch": 3.59375, + "grad_norm": 0.650126956498881, + "learning_rate": 2.0563130730306944e-05, + "loss": 3.5188, + "step": 34960 + }, + { + "epoch": 3.594777960526316, + "grad_norm": 0.6458895617793126, + "learning_rate": 2.0544006133894684e-05, + "loss": 3.5703, + "step": 34970 + }, + { + "epoch": 3.5958059210526314, + "grad_norm": 1.0293895238188733, + "learning_rate": 2.0524890222722868e-05, + "loss": 3.5226, + "step": 34980 + }, + { + "epoch": 3.5968338815789473, + "grad_norm": 0.9022288724096196, + "learning_rate": 2.050578300435331e-05, + "loss": 3.591, + "step": 34990 + }, + { + "epoch": 3.5978618421052633, + "grad_norm": 0.8262339539347642, + "learning_rate": 2.0486684486344303e-05, + "loss": 3.5041, + "step": 35000 + }, + { + "epoch": 3.5988898026315788, + "grad_norm": 1.267419508310039, + "learning_rate": 2.0467594676250767e-05, + "loss": 3.4764, + "step": 35010 + }, + { + "epoch": 3.5999177631578947, + "grad_norm": 0.7762425736518961, + "learning_rate": 2.044851358162414e-05, + "loss": 3.4858, + "step": 35020 + }, + { + "epoch": 3.6009457236842106, + "grad_norm": 0.8314079851585137, + "learning_rate": 2.0429441210012438e-05, + "loss": 3.58, + "step": 35030 + }, + { + "epoch": 3.6019736842105265, + "grad_norm": 0.9399949254243921, + "learning_rate": 2.0410377568960197e-05, + "loss": 3.4625, + "step": 35040 + }, + { + "epoch": 3.603001644736842, + "grad_norm": 0.6816264591002049, + "learning_rate": 2.0391322666008538e-05, + "loss": 3.5276, + "step": 35050 + }, + { + "epoch": 3.604029605263158, + "grad_norm": 0.5602099372960918, + "learning_rate": 2.037227650869507e-05, + "loss": 3.5431, + "step": 35060 + }, + { + "epoch": 3.6050575657894735, + "grad_norm": 0.9701102766294261, + "learning_rate": 2.0353239104553973e-05, + "loss": 3.5002, + "step": 35070 + }, + { + "epoch": 3.6060855263157894, + "grad_norm": 0.7024656170952889, + "learning_rate": 2.0334210461116026e-05, + "loss": 3.5418, + "step": 35080 + }, + { + "epoch": 3.6071134868421053, + "grad_norm": 1.117337303521774, + "learning_rate": 2.0315190585908418e-05, + "loss": 3.4977, + "step": 35090 + }, + { + "epoch": 3.6081414473684212, + "grad_norm": 0.6564780928812699, + "learning_rate": 2.0296179486454965e-05, + "loss": 3.4794, + "step": 35100 + }, + { + "epoch": 3.6091694078947367, + "grad_norm": 0.8899797812854819, + "learning_rate": 2.0277177170275974e-05, + "loss": 3.454, + "step": 35110 + }, + { + "epoch": 3.6101973684210527, + "grad_norm": 0.7316952368807109, + "learning_rate": 2.0258183644888286e-05, + "loss": 3.5855, + "step": 35120 + }, + { + "epoch": 3.6112253289473686, + "grad_norm": 0.9568636377657902, + "learning_rate": 2.023919891780526e-05, + "loss": 3.5655, + "step": 35130 + }, + { + "epoch": 3.612253289473684, + "grad_norm": 1.5602264648285882, + "learning_rate": 2.0220222996536787e-05, + "loss": 3.5349, + "step": 35140 + }, + { + "epoch": 3.61328125, + "grad_norm": 1.3791918148195068, + "learning_rate": 2.0201255888589254e-05, + "loss": 3.5961, + "step": 35150 + }, + { + "epoch": 3.614309210526316, + "grad_norm": 1.0563798113627492, + "learning_rate": 2.018229760146558e-05, + "loss": 3.566, + "step": 35160 + }, + { + "epoch": 3.6153371710526314, + "grad_norm": 0.80391007432686, + "learning_rate": 2.01633481426652e-05, + "loss": 3.4973, + "step": 35170 + }, + { + "epoch": 3.6163651315789473, + "grad_norm": 0.9619721525824996, + "learning_rate": 2.0144407519684016e-05, + "loss": 3.636, + "step": 35180 + }, + { + "epoch": 3.6173930921052633, + "grad_norm": 0.7222198818446003, + "learning_rate": 2.0125475740014465e-05, + "loss": 3.454, + "step": 35190 + }, + { + "epoch": 3.6184210526315788, + "grad_norm": 0.5917050129817182, + "learning_rate": 2.010655281114553e-05, + "loss": 3.5365, + "step": 35200 + }, + { + "epoch": 3.6194490131578947, + "grad_norm": 0.7797234689370133, + "learning_rate": 2.0087638740562598e-05, + "loss": 3.5949, + "step": 35210 + }, + { + "epoch": 3.6204769736842106, + "grad_norm": 0.6859686008084962, + "learning_rate": 2.0068733535747614e-05, + "loss": 3.4838, + "step": 35220 + }, + { + "epoch": 3.6215049342105265, + "grad_norm": 0.8030530014123555, + "learning_rate": 2.0049837204179007e-05, + "loss": 3.5403, + "step": 35230 + }, + { + "epoch": 3.622532894736842, + "grad_norm": 0.8064100845857751, + "learning_rate": 2.0030949753331688e-05, + "loss": 3.4519, + "step": 35240 + }, + { + "epoch": 3.623560855263158, + "grad_norm": 0.9036060438089942, + "learning_rate": 2.001207119067707e-05, + "loss": 3.4517, + "step": 35250 + }, + { + "epoch": 3.6245888157894735, + "grad_norm": 1.1800729993287755, + "learning_rate": 1.9993201523683025e-05, + "loss": 3.483, + "step": 35260 + }, + { + "epoch": 3.6256167763157894, + "grad_norm": 0.753133901962917, + "learning_rate": 1.997434075981393e-05, + "loss": 3.4988, + "step": 35270 + }, + { + "epoch": 3.6266447368421053, + "grad_norm": 0.8040830323273458, + "learning_rate": 1.9955488906530628e-05, + "loss": 3.5311, + "step": 35280 + }, + { + "epoch": 3.6276726973684212, + "grad_norm": 1.1640616807358695, + "learning_rate": 1.9936645971290446e-05, + "loss": 3.6052, + "step": 35290 + }, + { + "epoch": 3.6287006578947367, + "grad_norm": 0.8787464589987722, + "learning_rate": 1.9917811961547154e-05, + "loss": 3.5439, + "step": 35300 + }, + { + "epoch": 3.6297286184210527, + "grad_norm": 0.8678589704226314, + "learning_rate": 1.9898986884751025e-05, + "loss": 3.4826, + "step": 35310 + }, + { + "epoch": 3.6307565789473686, + "grad_norm": 0.7580673733011362, + "learning_rate": 1.9880170748348792e-05, + "loss": 3.561, + "step": 35320 + }, + { + "epoch": 3.631784539473684, + "grad_norm": 1.2132332482210204, + "learning_rate": 1.986136355978365e-05, + "loss": 3.5061, + "step": 35330 + }, + { + "epoch": 3.6328125, + "grad_norm": 0.7113157251357148, + "learning_rate": 1.984256532649524e-05, + "loss": 3.5673, + "step": 35340 + }, + { + "epoch": 3.633840460526316, + "grad_norm": 0.691344306886605, + "learning_rate": 1.9823776055919678e-05, + "loss": 3.5097, + "step": 35350 + }, + { + "epoch": 3.6348684210526314, + "grad_norm": 0.6666703795289918, + "learning_rate": 1.9804995755489523e-05, + "loss": 3.487, + "step": 35360 + }, + { + "epoch": 3.6358963815789473, + "grad_norm": 1.142934681612158, + "learning_rate": 1.9786224432633803e-05, + "loss": 3.5417, + "step": 35370 + }, + { + "epoch": 3.6369243421052633, + "grad_norm": 0.8314049382764895, + "learning_rate": 1.976746209477798e-05, + "loss": 3.5115, + "step": 35380 + }, + { + "epoch": 3.6379523026315788, + "grad_norm": 0.6283681949089914, + "learning_rate": 1.9748708749343948e-05, + "loss": 3.4761, + "step": 35390 + }, + { + "epoch": 3.6389802631578947, + "grad_norm": 1.0078045157182818, + "learning_rate": 1.972996440375006e-05, + "loss": 3.4291, + "step": 35400 + }, + { + "epoch": 3.6400082236842106, + "grad_norm": 1.5312283572788905, + "learning_rate": 1.971122906541114e-05, + "loss": 3.4835, + "step": 35410 + }, + { + "epoch": 3.6410361842105265, + "grad_norm": 1.0389502902140644, + "learning_rate": 1.9692502741738394e-05, + "loss": 3.483, + "step": 35420 + }, + { + "epoch": 3.642064144736842, + "grad_norm": 1.0070340010765977, + "learning_rate": 1.9673785440139482e-05, + "loss": 3.5247, + "step": 35430 + }, + { + "epoch": 3.643092105263158, + "grad_norm": 0.662042282441124, + "learning_rate": 1.9655077168018507e-05, + "loss": 3.5469, + "step": 35440 + }, + { + "epoch": 3.6441200657894735, + "grad_norm": 1.0317879054226349, + "learning_rate": 1.9636377932775993e-05, + "loss": 3.5226, + "step": 35450 + }, + { + "epoch": 3.6451480263157894, + "grad_norm": 0.9136270969568787, + "learning_rate": 1.961768774180889e-05, + "loss": 3.5067, + "step": 35460 + }, + { + "epoch": 3.6461759868421053, + "grad_norm": 0.9912240765081303, + "learning_rate": 1.959900660251056e-05, + "loss": 3.5357, + "step": 35470 + }, + { + "epoch": 3.6472039473684212, + "grad_norm": 0.6009669390215665, + "learning_rate": 1.9580334522270804e-05, + "loss": 3.5918, + "step": 35480 + }, + { + "epoch": 3.6482319078947367, + "grad_norm": 0.6403914700500565, + "learning_rate": 1.9561671508475823e-05, + "loss": 3.4702, + "step": 35490 + }, + { + "epoch": 3.6492598684210527, + "grad_norm": 1.0198931289297772, + "learning_rate": 1.954301756850825e-05, + "loss": 3.4808, + "step": 35500 + }, + { + "epoch": 3.6502878289473686, + "grad_norm": 0.9500448141068611, + "learning_rate": 1.9524372709747093e-05, + "loss": 3.5364, + "step": 35510 + }, + { + "epoch": 3.651315789473684, + "grad_norm": 0.6404868884192589, + "learning_rate": 1.9505736939567784e-05, + "loss": 3.562, + "step": 35520 + }, + { + "epoch": 3.65234375, + "grad_norm": 0.9769013999354133, + "learning_rate": 1.9487110265342208e-05, + "loss": 3.4732, + "step": 35530 + }, + { + "epoch": 3.653371710526316, + "grad_norm": 0.5719182128422153, + "learning_rate": 1.946849269443858e-05, + "loss": 3.506, + "step": 35540 + }, + { + "epoch": 3.6543996710526314, + "grad_norm": 0.6271186918975871, + "learning_rate": 1.944988423422154e-05, + "loss": 3.5221, + "step": 35550 + }, + { + "epoch": 3.6554276315789473, + "grad_norm": 0.7769745376421064, + "learning_rate": 1.9431284892052146e-05, + "loss": 3.456, + "step": 35560 + }, + { + "epoch": 3.6564555921052633, + "grad_norm": 0.6835838535137431, + "learning_rate": 1.9412694675287817e-05, + "loss": 3.481, + "step": 35570 + }, + { + "epoch": 3.6574835526315788, + "grad_norm": 0.9653055421844597, + "learning_rate": 1.9394113591282384e-05, + "loss": 3.5093, + "step": 35580 + }, + { + "epoch": 3.6585115131578947, + "grad_norm": 1.140380310568068, + "learning_rate": 1.9375541647386055e-05, + "loss": 3.5691, + "step": 35590 + }, + { + "epoch": 3.6595394736842106, + "grad_norm": 1.0717889111704937, + "learning_rate": 1.9356978850945444e-05, + "loss": 3.5247, + "step": 35600 + }, + { + "epoch": 3.6605674342105265, + "grad_norm": 0.8272471543531449, + "learning_rate": 1.933842520930348e-05, + "loss": 3.4516, + "step": 35610 + }, + { + "epoch": 3.661595394736842, + "grad_norm": 0.9416472573657199, + "learning_rate": 1.9319880729799572e-05, + "loss": 3.5472, + "step": 35620 + }, + { + "epoch": 3.662623355263158, + "grad_norm": 0.7084017795329303, + "learning_rate": 1.930134541976942e-05, + "loss": 3.6218, + "step": 35630 + }, + { + "epoch": 3.6636513157894735, + "grad_norm": 0.6518172235020547, + "learning_rate": 1.928281928654513e-05, + "loss": 3.4677, + "step": 35640 + }, + { + "epoch": 3.6646792763157894, + "grad_norm": 0.8327608088900141, + "learning_rate": 1.9264302337455182e-05, + "loss": 3.4805, + "step": 35650 + }, + { + "epoch": 3.6657072368421053, + "grad_norm": 1.0317243583234441, + "learning_rate": 1.9245794579824415e-05, + "loss": 3.4884, + "step": 35660 + }, + { + "epoch": 3.6667351973684212, + "grad_norm": 0.8190575582565505, + "learning_rate": 1.922729602097403e-05, + "loss": 3.4752, + "step": 35670 + }, + { + "epoch": 3.6677631578947367, + "grad_norm": 0.9454502919554064, + "learning_rate": 1.92088066682216e-05, + "loss": 3.533, + "step": 35680 + }, + { + "epoch": 3.6687911184210527, + "grad_norm": 0.8800667793771846, + "learning_rate": 1.9190326528881046e-05, + "loss": 3.5402, + "step": 35690 + }, + { + "epoch": 3.6698190789473686, + "grad_norm": 0.6691612173676452, + "learning_rate": 1.917185561026265e-05, + "loss": 3.5996, + "step": 35700 + }, + { + "epoch": 3.670847039473684, + "grad_norm": 0.7161105661667173, + "learning_rate": 1.915339391967304e-05, + "loss": 3.4498, + "step": 35710 + }, + { + "epoch": 3.671875, + "grad_norm": 0.8832914352976924, + "learning_rate": 1.913494146441521e-05, + "loss": 3.4933, + "step": 35720 + }, + { + "epoch": 3.672902960526316, + "grad_norm": 0.6743314991338503, + "learning_rate": 1.911649825178845e-05, + "loss": 3.4597, + "step": 35730 + }, + { + "epoch": 3.6739309210526314, + "grad_norm": 0.8040185759044843, + "learning_rate": 1.909806428908849e-05, + "loss": 3.4659, + "step": 35740 + }, + { + "epoch": 3.6749588815789473, + "grad_norm": 0.7323575411198588, + "learning_rate": 1.90796395836073e-05, + "loss": 3.5758, + "step": 35750 + }, + { + "epoch": 3.6759868421052633, + "grad_norm": 0.7950672888319575, + "learning_rate": 1.9061224142633242e-05, + "loss": 3.4244, + "step": 35760 + }, + { + "epoch": 3.6770148026315788, + "grad_norm": 0.9993202760372277, + "learning_rate": 1.9042817973451005e-05, + "loss": 3.5074, + "step": 35770 + }, + { + "epoch": 3.6780427631578947, + "grad_norm": 0.7284884613945576, + "learning_rate": 1.9024421083341612e-05, + "loss": 3.4277, + "step": 35780 + }, + { + "epoch": 3.6790707236842106, + "grad_norm": 0.9611955621828842, + "learning_rate": 1.9006033479582405e-05, + "loss": 3.5399, + "step": 35790 + }, + { + "epoch": 3.6800986842105265, + "grad_norm": 0.7647047545836837, + "learning_rate": 1.898765516944706e-05, + "loss": 3.4659, + "step": 35800 + }, + { + "epoch": 3.681126644736842, + "grad_norm": 1.0221820148447966, + "learning_rate": 1.896928616020558e-05, + "loss": 3.5028, + "step": 35810 + }, + { + "epoch": 3.682154605263158, + "grad_norm": 0.6238559026391965, + "learning_rate": 1.8950926459124278e-05, + "loss": 3.497, + "step": 35820 + }, + { + "epoch": 3.6831825657894735, + "grad_norm": 0.9059347393603073, + "learning_rate": 1.8932576073465803e-05, + "loss": 3.5798, + "step": 35830 + }, + { + "epoch": 3.6842105263157894, + "grad_norm": 0.6783061185582484, + "learning_rate": 1.8914235010489092e-05, + "loss": 3.5702, + "step": 35840 + }, + { + "epoch": 3.6852384868421053, + "grad_norm": 0.6048804347453746, + "learning_rate": 1.8895903277449418e-05, + "loss": 3.5415, + "step": 35850 + }, + { + "epoch": 3.6862664473684212, + "grad_norm": 0.8095531576844613, + "learning_rate": 1.8877580881598345e-05, + "loss": 3.4938, + "step": 35860 + }, + { + "epoch": 3.6872944078947367, + "grad_norm": 0.8358661585517471, + "learning_rate": 1.8859267830183765e-05, + "loss": 3.4737, + "step": 35870 + }, + { + "epoch": 3.6883223684210527, + "grad_norm": 0.6620764935185981, + "learning_rate": 1.8840964130449856e-05, + "loss": 3.523, + "step": 35880 + }, + { + "epoch": 3.6893503289473686, + "grad_norm": 0.571624917970239, + "learning_rate": 1.8822669789637107e-05, + "loss": 3.4835, + "step": 35890 + }, + { + "epoch": 3.690378289473684, + "grad_norm": 0.8006072064580688, + "learning_rate": 1.88043848149823e-05, + "loss": 3.524, + "step": 35900 + }, + { + "epoch": 3.69140625, + "grad_norm": 1.214289057150014, + "learning_rate": 1.8786109213718506e-05, + "loss": 3.4903, + "step": 35910 + }, + { + "epoch": 3.692434210526316, + "grad_norm": 0.7119331932898734, + "learning_rate": 1.8767842993075097e-05, + "loss": 3.5515, + "step": 35920 + }, + { + "epoch": 3.6934621710526314, + "grad_norm": 0.9416457698551393, + "learning_rate": 1.8749586160277745e-05, + "loss": 3.603, + "step": 35930 + }, + { + "epoch": 3.6944901315789473, + "grad_norm": 0.8252701764880348, + "learning_rate": 1.8731338722548363e-05, + "loss": 3.4674, + "step": 35940 + }, + { + "epoch": 3.6955180921052633, + "grad_norm": 0.860168264306865, + "learning_rate": 1.871310068710522e-05, + "loss": 3.5094, + "step": 35950 + }, + { + "epoch": 3.6965460526315788, + "grad_norm": 0.5821060542225783, + "learning_rate": 1.8694872061162795e-05, + "loss": 3.5884, + "step": 35960 + }, + { + "epoch": 3.6975740131578947, + "grad_norm": 0.8460164535874898, + "learning_rate": 1.8676652851931887e-05, + "loss": 3.5437, + "step": 35970 + }, + { + "epoch": 3.6986019736842106, + "grad_norm": 0.8803534566812601, + "learning_rate": 1.8658443066619556e-05, + "loss": 3.4708, + "step": 35980 + }, + { + "epoch": 3.6996299342105265, + "grad_norm": 0.7783757935472925, + "learning_rate": 1.864024271242914e-05, + "loss": 3.5562, + "step": 35990 + }, + { + "epoch": 3.700657894736842, + "grad_norm": 0.892693710355757, + "learning_rate": 1.8622051796560234e-05, + "loss": 3.4685, + "step": 36000 + }, + { + "epoch": 3.701685855263158, + "grad_norm": 0.8697941755308108, + "learning_rate": 1.860387032620871e-05, + "loss": 3.5196, + "step": 36010 + }, + { + "epoch": 3.7027138157894735, + "grad_norm": 0.9566813549438266, + "learning_rate": 1.858569830856671e-05, + "loss": 3.5548, + "step": 36020 + }, + { + "epoch": 3.7037417763157894, + "grad_norm": 0.8618188614850827, + "learning_rate": 1.856753575082263e-05, + "loss": 3.4809, + "step": 36030 + }, + { + "epoch": 3.7047697368421053, + "grad_norm": 0.6280251722575583, + "learning_rate": 1.8549382660161107e-05, + "loss": 3.551, + "step": 36040 + }, + { + "epoch": 3.7057976973684212, + "grad_norm": 0.792556204382249, + "learning_rate": 1.8531239043763075e-05, + "loss": 3.5667, + "step": 36050 + }, + { + "epoch": 3.7068256578947367, + "grad_norm": 0.7846079947643604, + "learning_rate": 1.851310490880565e-05, + "loss": 3.5853, + "step": 36060 + }, + { + "epoch": 3.7078536184210527, + "grad_norm": 0.6437114472557636, + "learning_rate": 1.849498026246229e-05, + "loss": 3.5474, + "step": 36070 + }, + { + "epoch": 3.7088815789473686, + "grad_norm": 0.8298381862550834, + "learning_rate": 1.8476865111902614e-05, + "loss": 3.4351, + "step": 36080 + }, + { + "epoch": 3.709909539473684, + "grad_norm": 0.9684474841892917, + "learning_rate": 1.8458759464292536e-05, + "loss": 3.4629, + "step": 36090 + }, + { + "epoch": 3.7109375, + "grad_norm": 1.011953142623059, + "learning_rate": 1.8440663326794195e-05, + "loss": 3.5478, + "step": 36100 + }, + { + "epoch": 3.711965460526316, + "grad_norm": 1.164138484241434, + "learning_rate": 1.8422576706565967e-05, + "loss": 3.508, + "step": 36110 + }, + { + "epoch": 3.7129934210526314, + "grad_norm": 0.9004787972859618, + "learning_rate": 1.8404499610762465e-05, + "loss": 3.539, + "step": 36120 + }, + { + "epoch": 3.7140213815789473, + "grad_norm": 0.9271473523156305, + "learning_rate": 1.8386432046534538e-05, + "loss": 3.5005, + "step": 36130 + }, + { + "epoch": 3.7150493421052633, + "grad_norm": 0.7069415673128883, + "learning_rate": 1.8368374021029278e-05, + "loss": 3.4737, + "step": 36140 + }, + { + "epoch": 3.7160773026315788, + "grad_norm": 0.9269456904097498, + "learning_rate": 1.835032554138994e-05, + "loss": 3.4675, + "step": 36150 + }, + { + "epoch": 3.7171052631578947, + "grad_norm": 0.6444635066642381, + "learning_rate": 1.83322866147561e-05, + "loss": 3.5599, + "step": 36160 + }, + { + "epoch": 3.7181332236842106, + "grad_norm": 0.8238506851254561, + "learning_rate": 1.831425724826347e-05, + "loss": 3.3708, + "step": 36170 + }, + { + "epoch": 3.7191611842105265, + "grad_norm": 1.0951675493264568, + "learning_rate": 1.8296237449044035e-05, + "loss": 3.5463, + "step": 36180 + }, + { + "epoch": 3.720189144736842, + "grad_norm": 0.9371069402054851, + "learning_rate": 1.8278227224225963e-05, + "loss": 3.5271, + "step": 36190 + }, + { + "epoch": 3.721217105263158, + "grad_norm": 0.6340203238006484, + "learning_rate": 1.826022658093366e-05, + "loss": 3.4576, + "step": 36200 + }, + { + "epoch": 3.7222450657894735, + "grad_norm": 0.81318972729463, + "learning_rate": 1.824223552628772e-05, + "loss": 3.5894, + "step": 36210 + }, + { + "epoch": 3.7232730263157894, + "grad_norm": 0.8551777035381323, + "learning_rate": 1.8224254067404953e-05, + "loss": 3.4512, + "step": 36220 + }, + { + "epoch": 3.7243009868421053, + "grad_norm": 0.985592649094615, + "learning_rate": 1.820628221139838e-05, + "loss": 3.501, + "step": 36230 + }, + { + "epoch": 3.7253289473684212, + "grad_norm": 1.0157996981098156, + "learning_rate": 1.8188319965377204e-05, + "loss": 3.5425, + "step": 36240 + }, + { + "epoch": 3.7263569078947367, + "grad_norm": 1.2250935693933427, + "learning_rate": 1.817036733644685e-05, + "loss": 3.5141, + "step": 36250 + }, + { + "epoch": 3.7273848684210527, + "grad_norm": 0.7345331711572558, + "learning_rate": 1.815242433170894e-05, + "loss": 3.4763, + "step": 36260 + }, + { + "epoch": 3.7284128289473686, + "grad_norm": 0.8040136089501689, + "learning_rate": 1.8134490958261234e-05, + "loss": 3.4966, + "step": 36270 + }, + { + "epoch": 3.729440789473684, + "grad_norm": 1.0333933664131505, + "learning_rate": 1.8116567223197778e-05, + "loss": 3.5217, + "step": 36280 + }, + { + "epoch": 3.73046875, + "grad_norm": 1.0871217705708374, + "learning_rate": 1.8098653133608712e-05, + "loss": 3.5321, + "step": 36290 + }, + { + "epoch": 3.731496710526316, + "grad_norm": 0.7291464848223129, + "learning_rate": 1.8080748696580417e-05, + "loss": 3.5559, + "step": 36300 + }, + { + "epoch": 3.7325246710526314, + "grad_norm": 0.9607521746807118, + "learning_rate": 1.8062853919195432e-05, + "loss": 3.4898, + "step": 36310 + }, + { + "epoch": 3.7335526315789473, + "grad_norm": 0.8989392745265646, + "learning_rate": 1.804496880853249e-05, + "loss": 3.4544, + "step": 36320 + }, + { + "epoch": 3.7345805921052633, + "grad_norm": 0.8925411291823501, + "learning_rate": 1.802709337166649e-05, + "loss": 3.533, + "step": 36330 + }, + { + "epoch": 3.7356085526315788, + "grad_norm": 0.8270524835158012, + "learning_rate": 1.8009227615668504e-05, + "loss": 3.5905, + "step": 36340 + }, + { + "epoch": 3.7366365131578947, + "grad_norm": 0.8244934708942276, + "learning_rate": 1.799137154760578e-05, + "loss": 3.5077, + "step": 36350 + }, + { + "epoch": 3.7376644736842106, + "grad_norm": 0.7605659172914867, + "learning_rate": 1.7973525174541725e-05, + "loss": 3.5185, + "step": 36360 + }, + { + "epoch": 3.7386924342105265, + "grad_norm": 0.5616365534419525, + "learning_rate": 1.7955688503535926e-05, + "loss": 3.6118, + "step": 36370 + }, + { + "epoch": 3.739720394736842, + "grad_norm": 0.5846817802504191, + "learning_rate": 1.7937861541644125e-05, + "loss": 3.4325, + "step": 36380 + }, + { + "epoch": 3.740748355263158, + "grad_norm": 1.196534573021622, + "learning_rate": 1.79200442959182e-05, + "loss": 3.457, + "step": 36390 + }, + { + "epoch": 3.7417763157894735, + "grad_norm": 1.9864378292130396, + "learning_rate": 1.7902236773406222e-05, + "loss": 3.4873, + "step": 36400 + }, + { + "epoch": 3.7428042763157894, + "grad_norm": 0.6846714788501166, + "learning_rate": 1.7884438981152393e-05, + "loss": 3.4852, + "step": 36410 + }, + { + "epoch": 3.7438322368421053, + "grad_norm": 0.9565290729473379, + "learning_rate": 1.7866650926197086e-05, + "loss": 3.5149, + "step": 36420 + }, + { + "epoch": 3.7448601973684212, + "grad_norm": 0.9378135474713026, + "learning_rate": 1.7848872615576798e-05, + "loss": 3.5518, + "step": 36430 + }, + { + "epoch": 3.7458881578947367, + "grad_norm": 0.6944306170888337, + "learning_rate": 1.7831104056324185e-05, + "loss": 3.4516, + "step": 36440 + }, + { + "epoch": 3.7469161184210527, + "grad_norm": 0.6620086323212737, + "learning_rate": 1.7813345255468045e-05, + "loss": 3.5893, + "step": 36450 + }, + { + "epoch": 3.7479440789473686, + "grad_norm": 0.6366352190328154, + "learning_rate": 1.7795596220033317e-05, + "loss": 3.5324, + "step": 36460 + }, + { + "epoch": 3.748972039473684, + "grad_norm": 0.8807003105996395, + "learning_rate": 1.777785695704108e-05, + "loss": 3.5253, + "step": 36470 + }, + { + "epoch": 3.75, + "grad_norm": 1.0229448486817454, + "learning_rate": 1.7760127473508507e-05, + "loss": 3.5278, + "step": 36480 + }, + { + "epoch": 3.751027960526316, + "grad_norm": 0.7595475773879995, + "learning_rate": 1.7742407776448988e-05, + "loss": 3.6189, + "step": 36490 + }, + { + "epoch": 3.7520559210526314, + "grad_norm": 0.7745139597551427, + "learning_rate": 1.7724697872871953e-05, + "loss": 3.5665, + "step": 36500 + }, + { + "epoch": 3.7530838815789473, + "grad_norm": 0.7132644718198198, + "learning_rate": 1.7706997769783007e-05, + "loss": 3.4369, + "step": 36510 + }, + { + "epoch": 3.7541118421052633, + "grad_norm": 0.8702216808113435, + "learning_rate": 1.7689307474183866e-05, + "loss": 3.4524, + "step": 36520 + }, + { + "epoch": 3.7551398026315788, + "grad_norm": 0.7207857823720385, + "learning_rate": 1.7671626993072378e-05, + "loss": 3.4444, + "step": 36530 + }, + { + "epoch": 3.7561677631578947, + "grad_norm": 0.7712919357373798, + "learning_rate": 1.7653956333442482e-05, + "loss": 3.5239, + "step": 36540 + }, + { + "epoch": 3.7571957236842106, + "grad_norm": 0.7285269974135401, + "learning_rate": 1.763629550228425e-05, + "loss": 3.5473, + "step": 36550 + }, + { + "epoch": 3.7582236842105265, + "grad_norm": 0.6900078160423763, + "learning_rate": 1.7618644506583875e-05, + "loss": 3.4234, + "step": 36560 + }, + { + "epoch": 3.759251644736842, + "grad_norm": 0.8985179035863368, + "learning_rate": 1.7601003353323637e-05, + "loss": 3.564, + "step": 36570 + }, + { + "epoch": 3.760279605263158, + "grad_norm": 0.7891498932739333, + "learning_rate": 1.7583372049481942e-05, + "loss": 3.5052, + "step": 36580 + }, + { + "epoch": 3.7613075657894735, + "grad_norm": 1.3365420039034472, + "learning_rate": 1.75657506020333e-05, + "loss": 3.4715, + "step": 36590 + }, + { + "epoch": 3.7623355263157894, + "grad_norm": 0.8823186792939696, + "learning_rate": 1.754813901794827e-05, + "loss": 3.4913, + "step": 36600 + }, + { + "epoch": 3.7633634868421053, + "grad_norm": 0.616295278251519, + "learning_rate": 1.7530537304193616e-05, + "loss": 3.4862, + "step": 36610 + }, + { + "epoch": 3.7643914473684212, + "grad_norm": 1.0346245317761076, + "learning_rate": 1.7512945467732088e-05, + "loss": 3.4964, + "step": 36620 + }, + { + "epoch": 3.7654194078947367, + "grad_norm": 1.3294651226563332, + "learning_rate": 1.749536351552259e-05, + "loss": 3.5905, + "step": 36630 + }, + { + "epoch": 3.7664473684210527, + "grad_norm": 1.59966936569532, + "learning_rate": 1.7477791454520098e-05, + "loss": 3.5623, + "step": 36640 + }, + { + "epoch": 3.7674753289473686, + "grad_norm": 0.6781033657507998, + "learning_rate": 1.746022929167569e-05, + "loss": 3.4539, + "step": 36650 + }, + { + "epoch": 3.768503289473684, + "grad_norm": 0.7001443841733639, + "learning_rate": 1.74426770339365e-05, + "loss": 3.4728, + "step": 36660 + }, + { + "epoch": 3.76953125, + "grad_norm": 0.671704226237216, + "learning_rate": 1.7425134688245772e-05, + "loss": 3.5199, + "step": 36670 + }, + { + "epoch": 3.770559210526316, + "grad_norm": 0.6001307801927913, + "learning_rate": 1.740760226154283e-05, + "loss": 3.4632, + "step": 36680 + }, + { + "epoch": 3.7715871710526314, + "grad_norm": 0.8288579971693512, + "learning_rate": 1.739007976076302e-05, + "loss": 3.4638, + "step": 36690 + }, + { + "epoch": 3.7726151315789473, + "grad_norm": 1.0736599110123632, + "learning_rate": 1.737256719283785e-05, + "loss": 3.5062, + "step": 36700 + }, + { + "epoch": 3.7736430921052633, + "grad_norm": 0.6103616942518446, + "learning_rate": 1.7355064564694844e-05, + "loss": 3.4533, + "step": 36710 + }, + { + "epoch": 3.7746710526315788, + "grad_norm": 1.044167847283851, + "learning_rate": 1.733757188325758e-05, + "loss": 3.5005, + "step": 36720 + }, + { + "epoch": 3.7756990131578947, + "grad_norm": 0.8970175759370141, + "learning_rate": 1.7320089155445735e-05, + "loss": 3.549, + "step": 36730 + }, + { + "epoch": 3.7767269736842106, + "grad_norm": 0.9493052753211495, + "learning_rate": 1.7302616388175043e-05, + "loss": 3.5582, + "step": 36740 + }, + { + "epoch": 3.7777549342105265, + "grad_norm": 0.8756092538990271, + "learning_rate": 1.7285153588357288e-05, + "loss": 3.5254, + "step": 36750 + }, + { + "epoch": 3.778782894736842, + "grad_norm": 0.843476145064993, + "learning_rate": 1.7267700762900316e-05, + "loss": 3.5711, + "step": 36760 + }, + { + "epoch": 3.779810855263158, + "grad_norm": 0.6208417369179902, + "learning_rate": 1.725025791870803e-05, + "loss": 3.5205, + "step": 36770 + }, + { + "epoch": 3.7808388157894735, + "grad_norm": 0.7267102737276911, + "learning_rate": 1.7232825062680378e-05, + "loss": 3.5475, + "step": 36780 + }, + { + "epoch": 3.7818667763157894, + "grad_norm": 0.6245772362433294, + "learning_rate": 1.721540220171336e-05, + "loss": 3.569, + "step": 36790 + }, + { + "epoch": 3.7828947368421053, + "grad_norm": 0.6236711919762409, + "learning_rate": 1.719798934269904e-05, + "loss": 3.5247, + "step": 36800 + }, + { + "epoch": 3.7839226973684212, + "grad_norm": 1.0009248152592165, + "learning_rate": 1.7180586492525465e-05, + "loss": 3.5233, + "step": 36810 + }, + { + "epoch": 3.7849506578947367, + "grad_norm": 0.9158515701931089, + "learning_rate": 1.716319365807681e-05, + "loss": 3.5779, + "step": 36820 + }, + { + "epoch": 3.7859786184210527, + "grad_norm": 0.7954670304396144, + "learning_rate": 1.7145810846233238e-05, + "loss": 3.5282, + "step": 36830 + }, + { + "epoch": 3.7870065789473686, + "grad_norm": 0.7512750119379167, + "learning_rate": 1.7128438063870934e-05, + "loss": 3.4677, + "step": 36840 + }, + { + "epoch": 3.788034539473684, + "grad_norm": 1.0035108091744422, + "learning_rate": 1.711107531786214e-05, + "loss": 3.5575, + "step": 36850 + }, + { + "epoch": 3.7890625, + "grad_norm": 0.7459664360989463, + "learning_rate": 1.7093722615075132e-05, + "loss": 3.5318, + "step": 36860 + }, + { + "epoch": 3.790090460526316, + "grad_norm": 0.6478129216959676, + "learning_rate": 1.70763799623742e-05, + "loss": 3.5195, + "step": 36870 + }, + { + "epoch": 3.7911184210526314, + "grad_norm": 1.0820268431205384, + "learning_rate": 1.7059047366619653e-05, + "loss": 3.5137, + "step": 36880 + }, + { + "epoch": 3.7921463815789473, + "grad_norm": 0.7413309877098948, + "learning_rate": 1.7041724834667847e-05, + "loss": 3.5408, + "step": 36890 + }, + { + "epoch": 3.7931743421052633, + "grad_norm": 1.0965420100683172, + "learning_rate": 1.7024412373371132e-05, + "loss": 3.5389, + "step": 36900 + }, + { + "epoch": 3.7942023026315788, + "grad_norm": 0.7082067170764788, + "learning_rate": 1.7007109989577886e-05, + "loss": 3.4913, + "step": 36910 + }, + { + "epoch": 3.7952302631578947, + "grad_norm": 0.644058192653773, + "learning_rate": 1.6989817690132512e-05, + "loss": 3.4897, + "step": 36920 + }, + { + "epoch": 3.7962582236842106, + "grad_norm": 0.6430696503275709, + "learning_rate": 1.697253548187538e-05, + "loss": 3.4289, + "step": 36930 + }, + { + "epoch": 3.7972861842105265, + "grad_norm": 0.7275947293484393, + "learning_rate": 1.695526337164291e-05, + "loss": 3.4864, + "step": 36940 + }, + { + "epoch": 3.798314144736842, + "grad_norm": 0.7947961206340522, + "learning_rate": 1.6938001366267525e-05, + "loss": 3.462, + "step": 36950 + }, + { + "epoch": 3.799342105263158, + "grad_norm": 0.9473017420224893, + "learning_rate": 1.6920749472577634e-05, + "loss": 3.5445, + "step": 36960 + }, + { + "epoch": 3.8003700657894735, + "grad_norm": 1.0254352087591765, + "learning_rate": 1.6903507697397653e-05, + "loss": 3.5141, + "step": 36970 + }, + { + "epoch": 3.8013980263157894, + "grad_norm": 0.8907589058849713, + "learning_rate": 1.6886276047547997e-05, + "loss": 3.5195, + "step": 36980 + }, + { + "epoch": 3.8024259868421053, + "grad_norm": 0.5757597144148392, + "learning_rate": 1.6869054529845077e-05, + "loss": 3.5107, + "step": 36990 + }, + { + "epoch": 3.8034539473684212, + "grad_norm": 0.8063919016436162, + "learning_rate": 1.6851843151101287e-05, + "loss": 3.4802, + "step": 37000 + }, + { + "epoch": 3.8044819078947367, + "grad_norm": 0.6233505417833447, + "learning_rate": 1.6834641918125038e-05, + "loss": 3.5335, + "step": 37010 + }, + { + "epoch": 3.8055098684210527, + "grad_norm": 0.8621772162116911, + "learning_rate": 1.6817450837720663e-05, + "loss": 3.4901, + "step": 37020 + }, + { + "epoch": 3.8065378289473686, + "grad_norm": 0.6228225686905029, + "learning_rate": 1.6800269916688564e-05, + "loss": 3.4885, + "step": 37030 + }, + { + "epoch": 3.807565789473684, + "grad_norm": 0.9043074539257865, + "learning_rate": 1.6783099161825075e-05, + "loss": 3.5108, + "step": 37040 + }, + { + "epoch": 3.80859375, + "grad_norm": 0.6704589203660938, + "learning_rate": 1.6765938579922495e-05, + "loss": 3.5075, + "step": 37050 + }, + { + "epoch": 3.809621710526316, + "grad_norm": 0.8675047835292357, + "learning_rate": 1.6748788177769134e-05, + "loss": 3.5211, + "step": 37060 + }, + { + "epoch": 3.8106496710526314, + "grad_norm": 0.7912823639408344, + "learning_rate": 1.6731647962149264e-05, + "loss": 3.5449, + "step": 37070 + }, + { + "epoch": 3.8116776315789473, + "grad_norm": 0.7949128771373436, + "learning_rate": 1.6714517939843117e-05, + "loss": 3.4191, + "step": 37080 + }, + { + "epoch": 3.8127055921052633, + "grad_norm": 1.2415438249913229, + "learning_rate": 1.6697398117626905e-05, + "loss": 3.5639, + "step": 37090 + }, + { + "epoch": 3.8137335526315788, + "grad_norm": 0.8530055077677964, + "learning_rate": 1.6680288502272795e-05, + "loss": 3.48, + "step": 37100 + }, + { + "epoch": 3.8147615131578947, + "grad_norm": 1.2811451845792663, + "learning_rate": 1.6663189100548926e-05, + "loss": 3.534, + "step": 37110 + }, + { + "epoch": 3.8157894736842106, + "grad_norm": 0.8307678229606688, + "learning_rate": 1.6646099919219393e-05, + "loss": 3.4985, + "step": 37120 + }, + { + "epoch": 3.8168174342105265, + "grad_norm": 0.7705145812613612, + "learning_rate": 1.6629020965044262e-05, + "loss": 3.4656, + "step": 37130 + }, + { + "epoch": 3.817845394736842, + "grad_norm": 0.7228942479185362, + "learning_rate": 1.661195224477949e-05, + "loss": 3.4636, + "step": 37140 + }, + { + "epoch": 3.818873355263158, + "grad_norm": 0.7757442638419949, + "learning_rate": 1.6594893765177086e-05, + "loss": 3.4974, + "step": 37150 + }, + { + "epoch": 3.8199013157894735, + "grad_norm": 1.2223148051992736, + "learning_rate": 1.6577845532984947e-05, + "loss": 3.5067, + "step": 37160 + }, + { + "epoch": 3.8209292763157894, + "grad_norm": 0.9604731422474068, + "learning_rate": 1.6560807554946908e-05, + "loss": 3.4628, + "step": 37170 + }, + { + "epoch": 3.8219572368421053, + "grad_norm": 0.8260904907831843, + "learning_rate": 1.6543779837802773e-05, + "loss": 3.4964, + "step": 37180 + }, + { + "epoch": 3.8229851973684212, + "grad_norm": 0.7751911973447556, + "learning_rate": 1.6526762388288284e-05, + "loss": 3.4489, + "step": 37190 + }, + { + "epoch": 3.8240131578947367, + "grad_norm": 0.8088179199258825, + "learning_rate": 1.6509755213135112e-05, + "loss": 3.4988, + "step": 37200 + }, + { + "epoch": 3.8250411184210527, + "grad_norm": 0.8867885545318912, + "learning_rate": 1.649275831907087e-05, + "loss": 3.509, + "step": 37210 + }, + { + "epoch": 3.8260690789473686, + "grad_norm": 0.893476839270228, + "learning_rate": 1.6475771712819123e-05, + "loss": 3.5091, + "step": 37220 + }, + { + "epoch": 3.827097039473684, + "grad_norm": 0.5964643523747193, + "learning_rate": 1.64587954010993e-05, + "loss": 3.5127, + "step": 37230 + }, + { + "epoch": 3.828125, + "grad_norm": 0.8247694372457769, + "learning_rate": 1.6441829390626843e-05, + "loss": 3.4518, + "step": 37240 + }, + { + "epoch": 3.829152960526316, + "grad_norm": 0.6548873514392991, + "learning_rate": 1.642487368811309e-05, + "loss": 3.5611, + "step": 37250 + }, + { + "epoch": 3.8301809210526314, + "grad_norm": 0.8355780925153309, + "learning_rate": 1.640792830026525e-05, + "loss": 3.4489, + "step": 37260 + }, + { + "epoch": 3.8312088815789473, + "grad_norm": 1.1320208183739895, + "learning_rate": 1.6390993233786515e-05, + "loss": 3.4952, + "step": 37270 + }, + { + "epoch": 3.8322368421052633, + "grad_norm": 1.2460882491704848, + "learning_rate": 1.637406849537598e-05, + "loss": 3.4897, + "step": 37280 + }, + { + "epoch": 3.8332648026315788, + "grad_norm": 0.7654088552121716, + "learning_rate": 1.635715409172863e-05, + "loss": 3.5173, + "step": 37290 + }, + { + "epoch": 3.8342927631578947, + "grad_norm": 0.8517267961109773, + "learning_rate": 1.6340250029535395e-05, + "loss": 3.5403, + "step": 37300 + }, + { + "epoch": 3.8353207236842106, + "grad_norm": 0.6294129030847557, + "learning_rate": 1.6323356315483086e-05, + "loss": 3.4988, + "step": 37310 + }, + { + "epoch": 3.8363486842105265, + "grad_norm": 0.5440487994331973, + "learning_rate": 1.630647295625444e-05, + "loss": 3.4859, + "step": 37320 + }, + { + "epoch": 3.837376644736842, + "grad_norm": 1.4584441097511074, + "learning_rate": 1.628959995852809e-05, + "loss": 3.6074, + "step": 37330 + }, + { + "epoch": 3.838404605263158, + "grad_norm": 0.6411317123936353, + "learning_rate": 1.6272737328978573e-05, + "loss": 3.5277, + "step": 37340 + }, + { + "epoch": 3.8394325657894735, + "grad_norm": 0.6022416034380778, + "learning_rate": 1.62558850742763e-05, + "loss": 3.4743, + "step": 37350 + }, + { + "epoch": 3.8404605263157894, + "grad_norm": 1.101368176790792, + "learning_rate": 1.6239043201087627e-05, + "loss": 3.5452, + "step": 37360 + }, + { + "epoch": 3.8414884868421053, + "grad_norm": 1.0890333108807768, + "learning_rate": 1.622221171607478e-05, + "loss": 3.5726, + "step": 37370 + }, + { + "epoch": 3.8425164473684212, + "grad_norm": 0.802329636610311, + "learning_rate": 1.620539062589585e-05, + "loss": 3.3945, + "step": 37380 + }, + { + "epoch": 3.8435444078947367, + "grad_norm": 0.8726676783073785, + "learning_rate": 1.6188579937204848e-05, + "loss": 3.484, + "step": 37390 + }, + { + "epoch": 3.8445723684210527, + "grad_norm": 0.6578939373864787, + "learning_rate": 1.617177965665166e-05, + "loss": 3.4844, + "step": 37400 + }, + { + "epoch": 3.8456003289473686, + "grad_norm": 0.6570706912436998, + "learning_rate": 1.615498979088206e-05, + "loss": 3.5063, + "step": 37410 + }, + { + "epoch": 3.846628289473684, + "grad_norm": 0.9067012058252142, + "learning_rate": 1.6138210346537687e-05, + "loss": 3.542, + "step": 37420 + }, + { + "epoch": 3.84765625, + "grad_norm": 0.9670848687217409, + "learning_rate": 1.612144133025608e-05, + "loss": 3.5018, + "step": 37430 + }, + { + "epoch": 3.848684210526316, + "grad_norm": 0.798091222097584, + "learning_rate": 1.6104682748670636e-05, + "loss": 3.5242, + "step": 37440 + }, + { + "epoch": 3.8497121710526314, + "grad_norm": 0.9577670946918191, + "learning_rate": 1.6087934608410636e-05, + "loss": 3.5211, + "step": 37450 + }, + { + "epoch": 3.8507401315789473, + "grad_norm": 0.7368902999803768, + "learning_rate": 1.6071196916101225e-05, + "loss": 3.4876, + "step": 37460 + }, + { + "epoch": 3.8517680921052633, + "grad_norm": 0.8997864257867138, + "learning_rate": 1.6054469678363397e-05, + "loss": 3.5308, + "step": 37470 + }, + { + "epoch": 3.8527960526315788, + "grad_norm": 0.7402998359340933, + "learning_rate": 1.6037752901814027e-05, + "loss": 3.5289, + "step": 37480 + }, + { + "epoch": 3.8538240131578947, + "grad_norm": 0.7956677810542349, + "learning_rate": 1.6021046593065887e-05, + "loss": 3.5014, + "step": 37490 + }, + { + "epoch": 3.8548519736842106, + "grad_norm": 0.5945872878183955, + "learning_rate": 1.6004350758727542e-05, + "loss": 3.5603, + "step": 37500 + }, + { + "epoch": 3.8558799342105265, + "grad_norm": 0.6959713375462587, + "learning_rate": 1.598766540540345e-05, + "loss": 3.5382, + "step": 37510 + }, + { + "epoch": 3.856907894736842, + "grad_norm": 0.682644059030231, + "learning_rate": 1.5970990539693926e-05, + "loss": 3.5099, + "step": 37520 + }, + { + "epoch": 3.857935855263158, + "grad_norm": 0.6063021185190347, + "learning_rate": 1.595432616819512e-05, + "loss": 3.5179, + "step": 37530 + }, + { + "epoch": 3.8589638157894735, + "grad_norm": 0.6998008351029643, + "learning_rate": 1.593767229749905e-05, + "loss": 3.4754, + "step": 37540 + }, + { + "epoch": 3.8599917763157894, + "grad_norm": 0.9202543010346897, + "learning_rate": 1.5921028934193575e-05, + "loss": 3.5726, + "step": 37550 + }, + { + "epoch": 3.8610197368421053, + "grad_norm": 0.7871562479985296, + "learning_rate": 1.5904396084862356e-05, + "loss": 3.5652, + "step": 37560 + }, + { + "epoch": 3.8620476973684212, + "grad_norm": 0.7495457147598551, + "learning_rate": 1.5887773756084967e-05, + "loss": 3.457, + "step": 37570 + }, + { + "epoch": 3.8630756578947367, + "grad_norm": 0.8235823357329759, + "learning_rate": 1.5871161954436787e-05, + "loss": 3.4381, + "step": 37580 + }, + { + "epoch": 3.8641036184210527, + "grad_norm": 0.802753903042391, + "learning_rate": 1.5854560686489002e-05, + "loss": 3.4935, + "step": 37590 + }, + { + "epoch": 3.8651315789473686, + "grad_norm": 1.0216951754472992, + "learning_rate": 1.583796995880866e-05, + "loss": 3.5478, + "step": 37600 + }, + { + "epoch": 3.866159539473684, + "grad_norm": 0.6450597965629942, + "learning_rate": 1.5821389777958668e-05, + "loss": 3.5197, + "step": 37610 + }, + { + "epoch": 3.8671875, + "grad_norm": 0.9957431550208488, + "learning_rate": 1.5804820150497697e-05, + "loss": 3.3772, + "step": 37620 + }, + { + "epoch": 3.868215460526316, + "grad_norm": 0.9892270534361509, + "learning_rate": 1.5788261082980283e-05, + "loss": 3.4761, + "step": 37630 + }, + { + "epoch": 3.8692434210526314, + "grad_norm": 0.7440611629161653, + "learning_rate": 1.577171258195678e-05, + "loss": 3.5257, + "step": 37640 + }, + { + "epoch": 3.8702713815789473, + "grad_norm": 0.6858839200932609, + "learning_rate": 1.575517465397337e-05, + "loss": 3.451, + "step": 37650 + }, + { + "epoch": 3.8712993421052633, + "grad_norm": 1.0722979208941281, + "learning_rate": 1.5738647305572032e-05, + "loss": 3.4974, + "step": 37660 + }, + { + "epoch": 3.8723273026315788, + "grad_norm": 0.9389075838543893, + "learning_rate": 1.572213054329058e-05, + "loss": 3.4754, + "step": 37670 + }, + { + "epoch": 3.8733552631578947, + "grad_norm": 0.8512263553265206, + "learning_rate": 1.57056243736626e-05, + "loss": 3.5611, + "step": 37680 + }, + { + "epoch": 3.8743832236842106, + "grad_norm": 0.7074783766040303, + "learning_rate": 1.5689128803217558e-05, + "loss": 3.4852, + "step": 37690 + }, + { + "epoch": 3.8754111842105265, + "grad_norm": 0.7461785472994521, + "learning_rate": 1.5672643838480683e-05, + "loss": 3.4838, + "step": 37700 + }, + { + "epoch": 3.876439144736842, + "grad_norm": 1.2596595423782047, + "learning_rate": 1.5656169485972993e-05, + "loss": 3.6321, + "step": 37710 + }, + { + "epoch": 3.877467105263158, + "grad_norm": 0.7352965234268325, + "learning_rate": 1.5639705752211335e-05, + "loss": 3.5488, + "step": 37720 + }, + { + "epoch": 3.8784950657894735, + "grad_norm": 0.6951920265668391, + "learning_rate": 1.5623252643708355e-05, + "loss": 3.4044, + "step": 37730 + }, + { + "epoch": 3.8795230263157894, + "grad_norm": 0.8868894034702026, + "learning_rate": 1.5606810166972483e-05, + "loss": 3.4791, + "step": 37740 + }, + { + "epoch": 3.8805509868421053, + "grad_norm": 0.7791993885683132, + "learning_rate": 1.5590378328507955e-05, + "loss": 3.4553, + "step": 37750 + }, + { + "epoch": 3.8815789473684212, + "grad_norm": 1.0936213281421365, + "learning_rate": 1.5573957134814814e-05, + "loss": 3.5759, + "step": 37760 + }, + { + "epoch": 3.8826069078947367, + "grad_norm": 1.2788998053125598, + "learning_rate": 1.5557546592388816e-05, + "loss": 3.5556, + "step": 37770 + }, + { + "epoch": 3.8836348684210527, + "grad_norm": 0.8388234527875587, + "learning_rate": 1.5541146707721616e-05, + "loss": 3.5057, + "step": 37780 + }, + { + "epoch": 3.8846628289473686, + "grad_norm": 0.8594549662058228, + "learning_rate": 1.552475748730059e-05, + "loss": 3.4926, + "step": 37790 + }, + { + "epoch": 3.885690789473684, + "grad_norm": 0.7302350573109551, + "learning_rate": 1.5508378937608876e-05, + "loss": 3.4512, + "step": 37800 + }, + { + "epoch": 3.88671875, + "grad_norm": 0.8018231196876625, + "learning_rate": 1.549201106512542e-05, + "loss": 3.4351, + "step": 37810 + }, + { + "epoch": 3.887746710526316, + "grad_norm": 0.6553566392359834, + "learning_rate": 1.547565387632498e-05, + "loss": 3.4837, + "step": 37820 + }, + { + "epoch": 3.8887746710526314, + "grad_norm": 0.6679744385594193, + "learning_rate": 1.5459307377678007e-05, + "loss": 3.4823, + "step": 37830 + }, + { + "epoch": 3.8898026315789473, + "grad_norm": 1.084067786504326, + "learning_rate": 1.544297157565078e-05, + "loss": 3.5844, + "step": 37840 + }, + { + "epoch": 3.8908305921052633, + "grad_norm": 0.9953913886457519, + "learning_rate": 1.5426646476705336e-05, + "loss": 3.4743, + "step": 37850 + }, + { + "epoch": 3.8918585526315788, + "grad_norm": 0.7635630264174249, + "learning_rate": 1.5410332087299472e-05, + "loss": 3.4436, + "step": 37860 + }, + { + "epoch": 3.8928865131578947, + "grad_norm": 0.6822844473381642, + "learning_rate": 1.539402841388675e-05, + "loss": 3.4758, + "step": 37870 + }, + { + "epoch": 3.8939144736842106, + "grad_norm": 0.7147746235094142, + "learning_rate": 1.5377735462916515e-05, + "loss": 3.5423, + "step": 37880 + }, + { + "epoch": 3.8949424342105265, + "grad_norm": 2.065652691861823, + "learning_rate": 1.53614532408338e-05, + "loss": 3.507, + "step": 37890 + }, + { + "epoch": 3.895970394736842, + "grad_norm": 1.3863771259572564, + "learning_rate": 1.534518175407949e-05, + "loss": 3.496, + "step": 37900 + }, + { + "epoch": 3.896998355263158, + "grad_norm": 0.6580023077182247, + "learning_rate": 1.532892100909017e-05, + "loss": 3.4958, + "step": 37910 + }, + { + "epoch": 3.8980263157894735, + "grad_norm": 0.6483815640142448, + "learning_rate": 1.5312671012298167e-05, + "loss": 3.4895, + "step": 37920 + }, + { + "epoch": 3.8990542763157894, + "grad_norm": 0.7849910429425294, + "learning_rate": 1.5296431770131568e-05, + "loss": 3.4139, + "step": 37930 + }, + { + "epoch": 3.9000822368421053, + "grad_norm": 0.6464474485978543, + "learning_rate": 1.5280203289014243e-05, + "loss": 3.5675, + "step": 37940 + }, + { + "epoch": 3.9011101973684212, + "grad_norm": 0.5950672695857031, + "learning_rate": 1.5263985575365738e-05, + "loss": 3.5248, + "step": 37950 + }, + { + "epoch": 3.9021381578947367, + "grad_norm": 0.588536107760033, + "learning_rate": 1.5247778635601389e-05, + "loss": 3.5232, + "step": 37960 + }, + { + "epoch": 3.9031661184210527, + "grad_norm": 1.0109026942680697, + "learning_rate": 1.523158247613225e-05, + "loss": 3.5402, + "step": 37970 + }, + { + "epoch": 3.9041940789473686, + "grad_norm": 1.667042888458068, + "learning_rate": 1.5215397103365116e-05, + "loss": 3.5085, + "step": 37980 + }, + { + "epoch": 3.905222039473684, + "grad_norm": 0.8101656465344134, + "learning_rate": 1.5199222523702518e-05, + "loss": 3.466, + "step": 37990 + }, + { + "epoch": 3.90625, + "grad_norm": 0.670252659934321, + "learning_rate": 1.5183058743542725e-05, + "loss": 3.3898, + "step": 38000 + }, + { + "epoch": 3.907277960526316, + "grad_norm": 0.618300629245315, + "learning_rate": 1.5166905769279696e-05, + "loss": 3.5019, + "step": 38010 + }, + { + "epoch": 3.9083059210526314, + "grad_norm": 0.6422777488389232, + "learning_rate": 1.5150763607303151e-05, + "loss": 3.5072, + "step": 38020 + }, + { + "epoch": 3.9093338815789473, + "grad_norm": 1.0971469987495908, + "learning_rate": 1.5134632263998548e-05, + "loss": 3.5024, + "step": 38030 + }, + { + "epoch": 3.9103618421052633, + "grad_norm": 1.2966234353825665, + "learning_rate": 1.5118511745747014e-05, + "loss": 3.4072, + "step": 38040 + }, + { + "epoch": 3.9113898026315788, + "grad_norm": 1.470858744256783, + "learning_rate": 1.510240205892543e-05, + "loss": 3.498, + "step": 38050 + }, + { + "epoch": 3.9124177631578947, + "grad_norm": 0.8884163960873085, + "learning_rate": 1.5086303209906384e-05, + "loss": 3.4186, + "step": 38060 + }, + { + "epoch": 3.9134457236842106, + "grad_norm": 0.751455246217748, + "learning_rate": 1.507021520505818e-05, + "loss": 3.4882, + "step": 38070 + }, + { + "epoch": 3.9144736842105265, + "grad_norm": 0.5628303776199243, + "learning_rate": 1.5054138050744827e-05, + "loss": 3.5354, + "step": 38080 + }, + { + "epoch": 3.915501644736842, + "grad_norm": 0.8233451009661026, + "learning_rate": 1.5038071753326058e-05, + "loss": 3.5227, + "step": 38090 + }, + { + "epoch": 3.916529605263158, + "grad_norm": 0.8815936899600624, + "learning_rate": 1.502201631915726e-05, + "loss": 3.5118, + "step": 38100 + }, + { + "epoch": 3.9175575657894735, + "grad_norm": 0.948454938096647, + "learning_rate": 1.5005971754589595e-05, + "loss": 3.5442, + "step": 38110 + }, + { + "epoch": 3.9185855263157894, + "grad_norm": 0.9315347991274795, + "learning_rate": 1.4989938065969886e-05, + "loss": 3.4839, + "step": 38120 + }, + { + "epoch": 3.9196134868421053, + "grad_norm": 0.8816690044035563, + "learning_rate": 1.4973915259640644e-05, + "loss": 3.4896, + "step": 38130 + }, + { + "epoch": 3.9206414473684212, + "grad_norm": 0.8551017635976365, + "learning_rate": 1.495790334194009e-05, + "loss": 3.5305, + "step": 38140 + }, + { + "epoch": 3.9216694078947367, + "grad_norm": 0.682163267696918, + "learning_rate": 1.494190231920216e-05, + "loss": 3.5118, + "step": 38150 + }, + { + "epoch": 3.9226973684210527, + "grad_norm": 1.0612201836557023, + "learning_rate": 1.492591219775643e-05, + "loss": 3.5105, + "step": 38160 + }, + { + "epoch": 3.9237253289473686, + "grad_norm": 0.7090003061737777, + "learning_rate": 1.4909932983928203e-05, + "loss": 3.4728, + "step": 38170 + }, + { + "epoch": 3.924753289473684, + "grad_norm": 0.6355728650537866, + "learning_rate": 1.4893964684038453e-05, + "loss": 3.4812, + "step": 38180 + }, + { + "epoch": 3.92578125, + "grad_norm": 0.6253619576435551, + "learning_rate": 1.4878007304403839e-05, + "loss": 3.465, + "step": 38190 + }, + { + "epoch": 3.926809210526316, + "grad_norm": 0.6199882313165128, + "learning_rate": 1.4862060851336701e-05, + "loss": 3.5714, + "step": 38200 + }, + { + "epoch": 3.9278371710526314, + "grad_norm": 0.7574864108689092, + "learning_rate": 1.4846125331145071e-05, + "loss": 3.5167, + "step": 38210 + }, + { + "epoch": 3.9288651315789473, + "grad_norm": 0.8064593095181416, + "learning_rate": 1.4830200750132602e-05, + "loss": 3.5518, + "step": 38220 + }, + { + "epoch": 3.9298930921052633, + "grad_norm": 1.0736798963654455, + "learning_rate": 1.4814287114598692e-05, + "loss": 3.4689, + "step": 38230 + }, + { + "epoch": 3.9309210526315788, + "grad_norm": 1.0328614331540538, + "learning_rate": 1.4798384430838376e-05, + "loss": 3.5379, + "step": 38240 + }, + { + "epoch": 3.9319490131578947, + "grad_norm": 0.9563526784086612, + "learning_rate": 1.4782492705142338e-05, + "loss": 3.5025, + "step": 38250 + }, + { + "epoch": 3.9329769736842106, + "grad_norm": 0.9267655624531892, + "learning_rate": 1.4766611943796958e-05, + "loss": 3.568, + "step": 38260 + }, + { + "epoch": 3.9340049342105265, + "grad_norm": 1.1145796850031862, + "learning_rate": 1.4750742153084265e-05, + "loss": 3.4584, + "step": 38270 + }, + { + "epoch": 3.935032894736842, + "grad_norm": 0.614921102541493, + "learning_rate": 1.4734883339281948e-05, + "loss": 3.5362, + "step": 38280 + }, + { + "epoch": 3.936060855263158, + "grad_norm": 0.7334566258332934, + "learning_rate": 1.4719035508663354e-05, + "loss": 3.4503, + "step": 38290 + }, + { + "epoch": 3.9370888157894735, + "grad_norm": 0.7474497909651762, + "learning_rate": 1.4703198667497505e-05, + "loss": 3.4797, + "step": 38300 + }, + { + "epoch": 3.9381167763157894, + "grad_norm": 0.8663926476940652, + "learning_rate": 1.4687372822049022e-05, + "loss": 3.5099, + "step": 38310 + }, + { + "epoch": 3.9391447368421053, + "grad_norm": 0.8117527459876183, + "learning_rate": 1.4671557978578246e-05, + "loss": 3.468, + "step": 38320 + }, + { + "epoch": 3.9401726973684212, + "grad_norm": 0.7943573617965383, + "learning_rate": 1.4655754143341133e-05, + "loss": 3.571, + "step": 38330 + }, + { + "epoch": 3.9412006578947367, + "grad_norm": 0.5775523055528513, + "learning_rate": 1.4639961322589256e-05, + "loss": 3.5202, + "step": 38340 + }, + { + "epoch": 3.9422286184210527, + "grad_norm": 0.621608520806395, + "learning_rate": 1.4624179522569867e-05, + "loss": 3.4411, + "step": 38350 + }, + { + "epoch": 3.9432565789473686, + "grad_norm": 0.7409738757126852, + "learning_rate": 1.460840874952588e-05, + "loss": 3.5407, + "step": 38360 + }, + { + "epoch": 3.944284539473684, + "grad_norm": 1.2267226582339668, + "learning_rate": 1.4592649009695785e-05, + "loss": 3.4182, + "step": 38370 + }, + { + "epoch": 3.9453125, + "grad_norm": 0.9828543952781905, + "learning_rate": 1.4576900309313745e-05, + "loss": 3.479, + "step": 38380 + }, + { + "epoch": 3.946340460526316, + "grad_norm": 0.8782643665753369, + "learning_rate": 1.456116265460956e-05, + "loss": 3.4196, + "step": 38390 + }, + { + "epoch": 3.9473684210526314, + "grad_norm": 0.7182054530031005, + "learning_rate": 1.4545436051808648e-05, + "loss": 3.4759, + "step": 38400 + }, + { + "epoch": 3.9483963815789473, + "grad_norm": 0.7893278978661096, + "learning_rate": 1.4529720507132053e-05, + "loss": 3.4455, + "step": 38410 + }, + { + "epoch": 3.9494243421052633, + "grad_norm": 0.7559077915190283, + "learning_rate": 1.4514016026796469e-05, + "loss": 3.4957, + "step": 38420 + }, + { + "epoch": 3.9504523026315788, + "grad_norm": 0.8579176596854566, + "learning_rate": 1.4498322617014156e-05, + "loss": 3.3799, + "step": 38430 + }, + { + "epoch": 3.9514802631578947, + "grad_norm": 0.7034033330400736, + "learning_rate": 1.4482640283993075e-05, + "loss": 3.5578, + "step": 38440 + }, + { + "epoch": 3.9525082236842106, + "grad_norm": 1.189943510045073, + "learning_rate": 1.4466969033936748e-05, + "loss": 3.4932, + "step": 38450 + }, + { + "epoch": 3.9535361842105265, + "grad_norm": 0.703394096591181, + "learning_rate": 1.4451308873044325e-05, + "loss": 3.5378, + "step": 38460 + }, + { + "epoch": 3.954564144736842, + "grad_norm": 0.6027507935498589, + "learning_rate": 1.4435659807510555e-05, + "loss": 3.5115, + "step": 38470 + }, + { + "epoch": 3.955592105263158, + "grad_norm": 1.2370939127922698, + "learning_rate": 1.4420021843525861e-05, + "loss": 3.5735, + "step": 38480 + }, + { + "epoch": 3.9566200657894735, + "grad_norm": 0.5897784417988592, + "learning_rate": 1.4404394987276193e-05, + "loss": 3.5507, + "step": 38490 + }, + { + "epoch": 3.9576480263157894, + "grad_norm": 0.8242814098964978, + "learning_rate": 1.438877924494316e-05, + "loss": 3.5114, + "step": 38500 + }, + { + "epoch": 3.9586759868421053, + "grad_norm": 1.16683767405117, + "learning_rate": 1.4373174622703942e-05, + "loss": 3.5114, + "step": 38510 + }, + { + "epoch": 3.9597039473684212, + "grad_norm": 1.0223301145264316, + "learning_rate": 1.4357581126731353e-05, + "loss": 3.5252, + "step": 38520 + }, + { + "epoch": 3.9607319078947367, + "grad_norm": 0.7297608724622523, + "learning_rate": 1.434199876319378e-05, + "loss": 3.5331, + "step": 38530 + }, + { + "epoch": 3.9617598684210527, + "grad_norm": 0.6053252511559734, + "learning_rate": 1.4326427538255229e-05, + "loss": 3.5219, + "step": 38540 + }, + { + "epoch": 3.9627878289473686, + "grad_norm": 0.8293841883388271, + "learning_rate": 1.4310867458075262e-05, + "loss": 3.4741, + "step": 38550 + }, + { + "epoch": 3.963815789473684, + "grad_norm": 0.5800635852982362, + "learning_rate": 1.4295318528809058e-05, + "loss": 3.4742, + "step": 38560 + }, + { + "epoch": 3.96484375, + "grad_norm": 1.0710623682991103, + "learning_rate": 1.4279780756607417e-05, + "loss": 3.5245, + "step": 38570 + }, + { + "epoch": 3.965871710526316, + "grad_norm": 0.751937509349829, + "learning_rate": 1.426425414761666e-05, + "loss": 3.5291, + "step": 38580 + }, + { + "epoch": 3.9668996710526314, + "grad_norm": 0.660167716564386, + "learning_rate": 1.4248738707978727e-05, + "loss": 3.4673, + "step": 38590 + }, + { + "epoch": 3.9679276315789473, + "grad_norm": 1.0744544308222541, + "learning_rate": 1.4233234443831147e-05, + "loss": 3.5344, + "step": 38600 + }, + { + "epoch": 3.9689555921052633, + "grad_norm": 0.8222625079968015, + "learning_rate": 1.4217741361307013e-05, + "loss": 3.5898, + "step": 38610 + }, + { + "epoch": 3.9699835526315788, + "grad_norm": 0.9371409682386084, + "learning_rate": 1.4202259466535004e-05, + "loss": 3.5044, + "step": 38620 + }, + { + "epoch": 3.9710115131578947, + "grad_norm": 0.7654106852242863, + "learning_rate": 1.4186788765639374e-05, + "loss": 3.4897, + "step": 38630 + }, + { + "epoch": 3.9720394736842106, + "grad_norm": 0.9377347652221875, + "learning_rate": 1.4171329264739913e-05, + "loss": 3.5183, + "step": 38640 + }, + { + "epoch": 3.9730674342105265, + "grad_norm": 0.6307342924943496, + "learning_rate": 1.4155880969952047e-05, + "loss": 3.5072, + "step": 38650 + }, + { + "epoch": 3.974095394736842, + "grad_norm": 0.9577280116194629, + "learning_rate": 1.4140443887386732e-05, + "loss": 3.4765, + "step": 38660 + }, + { + "epoch": 3.975123355263158, + "grad_norm": 0.6861925480805702, + "learning_rate": 1.4125018023150471e-05, + "loss": 3.4444, + "step": 38670 + }, + { + "epoch": 3.9761513157894735, + "grad_norm": 1.0002370791736492, + "learning_rate": 1.410960338334535e-05, + "loss": 3.4992, + "step": 38680 + }, + { + "epoch": 3.9771792763157894, + "grad_norm": 1.3713346525089174, + "learning_rate": 1.409419997406904e-05, + "loss": 3.5068, + "step": 38690 + }, + { + "epoch": 3.9782072368421053, + "grad_norm": 0.8479280870935404, + "learning_rate": 1.4078807801414718e-05, + "loss": 3.4947, + "step": 38700 + }, + { + "epoch": 3.9792351973684212, + "grad_norm": 0.852943665232906, + "learning_rate": 1.4063426871471148e-05, + "loss": 3.5008, + "step": 38710 + }, + { + "epoch": 3.9802631578947367, + "grad_norm": 0.6492378856349832, + "learning_rate": 1.4048057190322644e-05, + "loss": 3.5075, + "step": 38720 + }, + { + "epoch": 3.9812911184210527, + "grad_norm": 0.9578413045761794, + "learning_rate": 1.403269876404906e-05, + "loss": 3.4839, + "step": 38730 + }, + { + "epoch": 3.9823190789473686, + "grad_norm": 0.7457864338142993, + "learning_rate": 1.401735159872581e-05, + "loss": 3.5404, + "step": 38740 + }, + { + "epoch": 3.983347039473684, + "grad_norm": 1.0378265487422997, + "learning_rate": 1.4002015700423855e-05, + "loss": 3.5429, + "step": 38750 + }, + { + "epoch": 3.984375, + "grad_norm": 0.785172875071773, + "learning_rate": 1.398669107520966e-05, + "loss": 3.53, + "step": 38760 + }, + { + "epoch": 3.985402960526316, + "grad_norm": 0.8347691722860551, + "learning_rate": 1.3971377729145297e-05, + "loss": 3.4899, + "step": 38770 + }, + { + "epoch": 3.9864309210526314, + "grad_norm": 1.2961640305000355, + "learning_rate": 1.3956075668288338e-05, + "loss": 3.4539, + "step": 38780 + }, + { + "epoch": 3.9874588815789473, + "grad_norm": 1.1257481206801567, + "learning_rate": 1.3940784898691879e-05, + "loss": 3.5382, + "step": 38790 + }, + { + "epoch": 3.9884868421052633, + "grad_norm": 1.0203181478393706, + "learning_rate": 1.3925505426404566e-05, + "loss": 3.4591, + "step": 38800 + }, + { + "epoch": 3.9895148026315788, + "grad_norm": 0.6091600674829704, + "learning_rate": 1.3910237257470585e-05, + "loss": 3.4799, + "step": 38810 + }, + { + "epoch": 3.9905427631578947, + "grad_norm": 1.1392808074435505, + "learning_rate": 1.3894980397929635e-05, + "loss": 3.4909, + "step": 38820 + }, + { + "epoch": 3.9915707236842106, + "grad_norm": 0.7552611672347037, + "learning_rate": 1.3879734853816946e-05, + "loss": 3.464, + "step": 38830 + }, + { + "epoch": 3.9925986842105265, + "grad_norm": 0.6249422046641852, + "learning_rate": 1.3864500631163284e-05, + "loss": 3.5514, + "step": 38840 + }, + { + "epoch": 3.993626644736842, + "grad_norm": 0.6958791159988957, + "learning_rate": 1.3849277735994902e-05, + "loss": 3.4591, + "step": 38850 + }, + { + "epoch": 3.994654605263158, + "grad_norm": 0.7176038061539661, + "learning_rate": 1.3834066174333618e-05, + "loss": 3.5183, + "step": 38860 + }, + { + "epoch": 3.9956825657894735, + "grad_norm": 0.7031656253522475, + "learning_rate": 1.3818865952196743e-05, + "loss": 3.4804, + "step": 38870 + }, + { + "epoch": 3.9967105263157894, + "grad_norm": 0.5123667218036774, + "learning_rate": 1.3803677075597088e-05, + "loss": 3.5005, + "step": 38880 + }, + { + "epoch": 3.9977384868421053, + "grad_norm": 0.9615437835766153, + "learning_rate": 1.3788499550542988e-05, + "loss": 3.4874, + "step": 38890 + }, + { + "epoch": 3.9987664473684212, + "grad_norm": 0.8075592868995797, + "learning_rate": 1.377333338303833e-05, + "loss": 3.4708, + "step": 38900 + }, + { + "epoch": 3.9997944078947367, + "grad_norm": 0.8225699833254975, + "learning_rate": 1.375817857908243e-05, + "loss": 3.4746, + "step": 38910 + }, + { + "epoch": 4.000822368421052, + "grad_norm": 0.7948624497304843, + "learning_rate": 1.374303514467016e-05, + "loss": 3.5051, + "step": 38920 + }, + { + "epoch": 4.001850328947368, + "grad_norm": 0.7145920607438893, + "learning_rate": 1.3727903085791882e-05, + "loss": 3.437, + "step": 38930 + }, + { + "epoch": 4.002878289473684, + "grad_norm": 0.8176927058838459, + "learning_rate": 1.3712782408433467e-05, + "loss": 3.5099, + "step": 38940 + }, + { + "epoch": 4.00390625, + "grad_norm": 0.6384349931993532, + "learning_rate": 1.3697673118576266e-05, + "loss": 3.5933, + "step": 38950 + }, + { + "epoch": 4.004934210526316, + "grad_norm": 0.9873370458656314, + "learning_rate": 1.3682575222197154e-05, + "loss": 3.4546, + "step": 38960 + }, + { + "epoch": 4.005962171052632, + "grad_norm": 1.2935039876073502, + "learning_rate": 1.3667488725268437e-05, + "loss": 3.3933, + "step": 38970 + }, + { + "epoch": 4.006990131578948, + "grad_norm": 0.757665442916222, + "learning_rate": 1.3652413633757997e-05, + "loss": 3.4245, + "step": 38980 + }, + { + "epoch": 4.008018092105263, + "grad_norm": 0.8925194251460303, + "learning_rate": 1.3637349953629158e-05, + "loss": 3.4942, + "step": 38990 + }, + { + "epoch": 4.009046052631579, + "grad_norm": 1.093389070086488, + "learning_rate": 1.3622297690840714e-05, + "loss": 3.4624, + "step": 39000 + }, + { + "epoch": 4.010074013157895, + "grad_norm": 0.7206759977533544, + "learning_rate": 1.3607256851346958e-05, + "loss": 3.59, + "step": 39010 + }, + { + "epoch": 4.011101973684211, + "grad_norm": 0.6464328572470649, + "learning_rate": 1.3592227441097707e-05, + "loss": 3.4636, + "step": 39020 + }, + { + "epoch": 4.0121299342105265, + "grad_norm": 0.6414353200293172, + "learning_rate": 1.3577209466038189e-05, + "loss": 3.4356, + "step": 39030 + }, + { + "epoch": 4.0131578947368425, + "grad_norm": 0.66289127509423, + "learning_rate": 1.3562202932109139e-05, + "loss": 3.5149, + "step": 39040 + }, + { + "epoch": 4.0141858552631575, + "grad_norm": 0.6544650872202761, + "learning_rate": 1.3547207845246782e-05, + "loss": 3.4884, + "step": 39050 + }, + { + "epoch": 4.0152138157894735, + "grad_norm": 0.8352675560913335, + "learning_rate": 1.3532224211382764e-05, + "loss": 3.4136, + "step": 39060 + }, + { + "epoch": 4.016241776315789, + "grad_norm": 0.6757953399776949, + "learning_rate": 1.3517252036444269e-05, + "loss": 3.446, + "step": 39070 + }, + { + "epoch": 4.017269736842105, + "grad_norm": 1.170953362761885, + "learning_rate": 1.3502291326353914e-05, + "loss": 3.5586, + "step": 39080 + }, + { + "epoch": 4.018297697368421, + "grad_norm": 0.6770624793337989, + "learning_rate": 1.3487342087029763e-05, + "loss": 3.5389, + "step": 39090 + }, + { + "epoch": 4.019325657894737, + "grad_norm": 0.8848220099867513, + "learning_rate": 1.3472404324385349e-05, + "loss": 3.3947, + "step": 39100 + }, + { + "epoch": 4.020353618421052, + "grad_norm": 1.2011928649557055, + "learning_rate": 1.3457478044329718e-05, + "loss": 3.4713, + "step": 39110 + }, + { + "epoch": 4.021381578947368, + "grad_norm": 0.5556332759004835, + "learning_rate": 1.3442563252767294e-05, + "loss": 3.4216, + "step": 39120 + }, + { + "epoch": 4.022409539473684, + "grad_norm": 0.8373088817878459, + "learning_rate": 1.3427659955598009e-05, + "loss": 3.4695, + "step": 39130 + }, + { + "epoch": 4.0234375, + "grad_norm": 0.8409902863820369, + "learning_rate": 1.3412768158717232e-05, + "loss": 3.4884, + "step": 39140 + }, + { + "epoch": 4.024465460526316, + "grad_norm": 0.6379067740700076, + "learning_rate": 1.3397887868015788e-05, + "loss": 3.5007, + "step": 39150 + }, + { + "epoch": 4.025493421052632, + "grad_norm": 0.7021090266981457, + "learning_rate": 1.3383019089379946e-05, + "loss": 3.4023, + "step": 39160 + }, + { + "epoch": 4.026521381578948, + "grad_norm": 0.9278316085456324, + "learning_rate": 1.3368161828691423e-05, + "loss": 3.5542, + "step": 39170 + }, + { + "epoch": 4.027549342105263, + "grad_norm": 0.8212209122150024, + "learning_rate": 1.335331609182736e-05, + "loss": 3.5208, + "step": 39180 + }, + { + "epoch": 4.028577302631579, + "grad_norm": 1.0257402677694916, + "learning_rate": 1.3338481884660378e-05, + "loss": 3.5063, + "step": 39190 + }, + { + "epoch": 4.029605263157895, + "grad_norm": 1.2348588405663532, + "learning_rate": 1.332365921305853e-05, + "loss": 3.3879, + "step": 39200 + }, + { + "epoch": 4.030633223684211, + "grad_norm": 0.8122620600975936, + "learning_rate": 1.3308848082885257e-05, + "loss": 3.5008, + "step": 39210 + }, + { + "epoch": 4.0316611842105265, + "grad_norm": 1.0653948336738628, + "learning_rate": 1.3294048499999486e-05, + "loss": 3.4268, + "step": 39220 + }, + { + "epoch": 4.0326891447368425, + "grad_norm": 0.9348012226370969, + "learning_rate": 1.327926047025558e-05, + "loss": 3.4918, + "step": 39230 + }, + { + "epoch": 4.0337171052631575, + "grad_norm": 1.0056632137683954, + "learning_rate": 1.3264483999503284e-05, + "loss": 3.4674, + "step": 39240 + }, + { + "epoch": 4.0347450657894735, + "grad_norm": 0.5878561523063642, + "learning_rate": 1.3249719093587818e-05, + "loss": 3.5251, + "step": 39250 + }, + { + "epoch": 4.035773026315789, + "grad_norm": 0.7934810317884574, + "learning_rate": 1.3234965758349798e-05, + "loss": 3.4719, + "step": 39260 + }, + { + "epoch": 4.036800986842105, + "grad_norm": 1.127403605584508, + "learning_rate": 1.3220223999625281e-05, + "loss": 3.4911, + "step": 39270 + }, + { + "epoch": 4.037828947368421, + "grad_norm": 0.7040178352340057, + "learning_rate": 1.320549382324573e-05, + "loss": 3.5317, + "step": 39280 + }, + { + "epoch": 4.038856907894737, + "grad_norm": 0.8481607649703967, + "learning_rate": 1.3190775235038049e-05, + "loss": 3.4552, + "step": 39290 + }, + { + "epoch": 4.039884868421052, + "grad_norm": 0.6973769796233619, + "learning_rate": 1.317606824082451e-05, + "loss": 3.4982, + "step": 39300 + }, + { + "epoch": 4.040912828947368, + "grad_norm": 0.7424902449072992, + "learning_rate": 1.3161372846422862e-05, + "loss": 3.4676, + "step": 39310 + }, + { + "epoch": 4.041940789473684, + "grad_norm": 0.7326234667231031, + "learning_rate": 1.3146689057646236e-05, + "loss": 3.4835, + "step": 39320 + }, + { + "epoch": 4.04296875, + "grad_norm": 0.77711771069968, + "learning_rate": 1.313201688030314e-05, + "loss": 3.4536, + "step": 39330 + }, + { + "epoch": 4.043996710526316, + "grad_norm": 0.6385092105260902, + "learning_rate": 1.3117356320197545e-05, + "loss": 3.4322, + "step": 39340 + }, + { + "epoch": 4.045024671052632, + "grad_norm": 0.655872917830672, + "learning_rate": 1.310270738312879e-05, + "loss": 3.502, + "step": 39350 + }, + { + "epoch": 4.046052631578948, + "grad_norm": 0.6912972746923499, + "learning_rate": 1.308807007489163e-05, + "loss": 3.4139, + "step": 39360 + }, + { + "epoch": 4.047080592105263, + "grad_norm": 0.836158366749552, + "learning_rate": 1.307344440127622e-05, + "loss": 3.5672, + "step": 39370 + }, + { + "epoch": 4.048108552631579, + "grad_norm": 0.7047108462372009, + "learning_rate": 1.3058830368068116e-05, + "loss": 3.3741, + "step": 39380 + }, + { + "epoch": 4.049136513157895, + "grad_norm": 0.6841704633408402, + "learning_rate": 1.3044227981048234e-05, + "loss": 3.4309, + "step": 39390 + }, + { + "epoch": 4.050164473684211, + "grad_norm": 0.9961632199465261, + "learning_rate": 1.3029637245992941e-05, + "loss": 3.4839, + "step": 39400 + }, + { + "epoch": 4.0511924342105265, + "grad_norm": 0.7941586066000803, + "learning_rate": 1.301505816867397e-05, + "loss": 3.358, + "step": 39410 + }, + { + "epoch": 4.0522203947368425, + "grad_norm": 0.6507716394564301, + "learning_rate": 1.3000490754858413e-05, + "loss": 3.4764, + "step": 39420 + }, + { + "epoch": 4.0532483552631575, + "grad_norm": 1.1816402214617079, + "learning_rate": 1.2985935010308775e-05, + "loss": 3.4049, + "step": 39430 + }, + { + "epoch": 4.0542763157894735, + "grad_norm": 0.9359837987080435, + "learning_rate": 1.2971390940782976e-05, + "loss": 3.4794, + "step": 39440 + }, + { + "epoch": 4.055304276315789, + "grad_norm": 0.6771081675352231, + "learning_rate": 1.2956858552034253e-05, + "loss": 3.4504, + "step": 39450 + }, + { + "epoch": 4.056332236842105, + "grad_norm": 0.9273384936086515, + "learning_rate": 1.2942337849811262e-05, + "loss": 3.5074, + "step": 39460 + }, + { + "epoch": 4.057360197368421, + "grad_norm": 1.7890232531644632, + "learning_rate": 1.2927828839858033e-05, + "loss": 3.446, + "step": 39470 + }, + { + "epoch": 4.058388157894737, + "grad_norm": 0.7595912076116793, + "learning_rate": 1.2913331527913966e-05, + "loss": 3.4628, + "step": 39480 + }, + { + "epoch": 4.059416118421052, + "grad_norm": 0.9577436948273462, + "learning_rate": 1.2898845919713831e-05, + "loss": 3.4227, + "step": 39490 + }, + { + "epoch": 4.060444078947368, + "grad_norm": 1.0232491356281923, + "learning_rate": 1.2884372020987787e-05, + "loss": 3.4456, + "step": 39500 + }, + { + "epoch": 4.061472039473684, + "grad_norm": 1.123593510816175, + "learning_rate": 1.2869909837461307e-05, + "loss": 3.4232, + "step": 39510 + }, + { + "epoch": 4.0625, + "grad_norm": 0.8831496143622929, + "learning_rate": 1.2855459374855302e-05, + "loss": 3.5018, + "step": 39520 + }, + { + "epoch": 4.063527960526316, + "grad_norm": 1.0751167566635167, + "learning_rate": 1.2841020638886014e-05, + "loss": 3.3951, + "step": 39530 + }, + { + "epoch": 4.064555921052632, + "grad_norm": 0.5776072838862251, + "learning_rate": 1.2826593635265028e-05, + "loss": 3.4328, + "step": 39540 + }, + { + "epoch": 4.065583881578948, + "grad_norm": 0.7884681653966336, + "learning_rate": 1.2812178369699292e-05, + "loss": 3.4661, + "step": 39550 + }, + { + "epoch": 4.066611842105263, + "grad_norm": 0.9307431550656707, + "learning_rate": 1.2797774847891166e-05, + "loss": 3.4913, + "step": 39560 + }, + { + "epoch": 4.067639802631579, + "grad_norm": 0.7335114779260148, + "learning_rate": 1.2783383075538292e-05, + "loss": 3.4673, + "step": 39570 + }, + { + "epoch": 4.068667763157895, + "grad_norm": 0.6676547556154474, + "learning_rate": 1.2769003058333696e-05, + "loss": 3.6115, + "step": 39580 + }, + { + "epoch": 4.069695723684211, + "grad_norm": 1.3339109610020485, + "learning_rate": 1.2754634801965766e-05, + "loss": 3.5726, + "step": 39590 + }, + { + "epoch": 4.0707236842105265, + "grad_norm": 0.9149834973248928, + "learning_rate": 1.2740278312118196e-05, + "loss": 3.5076, + "step": 39600 + }, + { + "epoch": 4.0717516447368425, + "grad_norm": 0.8266371911380183, + "learning_rate": 1.2725933594470077e-05, + "loss": 3.5001, + "step": 39610 + }, + { + "epoch": 4.0727796052631575, + "grad_norm": 0.8422020381882898, + "learning_rate": 1.2711600654695828e-05, + "loss": 3.3556, + "step": 39620 + }, + { + "epoch": 4.0738075657894735, + "grad_norm": 1.0134901644620453, + "learning_rate": 1.2697279498465171e-05, + "loss": 3.4854, + "step": 39630 + }, + { + "epoch": 4.074835526315789, + "grad_norm": 0.6546774141904955, + "learning_rate": 1.2682970131443205e-05, + "loss": 3.4973, + "step": 39640 + }, + { + "epoch": 4.075863486842105, + "grad_norm": 0.668630528696551, + "learning_rate": 1.2668672559290385e-05, + "loss": 3.4804, + "step": 39650 + }, + { + "epoch": 4.076891447368421, + "grad_norm": 0.9070165286319304, + "learning_rate": 1.2654386787662438e-05, + "loss": 3.4987, + "step": 39660 + }, + { + "epoch": 4.077919407894737, + "grad_norm": 0.9634862763576146, + "learning_rate": 1.2640112822210466e-05, + "loss": 3.4188, + "step": 39670 + }, + { + "epoch": 4.078947368421052, + "grad_norm": 0.7964095457958201, + "learning_rate": 1.2625850668580898e-05, + "loss": 3.6032, + "step": 39680 + }, + { + "epoch": 4.079975328947368, + "grad_norm": 0.881131825741422, + "learning_rate": 1.2611600332415481e-05, + "loss": 3.4235, + "step": 39690 + }, + { + "epoch": 4.081003289473684, + "grad_norm": 0.9328549386599096, + "learning_rate": 1.259736181935129e-05, + "loss": 3.4328, + "step": 39700 + }, + { + "epoch": 4.08203125, + "grad_norm": 0.7493348748645448, + "learning_rate": 1.2583135135020734e-05, + "loss": 3.5188, + "step": 39710 + }, + { + "epoch": 4.083059210526316, + "grad_norm": 0.7329573238070133, + "learning_rate": 1.25689202850515e-05, + "loss": 3.4987, + "step": 39720 + }, + { + "epoch": 4.084087171052632, + "grad_norm": 0.6095630160075014, + "learning_rate": 1.2554717275066662e-05, + "loss": 3.4648, + "step": 39730 + }, + { + "epoch": 4.085115131578948, + "grad_norm": 0.6637855401016098, + "learning_rate": 1.254052611068457e-05, + "loss": 3.4779, + "step": 39740 + }, + { + "epoch": 4.086143092105263, + "grad_norm": 1.1820849379343825, + "learning_rate": 1.2526346797518876e-05, + "loss": 3.4063, + "step": 39750 + }, + { + "epoch": 4.087171052631579, + "grad_norm": 0.5797044468790573, + "learning_rate": 1.2512179341178557e-05, + "loss": 3.5645, + "step": 39760 + }, + { + "epoch": 4.088199013157895, + "grad_norm": 0.8344657634274093, + "learning_rate": 1.2498023747267935e-05, + "loss": 3.461, + "step": 39770 + }, + { + "epoch": 4.089226973684211, + "grad_norm": 0.6927556377884948, + "learning_rate": 1.2483880021386577e-05, + "loss": 3.502, + "step": 39780 + }, + { + "epoch": 4.0902549342105265, + "grad_norm": 1.0253754532917774, + "learning_rate": 1.2469748169129393e-05, + "loss": 3.3956, + "step": 39790 + }, + { + "epoch": 4.0912828947368425, + "grad_norm": 0.6843455512970162, + "learning_rate": 1.2455628196086596e-05, + "loss": 3.5247, + "step": 39800 + }, + { + "epoch": 4.0923108552631575, + "grad_norm": 0.5556892965852147, + "learning_rate": 1.2441520107843692e-05, + "loss": 3.5525, + "step": 39810 + }, + { + "epoch": 4.0933388157894735, + "grad_norm": 1.6421744886004446, + "learning_rate": 1.2427423909981483e-05, + "loss": 3.4683, + "step": 39820 + }, + { + "epoch": 4.094366776315789, + "grad_norm": 0.7566166910887172, + "learning_rate": 1.2413339608076086e-05, + "loss": 3.4552, + "step": 39830 + }, + { + "epoch": 4.095394736842105, + "grad_norm": 0.6985038853785068, + "learning_rate": 1.2399267207698862e-05, + "loss": 3.5548, + "step": 39840 + }, + { + "epoch": 4.096422697368421, + "grad_norm": 0.7078234623276934, + "learning_rate": 1.2385206714416537e-05, + "loss": 3.4596, + "step": 39850 + }, + { + "epoch": 4.097450657894737, + "grad_norm": 0.6776647394557568, + "learning_rate": 1.2371158133791081e-05, + "loss": 3.5096, + "step": 39860 + }, + { + "epoch": 4.098478618421052, + "grad_norm": 0.6853521347587068, + "learning_rate": 1.2357121471379747e-05, + "loss": 3.4374, + "step": 39870 + }, + { + "epoch": 4.099506578947368, + "grad_norm": 0.8089670987128752, + "learning_rate": 1.2343096732735092e-05, + "loss": 3.4865, + "step": 39880 + }, + { + "epoch": 4.100534539473684, + "grad_norm": 0.754546531005877, + "learning_rate": 1.2329083923404953e-05, + "loss": 3.3661, + "step": 39890 + }, + { + "epoch": 4.1015625, + "grad_norm": 0.9095052920728065, + "learning_rate": 1.2315083048932448e-05, + "loss": 3.441, + "step": 39900 + }, + { + "epoch": 4.102590460526316, + "grad_norm": 1.1673914780916608, + "learning_rate": 1.2301094114855965e-05, + "loss": 3.4967, + "step": 39910 + }, + { + "epoch": 4.103618421052632, + "grad_norm": 0.7060450847389537, + "learning_rate": 1.2287117126709195e-05, + "loss": 3.4877, + "step": 39920 + }, + { + "epoch": 4.104646381578948, + "grad_norm": 0.67539867972165, + "learning_rate": 1.2273152090021046e-05, + "loss": 3.4315, + "step": 39930 + }, + { + "epoch": 4.105674342105263, + "grad_norm": 0.6091424547211742, + "learning_rate": 1.2259199010315767e-05, + "loss": 3.4761, + "step": 39940 + }, + { + "epoch": 4.106702302631579, + "grad_norm": 0.8000307986172248, + "learning_rate": 1.2245257893112853e-05, + "loss": 3.4396, + "step": 39950 + }, + { + "epoch": 4.107730263157895, + "grad_norm": 0.7627115284542737, + "learning_rate": 1.2231328743927029e-05, + "loss": 3.4517, + "step": 39960 + }, + { + "epoch": 4.108758223684211, + "grad_norm": 0.5885250938805332, + "learning_rate": 1.2217411568268327e-05, + "loss": 3.5624, + "step": 39970 + }, + { + "epoch": 4.1097861842105265, + "grad_norm": 0.9786049198668593, + "learning_rate": 1.2203506371642062e-05, + "loss": 3.3929, + "step": 39980 + }, + { + "epoch": 4.1108141447368425, + "grad_norm": 0.7957719386387598, + "learning_rate": 1.218961315954874e-05, + "loss": 3.5559, + "step": 39990 + }, + { + "epoch": 4.1118421052631575, + "grad_norm": 0.9570396578672442, + "learning_rate": 1.2175731937484195e-05, + "loss": 3.4733, + "step": 40000 + }, + { + "epoch": 4.1128700657894735, + "grad_norm": 1.0247284413192046, + "learning_rate": 1.2161862710939476e-05, + "loss": 3.4665, + "step": 40010 + }, + { + "epoch": 4.113898026315789, + "grad_norm": 0.973264603545175, + "learning_rate": 1.2148005485400904e-05, + "loss": 3.4367, + "step": 40020 + }, + { + "epoch": 4.114925986842105, + "grad_norm": 1.1125767670623055, + "learning_rate": 1.2134160266350054e-05, + "loss": 3.4926, + "step": 40030 + }, + { + "epoch": 4.115953947368421, + "grad_norm": 0.8216205884872204, + "learning_rate": 1.2120327059263755e-05, + "loss": 3.4404, + "step": 40040 + }, + { + "epoch": 4.116981907894737, + "grad_norm": 0.6770298838915271, + "learning_rate": 1.2106505869614046e-05, + "loss": 3.4962, + "step": 40050 + }, + { + "epoch": 4.118009868421052, + "grad_norm": 0.6557512204373019, + "learning_rate": 1.2092696702868274e-05, + "loss": 3.4557, + "step": 40060 + }, + { + "epoch": 4.119037828947368, + "grad_norm": 0.5407388302693931, + "learning_rate": 1.2078899564489002e-05, + "loss": 3.5404, + "step": 40070 + }, + { + "epoch": 4.120065789473684, + "grad_norm": 0.5609962974483955, + "learning_rate": 1.2065114459934009e-05, + "loss": 3.494, + "step": 40080 + }, + { + "epoch": 4.12109375, + "grad_norm": 0.894697781849153, + "learning_rate": 1.2051341394656331e-05, + "loss": 3.3218, + "step": 40090 + }, + { + "epoch": 4.122121710526316, + "grad_norm": 0.9951111678221234, + "learning_rate": 1.2037580374104285e-05, + "loss": 3.5224, + "step": 40100 + }, + { + "epoch": 4.123149671052632, + "grad_norm": 1.000659069789367, + "learning_rate": 1.2023831403721349e-05, + "loss": 3.4953, + "step": 40110 + }, + { + "epoch": 4.124177631578948, + "grad_norm": 0.8156708686178913, + "learning_rate": 1.2010094488946292e-05, + "loss": 3.4393, + "step": 40120 + }, + { + "epoch": 4.125205592105263, + "grad_norm": 0.82342063987887, + "learning_rate": 1.1996369635213078e-05, + "loss": 3.467, + "step": 40130 + }, + { + "epoch": 4.126233552631579, + "grad_norm": 0.7726733739052725, + "learning_rate": 1.1982656847950929e-05, + "loss": 3.4686, + "step": 40140 + }, + { + "epoch": 4.127261513157895, + "grad_norm": 0.6409152234175737, + "learning_rate": 1.1968956132584273e-05, + "loss": 3.5118, + "step": 40150 + }, + { + "epoch": 4.128289473684211, + "grad_norm": 1.5320252771956573, + "learning_rate": 1.1955267494532773e-05, + "loss": 3.4663, + "step": 40160 + }, + { + "epoch": 4.1293174342105265, + "grad_norm": 0.8630289598050512, + "learning_rate": 1.1941590939211303e-05, + "loss": 3.5511, + "step": 40170 + }, + { + "epoch": 4.1303453947368425, + "grad_norm": 0.6059350709832046, + "learning_rate": 1.192792647202996e-05, + "loss": 3.4388, + "step": 40180 + }, + { + "epoch": 4.1313733552631575, + "grad_norm": 0.6270828350567871, + "learning_rate": 1.1914274098394093e-05, + "loss": 3.3973, + "step": 40190 + }, + { + "epoch": 4.1324013157894735, + "grad_norm": 0.848443610518846, + "learning_rate": 1.1900633823704206e-05, + "loss": 3.4993, + "step": 40200 + }, + { + "epoch": 4.133429276315789, + "grad_norm": 0.6000424538931186, + "learning_rate": 1.1887005653356067e-05, + "loss": 3.4699, + "step": 40210 + }, + { + "epoch": 4.134457236842105, + "grad_norm": 0.674745283810676, + "learning_rate": 1.1873389592740636e-05, + "loss": 3.3437, + "step": 40220 + }, + { + "epoch": 4.135485197368421, + "grad_norm": 0.9968914951406704, + "learning_rate": 1.1859785647244082e-05, + "loss": 3.4669, + "step": 40230 + }, + { + "epoch": 4.136513157894737, + "grad_norm": 0.6567674141133601, + "learning_rate": 1.1846193822247787e-05, + "loss": 3.5367, + "step": 40240 + }, + { + "epoch": 4.137541118421052, + "grad_norm": 0.7188192002528384, + "learning_rate": 1.1832614123128345e-05, + "loss": 3.5062, + "step": 40250 + }, + { + "epoch": 4.138569078947368, + "grad_norm": 0.6589207797747284, + "learning_rate": 1.1819046555257516e-05, + "loss": 3.3993, + "step": 40260 + }, + { + "epoch": 4.139597039473684, + "grad_norm": 0.790577595311122, + "learning_rate": 1.1805491124002318e-05, + "loss": 3.4648, + "step": 40270 + }, + { + "epoch": 4.140625, + "grad_norm": 0.5857392547930865, + "learning_rate": 1.1791947834724937e-05, + "loss": 3.4607, + "step": 40280 + }, + { + "epoch": 4.141652960526316, + "grad_norm": 0.7717728087697332, + "learning_rate": 1.1778416692782743e-05, + "loss": 3.4475, + "step": 40290 + }, + { + "epoch": 4.142680921052632, + "grad_norm": 0.8474162592631029, + "learning_rate": 1.1764897703528315e-05, + "loss": 3.3308, + "step": 40300 + }, + { + "epoch": 4.143708881578948, + "grad_norm": 0.8398336720416408, + "learning_rate": 1.1751390872309454e-05, + "loss": 3.538, + "step": 40310 + }, + { + "epoch": 4.144736842105263, + "grad_norm": 0.9922627397979894, + "learning_rate": 1.1737896204469099e-05, + "loss": 3.4539, + "step": 40320 + }, + { + "epoch": 4.145764802631579, + "grad_norm": 0.8472796539650665, + "learning_rate": 1.1724413705345406e-05, + "loss": 3.5853, + "step": 40330 + }, + { + "epoch": 4.146792763157895, + "grad_norm": 1.20608280782311, + "learning_rate": 1.1710943380271717e-05, + "loss": 3.5341, + "step": 40340 + }, + { + "epoch": 4.147820723684211, + "grad_norm": 0.785801636497209, + "learning_rate": 1.1697485234576554e-05, + "loss": 3.4917, + "step": 40350 + }, + { + "epoch": 4.1488486842105265, + "grad_norm": 0.6432915758958577, + "learning_rate": 1.1684039273583623e-05, + "loss": 3.4824, + "step": 40360 + }, + { + "epoch": 4.1498766447368425, + "grad_norm": 1.2023490285974425, + "learning_rate": 1.1670605502611812e-05, + "loss": 3.4613, + "step": 40370 + }, + { + "epoch": 4.1509046052631575, + "grad_norm": 1.2739885694149693, + "learning_rate": 1.1657183926975164e-05, + "loss": 3.5064, + "step": 40380 + }, + { + "epoch": 4.1519325657894735, + "grad_norm": 0.6193499045960986, + "learning_rate": 1.1643774551982944e-05, + "loss": 3.4543, + "step": 40390 + }, + { + "epoch": 4.152960526315789, + "grad_norm": 0.6675228839337692, + "learning_rate": 1.163037738293956e-05, + "loss": 3.4778, + "step": 40400 + }, + { + "epoch": 4.153988486842105, + "grad_norm": 0.692659094262732, + "learning_rate": 1.161699242514458e-05, + "loss": 3.5289, + "step": 40410 + }, + { + "epoch": 4.155016447368421, + "grad_norm": 0.9289114461870573, + "learning_rate": 1.1603619683892769e-05, + "loss": 3.4381, + "step": 40420 + }, + { + "epoch": 4.156044407894737, + "grad_norm": 0.6793065154327673, + "learning_rate": 1.1590259164474047e-05, + "loss": 3.4745, + "step": 40430 + }, + { + "epoch": 4.157072368421052, + "grad_norm": 4.529197280532329, + "learning_rate": 1.1576910872173495e-05, + "loss": 3.5283, + "step": 40440 + }, + { + "epoch": 4.158100328947368, + "grad_norm": 0.7781751777372489, + "learning_rate": 1.156357481227137e-05, + "loss": 3.4128, + "step": 40450 + }, + { + "epoch": 4.159128289473684, + "grad_norm": 1.1879796515048253, + "learning_rate": 1.1550250990043081e-05, + "loss": 3.5516, + "step": 40460 + }, + { + "epoch": 4.16015625, + "grad_norm": 0.6252327962290831, + "learning_rate": 1.1536939410759193e-05, + "loss": 3.5113, + "step": 40470 + }, + { + "epoch": 4.161184210526316, + "grad_norm": 0.6535436137442687, + "learning_rate": 1.1523640079685441e-05, + "loss": 3.4562, + "step": 40480 + }, + { + "epoch": 4.162212171052632, + "grad_norm": 0.7819250623500333, + "learning_rate": 1.151035300208271e-05, + "loss": 3.5053, + "step": 40490 + }, + { + "epoch": 4.163240131578948, + "grad_norm": 0.524856693145704, + "learning_rate": 1.1497078183207018e-05, + "loss": 3.5468, + "step": 40500 + }, + { + "epoch": 4.164268092105263, + "grad_norm": 0.790449566281277, + "learning_rate": 1.148381562830955e-05, + "loss": 3.5272, + "step": 40510 + }, + { + "epoch": 4.165296052631579, + "grad_norm": 0.8604196960974712, + "learning_rate": 1.1470565342636668e-05, + "loss": 3.4205, + "step": 40520 + }, + { + "epoch": 4.166324013157895, + "grad_norm": 0.8135461308108133, + "learning_rate": 1.1457327331429821e-05, + "loss": 3.5511, + "step": 40530 + }, + { + "epoch": 4.167351973684211, + "grad_norm": 1.450632562449152, + "learning_rate": 1.1444101599925653e-05, + "loss": 3.4452, + "step": 40540 + }, + { + "epoch": 4.1683799342105265, + "grad_norm": 1.256472614358763, + "learning_rate": 1.1430888153355923e-05, + "loss": 3.4675, + "step": 40550 + }, + { + "epoch": 4.1694078947368425, + "grad_norm": 0.7858875342198196, + "learning_rate": 1.1417686996947546e-05, + "loss": 3.4699, + "step": 40560 + }, + { + "epoch": 4.1704358552631575, + "grad_norm": 0.7531467228839251, + "learning_rate": 1.1404498135922557e-05, + "loss": 3.4854, + "step": 40570 + }, + { + "epoch": 4.1714638157894735, + "grad_norm": 0.7464358359405103, + "learning_rate": 1.1391321575498152e-05, + "loss": 3.5108, + "step": 40580 + }, + { + "epoch": 4.172491776315789, + "grad_norm": 0.7041239106630187, + "learning_rate": 1.1378157320886641e-05, + "loss": 3.6007, + "step": 40590 + }, + { + "epoch": 4.173519736842105, + "grad_norm": 0.7703607803287651, + "learning_rate": 1.136500537729547e-05, + "loss": 3.5101, + "step": 40600 + }, + { + "epoch": 4.174547697368421, + "grad_norm": 0.6574310166102456, + "learning_rate": 1.1351865749927233e-05, + "loss": 3.5129, + "step": 40610 + }, + { + "epoch": 4.175575657894737, + "grad_norm": 0.699112186090061, + "learning_rate": 1.1338738443979615e-05, + "loss": 3.488, + "step": 40620 + }, + { + "epoch": 4.176603618421052, + "grad_norm": 0.9217704131719305, + "learning_rate": 1.1325623464645456e-05, + "loss": 3.5469, + "step": 40630 + }, + { + "epoch": 4.177631578947368, + "grad_norm": 0.9582497138707913, + "learning_rate": 1.1312520817112732e-05, + "loss": 3.5192, + "step": 40640 + }, + { + "epoch": 4.178659539473684, + "grad_norm": 0.860090408784957, + "learning_rate": 1.1299430506564492e-05, + "loss": 3.4492, + "step": 40650 + }, + { + "epoch": 4.1796875, + "grad_norm": 0.6682604771744275, + "learning_rate": 1.1286352538178956e-05, + "loss": 3.5172, + "step": 40660 + }, + { + "epoch": 4.180715460526316, + "grad_norm": 1.16739352040887, + "learning_rate": 1.127328691712943e-05, + "loss": 3.589, + "step": 40670 + }, + { + "epoch": 4.181743421052632, + "grad_norm": 1.604462765535932, + "learning_rate": 1.1260233648584349e-05, + "loss": 3.4548, + "step": 40680 + }, + { + "epoch": 4.182771381578948, + "grad_norm": 0.9393435154535142, + "learning_rate": 1.1247192737707258e-05, + "loss": 3.4311, + "step": 40690 + }, + { + "epoch": 4.183799342105263, + "grad_norm": 0.7540874757515588, + "learning_rate": 1.1234164189656824e-05, + "loss": 3.4926, + "step": 40700 + }, + { + "epoch": 4.184827302631579, + "grad_norm": 0.6840089027804174, + "learning_rate": 1.1221148009586798e-05, + "loss": 3.511, + "step": 40710 + }, + { + "epoch": 4.185855263157895, + "grad_norm": 0.9815293916585512, + "learning_rate": 1.1208144202646047e-05, + "loss": 3.4464, + "step": 40720 + }, + { + "epoch": 4.186883223684211, + "grad_norm": 0.7173272439282816, + "learning_rate": 1.1195152773978586e-05, + "loss": 3.4049, + "step": 40730 + }, + { + "epoch": 4.1879111842105265, + "grad_norm": 0.6843960440897026, + "learning_rate": 1.1182173728723468e-05, + "loss": 3.4862, + "step": 40740 + }, + { + "epoch": 4.1889391447368425, + "grad_norm": 0.5272607243488627, + "learning_rate": 1.116920707201489e-05, + "loss": 3.5056, + "step": 40750 + }, + { + "epoch": 4.1899671052631575, + "grad_norm": 0.6462361991707344, + "learning_rate": 1.1156252808982129e-05, + "loss": 3.4399, + "step": 40760 + }, + { + "epoch": 4.1909950657894735, + "grad_norm": 0.6541619232839975, + "learning_rate": 1.1143310944749573e-05, + "loss": 3.4736, + "step": 40770 + }, + { + "epoch": 4.192023026315789, + "grad_norm": 0.8306474954445071, + "learning_rate": 1.1130381484436695e-05, + "loss": 3.4634, + "step": 40780 + }, + { + "epoch": 4.193050986842105, + "grad_norm": 0.8308495740218275, + "learning_rate": 1.1117464433158066e-05, + "loss": 3.52, + "step": 40790 + }, + { + "epoch": 4.194078947368421, + "grad_norm": 0.6627272027367435, + "learning_rate": 1.1104559796023349e-05, + "loss": 3.438, + "step": 40800 + }, + { + "epoch": 4.195106907894737, + "grad_norm": 0.9057206548137645, + "learning_rate": 1.1091667578137292e-05, + "loss": 3.4525, + "step": 40810 + }, + { + "epoch": 4.196134868421052, + "grad_norm": 0.8234925890451846, + "learning_rate": 1.1078787784599752e-05, + "loss": 3.4081, + "step": 40820 + }, + { + "epoch": 4.197162828947368, + "grad_norm": 0.8389471191409532, + "learning_rate": 1.1065920420505615e-05, + "loss": 3.4761, + "step": 40830 + }, + { + "epoch": 4.198190789473684, + "grad_norm": 0.6950028833837182, + "learning_rate": 1.1053065490944906e-05, + "loss": 3.4665, + "step": 40840 + }, + { + "epoch": 4.19921875, + "grad_norm": 0.9063508536684489, + "learning_rate": 1.1040223001002732e-05, + "loss": 3.4822, + "step": 40850 + }, + { + "epoch": 4.200246710526316, + "grad_norm": 0.6645964551445404, + "learning_rate": 1.102739295575923e-05, + "loss": 3.3592, + "step": 40860 + }, + { + "epoch": 4.201274671052632, + "grad_norm": 0.7201522748245238, + "learning_rate": 1.1014575360289661e-05, + "loss": 3.5316, + "step": 40870 + }, + { + "epoch": 4.202302631578948, + "grad_norm": 0.744273777060074, + "learning_rate": 1.1001770219664335e-05, + "loss": 3.5218, + "step": 40880 + }, + { + "epoch": 4.203330592105263, + "grad_norm": 0.6261198025741125, + "learning_rate": 1.0988977538948648e-05, + "loss": 3.4868, + "step": 40890 + }, + { + "epoch": 4.204358552631579, + "grad_norm": 0.5846667433560683, + "learning_rate": 1.097619732320307e-05, + "loss": 3.5126, + "step": 40900 + }, + { + "epoch": 4.205386513157895, + "grad_norm": 0.6138572660512815, + "learning_rate": 1.0963429577483126e-05, + "loss": 3.4057, + "step": 40910 + }, + { + "epoch": 4.206414473684211, + "grad_norm": 0.736816123118008, + "learning_rate": 1.0950674306839426e-05, + "loss": 3.4666, + "step": 40920 + }, + { + "epoch": 4.2074424342105265, + "grad_norm": 0.6615173483726852, + "learning_rate": 1.0937931516317612e-05, + "loss": 3.4111, + "step": 40930 + }, + { + "epoch": 4.2084703947368425, + "grad_norm": 0.6880086837569845, + "learning_rate": 1.0925201210958447e-05, + "loss": 3.4644, + "step": 40940 + }, + { + "epoch": 4.2094983552631575, + "grad_norm": 0.7260006198381243, + "learning_rate": 1.0912483395797691e-05, + "loss": 3.5079, + "step": 40950 + }, + { + "epoch": 4.2105263157894735, + "grad_norm": 0.8468677703436218, + "learning_rate": 1.08997780758662e-05, + "loss": 3.4411, + "step": 40960 + }, + { + "epoch": 4.211554276315789, + "grad_norm": 1.0533868186887914, + "learning_rate": 1.0887085256189889e-05, + "loss": 3.4727, + "step": 40970 + }, + { + "epoch": 4.212582236842105, + "grad_norm": 0.7044437122086283, + "learning_rate": 1.087440494178971e-05, + "loss": 3.4607, + "step": 40980 + }, + { + "epoch": 4.213610197368421, + "grad_norm": 0.8197386414676611, + "learning_rate": 1.0861737137681681e-05, + "loss": 3.482, + "step": 40990 + }, + { + "epoch": 4.214638157894737, + "grad_norm": 0.755073158180727, + "learning_rate": 1.0849081848876864e-05, + "loss": 3.528, + "step": 41000 + }, + { + "epoch": 4.215666118421052, + "grad_norm": 0.6776620348049499, + "learning_rate": 1.083643908038138e-05, + "loss": 3.4684, + "step": 41010 + }, + { + "epoch": 4.216694078947368, + "grad_norm": 0.8463176934624002, + "learning_rate": 1.0823808837196386e-05, + "loss": 3.4654, + "step": 41020 + }, + { + "epoch": 4.217722039473684, + "grad_norm": 0.8747139871995108, + "learning_rate": 1.0811191124318102e-05, + "loss": 3.4157, + "step": 41030 + }, + { + "epoch": 4.21875, + "grad_norm": 1.0810003775279244, + "learning_rate": 1.0798585946737759e-05, + "loss": 3.4826, + "step": 41040 + }, + { + "epoch": 4.219777960526316, + "grad_norm": 1.0226049749868513, + "learning_rate": 1.0785993309441646e-05, + "loss": 3.553, + "step": 41050 + }, + { + "epoch": 4.220805921052632, + "grad_norm": 0.6834743068431324, + "learning_rate": 1.0773413217411126e-05, + "loss": 3.577, + "step": 41060 + }, + { + "epoch": 4.221833881578948, + "grad_norm": 0.6999503203201569, + "learning_rate": 1.076084567562254e-05, + "loss": 3.5297, + "step": 41070 + }, + { + "epoch": 4.222861842105263, + "grad_norm": 0.939760788246981, + "learning_rate": 1.0748290689047299e-05, + "loss": 3.4547, + "step": 41080 + }, + { + "epoch": 4.223889802631579, + "grad_norm": 0.6626658169146423, + "learning_rate": 1.0735748262651846e-05, + "loss": 3.4975, + "step": 41090 + }, + { + "epoch": 4.224917763157895, + "grad_norm": 0.8888124794868482, + "learning_rate": 1.0723218401397647e-05, + "loss": 3.388, + "step": 41100 + }, + { + "epoch": 4.225945723684211, + "grad_norm": 1.2747610504036515, + "learning_rate": 1.0710701110241198e-05, + "loss": 3.5209, + "step": 41110 + }, + { + "epoch": 4.2269736842105265, + "grad_norm": 0.7595875980120067, + "learning_rate": 1.0698196394134027e-05, + "loss": 3.4816, + "step": 41120 + }, + { + "epoch": 4.2280016447368425, + "grad_norm": 0.6660013728700412, + "learning_rate": 1.068570425802269e-05, + "loss": 3.48, + "step": 41130 + }, + { + "epoch": 4.2290296052631575, + "grad_norm": 0.7735661871616415, + "learning_rate": 1.0673224706848758e-05, + "loss": 3.5311, + "step": 41140 + }, + { + "epoch": 4.2300575657894735, + "grad_norm": 0.6834189709252153, + "learning_rate": 1.066075774554884e-05, + "loss": 3.4401, + "step": 41150 + }, + { + "epoch": 4.231085526315789, + "grad_norm": 1.2359894926031938, + "learning_rate": 1.0648303379054533e-05, + "loss": 3.5129, + "step": 41160 + }, + { + "epoch": 4.232113486842105, + "grad_norm": 0.993319848969695, + "learning_rate": 1.0635861612292473e-05, + "loss": 3.4518, + "step": 41170 + }, + { + "epoch": 4.233141447368421, + "grad_norm": 0.8007264281324854, + "learning_rate": 1.0623432450184338e-05, + "loss": 3.3803, + "step": 41180 + }, + { + "epoch": 4.234169407894737, + "grad_norm": 0.7033599249127807, + "learning_rate": 1.0611015897646766e-05, + "loss": 3.4384, + "step": 41190 + }, + { + "epoch": 4.235197368421052, + "grad_norm": 0.6289465044300662, + "learning_rate": 1.0598611959591448e-05, + "loss": 3.4899, + "step": 41200 + }, + { + "epoch": 4.236225328947368, + "grad_norm": 0.743185917737215, + "learning_rate": 1.058622064092506e-05, + "loss": 3.428, + "step": 41210 + }, + { + "epoch": 4.237253289473684, + "grad_norm": 1.248424968250767, + "learning_rate": 1.0573841946549307e-05, + "loss": 3.4329, + "step": 41220 + }, + { + "epoch": 4.23828125, + "grad_norm": 0.8398918670355, + "learning_rate": 1.0561475881360879e-05, + "loss": 3.5126, + "step": 41230 + }, + { + "epoch": 4.239309210526316, + "grad_norm": 1.0694915518906307, + "learning_rate": 1.0549122450251492e-05, + "loss": 3.454, + "step": 41240 + }, + { + "epoch": 4.240337171052632, + "grad_norm": 0.9909503763421508, + "learning_rate": 1.0536781658107858e-05, + "loss": 3.5362, + "step": 41250 + }, + { + "epoch": 4.241365131578948, + "grad_norm": 0.991321876969761, + "learning_rate": 1.0524453509811657e-05, + "loss": 3.4762, + "step": 41260 + }, + { + "epoch": 4.242393092105263, + "grad_norm": 0.8242614879646428, + "learning_rate": 1.051213801023963e-05, + "loss": 3.5355, + "step": 41270 + }, + { + "epoch": 4.243421052631579, + "grad_norm": 0.6293132247079573, + "learning_rate": 1.049983516426346e-05, + "loss": 3.4224, + "step": 41280 + }, + { + "epoch": 4.244449013157895, + "grad_norm": 0.7920597296911432, + "learning_rate": 1.0487544976749842e-05, + "loss": 3.4498, + "step": 41290 + }, + { + "epoch": 4.245476973684211, + "grad_norm": 0.6690417366422957, + "learning_rate": 1.0475267452560475e-05, + "loss": 3.4522, + "step": 41300 + }, + { + "epoch": 4.2465049342105265, + "grad_norm": 0.7095550678273533, + "learning_rate": 1.0463002596552037e-05, + "loss": 3.5283, + "step": 41310 + }, + { + "epoch": 4.2475328947368425, + "grad_norm": 0.5505458247473952, + "learning_rate": 1.0450750413576198e-05, + "loss": 3.48, + "step": 41320 + }, + { + "epoch": 4.2485608552631575, + "grad_norm": 0.885235558481177, + "learning_rate": 1.0438510908479612e-05, + "loss": 3.4943, + "step": 41330 + }, + { + "epoch": 4.2495888157894735, + "grad_norm": 1.0248454070394095, + "learning_rate": 1.0426284086103921e-05, + "loss": 3.4656, + "step": 41340 + }, + { + "epoch": 4.250616776315789, + "grad_norm": 0.9460916067110161, + "learning_rate": 1.041406995128575e-05, + "loss": 3.4064, + "step": 41350 + }, + { + "epoch": 4.251644736842105, + "grad_norm": 1.1394608187096968, + "learning_rate": 1.0401868508856715e-05, + "loss": 3.5464, + "step": 41360 + }, + { + "epoch": 4.252672697368421, + "grad_norm": 1.0782839892819276, + "learning_rate": 1.038967976364338e-05, + "loss": 3.4583, + "step": 41370 + }, + { + "epoch": 4.253700657894737, + "grad_norm": 0.9528620935772255, + "learning_rate": 1.0377503720467304e-05, + "loss": 3.4457, + "step": 41380 + }, + { + "epoch": 4.254728618421053, + "grad_norm": 0.7433520767246539, + "learning_rate": 1.0365340384145059e-05, + "loss": 3.4983, + "step": 41390 + }, + { + "epoch": 4.255756578947368, + "grad_norm": 0.7530064327248919, + "learning_rate": 1.0353189759488124e-05, + "loss": 3.4894, + "step": 41400 + }, + { + "epoch": 4.256784539473684, + "grad_norm": 0.8132862541638418, + "learning_rate": 1.034105185130299e-05, + "loss": 3.4519, + "step": 41410 + }, + { + "epoch": 4.2578125, + "grad_norm": 0.8184507769614351, + "learning_rate": 1.0328926664391112e-05, + "loss": 3.5072, + "step": 41420 + }, + { + "epoch": 4.258840460526316, + "grad_norm": 0.7774840951868952, + "learning_rate": 1.0316814203548908e-05, + "loss": 3.4017, + "step": 41430 + }, + { + "epoch": 4.259868421052632, + "grad_norm": 0.9154352818296364, + "learning_rate": 1.030471447356776e-05, + "loss": 3.4355, + "step": 41440 + }, + { + "epoch": 4.260896381578947, + "grad_norm": 0.8705299160662935, + "learning_rate": 1.0292627479234026e-05, + "loss": 3.5094, + "step": 41450 + }, + { + "epoch": 4.261924342105263, + "grad_norm": 0.7388506515148971, + "learning_rate": 1.0280553225329015e-05, + "loss": 3.4602, + "step": 41460 + }, + { + "epoch": 4.262952302631579, + "grad_norm": 0.7668361815037849, + "learning_rate": 1.0268491716628983e-05, + "loss": 3.4572, + "step": 41470 + }, + { + "epoch": 4.263980263157895, + "grad_norm": 0.8033303974092686, + "learning_rate": 1.0256442957905193e-05, + "loss": 3.4484, + "step": 41480 + }, + { + "epoch": 4.265008223684211, + "grad_norm": 0.6905347860110855, + "learning_rate": 1.0244406953923805e-05, + "loss": 3.4902, + "step": 41490 + }, + { + "epoch": 4.2660361842105265, + "grad_norm": 0.65521412792787, + "learning_rate": 1.0232383709445966e-05, + "loss": 3.5263, + "step": 41500 + }, + { + "epoch": 4.2670641447368425, + "grad_norm": 0.6591706904044561, + "learning_rate": 1.0220373229227774e-05, + "loss": 3.4208, + "step": 41510 + }, + { + "epoch": 4.2680921052631575, + "grad_norm": 0.8459610754870264, + "learning_rate": 1.020837551802027e-05, + "loss": 3.416, + "step": 41520 + }, + { + "epoch": 4.2691200657894735, + "grad_norm": 0.7686814781602573, + "learning_rate": 1.019639058056945e-05, + "loss": 3.3979, + "step": 41530 + }, + { + "epoch": 4.270148026315789, + "grad_norm": 0.779239356038325, + "learning_rate": 1.0184418421616256e-05, + "loss": 3.5129, + "step": 41540 + }, + { + "epoch": 4.271175986842105, + "grad_norm": 0.7307400123260795, + "learning_rate": 1.0172459045896573e-05, + "loss": 3.4759, + "step": 41550 + }, + { + "epoch": 4.272203947368421, + "grad_norm": 0.863164937050454, + "learning_rate": 1.0160512458141225e-05, + "loss": 3.4459, + "step": 41560 + }, + { + "epoch": 4.273231907894737, + "grad_norm": 1.0920884181581894, + "learning_rate": 1.0148578663075988e-05, + "loss": 3.5134, + "step": 41570 + }, + { + "epoch": 4.274259868421053, + "grad_norm": 0.6047954137749337, + "learning_rate": 1.0136657665421578e-05, + "loss": 3.4872, + "step": 41580 + }, + { + "epoch": 4.275287828947368, + "grad_norm": 0.6006584557892223, + "learning_rate": 1.0124749469893627e-05, + "loss": 3.5448, + "step": 41590 + }, + { + "epoch": 4.276315789473684, + "grad_norm": 1.1079992978854658, + "learning_rate": 1.0112854081202731e-05, + "loss": 3.5661, + "step": 41600 + }, + { + "epoch": 4.27734375, + "grad_norm": 0.7618110796406367, + "learning_rate": 1.0100971504054404e-05, + "loss": 3.4973, + "step": 41610 + }, + { + "epoch": 4.278371710526316, + "grad_norm": 0.727594237827576, + "learning_rate": 1.008910174314909e-05, + "loss": 3.4445, + "step": 41620 + }, + { + "epoch": 4.279399671052632, + "grad_norm": 0.730556747929511, + "learning_rate": 1.0077244803182171e-05, + "loss": 3.4581, + "step": 41630 + }, + { + "epoch": 4.280427631578947, + "grad_norm": 0.8918133554092936, + "learning_rate": 1.0065400688843963e-05, + "loss": 3.4663, + "step": 41640 + }, + { + "epoch": 4.281455592105263, + "grad_norm": 0.7853449504567313, + "learning_rate": 1.0053569404819692e-05, + "loss": 3.4894, + "step": 41650 + }, + { + "epoch": 4.282483552631579, + "grad_norm": 0.6752986176779275, + "learning_rate": 1.004175095578952e-05, + "loss": 3.4245, + "step": 41660 + }, + { + "epoch": 4.283511513157895, + "grad_norm": 0.8682071087252466, + "learning_rate": 1.0029945346428529e-05, + "loss": 3.5215, + "step": 41670 + }, + { + "epoch": 4.284539473684211, + "grad_norm": 0.7524986369098472, + "learning_rate": 1.0018152581406722e-05, + "loss": 3.4698, + "step": 41680 + }, + { + "epoch": 4.2855674342105265, + "grad_norm": 0.6225094285886706, + "learning_rate": 1.0006372665389018e-05, + "loss": 3.4408, + "step": 41690 + }, + { + "epoch": 4.2865953947368425, + "grad_norm": 0.6275942605108711, + "learning_rate": 9.994605603035276e-06, + "loss": 3.4664, + "step": 41700 + }, + { + "epoch": 4.2876233552631575, + "grad_norm": 0.6974692167646903, + "learning_rate": 9.982851399000208e-06, + "loss": 3.4428, + "step": 41710 + }, + { + "epoch": 4.2886513157894735, + "grad_norm": 0.7973990867410505, + "learning_rate": 9.97111005793353e-06, + "loss": 3.4962, + "step": 41720 + }, + { + "epoch": 4.289679276315789, + "grad_norm": 0.7583749394653356, + "learning_rate": 9.95938158447979e-06, + "loss": 3.438, + "step": 41730 + }, + { + "epoch": 4.290707236842105, + "grad_norm": 0.8034310763268084, + "learning_rate": 9.947665983278489e-06, + "loss": 3.4665, + "step": 41740 + }, + { + "epoch": 4.291735197368421, + "grad_norm": 0.6947380221550219, + "learning_rate": 9.935963258964028e-06, + "loss": 3.5006, + "step": 41750 + }, + { + "epoch": 4.292763157894737, + "grad_norm": 0.6570880171375655, + "learning_rate": 9.924273416165705e-06, + "loss": 3.5432, + "step": 41760 + }, + { + "epoch": 4.293791118421053, + "grad_norm": 0.6626892026602165, + "learning_rate": 9.912596459507728e-06, + "loss": 3.4512, + "step": 41770 + }, + { + "epoch": 4.294819078947368, + "grad_norm": 0.878418104459064, + "learning_rate": 9.900932393609215e-06, + "loss": 3.4332, + "step": 41780 + }, + { + "epoch": 4.295847039473684, + "grad_norm": 0.9679515686234472, + "learning_rate": 9.889281223084179e-06, + "loss": 3.5465, + "step": 41790 + }, + { + "epoch": 4.296875, + "grad_norm": 0.8854849882506574, + "learning_rate": 9.877642952541512e-06, + "loss": 3.5352, + "step": 41800 + }, + { + "epoch": 4.297902960526316, + "grad_norm": 0.7406892611796745, + "learning_rate": 9.866017586585051e-06, + "loss": 3.5426, + "step": 41810 + }, + { + "epoch": 4.298930921052632, + "grad_norm": 0.6651463359069726, + "learning_rate": 9.854405129813476e-06, + "loss": 3.4418, + "step": 41820 + }, + { + "epoch": 4.299958881578947, + "grad_norm": 0.6823112664550074, + "learning_rate": 9.84280558682039e-06, + "loss": 3.4844, + "step": 41830 + }, + { + "epoch": 4.300986842105263, + "grad_norm": 0.6465798160599726, + "learning_rate": 9.831218962194289e-06, + "loss": 3.4388, + "step": 41840 + }, + { + "epoch": 4.302014802631579, + "grad_norm": 1.0229535887995855, + "learning_rate": 9.819645260518537e-06, + "loss": 3.3107, + "step": 41850 + }, + { + "epoch": 4.303042763157895, + "grad_norm": 0.8473698356280845, + "learning_rate": 9.808084486371414e-06, + "loss": 3.3737, + "step": 41860 + }, + { + "epoch": 4.304070723684211, + "grad_norm": 0.9800577708168478, + "learning_rate": 9.796536644326064e-06, + "loss": 3.533, + "step": 41870 + }, + { + "epoch": 4.3050986842105265, + "grad_norm": 1.243145794970238, + "learning_rate": 9.785001738950523e-06, + "loss": 3.4475, + "step": 41880 + }, + { + "epoch": 4.3061266447368425, + "grad_norm": 0.8960414947731925, + "learning_rate": 9.77347977480772e-06, + "loss": 3.4486, + "step": 41890 + }, + { + "epoch": 4.3071546052631575, + "grad_norm": 0.7159643093436123, + "learning_rate": 9.761970756455445e-06, + "loss": 3.4739, + "step": 41900 + }, + { + "epoch": 4.3081825657894735, + "grad_norm": 0.6832241039861108, + "learning_rate": 9.750474688446385e-06, + "loss": 3.4487, + "step": 41910 + }, + { + "epoch": 4.309210526315789, + "grad_norm": 0.9745742699793015, + "learning_rate": 9.738991575328078e-06, + "loss": 3.5203, + "step": 41920 + }, + { + "epoch": 4.310238486842105, + "grad_norm": 0.61637506248955, + "learning_rate": 9.727521421642985e-06, + "loss": 3.4602, + "step": 41930 + }, + { + "epoch": 4.311266447368421, + "grad_norm": 0.7299347011546146, + "learning_rate": 9.716064231928388e-06, + "loss": 3.5231, + "step": 41940 + }, + { + "epoch": 4.312294407894737, + "grad_norm": 0.5562977052441379, + "learning_rate": 9.704620010716473e-06, + "loss": 3.509, + "step": 41950 + }, + { + "epoch": 4.313322368421053, + "grad_norm": 0.7927616017941367, + "learning_rate": 9.693188762534278e-06, + "loss": 3.5032, + "step": 41960 + }, + { + "epoch": 4.314350328947368, + "grad_norm": 0.9834304092195235, + "learning_rate": 9.681770491903727e-06, + "loss": 3.4785, + "step": 41970 + }, + { + "epoch": 4.315378289473684, + "grad_norm": 0.7207783398754516, + "learning_rate": 9.670365203341603e-06, + "loss": 3.4082, + "step": 41980 + }, + { + "epoch": 4.31640625, + "grad_norm": 0.8413447606421034, + "learning_rate": 9.658972901359545e-06, + "loss": 3.5987, + "step": 41990 + }, + { + "epoch": 4.317434210526316, + "grad_norm": 1.2307510778688573, + "learning_rate": 9.647593590464075e-06, + "loss": 3.4032, + "step": 42000 + }, + { + "epoch": 4.318462171052632, + "grad_norm": 0.8306020721187414, + "learning_rate": 9.636227275156537e-06, + "loss": 3.4253, + "step": 42010 + }, + { + "epoch": 4.319490131578947, + "grad_norm": 0.7987326492669719, + "learning_rate": 9.624873959933187e-06, + "loss": 3.4431, + "step": 42020 + }, + { + "epoch": 4.320518092105263, + "grad_norm": 0.7503516644317861, + "learning_rate": 9.613533649285104e-06, + "loss": 3.3962, + "step": 42030 + }, + { + "epoch": 4.321546052631579, + "grad_norm": 0.5796276359802552, + "learning_rate": 9.602206347698223e-06, + "loss": 3.4342, + "step": 42040 + }, + { + "epoch": 4.322574013157895, + "grad_norm": 0.7220318545396953, + "learning_rate": 9.59089205965334e-06, + "loss": 3.4676, + "step": 42050 + }, + { + "epoch": 4.323601973684211, + "grad_norm": 0.8964782249223108, + "learning_rate": 9.579590789626114e-06, + "loss": 3.5032, + "step": 42060 + }, + { + "epoch": 4.3246299342105265, + "grad_norm": 0.7213813464464444, + "learning_rate": 9.568302542087037e-06, + "loss": 3.4418, + "step": 42070 + }, + { + "epoch": 4.3256578947368425, + "grad_norm": 1.1270056436290767, + "learning_rate": 9.557027321501461e-06, + "loss": 3.4792, + "step": 42080 + }, + { + "epoch": 4.3266858552631575, + "grad_norm": 1.6914039946578496, + "learning_rate": 9.545765132329572e-06, + "loss": 3.4816, + "step": 42090 + }, + { + "epoch": 4.3277138157894735, + "grad_norm": 0.7280564484640707, + "learning_rate": 9.534515979026416e-06, + "loss": 3.5691, + "step": 42100 + }, + { + "epoch": 4.328741776315789, + "grad_norm": 0.8390736442408346, + "learning_rate": 9.52327986604188e-06, + "loss": 3.5631, + "step": 42110 + }, + { + "epoch": 4.329769736842105, + "grad_norm": 0.8750075953017693, + "learning_rate": 9.512056797820688e-06, + "loss": 3.5054, + "step": 42120 + }, + { + "epoch": 4.330797697368421, + "grad_norm": 0.7848147508388893, + "learning_rate": 9.500846778802381e-06, + "loss": 3.5236, + "step": 42130 + }, + { + "epoch": 4.331825657894737, + "grad_norm": 0.5671549354221445, + "learning_rate": 9.4896498134214e-06, + "loss": 3.5374, + "step": 42140 + }, + { + "epoch": 4.332853618421053, + "grad_norm": 0.7464385908410002, + "learning_rate": 9.478465906106955e-06, + "loss": 3.5527, + "step": 42150 + }, + { + "epoch": 4.333881578947368, + "grad_norm": 1.1359287413793342, + "learning_rate": 9.467295061283128e-06, + "loss": 3.5029, + "step": 42160 + }, + { + "epoch": 4.334909539473684, + "grad_norm": 0.8559306043254493, + "learning_rate": 9.456137283368827e-06, + "loss": 3.4303, + "step": 42170 + }, + { + "epoch": 4.3359375, + "grad_norm": 1.0225980970898059, + "learning_rate": 9.444992576777783e-06, + "loss": 3.4533, + "step": 42180 + }, + { + "epoch": 4.336965460526316, + "grad_norm": 0.7936666640140555, + "learning_rate": 9.433860945918569e-06, + "loss": 3.4707, + "step": 42190 + }, + { + "epoch": 4.337993421052632, + "grad_norm": 0.6035040203077511, + "learning_rate": 9.422742395194575e-06, + "loss": 3.493, + "step": 42200 + }, + { + "epoch": 4.339021381578947, + "grad_norm": 0.8259278787087887, + "learning_rate": 9.411636929004022e-06, + "loss": 3.5153, + "step": 42210 + }, + { + "epoch": 4.340049342105263, + "grad_norm": 0.8677940309542118, + "learning_rate": 9.400544551739955e-06, + "loss": 3.4817, + "step": 42220 + }, + { + "epoch": 4.341077302631579, + "grad_norm": 0.9229393972750759, + "learning_rate": 9.389465267790238e-06, + "loss": 3.3998, + "step": 42230 + }, + { + "epoch": 4.342105263157895, + "grad_norm": 0.7545815640293093, + "learning_rate": 9.378399081537567e-06, + "loss": 3.4356, + "step": 42240 + }, + { + "epoch": 4.343133223684211, + "grad_norm": 0.8407582117202015, + "learning_rate": 9.367345997359426e-06, + "loss": 3.4688, + "step": 42250 + }, + { + "epoch": 4.3441611842105265, + "grad_norm": 0.7423240134526744, + "learning_rate": 9.35630601962817e-06, + "loss": 3.5596, + "step": 42260 + }, + { + "epoch": 4.3451891447368425, + "grad_norm": 0.7304484318646076, + "learning_rate": 9.345279152710907e-06, + "loss": 3.4547, + "step": 42270 + }, + { + "epoch": 4.3462171052631575, + "grad_norm": 1.002463049267091, + "learning_rate": 9.3342654009696e-06, + "loss": 3.5444, + "step": 42280 + }, + { + "epoch": 4.3472450657894735, + "grad_norm": 0.9929716961064367, + "learning_rate": 9.323264768761015e-06, + "loss": 3.4425, + "step": 42290 + }, + { + "epoch": 4.348273026315789, + "grad_norm": 1.2383732806642687, + "learning_rate": 9.312277260436723e-06, + "loss": 3.4716, + "step": 42300 + }, + { + "epoch": 4.349300986842105, + "grad_norm": 0.6539772961303693, + "learning_rate": 9.301302880343115e-06, + "loss": 3.4847, + "step": 42310 + }, + { + "epoch": 4.350328947368421, + "grad_norm": 0.8964299536582004, + "learning_rate": 9.29034163282137e-06, + "loss": 3.5255, + "step": 42320 + }, + { + "epoch": 4.351356907894737, + "grad_norm": 0.98488576444458, + "learning_rate": 9.279393522207493e-06, + "loss": 3.4437, + "step": 42330 + }, + { + "epoch": 4.352384868421053, + "grad_norm": 0.7033859616366586, + "learning_rate": 9.268458552832265e-06, + "loss": 3.526, + "step": 42340 + }, + { + "epoch": 4.353412828947368, + "grad_norm": 0.5936348201062357, + "learning_rate": 9.257536729021297e-06, + "loss": 3.514, + "step": 42350 + }, + { + "epoch": 4.354440789473684, + "grad_norm": 1.8643814773268017, + "learning_rate": 9.246628055095004e-06, + "loss": 3.5177, + "step": 42360 + }, + { + "epoch": 4.35546875, + "grad_norm": 0.7467423660429604, + "learning_rate": 9.235732535368553e-06, + "loss": 3.4788, + "step": 42370 + }, + { + "epoch": 4.356496710526316, + "grad_norm": 0.6153303041299187, + "learning_rate": 9.224850174151956e-06, + "loss": 3.4834, + "step": 42380 + }, + { + "epoch": 4.357524671052632, + "grad_norm": 0.5063666962710452, + "learning_rate": 9.21398097575e-06, + "loss": 3.4506, + "step": 42390 + }, + { + "epoch": 4.358552631578947, + "grad_norm": 1.0331158375780884, + "learning_rate": 9.203124944462267e-06, + "loss": 3.5118, + "step": 42400 + }, + { + "epoch": 4.359580592105263, + "grad_norm": 1.145747037007685, + "learning_rate": 9.192282084583127e-06, + "loss": 3.4534, + "step": 42410 + }, + { + "epoch": 4.360608552631579, + "grad_norm": 1.2442664257986515, + "learning_rate": 9.18145240040175e-06, + "loss": 3.4871, + "step": 42420 + }, + { + "epoch": 4.361636513157895, + "grad_norm": 0.8416141037217919, + "learning_rate": 9.17063589620208e-06, + "loss": 3.4384, + "step": 42430 + }, + { + "epoch": 4.362664473684211, + "grad_norm": 1.0248322628754105, + "learning_rate": 9.15983257626286e-06, + "loss": 3.4694, + "step": 42440 + }, + { + "epoch": 4.3636924342105265, + "grad_norm": 0.8455264499413999, + "learning_rate": 9.14904244485762e-06, + "loss": 3.4264, + "step": 42450 + }, + { + "epoch": 4.3647203947368425, + "grad_norm": 0.7410875702772636, + "learning_rate": 9.138265506254638e-06, + "loss": 3.4647, + "step": 42460 + }, + { + "epoch": 4.3657483552631575, + "grad_norm": 0.7056853197466312, + "learning_rate": 9.12750176471703e-06, + "loss": 3.4659, + "step": 42470 + }, + { + "epoch": 4.3667763157894735, + "grad_norm": 0.5451728488956825, + "learning_rate": 9.116751224502657e-06, + "loss": 3.4901, + "step": 42480 + }, + { + "epoch": 4.367804276315789, + "grad_norm": 0.7782013729256877, + "learning_rate": 9.106013889864152e-06, + "loss": 3.5081, + "step": 42490 + }, + { + "epoch": 4.368832236842105, + "grad_norm": 1.1483430888495638, + "learning_rate": 9.095289765048937e-06, + "loss": 3.4654, + "step": 42500 + }, + { + "epoch": 4.369860197368421, + "grad_norm": 1.3759107607830128, + "learning_rate": 9.084578854299214e-06, + "loss": 3.4833, + "step": 42510 + }, + { + "epoch": 4.370888157894737, + "grad_norm": 0.7053118711915802, + "learning_rate": 9.073881161851944e-06, + "loss": 3.4478, + "step": 42520 + }, + { + "epoch": 4.371916118421053, + "grad_norm": 0.6258511707804822, + "learning_rate": 9.06319669193887e-06, + "loss": 3.4565, + "step": 42530 + }, + { + "epoch": 4.372944078947368, + "grad_norm": 1.0152306010993306, + "learning_rate": 9.052525448786506e-06, + "loss": 3.4672, + "step": 42540 + }, + { + "epoch": 4.373972039473684, + "grad_norm": 0.668127877538781, + "learning_rate": 9.041867436616108e-06, + "loss": 3.5284, + "step": 42550 + }, + { + "epoch": 4.375, + "grad_norm": 0.640809800080096, + "learning_rate": 9.031222659643736e-06, + "loss": 3.5074, + "step": 42560 + }, + { + "epoch": 4.376027960526316, + "grad_norm": 0.6867512830854885, + "learning_rate": 9.0205911220802e-06, + "loss": 3.4319, + "step": 42570 + }, + { + "epoch": 4.377055921052632, + "grad_norm": 0.5492774780043223, + "learning_rate": 9.009972828131058e-06, + "loss": 3.4721, + "step": 42580 + }, + { + "epoch": 4.378083881578947, + "grad_norm": 0.8207986027168442, + "learning_rate": 8.99936778199664e-06, + "loss": 3.4367, + "step": 42590 + }, + { + "epoch": 4.379111842105263, + "grad_norm": 1.336275024607955, + "learning_rate": 8.988775987872045e-06, + "loss": 3.485, + "step": 42600 + }, + { + "epoch": 4.380139802631579, + "grad_norm": 1.0199980270142266, + "learning_rate": 8.978197449947112e-06, + "loss": 3.5057, + "step": 42610 + }, + { + "epoch": 4.381167763157895, + "grad_norm": 0.6467749181900955, + "learning_rate": 8.967632172406454e-06, + "loss": 3.4815, + "step": 42620 + }, + { + "epoch": 4.382195723684211, + "grad_norm": 0.775151559407599, + "learning_rate": 8.957080159429427e-06, + "loss": 3.5379, + "step": 42630 + }, + { + "epoch": 4.3832236842105265, + "grad_norm": 0.9598868914867468, + "learning_rate": 8.946541415190137e-06, + "loss": 3.5273, + "step": 42640 + }, + { + "epoch": 4.3842516447368425, + "grad_norm": 0.8582793099954216, + "learning_rate": 8.936015943857453e-06, + "loss": 3.4375, + "step": 42650 + }, + { + "epoch": 4.3852796052631575, + "grad_norm": 0.679136074484914, + "learning_rate": 8.925503749594994e-06, + "loss": 3.4972, + "step": 42660 + }, + { + "epoch": 4.3863075657894735, + "grad_norm": 0.5426288885835353, + "learning_rate": 8.915004836561097e-06, + "loss": 3.4652, + "step": 42670 + }, + { + "epoch": 4.387335526315789, + "grad_norm": 0.6211385170979481, + "learning_rate": 8.904519208908892e-06, + "loss": 3.4404, + "step": 42680 + }, + { + "epoch": 4.388363486842105, + "grad_norm": 0.6124676452576928, + "learning_rate": 8.894046870786229e-06, + "loss": 3.4411, + "step": 42690 + }, + { + "epoch": 4.389391447368421, + "grad_norm": 0.8800396879956663, + "learning_rate": 8.883587826335687e-06, + "loss": 3.498, + "step": 42700 + }, + { + "epoch": 4.390419407894737, + "grad_norm": 0.8596935028787334, + "learning_rate": 8.873142079694616e-06, + "loss": 3.4037, + "step": 42710 + }, + { + "epoch": 4.391447368421053, + "grad_norm": 0.9185036257535737, + "learning_rate": 8.86270963499508e-06, + "loss": 3.5024, + "step": 42720 + }, + { + "epoch": 4.392475328947368, + "grad_norm": 1.1037134043933123, + "learning_rate": 8.852290496363898e-06, + "loss": 3.5325, + "step": 42730 + }, + { + "epoch": 4.393503289473684, + "grad_norm": 1.2049381415923752, + "learning_rate": 8.841884667922623e-06, + "loss": 3.4503, + "step": 42740 + }, + { + "epoch": 4.39453125, + "grad_norm": 0.6443587101117711, + "learning_rate": 8.831492153787534e-06, + "loss": 3.5127, + "step": 42750 + }, + { + "epoch": 4.395559210526316, + "grad_norm": 0.7522377565375772, + "learning_rate": 8.821112958069651e-06, + "loss": 3.5207, + "step": 42760 + }, + { + "epoch": 4.396587171052632, + "grad_norm": 0.7233858291925457, + "learning_rate": 8.810747084874723e-06, + "loss": 3.5089, + "step": 42770 + }, + { + "epoch": 4.397615131578947, + "grad_norm": 0.9217457114901827, + "learning_rate": 8.800394538303243e-06, + "loss": 3.465, + "step": 42780 + }, + { + "epoch": 4.398643092105263, + "grad_norm": 0.6134007088524945, + "learning_rate": 8.790055322450396e-06, + "loss": 3.4505, + "step": 42790 + }, + { + "epoch": 4.399671052631579, + "grad_norm": 1.165179286553592, + "learning_rate": 8.779729441406117e-06, + "loss": 3.4603, + "step": 42800 + }, + { + "epoch": 4.400699013157895, + "grad_norm": 0.6516887322964152, + "learning_rate": 8.769416899255091e-06, + "loss": 3.4723, + "step": 42810 + }, + { + "epoch": 4.401726973684211, + "grad_norm": 0.7034745513092511, + "learning_rate": 8.759117700076682e-06, + "loss": 3.5124, + "step": 42820 + }, + { + "epoch": 4.4027549342105265, + "grad_norm": 0.7967990016226278, + "learning_rate": 8.748831847944991e-06, + "loss": 3.5081, + "step": 42830 + }, + { + "epoch": 4.4037828947368425, + "grad_norm": 0.7603840015310641, + "learning_rate": 8.738559346928858e-06, + "loss": 3.4459, + "step": 42840 + }, + { + "epoch": 4.4048108552631575, + "grad_norm": 0.8500333933722074, + "learning_rate": 8.728300201091811e-06, + "loss": 3.5205, + "step": 42850 + }, + { + "epoch": 4.4058388157894735, + "grad_norm": 0.8678142585228318, + "learning_rate": 8.718054414492123e-06, + "loss": 3.4602, + "step": 42860 + }, + { + "epoch": 4.406866776315789, + "grad_norm": 0.6988464941571262, + "learning_rate": 8.707821991182775e-06, + "loss": 3.548, + "step": 42870 + }, + { + "epoch": 4.407894736842105, + "grad_norm": 0.6643450192122149, + "learning_rate": 8.697602935211432e-06, + "loss": 3.5065, + "step": 42880 + }, + { + "epoch": 4.408922697368421, + "grad_norm": 0.6409978711856122, + "learning_rate": 8.687397250620518e-06, + "loss": 3.445, + "step": 42890 + }, + { + "epoch": 4.409950657894737, + "grad_norm": 0.7406949746599137, + "learning_rate": 8.677204941447146e-06, + "loss": 3.567, + "step": 42900 + }, + { + "epoch": 4.410978618421053, + "grad_norm": 1.0708929562328378, + "learning_rate": 8.667026011723127e-06, + "loss": 3.5388, + "step": 42910 + }, + { + "epoch": 4.412006578947368, + "grad_norm": 0.6141177504437899, + "learning_rate": 8.656860465474993e-06, + "loss": 3.4712, + "step": 42920 + }, + { + "epoch": 4.413034539473684, + "grad_norm": 0.6226447550671035, + "learning_rate": 8.646708306723983e-06, + "loss": 3.433, + "step": 42930 + }, + { + "epoch": 4.4140625, + "grad_norm": 0.9762994640875184, + "learning_rate": 8.636569539486034e-06, + "loss": 3.4723, + "step": 42940 + }, + { + "epoch": 4.415090460526316, + "grad_norm": 1.1480939701644648, + "learning_rate": 8.626444167771792e-06, + "loss": 3.5071, + "step": 42950 + }, + { + "epoch": 4.416118421052632, + "grad_norm": 0.555576632682675, + "learning_rate": 8.616332195586589e-06, + "loss": 3.4938, + "step": 42960 + }, + { + "epoch": 4.417146381578947, + "grad_norm": 0.8957851899687685, + "learning_rate": 8.606233626930483e-06, + "loss": 3.5119, + "step": 42970 + }, + { + "epoch": 4.418174342105263, + "grad_norm": 0.943816869779282, + "learning_rate": 8.5961484657982e-06, + "loss": 3.5033, + "step": 42980 + }, + { + "epoch": 4.419202302631579, + "grad_norm": 0.669102492974591, + "learning_rate": 8.586076716179193e-06, + "loss": 3.4927, + "step": 42990 + }, + { + "epoch": 4.420230263157895, + "grad_norm": 1.434051939376148, + "learning_rate": 8.576018382057573e-06, + "loss": 3.4552, + "step": 43000 + }, + { + "epoch": 4.421258223684211, + "grad_norm": 0.7546695541238418, + "learning_rate": 8.56597346741218e-06, + "loss": 3.4596, + "step": 43010 + }, + { + "epoch": 4.4222861842105265, + "grad_norm": 4.028707012689281, + "learning_rate": 8.555941976216533e-06, + "loss": 3.5005, + "step": 43020 + }, + { + "epoch": 4.4233141447368425, + "grad_norm": 1.1568092849342995, + "learning_rate": 8.545923912438822e-06, + "loss": 3.4853, + "step": 43030 + }, + { + "epoch": 4.4243421052631575, + "grad_norm": 0.8930579634028027, + "learning_rate": 8.535919280041955e-06, + "loss": 3.5006, + "step": 43040 + }, + { + "epoch": 4.4253700657894735, + "grad_norm": 0.7775535108790091, + "learning_rate": 8.525928082983505e-06, + "loss": 3.3226, + "step": 43050 + }, + { + "epoch": 4.426398026315789, + "grad_norm": 0.8177307919538288, + "learning_rate": 8.515950325215746e-06, + "loss": 3.3707, + "step": 43060 + }, + { + "epoch": 4.427425986842105, + "grad_norm": 0.8800389429708919, + "learning_rate": 8.505986010685627e-06, + "loss": 3.454, + "step": 43070 + }, + { + "epoch": 4.428453947368421, + "grad_norm": 0.7475920407457828, + "learning_rate": 8.496035143334789e-06, + "loss": 3.4267, + "step": 43080 + }, + { + "epoch": 4.429481907894737, + "grad_norm": 0.6122865811093823, + "learning_rate": 8.48609772709952e-06, + "loss": 3.5018, + "step": 43090 + }, + { + "epoch": 4.430509868421053, + "grad_norm": 0.940559164385686, + "learning_rate": 8.476173765910835e-06, + "loss": 3.3604, + "step": 43100 + }, + { + "epoch": 4.431537828947368, + "grad_norm": 0.7481486597359289, + "learning_rate": 8.466263263694407e-06, + "loss": 3.377, + "step": 43110 + }, + { + "epoch": 4.432565789473684, + "grad_norm": 0.8399755684955384, + "learning_rate": 8.456366224370569e-06, + "loss": 3.4969, + "step": 43120 + }, + { + "epoch": 4.43359375, + "grad_norm": 0.7501383892174515, + "learning_rate": 8.446482651854335e-06, + "loss": 3.5389, + "step": 43130 + }, + { + "epoch": 4.434621710526316, + "grad_norm": 0.8627701018519344, + "learning_rate": 8.436612550055425e-06, + "loss": 3.503, + "step": 43140 + }, + { + "epoch": 4.435649671052632, + "grad_norm": 1.0250936604949157, + "learning_rate": 8.42675592287818e-06, + "loss": 3.469, + "step": 43150 + }, + { + "epoch": 4.436677631578947, + "grad_norm": 0.9245178035488699, + "learning_rate": 8.416912774221646e-06, + "loss": 3.4582, + "step": 43160 + }, + { + "epoch": 4.437705592105263, + "grad_norm": 0.867962671184799, + "learning_rate": 8.407083107979521e-06, + "loss": 3.4963, + "step": 43170 + }, + { + "epoch": 4.438733552631579, + "grad_norm": 0.9322294857379725, + "learning_rate": 8.397266928040173e-06, + "loss": 3.4099, + "step": 43180 + }, + { + "epoch": 4.439761513157895, + "grad_norm": 0.7150072838338896, + "learning_rate": 8.387464238286642e-06, + "loss": 3.5017, + "step": 43190 + }, + { + "epoch": 4.440789473684211, + "grad_norm": 1.1539756469933669, + "learning_rate": 8.377675042596629e-06, + "loss": 3.5378, + "step": 43200 + }, + { + "epoch": 4.4418174342105265, + "grad_norm": 0.7573420057046462, + "learning_rate": 8.367899344842473e-06, + "loss": 3.5386, + "step": 43210 + }, + { + "epoch": 4.4428453947368425, + "grad_norm": 0.703415153822959, + "learning_rate": 8.35813714889122e-06, + "loss": 3.5045, + "step": 43220 + }, + { + "epoch": 4.4438733552631575, + "grad_norm": 0.6467678681638058, + "learning_rate": 8.348388458604548e-06, + "loss": 3.4102, + "step": 43230 + }, + { + "epoch": 4.4449013157894735, + "grad_norm": 0.7811197172252741, + "learning_rate": 8.338653277838777e-06, + "loss": 3.4556, + "step": 43240 + }, + { + "epoch": 4.445929276315789, + "grad_norm": 0.7749025360472548, + "learning_rate": 8.328931610444908e-06, + "loss": 3.378, + "step": 43250 + }, + { + "epoch": 4.446957236842105, + "grad_norm": 1.0675674097365773, + "learning_rate": 8.3192234602686e-06, + "loss": 3.418, + "step": 43260 + }, + { + "epoch": 4.447985197368421, + "grad_norm": 0.8804760963925509, + "learning_rate": 8.309528831150142e-06, + "loss": 3.4517, + "step": 43270 + }, + { + "epoch": 4.449013157894737, + "grad_norm": 0.6041882911188422, + "learning_rate": 8.299847726924483e-06, + "loss": 3.547, + "step": 43280 + }, + { + "epoch": 4.450041118421053, + "grad_norm": 0.6811819856271549, + "learning_rate": 8.290180151421235e-06, + "loss": 3.5189, + "step": 43290 + }, + { + "epoch": 4.451069078947368, + "grad_norm": 0.7489770429782934, + "learning_rate": 8.280526108464645e-06, + "loss": 3.5208, + "step": 43300 + }, + { + "epoch": 4.452097039473684, + "grad_norm": 0.7196734342803127, + "learning_rate": 8.270885601873609e-06, + "loss": 3.4225, + "step": 43310 + }, + { + "epoch": 4.453125, + "grad_norm": 0.738358710529568, + "learning_rate": 8.261258635461674e-06, + "loss": 3.5062, + "step": 43320 + }, + { + "epoch": 4.454152960526316, + "grad_norm": 0.6703143068658267, + "learning_rate": 8.251645213037018e-06, + "loss": 3.4647, + "step": 43330 + }, + { + "epoch": 4.455180921052632, + "grad_norm": 0.6312424999206969, + "learning_rate": 8.242045338402464e-06, + "loss": 3.457, + "step": 43340 + }, + { + "epoch": 4.456208881578947, + "grad_norm": 0.6871607333491597, + "learning_rate": 8.232459015355507e-06, + "loss": 3.4964, + "step": 43350 + }, + { + "epoch": 4.457236842105263, + "grad_norm": 0.8999390038377654, + "learning_rate": 8.222886247688238e-06, + "loss": 3.4756, + "step": 43360 + }, + { + "epoch": 4.458264802631579, + "grad_norm": 0.8158769488795742, + "learning_rate": 8.213327039187402e-06, + "loss": 3.4275, + "step": 43370 + }, + { + "epoch": 4.459292763157895, + "grad_norm": 0.5626068543561409, + "learning_rate": 8.203781393634385e-06, + "loss": 3.5214, + "step": 43380 + }, + { + "epoch": 4.460320723684211, + "grad_norm": 0.6759624569007269, + "learning_rate": 8.19424931480521e-06, + "loss": 3.3897, + "step": 43390 + }, + { + "epoch": 4.4613486842105265, + "grad_norm": 0.7204384253065651, + "learning_rate": 8.184730806470525e-06, + "loss": 3.5047, + "step": 43400 + }, + { + "epoch": 4.4623766447368425, + "grad_norm": 0.7016575228562709, + "learning_rate": 8.175225872395617e-06, + "loss": 3.5539, + "step": 43410 + }, + { + "epoch": 4.4634046052631575, + "grad_norm": 0.6806461764247362, + "learning_rate": 8.165734516340382e-06, + "loss": 3.4458, + "step": 43420 + }, + { + "epoch": 4.4644325657894735, + "grad_norm": 1.0093842898670615, + "learning_rate": 8.156256742059385e-06, + "loss": 3.5423, + "step": 43430 + }, + { + "epoch": 4.465460526315789, + "grad_norm": 0.9187878464669296, + "learning_rate": 8.146792553301794e-06, + "loss": 3.4856, + "step": 43440 + }, + { + "epoch": 4.466488486842105, + "grad_norm": 0.7679095923611403, + "learning_rate": 8.137341953811389e-06, + "loss": 3.4129, + "step": 43450 + }, + { + "epoch": 4.467516447368421, + "grad_norm": 0.7304079571074286, + "learning_rate": 8.127904947326593e-06, + "loss": 3.4507, + "step": 43460 + }, + { + "epoch": 4.468544407894737, + "grad_norm": 0.5601716862763573, + "learning_rate": 8.118481537580466e-06, + "loss": 3.4311, + "step": 43470 + }, + { + "epoch": 4.469572368421053, + "grad_norm": 1.0488397750349543, + "learning_rate": 8.109071728300655e-06, + "loss": 3.4811, + "step": 43480 + }, + { + "epoch": 4.470600328947368, + "grad_norm": 1.325751483787643, + "learning_rate": 8.099675523209451e-06, + "loss": 3.5261, + "step": 43490 + }, + { + "epoch": 4.471628289473684, + "grad_norm": 0.8860720322069239, + "learning_rate": 8.090292926023754e-06, + "loss": 3.477, + "step": 43500 + }, + { + "epoch": 4.47265625, + "grad_norm": 0.7806203211941168, + "learning_rate": 8.080923940455093e-06, + "loss": 3.5245, + "step": 43510 + }, + { + "epoch": 4.473684210526316, + "grad_norm": 0.9163237096325776, + "learning_rate": 8.071568570209592e-06, + "loss": 3.3554, + "step": 43520 + }, + { + "epoch": 4.474712171052632, + "grad_norm": 0.7427634163039314, + "learning_rate": 8.062226818988013e-06, + "loss": 3.4982, + "step": 43530 + }, + { + "epoch": 4.475740131578947, + "grad_norm": 1.3279763531245607, + "learning_rate": 8.052898690485702e-06, + "loss": 3.474, + "step": 43540 + }, + { + "epoch": 4.476768092105263, + "grad_norm": 1.1900915177199656, + "learning_rate": 8.043584188392647e-06, + "loss": 3.4137, + "step": 43550 + }, + { + "epoch": 4.477796052631579, + "grad_norm": 1.2023501192138604, + "learning_rate": 8.034283316393434e-06, + "loss": 3.3931, + "step": 43560 + }, + { + "epoch": 4.478824013157895, + "grad_norm": 0.670724393655124, + "learning_rate": 8.024996078167237e-06, + "loss": 3.4622, + "step": 43570 + }, + { + "epoch": 4.479851973684211, + "grad_norm": 0.6308166442193732, + "learning_rate": 8.015722477387863e-06, + "loss": 3.5201, + "step": 43580 + }, + { + "epoch": 4.4808799342105265, + "grad_norm": 0.8049656285476777, + "learning_rate": 8.00646251772373e-06, + "loss": 3.5378, + "step": 43590 + }, + { + "epoch": 4.4819078947368425, + "grad_norm": 0.8408073043260061, + "learning_rate": 7.997216202837829e-06, + "loss": 3.4558, + "step": 43600 + }, + { + "epoch": 4.4829358552631575, + "grad_norm": 0.7225881596009849, + "learning_rate": 7.987983536387772e-06, + "loss": 3.4187, + "step": 43610 + }, + { + "epoch": 4.4839638157894735, + "grad_norm": 0.95508542672391, + "learning_rate": 7.978764522025779e-06, + "loss": 3.5201, + "step": 43620 + }, + { + "epoch": 4.484991776315789, + "grad_norm": 0.6563791874476109, + "learning_rate": 7.969559163398641e-06, + "loss": 3.3781, + "step": 43630 + }, + { + "epoch": 4.486019736842105, + "grad_norm": 0.6620458161671802, + "learning_rate": 7.96036746414779e-06, + "loss": 3.3972, + "step": 43640 + }, + { + "epoch": 4.487047697368421, + "grad_norm": 1.026149911866902, + "learning_rate": 7.951189427909228e-06, + "loss": 3.4446, + "step": 43650 + }, + { + "epoch": 4.488075657894737, + "grad_norm": 0.8942513840163145, + "learning_rate": 7.942025058313542e-06, + "loss": 3.4726, + "step": 43660 + }, + { + "epoch": 4.489103618421053, + "grad_norm": 1.1504295355493719, + "learning_rate": 7.932874358985933e-06, + "loss": 3.5552, + "step": 43670 + }, + { + "epoch": 4.490131578947368, + "grad_norm": 0.6561453826712186, + "learning_rate": 7.9237373335462e-06, + "loss": 3.5133, + "step": 43680 + }, + { + "epoch": 4.491159539473684, + "grad_norm": 0.7171606653050904, + "learning_rate": 7.914613985608707e-06, + "loss": 3.4468, + "step": 43690 + }, + { + "epoch": 4.4921875, + "grad_norm": 0.8760797106041638, + "learning_rate": 7.905504318782427e-06, + "loss": 3.428, + "step": 43700 + }, + { + "epoch": 4.493215460526316, + "grad_norm": 0.9913736144120987, + "learning_rate": 7.896408336670914e-06, + "loss": 3.4467, + "step": 43710 + }, + { + "epoch": 4.494243421052632, + "grad_norm": 0.8716483980444093, + "learning_rate": 7.88732604287231e-06, + "loss": 3.4933, + "step": 43720 + }, + { + "epoch": 4.495271381578947, + "grad_norm": 0.9427789201531588, + "learning_rate": 7.878257440979341e-06, + "loss": 3.5123, + "step": 43730 + }, + { + "epoch": 4.496299342105263, + "grad_norm": 0.681167700809745, + "learning_rate": 7.869202534579332e-06, + "loss": 3.524, + "step": 43740 + }, + { + "epoch": 4.497327302631579, + "grad_norm": 0.7293386438195012, + "learning_rate": 7.860161327254154e-06, + "loss": 3.5191, + "step": 43750 + }, + { + "epoch": 4.498355263157895, + "grad_norm": 0.6408779063760633, + "learning_rate": 7.851133822580297e-06, + "loss": 3.4935, + "step": 43760 + }, + { + "epoch": 4.499383223684211, + "grad_norm": 0.765652675517715, + "learning_rate": 7.84212002412882e-06, + "loss": 3.4883, + "step": 43770 + }, + { + "epoch": 4.5004111842105265, + "grad_norm": 0.5868927607423443, + "learning_rate": 7.833119935465343e-06, + "loss": 3.4514, + "step": 43780 + }, + { + "epoch": 4.5014391447368425, + "grad_norm": 0.8944638489230193, + "learning_rate": 7.824133560150073e-06, + "loss": 3.3681, + "step": 43790 + }, + { + "epoch": 4.5024671052631575, + "grad_norm": 0.9352806524837999, + "learning_rate": 7.815160901737811e-06, + "loss": 3.4236, + "step": 43800 + }, + { + "epoch": 4.5034950657894735, + "grad_norm": 0.6786468695171308, + "learning_rate": 7.806201963777904e-06, + "loss": 3.5033, + "step": 43810 + }, + { + "epoch": 4.504523026315789, + "grad_norm": 1.214655478261246, + "learning_rate": 7.797256749814285e-06, + "loss": 3.4868, + "step": 43820 + }, + { + "epoch": 4.505550986842105, + "grad_norm": 1.1462377123657692, + "learning_rate": 7.788325263385455e-06, + "loss": 3.3623, + "step": 43830 + }, + { + "epoch": 4.506578947368421, + "grad_norm": 0.7943656531263495, + "learning_rate": 7.779407508024491e-06, + "loss": 3.5221, + "step": 43840 + }, + { + "epoch": 4.507606907894737, + "grad_norm": 0.8210521814511426, + "learning_rate": 7.77050348725903e-06, + "loss": 3.4557, + "step": 43850 + }, + { + "epoch": 4.508634868421053, + "grad_norm": 0.8103621306779544, + "learning_rate": 7.761613204611282e-06, + "loss": 3.4381, + "step": 43860 + }, + { + "epoch": 4.509662828947368, + "grad_norm": 1.0425791494936891, + "learning_rate": 7.752736663598016e-06, + "loss": 3.5473, + "step": 43870 + }, + { + "epoch": 4.510690789473684, + "grad_norm": 1.019128069319607, + "learning_rate": 7.743873867730558e-06, + "loss": 3.4984, + "step": 43880 + }, + { + "epoch": 4.51171875, + "grad_norm": 0.6957545643230791, + "learning_rate": 7.735024820514838e-06, + "loss": 3.4594, + "step": 43890 + }, + { + "epoch": 4.512746710526316, + "grad_norm": 0.8893078550940892, + "learning_rate": 7.72618952545129e-06, + "loss": 3.5883, + "step": 43900 + }, + { + "epoch": 4.513774671052632, + "grad_norm": 0.6547743009141074, + "learning_rate": 7.717367986034947e-06, + "loss": 3.4886, + "step": 43910 + }, + { + "epoch": 4.514802631578947, + "grad_norm": 0.9731315462630926, + "learning_rate": 7.708560205755385e-06, + "loss": 3.51, + "step": 43920 + }, + { + "epoch": 4.515830592105263, + "grad_norm": 0.947793872271626, + "learning_rate": 7.69976618809674e-06, + "loss": 3.5603, + "step": 43930 + }, + { + "epoch": 4.516858552631579, + "grad_norm": 0.8923212929800275, + "learning_rate": 7.690985936537708e-06, + "loss": 3.5369, + "step": 43940 + }, + { + "epoch": 4.517886513157895, + "grad_norm": 1.2693850858617297, + "learning_rate": 7.68221945455154e-06, + "loss": 3.5332, + "step": 43950 + }, + { + "epoch": 4.518914473684211, + "grad_norm": 0.6122254924269513, + "learning_rate": 7.673466745606024e-06, + "loss": 3.483, + "step": 43960 + }, + { + "epoch": 4.5199424342105265, + "grad_norm": 0.5599597421293403, + "learning_rate": 7.66472781316352e-06, + "loss": 3.4397, + "step": 43970 + }, + { + "epoch": 4.5209703947368425, + "grad_norm": 0.5572990672490618, + "learning_rate": 7.65600266068094e-06, + "loss": 3.5412, + "step": 43980 + }, + { + "epoch": 4.5219983552631575, + "grad_norm": 0.7104458209084339, + "learning_rate": 7.647291291609717e-06, + "loss": 3.4721, + "step": 43990 + }, + { + "epoch": 4.5230263157894735, + "grad_norm": 0.9605746010420131, + "learning_rate": 7.638593709395856e-06, + "loss": 3.6085, + "step": 44000 + }, + { + "epoch": 4.524054276315789, + "grad_norm": 0.5868587372611668, + "learning_rate": 7.629909917479913e-06, + "loss": 3.4655, + "step": 44010 + }, + { + "epoch": 4.525082236842105, + "grad_norm": 0.8865784521486353, + "learning_rate": 7.6212399192969685e-06, + "loss": 3.4378, + "step": 44020 + }, + { + "epoch": 4.526110197368421, + "grad_norm": 0.630125699786566, + "learning_rate": 7.612583718276653e-06, + "loss": 3.3741, + "step": 44030 + }, + { + "epoch": 4.527138157894737, + "grad_norm": 0.8927542014043177, + "learning_rate": 7.603941317843152e-06, + "loss": 3.5062, + "step": 44040 + }, + { + "epoch": 4.528166118421053, + "grad_norm": 0.7598300308347662, + "learning_rate": 7.595312721415173e-06, + "loss": 3.5521, + "step": 44050 + }, + { + "epoch": 4.529194078947368, + "grad_norm": 0.6456571974577751, + "learning_rate": 7.586697932405982e-06, + "loss": 3.3991, + "step": 44060 + }, + { + "epoch": 4.530222039473684, + "grad_norm": 0.726794831241185, + "learning_rate": 7.578096954223374e-06, + "loss": 3.4758, + "step": 44070 + }, + { + "epoch": 4.53125, + "grad_norm": 0.7325405177344668, + "learning_rate": 7.569509790269659e-06, + "loss": 3.3872, + "step": 44080 + }, + { + "epoch": 4.532277960526316, + "grad_norm": 0.720149129682133, + "learning_rate": 7.56093644394173e-06, + "loss": 3.5404, + "step": 44090 + }, + { + "epoch": 4.533305921052632, + "grad_norm": 0.6659884852811528, + "learning_rate": 7.552376918630981e-06, + "loss": 3.5095, + "step": 44100 + }, + { + "epoch": 4.534333881578947, + "grad_norm": 0.7478669351037389, + "learning_rate": 7.543831217723338e-06, + "loss": 3.4365, + "step": 44110 + }, + { + "epoch": 4.535361842105263, + "grad_norm": 0.8225196210497786, + "learning_rate": 7.535299344599263e-06, + "loss": 3.4702, + "step": 44120 + }, + { + "epoch": 4.536389802631579, + "grad_norm": 0.6645043193599256, + "learning_rate": 7.5267813026337685e-06, + "loss": 3.4479, + "step": 44130 + }, + { + "epoch": 4.537417763157895, + "grad_norm": 0.7920452434111125, + "learning_rate": 7.51827709519636e-06, + "loss": 3.461, + "step": 44140 + }, + { + "epoch": 4.538445723684211, + "grad_norm": 0.7468526685241192, + "learning_rate": 7.5097867256511e-06, + "loss": 3.5002, + "step": 44150 + }, + { + "epoch": 4.5394736842105265, + "grad_norm": 0.8553276468203411, + "learning_rate": 7.501310197356566e-06, + "loss": 3.4887, + "step": 44160 + }, + { + "epoch": 4.5405016447368425, + "grad_norm": 0.8915727825888865, + "learning_rate": 7.492847513665844e-06, + "loss": 3.4432, + "step": 44170 + }, + { + "epoch": 4.5415296052631575, + "grad_norm": 0.748481723697328, + "learning_rate": 7.4843986779265795e-06, + "loss": 3.4464, + "step": 44180 + }, + { + "epoch": 4.5425575657894735, + "grad_norm": 0.9479069691815228, + "learning_rate": 7.475963693480914e-06, + "loss": 3.3514, + "step": 44190 + }, + { + "epoch": 4.543585526315789, + "grad_norm": 1.1906389110483415, + "learning_rate": 7.467542563665513e-06, + "loss": 3.4528, + "step": 44200 + }, + { + "epoch": 4.544613486842105, + "grad_norm": 0.9370402162322319, + "learning_rate": 7.4591352918115555e-06, + "loss": 3.4311, + "step": 44210 + }, + { + "epoch": 4.545641447368421, + "grad_norm": 0.641607391934788, + "learning_rate": 7.450741881244767e-06, + "loss": 3.4334, + "step": 44220 + }, + { + "epoch": 4.546669407894737, + "grad_norm": 0.72049584031079, + "learning_rate": 7.442362335285354e-06, + "loss": 3.4499, + "step": 44230 + }, + { + "epoch": 4.547697368421053, + "grad_norm": 0.7286826752698347, + "learning_rate": 7.4339966572480624e-06, + "loss": 3.5791, + "step": 44240 + }, + { + "epoch": 4.548725328947368, + "grad_norm": 0.7772465554053983, + "learning_rate": 7.425644850442144e-06, + "loss": 3.4527, + "step": 44250 + }, + { + "epoch": 4.549753289473684, + "grad_norm": 1.0120519141048356, + "learning_rate": 7.417306918171363e-06, + "loss": 3.4415, + "step": 44260 + }, + { + "epoch": 4.55078125, + "grad_norm": 0.816476882726451, + "learning_rate": 7.408982863733993e-06, + "loss": 3.526, + "step": 44270 + }, + { + "epoch": 4.551809210526316, + "grad_norm": 0.8838245700847976, + "learning_rate": 7.4006726904228295e-06, + "loss": 3.5014, + "step": 44280 + }, + { + "epoch": 4.552837171052632, + "grad_norm": 0.7679818525777966, + "learning_rate": 7.392376401525156e-06, + "loss": 3.4202, + "step": 44290 + }, + { + "epoch": 4.553865131578947, + "grad_norm": 0.6232312925050535, + "learning_rate": 7.384094000322786e-06, + "loss": 3.4401, + "step": 44300 + }, + { + "epoch": 4.554893092105263, + "grad_norm": 0.7275702348213903, + "learning_rate": 7.375825490092033e-06, + "loss": 3.4495, + "step": 44310 + }, + { + "epoch": 4.555921052631579, + "grad_norm": 0.7938451199344978, + "learning_rate": 7.367570874103704e-06, + "loss": 3.4857, + "step": 44320 + }, + { + "epoch": 4.556949013157895, + "grad_norm": 0.5979106835211039, + "learning_rate": 7.359330155623112e-06, + "loss": 3.4407, + "step": 44330 + }, + { + "epoch": 4.557976973684211, + "grad_norm": 0.8674534913823658, + "learning_rate": 7.351103337910102e-06, + "loss": 3.4715, + "step": 44340 + }, + { + "epoch": 4.5590049342105265, + "grad_norm": 0.5475752842759106, + "learning_rate": 7.342890424218971e-06, + "loss": 3.4517, + "step": 44350 + }, + { + "epoch": 4.5600328947368425, + "grad_norm": 1.7151492461500877, + "learning_rate": 7.334691417798561e-06, + "loss": 3.5853, + "step": 44360 + }, + { + "epoch": 4.5610608552631575, + "grad_norm": 1.0467878846318328, + "learning_rate": 7.326506321892176e-06, + "loss": 3.3758, + "step": 44370 + }, + { + "epoch": 4.5620888157894735, + "grad_norm": 0.7303577685578304, + "learning_rate": 7.3183351397376475e-06, + "loss": 3.4457, + "step": 44380 + }, + { + "epoch": 4.563116776315789, + "grad_norm": 0.9264212219191714, + "learning_rate": 7.310177874567283e-06, + "loss": 3.4847, + "step": 44390 + }, + { + "epoch": 4.564144736842105, + "grad_norm": 0.7185601107990712, + "learning_rate": 7.302034529607909e-06, + "loss": 3.5025, + "step": 44400 + }, + { + "epoch": 4.565172697368421, + "grad_norm": 0.6307560744611099, + "learning_rate": 7.293905108080802e-06, + "loss": 3.4389, + "step": 44410 + }, + { + "epoch": 4.566200657894737, + "grad_norm": 0.7849883853757746, + "learning_rate": 7.285789613201763e-06, + "loss": 3.5069, + "step": 44420 + }, + { + "epoch": 4.567228618421053, + "grad_norm": 1.0200974218742251, + "learning_rate": 7.277688048181102e-06, + "loss": 3.395, + "step": 44430 + }, + { + "epoch": 4.568256578947368, + "grad_norm": 0.834130267709282, + "learning_rate": 7.2696004162235714e-06, + "loss": 3.4963, + "step": 44440 + }, + { + "epoch": 4.569284539473684, + "grad_norm": 0.911218669808019, + "learning_rate": 7.261526720528439e-06, + "loss": 3.4387, + "step": 44450 + }, + { + "epoch": 4.5703125, + "grad_norm": 0.703450657345701, + "learning_rate": 7.253466964289458e-06, + "loss": 3.4891, + "step": 44460 + }, + { + "epoch": 4.571340460526316, + "grad_norm": 1.395679930422955, + "learning_rate": 7.245421150694871e-06, + "loss": 3.4429, + "step": 44470 + }, + { + "epoch": 4.572368421052632, + "grad_norm": 0.7150239560975747, + "learning_rate": 7.237389282927398e-06, + "loss": 3.4189, + "step": 44480 + }, + { + "epoch": 4.573396381578947, + "grad_norm": 0.830379440371545, + "learning_rate": 7.229371364164247e-06, + "loss": 3.4669, + "step": 44490 + }, + { + "epoch": 4.574424342105263, + "grad_norm": 0.6982837797995302, + "learning_rate": 7.2213673975770905e-06, + "loss": 3.432, + "step": 44500 + }, + { + "epoch": 4.575452302631579, + "grad_norm": 0.9111018364681074, + "learning_rate": 7.213377386332111e-06, + "loss": 3.37, + "step": 44510 + }, + { + "epoch": 4.576480263157895, + "grad_norm": 0.6457651061249076, + "learning_rate": 7.205401333589961e-06, + "loss": 3.4418, + "step": 44520 + }, + { + "epoch": 4.577508223684211, + "grad_norm": 0.52245808538211, + "learning_rate": 7.197439242505751e-06, + "loss": 3.4365, + "step": 44530 + }, + { + "epoch": 4.5785361842105265, + "grad_norm": 0.9879773543461438, + "learning_rate": 7.189491116229092e-06, + "loss": 3.4569, + "step": 44540 + }, + { + "epoch": 4.5795641447368425, + "grad_norm": 0.8179283372574663, + "learning_rate": 7.181556957904073e-06, + "loss": 3.4186, + "step": 44550 + }, + { + "epoch": 4.5805921052631575, + "grad_norm": 0.9701240086186075, + "learning_rate": 7.173636770669229e-06, + "loss": 3.4298, + "step": 44560 + }, + { + "epoch": 4.5816200657894735, + "grad_norm": 0.8756333170896748, + "learning_rate": 7.165730557657602e-06, + "loss": 3.4728, + "step": 44570 + }, + { + "epoch": 4.582648026315789, + "grad_norm": 0.7361785979694162, + "learning_rate": 7.157838321996685e-06, + "loss": 3.4624, + "step": 44580 + }, + { + "epoch": 4.583675986842105, + "grad_norm": 1.9875164247230013, + "learning_rate": 7.149960066808456e-06, + "loss": 3.4259, + "step": 44590 + }, + { + "epoch": 4.584703947368421, + "grad_norm": 1.050758219753719, + "learning_rate": 7.142095795209343e-06, + "loss": 3.5028, + "step": 44600 + }, + { + "epoch": 4.585731907894737, + "grad_norm": 0.920130679655742, + "learning_rate": 7.134245510310269e-06, + "loss": 3.5462, + "step": 44610 + }, + { + "epoch": 4.586759868421053, + "grad_norm": 0.6622679528927522, + "learning_rate": 7.126409215216594e-06, + "loss": 3.3715, + "step": 44620 + }, + { + "epoch": 4.587787828947368, + "grad_norm": 1.16540746627702, + "learning_rate": 7.118586913028173e-06, + "loss": 3.5659, + "step": 44630 + }, + { + "epoch": 4.588815789473684, + "grad_norm": 0.7648268061661596, + "learning_rate": 7.110778606839308e-06, + "loss": 3.4655, + "step": 44640 + }, + { + "epoch": 4.58984375, + "grad_norm": 1.0613176274403497, + "learning_rate": 7.102984299738769e-06, + "loss": 3.472, + "step": 44650 + }, + { + "epoch": 4.590871710526316, + "grad_norm": 0.914975647576208, + "learning_rate": 7.095203994809785e-06, + "loss": 3.4823, + "step": 44660 + }, + { + "epoch": 4.591899671052632, + "grad_norm": 0.8197136282383264, + "learning_rate": 7.087437695130053e-06, + "loss": 3.4966, + "step": 44670 + }, + { + "epoch": 4.592927631578947, + "grad_norm": 0.9912737445945821, + "learning_rate": 7.079685403771725e-06, + "loss": 3.5291, + "step": 44680 + }, + { + "epoch": 4.593955592105263, + "grad_norm": 0.7618298180618029, + "learning_rate": 7.071947123801412e-06, + "loss": 3.4363, + "step": 44690 + }, + { + "epoch": 4.594983552631579, + "grad_norm": 0.7846660317264712, + "learning_rate": 7.0642228582801966e-06, + "loss": 3.568, + "step": 44700 + }, + { + "epoch": 4.596011513157895, + "grad_norm": 0.8432122035418331, + "learning_rate": 7.056512610263575e-06, + "loss": 3.5123, + "step": 44710 + }, + { + "epoch": 4.597039473684211, + "grad_norm": 0.5701141535072843, + "learning_rate": 7.048816382801555e-06, + "loss": 3.437, + "step": 44720 + }, + { + "epoch": 4.5980674342105265, + "grad_norm": 0.7203956092667838, + "learning_rate": 7.041134178938567e-06, + "loss": 3.4955, + "step": 44730 + }, + { + "epoch": 4.5990953947368425, + "grad_norm": 0.8372870630038217, + "learning_rate": 7.033466001713489e-06, + "loss": 3.4574, + "step": 44740 + }, + { + "epoch": 4.6001233552631575, + "grad_norm": 0.7234153266293479, + "learning_rate": 7.0258118541596595e-06, + "loss": 3.5122, + "step": 44750 + }, + { + "epoch": 4.6011513157894735, + "grad_norm": 0.7128392031165351, + "learning_rate": 7.018171739304879e-06, + "loss": 3.4625, + "step": 44760 + }, + { + "epoch": 4.602179276315789, + "grad_norm": 0.5791660935469978, + "learning_rate": 7.010545660171376e-06, + "loss": 3.4432, + "step": 44770 + }, + { + "epoch": 4.603207236842105, + "grad_norm": 1.245954069243154, + "learning_rate": 7.002933619775837e-06, + "loss": 3.4606, + "step": 44780 + }, + { + "epoch": 4.604235197368421, + "grad_norm": 0.6922504637521366, + "learning_rate": 6.995335621129393e-06, + "loss": 3.4681, + "step": 44790 + }, + { + "epoch": 4.605263157894737, + "grad_norm": 0.7111415360650907, + "learning_rate": 6.987751667237627e-06, + "loss": 3.5133, + "step": 44800 + }, + { + "epoch": 4.606291118421053, + "grad_norm": 0.9229575121576288, + "learning_rate": 6.980181761100557e-06, + "loss": 3.5282, + "step": 44810 + }, + { + "epoch": 4.607319078947368, + "grad_norm": 0.68960660355442, + "learning_rate": 6.972625905712656e-06, + "loss": 3.4031, + "step": 44820 + }, + { + "epoch": 4.608347039473684, + "grad_norm": 0.7197644496764452, + "learning_rate": 6.96508410406281e-06, + "loss": 3.46, + "step": 44830 + }, + { + "epoch": 4.609375, + "grad_norm": 0.8437849673336967, + "learning_rate": 6.95755635913439e-06, + "loss": 3.5164, + "step": 44840 + }, + { + "epoch": 4.610402960526316, + "grad_norm": 0.8686288762902957, + "learning_rate": 6.950042673905174e-06, + "loss": 3.5703, + "step": 44850 + }, + { + "epoch": 4.611430921052632, + "grad_norm": 0.99436681306491, + "learning_rate": 6.942543051347386e-06, + "loss": 3.4826, + "step": 44860 + }, + { + "epoch": 4.612458881578947, + "grad_norm": 0.8133853710117797, + "learning_rate": 6.935057494427681e-06, + "loss": 3.4688, + "step": 44870 + }, + { + "epoch": 4.613486842105263, + "grad_norm": 0.6475545547950783, + "learning_rate": 6.927586006107178e-06, + "loss": 3.4139, + "step": 44880 + }, + { + "epoch": 4.614514802631579, + "grad_norm": 0.638451914575553, + "learning_rate": 6.920128589341393e-06, + "loss": 3.5242, + "step": 44890 + }, + { + "epoch": 4.615542763157895, + "grad_norm": 0.6436052780063568, + "learning_rate": 6.912685247080298e-06, + "loss": 3.5123, + "step": 44900 + }, + { + "epoch": 4.616570723684211, + "grad_norm": 0.8450495989762491, + "learning_rate": 6.905255982268294e-06, + "loss": 3.6044, + "step": 44910 + }, + { + "epoch": 4.6175986842105265, + "grad_norm": 0.8108743764311274, + "learning_rate": 6.8978407978442106e-06, + "loss": 3.4187, + "step": 44920 + }, + { + "epoch": 4.6186266447368425, + "grad_norm": 0.671009193395805, + "learning_rate": 6.890439696741309e-06, + "loss": 3.4612, + "step": 44930 + }, + { + "epoch": 4.6196546052631575, + "grad_norm": 0.6300028605244803, + "learning_rate": 6.883052681887282e-06, + "loss": 3.4129, + "step": 44940 + }, + { + "epoch": 4.6206825657894735, + "grad_norm": 1.2196027755534318, + "learning_rate": 6.87567975620424e-06, + "loss": 3.474, + "step": 44950 + }, + { + "epoch": 4.621710526315789, + "grad_norm": 0.8054329835515409, + "learning_rate": 6.868320922608726e-06, + "loss": 3.5274, + "step": 44960 + }, + { + "epoch": 4.622738486842105, + "grad_norm": 0.8366615510938675, + "learning_rate": 6.86097618401173e-06, + "loss": 3.4472, + "step": 44970 + }, + { + "epoch": 4.623766447368421, + "grad_norm": 0.5346793953009877, + "learning_rate": 6.8536455433186255e-06, + "loss": 3.4474, + "step": 44980 + }, + { + "epoch": 4.624794407894737, + "grad_norm": 0.6768904606892987, + "learning_rate": 6.846329003429238e-06, + "loss": 3.4586, + "step": 44990 + }, + { + "epoch": 4.625822368421053, + "grad_norm": 0.8933130489959663, + "learning_rate": 6.839026567237801e-06, + "loss": 3.4228, + "step": 45000 + }, + { + "epoch": 4.626850328947368, + "grad_norm": 1.059129755220844, + "learning_rate": 6.831738237632984e-06, + "loss": 3.3999, + "step": 45010 + }, + { + "epoch": 4.627878289473684, + "grad_norm": 0.8659612904436222, + "learning_rate": 6.824464017497864e-06, + "loss": 3.4714, + "step": 45020 + }, + { + "epoch": 4.62890625, + "grad_norm": 0.7348892359411539, + "learning_rate": 6.817203909709943e-06, + "loss": 3.4513, + "step": 45030 + }, + { + "epoch": 4.629934210526316, + "grad_norm": 0.6626177386260387, + "learning_rate": 6.809957917141125e-06, + "loss": 3.4438, + "step": 45040 + }, + { + "epoch": 4.630962171052632, + "grad_norm": 0.7849491662374148, + "learning_rate": 6.802726042657755e-06, + "loss": 3.4503, + "step": 45050 + }, + { + "epoch": 4.631990131578947, + "grad_norm": 0.764635887384037, + "learning_rate": 6.795508289120587e-06, + "loss": 3.4606, + "step": 45060 + }, + { + "epoch": 4.633018092105263, + "grad_norm": 1.0014829964110847, + "learning_rate": 6.788304659384765e-06, + "loss": 3.4947, + "step": 45070 + }, + { + "epoch": 4.634046052631579, + "grad_norm": 0.853555708932547, + "learning_rate": 6.781115156299869e-06, + "loss": 3.4562, + "step": 45080 + }, + { + "epoch": 4.635074013157895, + "grad_norm": 1.1820038544580704, + "learning_rate": 6.773939782709906e-06, + "loss": 3.4994, + "step": 45090 + }, + { + "epoch": 4.636101973684211, + "grad_norm": 0.9669781755708525, + "learning_rate": 6.7667785414532475e-06, + "loss": 3.5761, + "step": 45100 + }, + { + "epoch": 4.6371299342105265, + "grad_norm": 0.6686747874096131, + "learning_rate": 6.759631435362713e-06, + "loss": 3.4607, + "step": 45110 + }, + { + "epoch": 4.6381578947368425, + "grad_norm": 0.8166664840412584, + "learning_rate": 6.752498467265523e-06, + "loss": 3.4878, + "step": 45120 + }, + { + "epoch": 4.6391858552631575, + "grad_norm": 0.916344524623498, + "learning_rate": 6.745379639983287e-06, + "loss": 3.5434, + "step": 45130 + }, + { + "epoch": 4.6402138157894735, + "grad_norm": 0.8025821439348507, + "learning_rate": 6.7382749563320485e-06, + "loss": 3.4754, + "step": 45140 + }, + { + "epoch": 4.641241776315789, + "grad_norm": 0.5700029552115411, + "learning_rate": 6.73118441912224e-06, + "loss": 3.4739, + "step": 45150 + }, + { + "epoch": 4.642269736842105, + "grad_norm": 0.7863699419146659, + "learning_rate": 6.7241080311586846e-06, + "loss": 3.506, + "step": 45160 + }, + { + "epoch": 4.643297697368421, + "grad_norm": 0.9224816928380576, + "learning_rate": 6.717045795240641e-06, + "loss": 3.4068, + "step": 45170 + }, + { + "epoch": 4.644325657894737, + "grad_norm": 0.6321476634622576, + "learning_rate": 6.7099977141617554e-06, + "loss": 3.4517, + "step": 45180 + }, + { + "epoch": 4.645353618421053, + "grad_norm": 0.6136526938876053, + "learning_rate": 6.702963790710054e-06, + "loss": 3.5567, + "step": 45190 + }, + { + "epoch": 4.646381578947368, + "grad_norm": 0.6959761763241527, + "learning_rate": 6.695944027667993e-06, + "loss": 3.451, + "step": 45200 + }, + { + "epoch": 4.647409539473684, + "grad_norm": 0.6287926991944012, + "learning_rate": 6.688938427812414e-06, + "loss": 3.4034, + "step": 45210 + }, + { + "epoch": 4.6484375, + "grad_norm": 0.8016643957620075, + "learning_rate": 6.68194699391455e-06, + "loss": 3.5334, + "step": 45220 + }, + { + "epoch": 4.649465460526316, + "grad_norm": 0.6235610371045146, + "learning_rate": 6.674969728740046e-06, + "loss": 3.446, + "step": 45230 + }, + { + "epoch": 4.650493421052632, + "grad_norm": 1.1058280010617063, + "learning_rate": 6.6680066350489305e-06, + "loss": 3.4551, + "step": 45240 + }, + { + "epoch": 4.651521381578947, + "grad_norm": 0.8982440906163588, + "learning_rate": 6.661057715595615e-06, + "loss": 3.4637, + "step": 45250 + }, + { + "epoch": 4.652549342105263, + "grad_norm": 0.8790938452682994, + "learning_rate": 6.654122973128937e-06, + "loss": 3.4989, + "step": 45260 + }, + { + "epoch": 4.653577302631579, + "grad_norm": 0.6419038104209711, + "learning_rate": 6.647202410392104e-06, + "loss": 3.4432, + "step": 45270 + }, + { + "epoch": 4.654605263157895, + "grad_norm": 0.5658541389127221, + "learning_rate": 6.640296030122706e-06, + "loss": 3.4945, + "step": 45280 + }, + { + "epoch": 4.655633223684211, + "grad_norm": 0.6635616657790429, + "learning_rate": 6.633403835052734e-06, + "loss": 3.4834, + "step": 45290 + }, + { + "epoch": 4.6566611842105265, + "grad_norm": 0.6919200733668659, + "learning_rate": 6.626525827908588e-06, + "loss": 3.4671, + "step": 45300 + }, + { + "epoch": 4.6576891447368425, + "grad_norm": 0.5426549210382473, + "learning_rate": 6.61966201141101e-06, + "loss": 3.3666, + "step": 45310 + }, + { + "epoch": 4.6587171052631575, + "grad_norm": 0.790103452343116, + "learning_rate": 6.612812388275166e-06, + "loss": 3.4357, + "step": 45320 + }, + { + "epoch": 4.6597450657894735, + "grad_norm": 0.5250719373100315, + "learning_rate": 6.605976961210596e-06, + "loss": 3.4894, + "step": 45330 + }, + { + "epoch": 4.660773026315789, + "grad_norm": 0.5730041581597168, + "learning_rate": 6.599155732921219e-06, + "loss": 3.4944, + "step": 45340 + }, + { + "epoch": 4.661800986842105, + "grad_norm": 0.9803588015244266, + "learning_rate": 6.592348706105341e-06, + "loss": 3.553, + "step": 45350 + }, + { + "epoch": 4.662828947368421, + "grad_norm": 0.7711021152340186, + "learning_rate": 6.585555883455663e-06, + "loss": 3.5142, + "step": 45360 + }, + { + "epoch": 4.663856907894737, + "grad_norm": 1.0957491995904058, + "learning_rate": 6.5787772676592324e-06, + "loss": 3.447, + "step": 45370 + }, + { + "epoch": 4.664884868421053, + "grad_norm": 0.677394792862074, + "learning_rate": 6.572012861397521e-06, + "loss": 3.4906, + "step": 45380 + }, + { + "epoch": 4.665912828947368, + "grad_norm": 0.6719292463537812, + "learning_rate": 6.565262667346352e-06, + "loss": 3.4434, + "step": 45390 + }, + { + "epoch": 4.666940789473684, + "grad_norm": 0.7676575985987313, + "learning_rate": 6.55852668817593e-06, + "loss": 3.5253, + "step": 45400 + }, + { + "epoch": 4.66796875, + "grad_norm": 0.8279180088105927, + "learning_rate": 6.551804926550832e-06, + "loss": 3.4192, + "step": 45410 + }, + { + "epoch": 4.668996710526316, + "grad_norm": 1.3072703031742205, + "learning_rate": 6.54509738513004e-06, + "loss": 3.416, + "step": 45420 + }, + { + "epoch": 4.670024671052632, + "grad_norm": 0.6966877741407165, + "learning_rate": 6.538404066566869e-06, + "loss": 3.5066, + "step": 45430 + }, + { + "epoch": 4.671052631578947, + "grad_norm": 0.8990214564740625, + "learning_rate": 6.531724973509035e-06, + "loss": 3.455, + "step": 45440 + }, + { + "epoch": 4.672080592105263, + "grad_norm": 0.8498373142397819, + "learning_rate": 6.525060108598619e-06, + "loss": 3.4566, + "step": 45450 + }, + { + "epoch": 4.673108552631579, + "grad_norm": 0.817897365774401, + "learning_rate": 6.518409474472078e-06, + "loss": 3.5012, + "step": 45460 + }, + { + "epoch": 4.674136513157895, + "grad_norm": 0.5731712440379307, + "learning_rate": 6.511773073760232e-06, + "loss": 3.5096, + "step": 45470 + }, + { + "epoch": 4.675164473684211, + "grad_norm": 0.7024852385276673, + "learning_rate": 6.505150909088277e-06, + "loss": 3.4631, + "step": 45480 + }, + { + "epoch": 4.6761924342105265, + "grad_norm": 1.3703850112573015, + "learning_rate": 6.498542983075774e-06, + "loss": 3.524, + "step": 45490 + }, + { + "epoch": 4.6772203947368425, + "grad_norm": 0.9531966323199954, + "learning_rate": 6.491949298336644e-06, + "loss": 3.371, + "step": 45500 + }, + { + "epoch": 4.6782483552631575, + "grad_norm": 0.5200585292466987, + "learning_rate": 6.485369857479205e-06, + "loss": 3.4261, + "step": 45510 + }, + { + "epoch": 4.6792763157894735, + "grad_norm": 1.1250940389536124, + "learning_rate": 6.4788046631060996e-06, + "loss": 3.4919, + "step": 45520 + }, + { + "epoch": 4.680304276315789, + "grad_norm": 0.656733992892307, + "learning_rate": 6.47225371781436e-06, + "loss": 3.5164, + "step": 45530 + }, + { + "epoch": 4.681332236842105, + "grad_norm": 0.671367431363584, + "learning_rate": 6.465717024195374e-06, + "loss": 3.4901, + "step": 45540 + }, + { + "epoch": 4.682360197368421, + "grad_norm": 0.7143248044966115, + "learning_rate": 6.459194584834899e-06, + "loss": 3.4358, + "step": 45550 + }, + { + "epoch": 4.683388157894737, + "grad_norm": 0.684347249692116, + "learning_rate": 6.452686402313042e-06, + "loss": 3.4146, + "step": 45560 + }, + { + "epoch": 4.684416118421053, + "grad_norm": 0.7740144985625175, + "learning_rate": 6.44619247920429e-06, + "loss": 3.4791, + "step": 45570 + }, + { + "epoch": 4.685444078947368, + "grad_norm": 0.6820820469417881, + "learning_rate": 6.439712818077454e-06, + "loss": 3.4013, + "step": 45580 + }, + { + "epoch": 4.686472039473684, + "grad_norm": 1.240052601867582, + "learning_rate": 6.4332474214957375e-06, + "loss": 3.5186, + "step": 45590 + }, + { + "epoch": 4.6875, + "grad_norm": 0.6757345018038524, + "learning_rate": 6.426796292016699e-06, + "loss": 3.4282, + "step": 45600 + }, + { + "epoch": 4.688527960526316, + "grad_norm": 0.814418801149302, + "learning_rate": 6.420359432192229e-06, + "loss": 3.3887, + "step": 45610 + }, + { + "epoch": 4.689555921052632, + "grad_norm": 0.8029787011985762, + "learning_rate": 6.413936844568587e-06, + "loss": 3.4669, + "step": 45620 + }, + { + "epoch": 4.690583881578947, + "grad_norm": 0.7456246943764967, + "learning_rate": 6.4075285316864e-06, + "loss": 3.4488, + "step": 45630 + }, + { + "epoch": 4.691611842105263, + "grad_norm": 0.9496085226164193, + "learning_rate": 6.401134496080625e-06, + "loss": 3.5427, + "step": 45640 + }, + { + "epoch": 4.692639802631579, + "grad_norm": 0.9922572734659411, + "learning_rate": 6.394754740280589e-06, + "loss": 3.4626, + "step": 45650 + }, + { + "epoch": 4.693667763157895, + "grad_norm": 0.7525902365129005, + "learning_rate": 6.388389266809958e-06, + "loss": 3.5117, + "step": 45660 + }, + { + "epoch": 4.694695723684211, + "grad_norm": 0.7480497434874832, + "learning_rate": 6.382038078186757e-06, + "loss": 3.4923, + "step": 45670 + }, + { + "epoch": 4.6957236842105265, + "grad_norm": 0.8111890340270763, + "learning_rate": 6.3757011769233555e-06, + "loss": 3.4417, + "step": 45680 + }, + { + "epoch": 4.6967516447368425, + "grad_norm": 0.6350162656833436, + "learning_rate": 6.369378565526478e-06, + "loss": 3.5471, + "step": 45690 + }, + { + "epoch": 4.6977796052631575, + "grad_norm": 0.6526399042969915, + "learning_rate": 6.363070246497173e-06, + "loss": 3.3841, + "step": 45700 + }, + { + "epoch": 4.6988075657894735, + "grad_norm": 0.7661869166856097, + "learning_rate": 6.356776222330878e-06, + "loss": 3.4464, + "step": 45710 + }, + { + "epoch": 4.699835526315789, + "grad_norm": 0.9741325358624678, + "learning_rate": 6.350496495517339e-06, + "loss": 3.4472, + "step": 45720 + }, + { + "epoch": 4.700863486842105, + "grad_norm": 0.8745479437792277, + "learning_rate": 6.344231068540656e-06, + "loss": 3.4933, + "step": 45730 + }, + { + "epoch": 4.701891447368421, + "grad_norm": 0.8549544647985454, + "learning_rate": 6.337979943879275e-06, + "loss": 3.358, + "step": 45740 + }, + { + "epoch": 4.702919407894737, + "grad_norm": 0.7441735448088678, + "learning_rate": 6.331743124005987e-06, + "loss": 3.478, + "step": 45750 + }, + { + "epoch": 4.703947368421053, + "grad_norm": 0.8281920064052609, + "learning_rate": 6.325520611387925e-06, + "loss": 3.4864, + "step": 45760 + }, + { + "epoch": 4.704975328947368, + "grad_norm": 1.3206888599183852, + "learning_rate": 6.319312408486553e-06, + "loss": 3.5079, + "step": 45770 + }, + { + "epoch": 4.706003289473684, + "grad_norm": 1.0313969131784515, + "learning_rate": 6.3131185177576836e-06, + "loss": 3.4728, + "step": 45780 + }, + { + "epoch": 4.70703125, + "grad_norm": 0.8811951058788209, + "learning_rate": 6.3069389416514635e-06, + "loss": 3.4976, + "step": 45790 + }, + { + "epoch": 4.708059210526316, + "grad_norm": 0.5791116491664775, + "learning_rate": 6.300773682612376e-06, + "loss": 3.4858, + "step": 45800 + }, + { + "epoch": 4.709087171052632, + "grad_norm": 0.7387539596580592, + "learning_rate": 6.294622743079251e-06, + "loss": 3.4268, + "step": 45810 + }, + { + "epoch": 4.710115131578947, + "grad_norm": 0.8177586355753417, + "learning_rate": 6.2884861254852325e-06, + "loss": 3.4565, + "step": 45820 + }, + { + "epoch": 4.711143092105263, + "grad_norm": 0.7001754992034709, + "learning_rate": 6.2823638322578185e-06, + "loss": 3.4287, + "step": 45830 + }, + { + "epoch": 4.712171052631579, + "grad_norm": 0.5195765654799445, + "learning_rate": 6.276255865818844e-06, + "loss": 3.4939, + "step": 45840 + }, + { + "epoch": 4.713199013157895, + "grad_norm": 0.613611411862146, + "learning_rate": 6.270162228584456e-06, + "loss": 3.3687, + "step": 45850 + }, + { + "epoch": 4.714226973684211, + "grad_norm": 0.7302122022568917, + "learning_rate": 6.2640829229651445e-06, + "loss": 3.4265, + "step": 45860 + }, + { + "epoch": 4.7152549342105265, + "grad_norm": 0.7899807034924407, + "learning_rate": 6.258017951365738e-06, + "loss": 3.4627, + "step": 45870 + }, + { + "epoch": 4.7162828947368425, + "grad_norm": 1.075900284131549, + "learning_rate": 6.251967316185379e-06, + "loss": 3.4917, + "step": 45880 + }, + { + "epoch": 4.7173108552631575, + "grad_norm": 0.6802245024725573, + "learning_rate": 6.245931019817553e-06, + "loss": 3.4732, + "step": 45890 + }, + { + "epoch": 4.7183388157894735, + "grad_norm": 0.7481857051456596, + "learning_rate": 6.239909064650068e-06, + "loss": 3.4765, + "step": 45900 + }, + { + "epoch": 4.719366776315789, + "grad_norm": 0.8397953463068183, + "learning_rate": 6.23390145306505e-06, + "loss": 3.4323, + "step": 45910 + }, + { + "epoch": 4.720394736842105, + "grad_norm": 0.6554638604401186, + "learning_rate": 6.227908187438965e-06, + "loss": 3.4961, + "step": 45920 + }, + { + "epoch": 4.721422697368421, + "grad_norm": 0.8868748195613062, + "learning_rate": 6.221929270142608e-06, + "loss": 3.5365, + "step": 45930 + }, + { + "epoch": 4.722450657894737, + "grad_norm": 0.7419384588724309, + "learning_rate": 6.215964703541072e-06, + "loss": 3.5213, + "step": 45940 + }, + { + "epoch": 4.723478618421053, + "grad_norm": 0.6860901291294936, + "learning_rate": 6.2100144899937935e-06, + "loss": 3.4288, + "step": 45950 + }, + { + "epoch": 4.724506578947368, + "grad_norm": 0.8408442725335445, + "learning_rate": 6.204078631854539e-06, + "loss": 3.4543, + "step": 45960 + }, + { + "epoch": 4.725534539473684, + "grad_norm": 1.222803661490946, + "learning_rate": 6.198157131471373e-06, + "loss": 3.4928, + "step": 45970 + }, + { + "epoch": 4.7265625, + "grad_norm": 0.8785128581952075, + "learning_rate": 6.192249991186701e-06, + "loss": 3.5426, + "step": 45980 + }, + { + "epoch": 4.727590460526316, + "grad_norm": 0.7585586335434964, + "learning_rate": 6.186357213337232e-06, + "loss": 3.4844, + "step": 45990 + }, + { + "epoch": 4.728618421052632, + "grad_norm": 0.6142950971534732, + "learning_rate": 6.180478800254006e-06, + "loss": 3.4697, + "step": 46000 + }, + { + "epoch": 4.729646381578947, + "grad_norm": 1.1789827073004355, + "learning_rate": 6.174614754262377e-06, + "loss": 3.4146, + "step": 46010 + }, + { + "epoch": 4.730674342105263, + "grad_norm": 0.6710600901145899, + "learning_rate": 6.168765077682012e-06, + "loss": 3.5535, + "step": 46020 + }, + { + "epoch": 4.731702302631579, + "grad_norm": 0.8037388218507303, + "learning_rate": 6.162929772826895e-06, + "loss": 3.3989, + "step": 46030 + }, + { + "epoch": 4.732730263157895, + "grad_norm": 0.5610375200957191, + "learning_rate": 6.157108842005324e-06, + "loss": 3.479, + "step": 46040 + }, + { + "epoch": 4.733758223684211, + "grad_norm": 0.7725440365236309, + "learning_rate": 6.15130228751992e-06, + "loss": 3.4396, + "step": 46050 + }, + { + "epoch": 4.7347861842105265, + "grad_norm": 0.6301130716715487, + "learning_rate": 6.14551011166761e-06, + "loss": 3.4567, + "step": 46060 + }, + { + "epoch": 4.7358141447368425, + "grad_norm": 0.610617105321696, + "learning_rate": 6.139732316739626e-06, + "loss": 3.4, + "step": 46070 + }, + { + "epoch": 4.7368421052631575, + "grad_norm": 1.0192326952635087, + "learning_rate": 6.133968905021527e-06, + "loss": 3.5222, + "step": 46080 + }, + { + "epoch": 4.7378700657894735, + "grad_norm": 0.817235753401692, + "learning_rate": 6.128219878793169e-06, + "loss": 3.5094, + "step": 46090 + }, + { + "epoch": 4.738898026315789, + "grad_norm": 0.7482374461478596, + "learning_rate": 6.122485240328721e-06, + "loss": 3.4389, + "step": 46100 + }, + { + "epoch": 4.739925986842105, + "grad_norm": 0.7372313608249788, + "learning_rate": 6.116764991896668e-06, + "loss": 3.486, + "step": 46110 + }, + { + "epoch": 4.740953947368421, + "grad_norm": 1.0108401338600626, + "learning_rate": 6.111059135759795e-06, + "loss": 3.5489, + "step": 46120 + }, + { + "epoch": 4.741981907894737, + "grad_norm": 0.8056387232639285, + "learning_rate": 6.105367674175193e-06, + "loss": 3.5217, + "step": 46130 + }, + { + "epoch": 4.743009868421053, + "grad_norm": 0.7385052941480587, + "learning_rate": 6.099690609394269e-06, + "loss": 3.4103, + "step": 46140 + }, + { + "epoch": 4.744037828947368, + "grad_norm": 0.9505965041605344, + "learning_rate": 6.094027943662715e-06, + "loss": 3.4678, + "step": 46150 + }, + { + "epoch": 4.745065789473684, + "grad_norm": 0.7857531276304436, + "learning_rate": 6.088379679220543e-06, + "loss": 3.4459, + "step": 46160 + }, + { + "epoch": 4.74609375, + "grad_norm": 0.6265743216448169, + "learning_rate": 6.082745818302078e-06, + "loss": 3.477, + "step": 46170 + }, + { + "epoch": 4.747121710526316, + "grad_norm": 0.83435557767858, + "learning_rate": 6.0771263631359164e-06, + "loss": 3.4819, + "step": 46180 + }, + { + "epoch": 4.748149671052632, + "grad_norm": 0.7146046973576127, + "learning_rate": 6.071521315944985e-06, + "loss": 3.4734, + "step": 46190 + }, + { + "epoch": 4.749177631578947, + "grad_norm": 1.2810992175442797, + "learning_rate": 6.065930678946492e-06, + "loss": 3.4947, + "step": 46200 + }, + { + "epoch": 4.750205592105263, + "grad_norm": 0.7363863651303216, + "learning_rate": 6.060354454351958e-06, + "loss": 3.5585, + "step": 46210 + }, + { + "epoch": 4.751233552631579, + "grad_norm": 0.8119437808061897, + "learning_rate": 6.054792644367199e-06, + "loss": 3.4957, + "step": 46220 + }, + { + "epoch": 4.752261513157895, + "grad_norm": 0.6709191153947578, + "learning_rate": 6.0492452511923235e-06, + "loss": 3.4561, + "step": 46230 + }, + { + "epoch": 4.753289473684211, + "grad_norm": 0.9101408928488286, + "learning_rate": 6.043712277021745e-06, + "loss": 3.3391, + "step": 46240 + }, + { + "epoch": 4.7543174342105265, + "grad_norm": 0.9568062342596124, + "learning_rate": 6.038193724044162e-06, + "loss": 3.4997, + "step": 46250 + }, + { + "epoch": 4.7553453947368425, + "grad_norm": 0.7074154394858652, + "learning_rate": 6.032689594442591e-06, + "loss": 3.4513, + "step": 46260 + }, + { + "epoch": 4.7563733552631575, + "grad_norm": 1.0832232578303636, + "learning_rate": 6.027199890394308e-06, + "loss": 3.4815, + "step": 46270 + }, + { + "epoch": 4.7574013157894735, + "grad_norm": 0.996853856307949, + "learning_rate": 6.021724614070916e-06, + "loss": 3.3988, + "step": 46280 + }, + { + "epoch": 4.758429276315789, + "grad_norm": 0.7670235608248371, + "learning_rate": 6.016263767638288e-06, + "loss": 3.3934, + "step": 46290 + }, + { + "epoch": 4.759457236842105, + "grad_norm": 0.8905578052537022, + "learning_rate": 6.0108173532566085e-06, + "loss": 3.4707, + "step": 46300 + }, + { + "epoch": 4.760485197368421, + "grad_norm": 0.6662049010581226, + "learning_rate": 6.005385373080332e-06, + "loss": 3.4212, + "step": 46310 + }, + { + "epoch": 4.761513157894737, + "grad_norm": 0.6157462760636121, + "learning_rate": 5.99996782925822e-06, + "loss": 3.4159, + "step": 46320 + }, + { + "epoch": 4.762541118421053, + "grad_norm": 0.9072752776273408, + "learning_rate": 5.994564723933313e-06, + "loss": 3.5028, + "step": 46330 + }, + { + "epoch": 4.763569078947368, + "grad_norm": 1.0570589319905437, + "learning_rate": 5.989176059242948e-06, + "loss": 3.4449, + "step": 46340 + }, + { + "epoch": 4.764597039473684, + "grad_norm": 1.0454449921763291, + "learning_rate": 5.983801837318745e-06, + "loss": 3.5171, + "step": 46350 + }, + { + "epoch": 4.765625, + "grad_norm": 0.7734490210705796, + "learning_rate": 5.97844206028661e-06, + "loss": 3.5338, + "step": 46360 + }, + { + "epoch": 4.766652960526316, + "grad_norm": 0.6309238318237683, + "learning_rate": 5.973096730266731e-06, + "loss": 3.4843, + "step": 46370 + }, + { + "epoch": 4.767680921052632, + "grad_norm": 1.0599118855578464, + "learning_rate": 5.967765849373602e-06, + "loss": 3.4719, + "step": 46380 + }, + { + "epoch": 4.768708881578947, + "grad_norm": 0.7184628866853319, + "learning_rate": 5.962449419715967e-06, + "loss": 3.489, + "step": 46390 + }, + { + "epoch": 4.769736842105263, + "grad_norm": 0.6007561419217085, + "learning_rate": 5.957147443396879e-06, + "loss": 3.449, + "step": 46400 + }, + { + "epoch": 4.770764802631579, + "grad_norm": 0.7164458368943297, + "learning_rate": 5.9518599225136745e-06, + "loss": 3.4366, + "step": 46410 + }, + { + "epoch": 4.771792763157895, + "grad_norm": 0.9831692713659712, + "learning_rate": 5.946586859157955e-06, + "loss": 3.4567, + "step": 46420 + }, + { + "epoch": 4.772820723684211, + "grad_norm": 0.7892924058824097, + "learning_rate": 5.941328255415615e-06, + "loss": 3.4829, + "step": 46430 + }, + { + "epoch": 4.7738486842105265, + "grad_norm": 0.8066770162062971, + "learning_rate": 5.93608411336683e-06, + "loss": 3.5158, + "step": 46440 + }, + { + "epoch": 4.7748766447368425, + "grad_norm": 0.6676126612908863, + "learning_rate": 5.930854435086044e-06, + "loss": 3.4221, + "step": 46450 + }, + { + "epoch": 4.7759046052631575, + "grad_norm": 0.7126271636319801, + "learning_rate": 5.925639222641992e-06, + "loss": 3.4351, + "step": 46460 + }, + { + "epoch": 4.7769325657894735, + "grad_norm": 0.5622541367127187, + "learning_rate": 5.92043847809768e-06, + "loss": 3.5641, + "step": 46470 + }, + { + "epoch": 4.777960526315789, + "grad_norm": 0.6950995730006246, + "learning_rate": 5.915252203510392e-06, + "loss": 3.4215, + "step": 46480 + }, + { + "epoch": 4.778988486842105, + "grad_norm": 0.6844191227803071, + "learning_rate": 5.910080400931683e-06, + "loss": 3.4592, + "step": 46490 + }, + { + "epoch": 4.780016447368421, + "grad_norm": 0.7829338999894081, + "learning_rate": 5.904923072407401e-06, + "loss": 3.5139, + "step": 46500 + }, + { + "epoch": 4.781044407894737, + "grad_norm": 0.8798709915163038, + "learning_rate": 5.899780219977641e-06, + "loss": 3.4423, + "step": 46510 + }, + { + "epoch": 4.782072368421053, + "grad_norm": 0.6795405524997075, + "learning_rate": 5.894651845676793e-06, + "loss": 3.5281, + "step": 46520 + }, + { + "epoch": 4.783100328947368, + "grad_norm": 0.8312526874929022, + "learning_rate": 5.889537951533515e-06, + "loss": 3.4107, + "step": 46530 + }, + { + "epoch": 4.784128289473684, + "grad_norm": 0.7840380701095863, + "learning_rate": 5.884438539570737e-06, + "loss": 3.5139, + "step": 46540 + }, + { + "epoch": 4.78515625, + "grad_norm": 1.0331073565167908, + "learning_rate": 5.879353611805651e-06, + "loss": 3.5205, + "step": 46550 + }, + { + "epoch": 4.786184210526316, + "grad_norm": 1.1654720604917674, + "learning_rate": 5.874283170249734e-06, + "loss": 3.5328, + "step": 46560 + }, + { + "epoch": 4.787212171052632, + "grad_norm": 0.6764213529307516, + "learning_rate": 5.869227216908724e-06, + "loss": 3.4922, + "step": 46570 + }, + { + "epoch": 4.788240131578947, + "grad_norm": 0.6601993388723595, + "learning_rate": 5.864185753782621e-06, + "loss": 3.5036, + "step": 46580 + }, + { + "epoch": 4.789268092105263, + "grad_norm": 0.6938432106772278, + "learning_rate": 5.859158782865717e-06, + "loss": 3.4696, + "step": 46590 + }, + { + "epoch": 4.790296052631579, + "grad_norm": 0.7744520065036326, + "learning_rate": 5.854146306146543e-06, + "loss": 3.5069, + "step": 46600 + }, + { + "epoch": 4.791324013157895, + "grad_norm": 2.870014512028706, + "learning_rate": 5.849148325607908e-06, + "loss": 3.46, + "step": 46610 + }, + { + "epoch": 4.792351973684211, + "grad_norm": 0.8306920911274455, + "learning_rate": 5.8441648432268985e-06, + "loss": 3.4216, + "step": 46620 + }, + { + "epoch": 4.7933799342105265, + "grad_norm": 0.9247661449614283, + "learning_rate": 5.839195860974849e-06, + "loss": 3.5071, + "step": 46630 + }, + { + "epoch": 4.7944078947368425, + "grad_norm": 0.9320743918526456, + "learning_rate": 5.83424138081736e-06, + "loss": 3.4975, + "step": 46640 + }, + { + "epoch": 4.7954358552631575, + "grad_norm": 0.9611395646687355, + "learning_rate": 5.82930140471431e-06, + "loss": 3.4064, + "step": 46650 + }, + { + "epoch": 4.7964638157894735, + "grad_norm": 1.2118475171062124, + "learning_rate": 5.82437593461982e-06, + "loss": 3.472, + "step": 46660 + }, + { + "epoch": 4.797491776315789, + "grad_norm": 0.8204371766303128, + "learning_rate": 5.819464972482287e-06, + "loss": 3.4439, + "step": 46670 + }, + { + "epoch": 4.798519736842105, + "grad_norm": 1.0820218853596903, + "learning_rate": 5.814568520244369e-06, + "loss": 3.4503, + "step": 46680 + }, + { + "epoch": 4.799547697368421, + "grad_norm": 0.6484041779930075, + "learning_rate": 5.809686579842972e-06, + "loss": 3.3851, + "step": 46690 + }, + { + "epoch": 4.800575657894737, + "grad_norm": 0.7658215873725163, + "learning_rate": 5.804819153209265e-06, + "loss": 3.5402, + "step": 46700 + }, + { + "epoch": 4.801603618421053, + "grad_norm": 0.5713130848064378, + "learning_rate": 5.7999662422686895e-06, + "loss": 3.5556, + "step": 46710 + }, + { + "epoch": 4.802631578947368, + "grad_norm": 0.6377973956759188, + "learning_rate": 5.795127848940934e-06, + "loss": 3.4644, + "step": 46720 + }, + { + "epoch": 4.803659539473684, + "grad_norm": 0.8056899557865586, + "learning_rate": 5.790303975139943e-06, + "loss": 3.5015, + "step": 46730 + }, + { + "epoch": 4.8046875, + "grad_norm": 0.7144529598146209, + "learning_rate": 5.78549462277392e-06, + "loss": 3.453, + "step": 46740 + }, + { + "epoch": 4.805715460526316, + "grad_norm": 0.8356197858603525, + "learning_rate": 5.780699793745328e-06, + "loss": 3.4033, + "step": 46750 + }, + { + "epoch": 4.806743421052632, + "grad_norm": 0.6962535148203153, + "learning_rate": 5.775919489950873e-06, + "loss": 3.5264, + "step": 46760 + }, + { + "epoch": 4.807771381578947, + "grad_norm": 0.9174824501978504, + "learning_rate": 5.77115371328153e-06, + "loss": 3.4674, + "step": 46770 + }, + { + "epoch": 4.808799342105263, + "grad_norm": 0.7154447186864743, + "learning_rate": 5.766402465622523e-06, + "loss": 3.4831, + "step": 46780 + }, + { + "epoch": 4.809827302631579, + "grad_norm": 1.1983689269897482, + "learning_rate": 5.761665748853319e-06, + "loss": 3.471, + "step": 46790 + }, + { + "epoch": 4.810855263157895, + "grad_norm": 0.9469905493203267, + "learning_rate": 5.75694356484765e-06, + "loss": 3.5155, + "step": 46800 + }, + { + "epoch": 4.811883223684211, + "grad_norm": 0.855685898903384, + "learning_rate": 5.752235915473488e-06, + "loss": 3.3961, + "step": 46810 + }, + { + "epoch": 4.8129111842105265, + "grad_norm": 0.934999118437326, + "learning_rate": 5.747542802593067e-06, + "loss": 3.4739, + "step": 46820 + }, + { + "epoch": 4.8139391447368425, + "grad_norm": 0.7576766924736702, + "learning_rate": 5.742864228062857e-06, + "loss": 3.4314, + "step": 46830 + }, + { + "epoch": 4.8149671052631575, + "grad_norm": 0.6273694661381483, + "learning_rate": 5.738200193733596e-06, + "loss": 3.4638, + "step": 46840 + }, + { + "epoch": 4.8159950657894735, + "grad_norm": 0.6569683366879546, + "learning_rate": 5.733550701450248e-06, + "loss": 3.4305, + "step": 46850 + }, + { + "epoch": 4.817023026315789, + "grad_norm": 0.6498958779349426, + "learning_rate": 5.728915753052045e-06, + "loss": 3.4343, + "step": 46860 + }, + { + "epoch": 4.818050986842105, + "grad_norm": 0.6358685111600942, + "learning_rate": 5.724295350372449e-06, + "loss": 3.457, + "step": 46870 + }, + { + "epoch": 4.819078947368421, + "grad_norm": 0.7346378018515044, + "learning_rate": 5.719689495239176e-06, + "loss": 3.4704, + "step": 46880 + }, + { + "epoch": 4.820106907894737, + "grad_norm": 0.7206722759477869, + "learning_rate": 5.71509818947419e-06, + "loss": 3.4625, + "step": 46890 + }, + { + "epoch": 4.821134868421053, + "grad_norm": 0.8395368148247252, + "learning_rate": 5.710521434893702e-06, + "loss": 3.4701, + "step": 46900 + }, + { + "epoch": 4.822162828947368, + "grad_norm": 0.8062501567278569, + "learning_rate": 5.7059592333081446e-06, + "loss": 3.4674, + "step": 46910 + }, + { + "epoch": 4.823190789473684, + "grad_norm": 0.5894411999688763, + "learning_rate": 5.701411586522228e-06, + "loss": 3.4403, + "step": 46920 + }, + { + "epoch": 4.82421875, + "grad_norm": 0.6502880210247137, + "learning_rate": 5.696878496334875e-06, + "loss": 3.4291, + "step": 46930 + }, + { + "epoch": 4.825246710526316, + "grad_norm": 0.8517659845377831, + "learning_rate": 5.692359964539268e-06, + "loss": 3.4355, + "step": 46940 + }, + { + "epoch": 4.826274671052632, + "grad_norm": 0.8343247873577535, + "learning_rate": 5.6878559929228236e-06, + "loss": 3.4919, + "step": 46950 + }, + { + "epoch": 4.827302631578947, + "grad_norm": 0.7445378882641581, + "learning_rate": 5.683366583267196e-06, + "loss": 3.5305, + "step": 46960 + }, + { + "epoch": 4.828330592105263, + "grad_norm": 0.6726264078607236, + "learning_rate": 5.678891737348292e-06, + "loss": 3.4124, + "step": 46970 + }, + { + "epoch": 4.829358552631579, + "grad_norm": 0.8898712783587759, + "learning_rate": 5.674431456936243e-06, + "loss": 3.5257, + "step": 46980 + }, + { + "epoch": 4.830386513157895, + "grad_norm": 1.1793206257053725, + "learning_rate": 5.669985743795422e-06, + "loss": 3.4121, + "step": 46990 + }, + { + "epoch": 4.831414473684211, + "grad_norm": 0.7999694430952782, + "learning_rate": 5.665554599684446e-06, + "loss": 3.445, + "step": 47000 + }, + { + "epoch": 4.8324424342105265, + "grad_norm": 0.7186988729454026, + "learning_rate": 5.661138026356167e-06, + "loss": 3.4707, + "step": 47010 + }, + { + "epoch": 4.8334703947368425, + "grad_norm": 0.607163377322909, + "learning_rate": 5.656736025557661e-06, + "loss": 3.3926, + "step": 47020 + }, + { + "epoch": 4.8344983552631575, + "grad_norm": 0.774645930594856, + "learning_rate": 5.652348599030251e-06, + "loss": 3.4945, + "step": 47030 + }, + { + "epoch": 4.8355263157894735, + "grad_norm": 0.5454853111950293, + "learning_rate": 5.647975748509504e-06, + "loss": 3.4509, + "step": 47040 + }, + { + "epoch": 4.836554276315789, + "grad_norm": 0.5798036844468071, + "learning_rate": 5.643617475725196e-06, + "loss": 3.4865, + "step": 47050 + }, + { + "epoch": 4.837582236842105, + "grad_norm": 0.7301610206662029, + "learning_rate": 5.639273782401361e-06, + "loss": 3.4521, + "step": 47060 + }, + { + "epoch": 4.838610197368421, + "grad_norm": 0.6584906473603528, + "learning_rate": 5.634944670256248e-06, + "loss": 3.4488, + "step": 47070 + }, + { + "epoch": 4.839638157894737, + "grad_norm": 0.6311267250854948, + "learning_rate": 5.630630141002351e-06, + "loss": 3.4541, + "step": 47080 + }, + { + "epoch": 4.840666118421053, + "grad_norm": 0.6072393063262859, + "learning_rate": 5.626330196346382e-06, + "loss": 3.4276, + "step": 47090 + }, + { + "epoch": 4.841694078947368, + "grad_norm": 1.3352679532345257, + "learning_rate": 5.6220448379893e-06, + "loss": 3.506, + "step": 47100 + }, + { + "epoch": 4.842722039473684, + "grad_norm": 0.8480734435724901, + "learning_rate": 5.617774067626281e-06, + "loss": 3.4339, + "step": 47110 + }, + { + "epoch": 4.84375, + "grad_norm": 0.5462951310672773, + "learning_rate": 5.61351788694673e-06, + "loss": 3.5437, + "step": 47120 + }, + { + "epoch": 4.844777960526316, + "grad_norm": 0.8727878809378596, + "learning_rate": 5.609276297634294e-06, + "loss": 3.4942, + "step": 47130 + }, + { + "epoch": 4.845805921052632, + "grad_norm": 0.8420206998219701, + "learning_rate": 5.605049301366835e-06, + "loss": 3.4721, + "step": 47140 + }, + { + "epoch": 4.846833881578947, + "grad_norm": 0.6910020541189553, + "learning_rate": 5.600836899816444e-06, + "loss": 3.5123, + "step": 47150 + }, + { + "epoch": 4.847861842105263, + "grad_norm": 0.6631445879834094, + "learning_rate": 5.596639094649448e-06, + "loss": 3.4542, + "step": 47160 + }, + { + "epoch": 4.848889802631579, + "grad_norm": 0.5795716151879563, + "learning_rate": 5.59245588752639e-06, + "loss": 3.4753, + "step": 47170 + }, + { + "epoch": 4.849917763157895, + "grad_norm": 0.8880378557440023, + "learning_rate": 5.588287280102039e-06, + "loss": 3.4235, + "step": 47180 + }, + { + "epoch": 4.850945723684211, + "grad_norm": 0.9044654969107895, + "learning_rate": 5.584133274025395e-06, + "loss": 3.4427, + "step": 47190 + }, + { + "epoch": 4.8519736842105265, + "grad_norm": 0.5712306326957444, + "learning_rate": 5.579993870939677e-06, + "loss": 3.5571, + "step": 47200 + }, + { + "epoch": 4.8530016447368425, + "grad_norm": 0.7223244395017306, + "learning_rate": 5.575869072482333e-06, + "loss": 3.4977, + "step": 47210 + }, + { + "epoch": 4.8540296052631575, + "grad_norm": 0.7495523547361715, + "learning_rate": 5.571758880285023e-06, + "loss": 3.5356, + "step": 47220 + }, + { + "epoch": 4.8550575657894735, + "grad_norm": 1.0132719511548074, + "learning_rate": 5.567663295973643e-06, + "loss": 3.405, + "step": 47230 + }, + { + "epoch": 4.856085526315789, + "grad_norm": 0.5848241528288477, + "learning_rate": 5.563582321168294e-06, + "loss": 3.463, + "step": 47240 + }, + { + "epoch": 4.857113486842105, + "grad_norm": 0.682237970020361, + "learning_rate": 5.5595159574833204e-06, + "loss": 3.472, + "step": 47250 + }, + { + "epoch": 4.858141447368421, + "grad_norm": 0.7891037429757539, + "learning_rate": 5.555464206527266e-06, + "loss": 3.4191, + "step": 47260 + }, + { + "epoch": 4.859169407894737, + "grad_norm": 1.0774379144116013, + "learning_rate": 5.551427069902897e-06, + "loss": 3.5119, + "step": 47270 + }, + { + "epoch": 4.860197368421053, + "grad_norm": 0.9498348669787309, + "learning_rate": 5.547404549207211e-06, + "loss": 3.4323, + "step": 47280 + }, + { + "epoch": 4.861225328947368, + "grad_norm": 1.081800911414365, + "learning_rate": 5.543396646031414e-06, + "loss": 3.4542, + "step": 47290 + }, + { + "epoch": 4.862253289473684, + "grad_norm": 0.9048129889071952, + "learning_rate": 5.539403361960933e-06, + "loss": 3.4564, + "step": 47300 + }, + { + "epoch": 4.86328125, + "grad_norm": 0.6798912488353146, + "learning_rate": 5.5354246985754115e-06, + "loss": 3.4571, + "step": 47310 + }, + { + "epoch": 4.864309210526316, + "grad_norm": 0.8070417612135393, + "learning_rate": 5.531460657448707e-06, + "loss": 3.4385, + "step": 47320 + }, + { + "epoch": 4.865337171052632, + "grad_norm": 0.5899986164836503, + "learning_rate": 5.527511240148892e-06, + "loss": 3.4157, + "step": 47330 + }, + { + "epoch": 4.866365131578947, + "grad_norm": 0.6120501023973607, + "learning_rate": 5.52357644823826e-06, + "loss": 3.4605, + "step": 47340 + }, + { + "epoch": 4.867393092105263, + "grad_norm": 1.0019373009456456, + "learning_rate": 5.51965628327332e-06, + "loss": 3.5184, + "step": 47350 + }, + { + "epoch": 4.868421052631579, + "grad_norm": 1.2225998448045265, + "learning_rate": 5.515750746804788e-06, + "loss": 3.4525, + "step": 47360 + }, + { + "epoch": 4.869449013157895, + "grad_norm": 0.8703396188561833, + "learning_rate": 5.511859840377592e-06, + "loss": 3.4506, + "step": 47370 + }, + { + "epoch": 4.870476973684211, + "grad_norm": 0.6761354168826266, + "learning_rate": 5.507983565530879e-06, + "loss": 3.4044, + "step": 47380 + }, + { + "epoch": 4.8715049342105265, + "grad_norm": 0.789869783087549, + "learning_rate": 5.504121923798013e-06, + "loss": 3.5248, + "step": 47390 + }, + { + "epoch": 4.8725328947368425, + "grad_norm": 0.7111636210920896, + "learning_rate": 5.50027491670655e-06, + "loss": 3.4883, + "step": 47400 + }, + { + "epoch": 4.8735608552631575, + "grad_norm": 0.7110582605439922, + "learning_rate": 5.49644254577828e-06, + "loss": 3.4399, + "step": 47410 + }, + { + "epoch": 4.8745888157894735, + "grad_norm": 0.9236895919064749, + "learning_rate": 5.492624812529192e-06, + "loss": 3.4465, + "step": 47420 + }, + { + "epoch": 4.875616776315789, + "grad_norm": 0.5682860651617235, + "learning_rate": 5.48882171846948e-06, + "loss": 3.5526, + "step": 47430 + }, + { + "epoch": 4.876644736842105, + "grad_norm": 0.5948002210582022, + "learning_rate": 5.485033265103556e-06, + "loss": 3.4419, + "step": 47440 + }, + { + "epoch": 4.877672697368421, + "grad_norm": 0.7757648554641787, + "learning_rate": 5.481259453930037e-06, + "loss": 3.4941, + "step": 47450 + }, + { + "epoch": 4.878700657894737, + "grad_norm": 0.8138099527614306, + "learning_rate": 5.477500286441747e-06, + "loss": 3.5208, + "step": 47460 + }, + { + "epoch": 4.879728618421053, + "grad_norm": 1.0394709651868255, + "learning_rate": 5.473755764125721e-06, + "loss": 3.4571, + "step": 47470 + }, + { + "epoch": 4.880756578947368, + "grad_norm": 0.8687168759953905, + "learning_rate": 5.470025888463193e-06, + "loss": 3.4279, + "step": 47480 + }, + { + "epoch": 4.881784539473684, + "grad_norm": 0.572701194009124, + "learning_rate": 5.4663106609296125e-06, + "loss": 3.4566, + "step": 47490 + }, + { + "epoch": 4.8828125, + "grad_norm": 1.4660780119129986, + "learning_rate": 5.462610082994628e-06, + "loss": 3.4759, + "step": 47500 + }, + { + "epoch": 4.883840460526316, + "grad_norm": 0.6863521183099334, + "learning_rate": 5.4589241561220984e-06, + "loss": 3.4842, + "step": 47510 + }, + { + "epoch": 4.884868421052632, + "grad_norm": 0.5734609677036832, + "learning_rate": 5.455252881770083e-06, + "loss": 3.4991, + "step": 47520 + }, + { + "epoch": 4.885896381578947, + "grad_norm": 1.2010935688080693, + "learning_rate": 5.45159626139084e-06, + "loss": 3.393, + "step": 47530 + }, + { + "epoch": 4.886924342105263, + "grad_norm": 0.8002216151362689, + "learning_rate": 5.447954296430846e-06, + "loss": 3.4481, + "step": 47540 + }, + { + "epoch": 4.887952302631579, + "grad_norm": 0.6941573802705412, + "learning_rate": 5.444326988330766e-06, + "loss": 3.4287, + "step": 47550 + }, + { + "epoch": 4.888980263157895, + "grad_norm": 0.6695498665321302, + "learning_rate": 5.440714338525474e-06, + "loss": 3.4735, + "step": 47560 + }, + { + "epoch": 4.890008223684211, + "grad_norm": 0.5725048463524663, + "learning_rate": 5.4371163484440405e-06, + "loss": 3.4317, + "step": 47570 + }, + { + "epoch": 4.8910361842105265, + "grad_norm": 0.711573389746526, + "learning_rate": 5.433533019509745e-06, + "loss": 3.4228, + "step": 47580 + }, + { + "epoch": 4.8920641447368425, + "grad_norm": 0.8345446524858917, + "learning_rate": 5.4299643531400555e-06, + "loss": 3.5171, + "step": 47590 + }, + { + "epoch": 4.8930921052631575, + "grad_norm": 0.7279404320319228, + "learning_rate": 5.426410350746652e-06, + "loss": 3.4762, + "step": 47600 + }, + { + "epoch": 4.8941200657894735, + "grad_norm": 0.990309281685518, + "learning_rate": 5.422871013735405e-06, + "loss": 3.5202, + "step": 47610 + }, + { + "epoch": 4.895148026315789, + "grad_norm": 0.5167968323261362, + "learning_rate": 5.419346343506393e-06, + "loss": 3.5238, + "step": 47620 + }, + { + "epoch": 4.896175986842105, + "grad_norm": 0.7985430444283712, + "learning_rate": 5.415836341453884e-06, + "loss": 3.476, + "step": 47630 + }, + { + "epoch": 4.897203947368421, + "grad_norm": 0.9916689561502223, + "learning_rate": 5.41234100896635e-06, + "loss": 3.5051, + "step": 47640 + }, + { + "epoch": 4.898231907894737, + "grad_norm": 1.489800506268263, + "learning_rate": 5.4088603474264526e-06, + "loss": 3.4356, + "step": 47650 + }, + { + "epoch": 4.899259868421053, + "grad_norm": 0.6739852722027574, + "learning_rate": 5.405394358211051e-06, + "loss": 3.517, + "step": 47660 + }, + { + "epoch": 4.900287828947368, + "grad_norm": 0.6156370026246194, + "learning_rate": 5.401943042691209e-06, + "loss": 3.4786, + "step": 47670 + }, + { + "epoch": 4.901315789473684, + "grad_norm": 0.7567260268604059, + "learning_rate": 5.398506402232182e-06, + "loss": 3.48, + "step": 47680 + }, + { + "epoch": 4.90234375, + "grad_norm": 0.7605972641689299, + "learning_rate": 5.395084438193416e-06, + "loss": 3.5277, + "step": 47690 + }, + { + "epoch": 4.903371710526316, + "grad_norm": 0.562581612706349, + "learning_rate": 5.391677151928556e-06, + "loss": 3.4046, + "step": 47700 + }, + { + "epoch": 4.904399671052632, + "grad_norm": 0.9232432346648751, + "learning_rate": 5.388284544785433e-06, + "loss": 3.4351, + "step": 47710 + }, + { + "epoch": 4.905427631578947, + "grad_norm": 0.588855623096892, + "learning_rate": 5.384906618106082e-06, + "loss": 3.4311, + "step": 47720 + }, + { + "epoch": 4.906455592105263, + "grad_norm": 0.7017408947029469, + "learning_rate": 5.3815433732267284e-06, + "loss": 3.4781, + "step": 47730 + }, + { + "epoch": 4.907483552631579, + "grad_norm": 0.5878952381041754, + "learning_rate": 5.378194811477786e-06, + "loss": 3.4424, + "step": 47740 + }, + { + "epoch": 4.908511513157895, + "grad_norm": 0.9813142755437816, + "learning_rate": 5.374860934183859e-06, + "loss": 3.4558, + "step": 47750 + }, + { + "epoch": 4.909539473684211, + "grad_norm": 1.1536698808883366, + "learning_rate": 5.371541742663745e-06, + "loss": 3.4327, + "step": 47760 + }, + { + "epoch": 4.9105674342105265, + "grad_norm": 0.711458455463211, + "learning_rate": 5.368237238230441e-06, + "loss": 3.5513, + "step": 47770 + }, + { + "epoch": 4.9115953947368425, + "grad_norm": 0.7196407603409438, + "learning_rate": 5.364947422191111e-06, + "loss": 3.4584, + "step": 47780 + }, + { + "epoch": 4.9126233552631575, + "grad_norm": 0.8026922730034626, + "learning_rate": 5.361672295847144e-06, + "loss": 3.5271, + "step": 47790 + }, + { + "epoch": 4.9136513157894735, + "grad_norm": 0.7686667451513846, + "learning_rate": 5.358411860494078e-06, + "loss": 3.5326, + "step": 47800 + }, + { + "epoch": 4.914679276315789, + "grad_norm": 0.7583069908541229, + "learning_rate": 5.355166117421675e-06, + "loss": 3.4644, + "step": 47810 + }, + { + "epoch": 4.915707236842105, + "grad_norm": 0.8325017753788343, + "learning_rate": 5.3519350679138586e-06, + "loss": 3.512, + "step": 47820 + }, + { + "epoch": 4.916735197368421, + "grad_norm": 0.990501412946684, + "learning_rate": 5.348718713248755e-06, + "loss": 3.4269, + "step": 47830 + }, + { + "epoch": 4.917763157894737, + "grad_norm": 0.9339147997332831, + "learning_rate": 5.3455170546986736e-06, + "loss": 3.4404, + "step": 47840 + }, + { + "epoch": 4.918791118421053, + "grad_norm": 1.0309878507392254, + "learning_rate": 5.34233009353011e-06, + "loss": 3.5519, + "step": 47850 + }, + { + "epoch": 4.919819078947368, + "grad_norm": 0.86767117869928, + "learning_rate": 5.339157831003747e-06, + "loss": 3.4805, + "step": 47860 + }, + { + "epoch": 4.920847039473684, + "grad_norm": 0.825975218830653, + "learning_rate": 5.3360002683744546e-06, + "loss": 3.4729, + "step": 47870 + }, + { + "epoch": 4.921875, + "grad_norm": 0.6118933651343265, + "learning_rate": 5.3328574068912795e-06, + "loss": 3.4265, + "step": 47880 + }, + { + "epoch": 4.922902960526316, + "grad_norm": 0.9734024188484846, + "learning_rate": 5.329729247797464e-06, + "loss": 3.4692, + "step": 47890 + }, + { + "epoch": 4.923930921052632, + "grad_norm": 0.593866186318648, + "learning_rate": 5.326615792330423e-06, + "loss": 3.4612, + "step": 47900 + }, + { + "epoch": 4.924958881578947, + "grad_norm": 0.837635810715113, + "learning_rate": 5.323517041721767e-06, + "loss": 3.4251, + "step": 47910 + }, + { + "epoch": 4.925986842105263, + "grad_norm": 0.5852598467849501, + "learning_rate": 5.320432997197283e-06, + "loss": 3.4215, + "step": 47920 + }, + { + "epoch": 4.927014802631579, + "grad_norm": 0.9742773556920067, + "learning_rate": 5.3173636599769395e-06, + "loss": 3.48, + "step": 47930 + }, + { + "epoch": 4.928042763157895, + "grad_norm": 0.6421175758551955, + "learning_rate": 5.314309031274891e-06, + "loss": 3.4217, + "step": 47940 + }, + { + "epoch": 4.929070723684211, + "grad_norm": 0.8265009736380176, + "learning_rate": 5.311269112299472e-06, + "loss": 3.3808, + "step": 47950 + }, + { + "epoch": 4.9300986842105265, + "grad_norm": 0.6967956925497705, + "learning_rate": 5.308243904253197e-06, + "loss": 3.4121, + "step": 47960 + }, + { + "epoch": 4.9311266447368425, + "grad_norm": 0.7680168936182697, + "learning_rate": 5.305233408332766e-06, + "loss": 3.4106, + "step": 47970 + }, + { + "epoch": 4.9321546052631575, + "grad_norm": 0.6650050486466207, + "learning_rate": 5.302237625729053e-06, + "loss": 3.5178, + "step": 47980 + }, + { + "epoch": 4.9331825657894735, + "grad_norm": 0.7577988928133622, + "learning_rate": 5.299256557627113e-06, + "loss": 3.5103, + "step": 47990 + }, + { + "epoch": 4.934210526315789, + "grad_norm": 0.9379599078929465, + "learning_rate": 5.296290205206183e-06, + "loss": 3.4997, + "step": 48000 + }, + { + "epoch": 4.935238486842105, + "grad_norm": 0.8320315723687206, + "learning_rate": 5.293338569639679e-06, + "loss": 3.4393, + "step": 48010 + }, + { + "epoch": 4.936266447368421, + "grad_norm": 0.7080075599406878, + "learning_rate": 5.290401652095194e-06, + "loss": 3.4747, + "step": 48020 + }, + { + "epoch": 4.937294407894737, + "grad_norm": 0.8092911886764278, + "learning_rate": 5.287479453734497e-06, + "loss": 3.4573, + "step": 48030 + }, + { + "epoch": 4.938322368421053, + "grad_norm": 0.6735316633344369, + "learning_rate": 5.284571975713534e-06, + "loss": 3.4384, + "step": 48040 + }, + { + "epoch": 4.939350328947368, + "grad_norm": 0.6643061022772074, + "learning_rate": 5.281679219182436e-06, + "loss": 3.4978, + "step": 48050 + }, + { + "epoch": 4.940378289473684, + "grad_norm": 0.9416182979607038, + "learning_rate": 5.278801185285505e-06, + "loss": 3.4669, + "step": 48060 + }, + { + "epoch": 4.94140625, + "grad_norm": 0.7808212248054346, + "learning_rate": 5.275937875161212e-06, + "loss": 3.5124, + "step": 48070 + }, + { + "epoch": 4.942434210526316, + "grad_norm": 1.0490286012137853, + "learning_rate": 5.273089289942218e-06, + "loss": 3.4198, + "step": 48080 + }, + { + "epoch": 4.943462171052632, + "grad_norm": 0.6715620332186405, + "learning_rate": 5.270255430755346e-06, + "loss": 3.422, + "step": 48090 + }, + { + "epoch": 4.944490131578947, + "grad_norm": 0.6183401281912294, + "learning_rate": 5.267436298721611e-06, + "loss": 3.4395, + "step": 48100 + }, + { + "epoch": 4.945518092105263, + "grad_norm": 0.9033208073795097, + "learning_rate": 5.264631894956175e-06, + "loss": 3.422, + "step": 48110 + }, + { + "epoch": 4.946546052631579, + "grad_norm": 0.6570285311720905, + "learning_rate": 5.2618422205683985e-06, + "loss": 3.4737, + "step": 48120 + }, + { + "epoch": 4.947574013157895, + "grad_norm": 0.5967350511104833, + "learning_rate": 5.259067276661807e-06, + "loss": 3.3961, + "step": 48130 + }, + { + "epoch": 4.948601973684211, + "grad_norm": 0.724962667622134, + "learning_rate": 5.256307064334096e-06, + "loss": 3.441, + "step": 48140 + }, + { + "epoch": 4.9496299342105265, + "grad_norm": 1.0080163439038732, + "learning_rate": 5.253561584677137e-06, + "loss": 3.4448, + "step": 48150 + }, + { + "epoch": 4.9506578947368425, + "grad_norm": 0.7634644287581671, + "learning_rate": 5.250830838776975e-06, + "loss": 3.5398, + "step": 48160 + }, + { + "epoch": 4.9516858552631575, + "grad_norm": 0.9422421048917334, + "learning_rate": 5.248114827713821e-06, + "loss": 3.4398, + "step": 48170 + }, + { + "epoch": 4.9527138157894735, + "grad_norm": 0.8766361334841785, + "learning_rate": 5.245413552562069e-06, + "loss": 3.4067, + "step": 48180 + }, + { + "epoch": 4.953741776315789, + "grad_norm": 0.6083150105351576, + "learning_rate": 5.242727014390264e-06, + "loss": 3.4221, + "step": 48190 + }, + { + "epoch": 4.954769736842105, + "grad_norm": 0.674259059050243, + "learning_rate": 5.2400552142611415e-06, + "loss": 3.493, + "step": 48200 + }, + { + "epoch": 4.955797697368421, + "grad_norm": 0.5841614062561724, + "learning_rate": 5.237398153231593e-06, + "loss": 3.5016, + "step": 48210 + }, + { + "epoch": 4.956825657894737, + "grad_norm": 1.2213627124262374, + "learning_rate": 5.23475583235269e-06, + "loss": 3.4488, + "step": 48220 + }, + { + "epoch": 4.957853618421053, + "grad_norm": 0.7895539519011012, + "learning_rate": 5.232128252669664e-06, + "loss": 3.4384, + "step": 48230 + }, + { + "epoch": 4.958881578947368, + "grad_norm": 0.7344163517774003, + "learning_rate": 5.229515415221925e-06, + "loss": 3.408, + "step": 48240 + }, + { + "epoch": 4.959909539473684, + "grad_norm": 0.6619073340106897, + "learning_rate": 5.2269173210430415e-06, + "loss": 3.4492, + "step": 48250 + }, + { + "epoch": 4.9609375, + "grad_norm": 0.6220507895671396, + "learning_rate": 5.224333971160757e-06, + "loss": 3.418, + "step": 48260 + }, + { + "epoch": 4.961965460526316, + "grad_norm": 1.2637624339653213, + "learning_rate": 5.221765366596978e-06, + "loss": 3.4773, + "step": 48270 + }, + { + "epoch": 4.962993421052632, + "grad_norm": 0.5549016727087507, + "learning_rate": 5.219211508367777e-06, + "loss": 3.4531, + "step": 48280 + }, + { + "epoch": 4.964021381578947, + "grad_norm": 0.7396044705732912, + "learning_rate": 5.216672397483404e-06, + "loss": 3.4507, + "step": 48290 + }, + { + "epoch": 4.965049342105263, + "grad_norm": 0.7937311725936709, + "learning_rate": 5.214148034948265e-06, + "loss": 3.4312, + "step": 48300 + }, + { + "epoch": 4.966077302631579, + "grad_norm": 0.8042150054660661, + "learning_rate": 5.211638421760932e-06, + "loss": 3.454, + "step": 48310 + }, + { + "epoch": 4.967105263157895, + "grad_norm": 0.9376462504434236, + "learning_rate": 5.209143558914144e-06, + "loss": 3.4744, + "step": 48320 + }, + { + "epoch": 4.968133223684211, + "grad_norm": 0.53688463537698, + "learning_rate": 5.206663447394811e-06, + "loss": 3.4571, + "step": 48330 + }, + { + "epoch": 4.9691611842105265, + "grad_norm": 0.7821570662012177, + "learning_rate": 5.204198088183998e-06, + "loss": 3.4037, + "step": 48340 + }, + { + "epoch": 4.9701891447368425, + "grad_norm": 0.9929459744054161, + "learning_rate": 5.201747482256942e-06, + "loss": 3.3606, + "step": 48350 + }, + { + "epoch": 4.9712171052631575, + "grad_norm": 0.713859004901377, + "learning_rate": 5.199311630583044e-06, + "loss": 3.4692, + "step": 48360 + }, + { + "epoch": 4.9722450657894735, + "grad_norm": 0.8606468499318094, + "learning_rate": 5.1968905341258605e-06, + "loss": 3.5376, + "step": 48370 + }, + { + "epoch": 4.973273026315789, + "grad_norm": 1.0353077327803897, + "learning_rate": 5.194484193843117e-06, + "loss": 3.4332, + "step": 48380 + }, + { + "epoch": 4.974300986842105, + "grad_norm": 0.7227576519730283, + "learning_rate": 5.192092610686703e-06, + "loss": 3.4719, + "step": 48390 + }, + { + "epoch": 4.975328947368421, + "grad_norm": 0.5524994853176982, + "learning_rate": 5.189715785602671e-06, + "loss": 3.5053, + "step": 48400 + }, + { + "epoch": 4.976356907894737, + "grad_norm": 0.7972197815248151, + "learning_rate": 5.1873537195312294e-06, + "loss": 3.4281, + "step": 48410 + }, + { + "epoch": 4.977384868421053, + "grad_norm": 0.601836625100832, + "learning_rate": 5.185006413406753e-06, + "loss": 3.4337, + "step": 48420 + }, + { + "epoch": 4.978412828947368, + "grad_norm": 0.8110714237709619, + "learning_rate": 5.18267386815778e-06, + "loss": 3.4195, + "step": 48430 + }, + { + "epoch": 4.979440789473684, + "grad_norm": 1.0428774218631163, + "learning_rate": 5.180356084707006e-06, + "loss": 3.5294, + "step": 48440 + }, + { + "epoch": 4.98046875, + "grad_norm": 0.9376566755983226, + "learning_rate": 5.178053063971284e-06, + "loss": 3.5363, + "step": 48450 + }, + { + "epoch": 4.981496710526316, + "grad_norm": 0.5209150027776025, + "learning_rate": 5.175764806861639e-06, + "loss": 3.3905, + "step": 48460 + }, + { + "epoch": 4.982524671052632, + "grad_norm": 0.8808296689186155, + "learning_rate": 5.173491314283242e-06, + "loss": 3.4001, + "step": 48470 + }, + { + "epoch": 4.983552631578947, + "grad_norm": 0.7685629857547165, + "learning_rate": 5.171232587135427e-06, + "loss": 3.5326, + "step": 48480 + }, + { + "epoch": 4.984580592105263, + "grad_norm": 0.837825888242539, + "learning_rate": 5.168988626311696e-06, + "loss": 3.4783, + "step": 48490 + }, + { + "epoch": 4.985608552631579, + "grad_norm": 0.8087349736086858, + "learning_rate": 5.166759432699703e-06, + "loss": 3.4873, + "step": 48500 + }, + { + "epoch": 4.986636513157895, + "grad_norm": 0.6909364300205844, + "learning_rate": 5.164545007181256e-06, + "loss": 3.4407, + "step": 48510 + }, + { + "epoch": 4.987664473684211, + "grad_norm": 0.7755011568632665, + "learning_rate": 5.162345350632337e-06, + "loss": 3.3955, + "step": 48520 + }, + { + "epoch": 4.9886924342105265, + "grad_norm": 0.7373579599701129, + "learning_rate": 5.1601604639230605e-06, + "loss": 3.5239, + "step": 48530 + }, + { + "epoch": 4.9897203947368425, + "grad_norm": 0.76343764983967, + "learning_rate": 5.157990347917724e-06, + "loss": 3.4104, + "step": 48540 + }, + { + "epoch": 4.9907483552631575, + "grad_norm": 0.5848872117595741, + "learning_rate": 5.155835003474764e-06, + "loss": 3.4463, + "step": 48550 + }, + { + "epoch": 4.9917763157894735, + "grad_norm": 0.7074572296828954, + "learning_rate": 5.153694431446787e-06, + "loss": 3.4729, + "step": 48560 + }, + { + "epoch": 4.992804276315789, + "grad_norm": 0.9673071853463104, + "learning_rate": 5.151568632680545e-06, + "loss": 3.4422, + "step": 48570 + }, + { + "epoch": 4.993832236842105, + "grad_norm": 0.9673867323957939, + "learning_rate": 5.1494576080169525e-06, + "loss": 3.4507, + "step": 48580 + }, + { + "epoch": 4.994860197368421, + "grad_norm": 0.9566610437708657, + "learning_rate": 5.147361358291075e-06, + "loss": 3.4842, + "step": 48590 + }, + { + "epoch": 4.995888157894737, + "grad_norm": 1.220946606576944, + "learning_rate": 5.14527988433214e-06, + "loss": 3.5248, + "step": 48600 + }, + { + "epoch": 4.996916118421053, + "grad_norm": 0.7897430733077789, + "learning_rate": 5.143213186963525e-06, + "loss": 3.4686, + "step": 48610 + }, + { + "epoch": 4.997944078947368, + "grad_norm": 1.0468514283216726, + "learning_rate": 5.141161267002764e-06, + "loss": 3.4654, + "step": 48620 + }, + { + "epoch": 4.998972039473684, + "grad_norm": 0.7225310758487619, + "learning_rate": 5.1391241252615475e-06, + "loss": 3.4665, + "step": 48630 + }, + { + "epoch": 5.0, + "grad_norm": 0.6168580685024387, + "learning_rate": 5.137101762545713e-06, + "loss": 3.5104, + "step": 48640 + }, + { + "epoch": 5.001027960526316, + "grad_norm": 0.7618077935234525, + "learning_rate": 5.135094179655256e-06, + "loss": 3.4124, + "step": 48650 + }, + { + "epoch": 5.002055921052632, + "grad_norm": 0.629553728116564, + "learning_rate": 5.1331013773843305e-06, + "loss": 3.4185, + "step": 48660 + }, + { + "epoch": 5.003083881578948, + "grad_norm": 0.7388391556490675, + "learning_rate": 5.131123356521241e-06, + "loss": 3.4099, + "step": 48670 + }, + { + "epoch": 5.004111842105263, + "grad_norm": 0.8359382843299109, + "learning_rate": 5.1291601178484306e-06, + "loss": 3.4726, + "step": 48680 + }, + { + "epoch": 5.005139802631579, + "grad_norm": 1.1621304550182376, + "learning_rate": 5.127211662142522e-06, + "loss": 3.4225, + "step": 48690 + }, + { + "epoch": 5.006167763157895, + "grad_norm": 1.1272831636830882, + "learning_rate": 5.125277990174267e-06, + "loss": 3.3909, + "step": 48700 + }, + { + "epoch": 5.007195723684211, + "grad_norm": 0.92537851064022, + "learning_rate": 5.12335910270858e-06, + "loss": 3.3957, + "step": 48710 + }, + { + "epoch": 5.0082236842105265, + "grad_norm": 0.6823918921746445, + "learning_rate": 5.121455000504522e-06, + "loss": 3.4062, + "step": 48720 + }, + { + "epoch": 5.0092516447368425, + "grad_norm": 0.9207458369657858, + "learning_rate": 5.119565684315316e-06, + "loss": 3.4136, + "step": 48730 + }, + { + "epoch": 5.0102796052631575, + "grad_norm": 0.5314285875721825, + "learning_rate": 5.117691154888318e-06, + "loss": 3.3581, + "step": 48740 + }, + { + "epoch": 5.0113075657894735, + "grad_norm": 0.6492015978662473, + "learning_rate": 5.11583141296505e-06, + "loss": 3.419, + "step": 48750 + }, + { + "epoch": 5.012335526315789, + "grad_norm": 0.7837358979047022, + "learning_rate": 5.113986459281181e-06, + "loss": 3.3961, + "step": 48760 + }, + { + "epoch": 5.013363486842105, + "grad_norm": 0.6891951686273006, + "learning_rate": 5.112156294566524e-06, + "loss": 3.4087, + "step": 48770 + }, + { + "epoch": 5.014391447368421, + "grad_norm": 0.7777441709122808, + "learning_rate": 5.1103409195450495e-06, + "loss": 3.439, + "step": 48780 + }, + { + "epoch": 5.015419407894737, + "grad_norm": 0.9362964852358056, + "learning_rate": 5.108540334934875e-06, + "loss": 3.4744, + "step": 48790 + }, + { + "epoch": 5.016447368421052, + "grad_norm": 0.80818294833678, + "learning_rate": 5.10675454144826e-06, + "loss": 3.4929, + "step": 48800 + }, + { + "epoch": 5.017475328947368, + "grad_norm": 0.6341780540038777, + "learning_rate": 5.10498353979163e-06, + "loss": 3.4693, + "step": 48810 + }, + { + "epoch": 5.018503289473684, + "grad_norm": 0.9249237853948089, + "learning_rate": 5.103227330665539e-06, + "loss": 3.4792, + "step": 48820 + }, + { + "epoch": 5.01953125, + "grad_norm": 0.6411555117246673, + "learning_rate": 5.1014859147647044e-06, + "loss": 3.4494, + "step": 48830 + }, + { + "epoch": 5.020559210526316, + "grad_norm": 0.7233730163837062, + "learning_rate": 5.0997592927779845e-06, + "loss": 3.474, + "step": 48840 + }, + { + "epoch": 5.021587171052632, + "grad_norm": 0.9915163067752629, + "learning_rate": 5.098047465388396e-06, + "loss": 3.4861, + "step": 48850 + }, + { + "epoch": 5.022615131578948, + "grad_norm": 0.5538710105457888, + "learning_rate": 5.096350433273079e-06, + "loss": 3.3951, + "step": 48860 + }, + { + "epoch": 5.023643092105263, + "grad_norm": 0.7178722495229439, + "learning_rate": 5.0946681971033484e-06, + "loss": 3.4274, + "step": 48870 + }, + { + "epoch": 5.024671052631579, + "grad_norm": 0.6047314492353403, + "learning_rate": 5.0930007575446535e-06, + "loss": 3.4587, + "step": 48880 + }, + { + "epoch": 5.025699013157895, + "grad_norm": 0.9393752733992878, + "learning_rate": 5.091348115256593e-06, + "loss": 3.4444, + "step": 48890 + }, + { + "epoch": 5.026726973684211, + "grad_norm": 0.5587714119463226, + "learning_rate": 5.089710270892901e-06, + "loss": 3.4643, + "step": 48900 + }, + { + "epoch": 5.0277549342105265, + "grad_norm": 1.6492207912502912, + "learning_rate": 5.088087225101476e-06, + "loss": 3.5639, + "step": 48910 + }, + { + "epoch": 5.0287828947368425, + "grad_norm": 1.0837418568447108, + "learning_rate": 5.0864789785243575e-06, + "loss": 3.4758, + "step": 48920 + }, + { + "epoch": 5.0298108552631575, + "grad_norm": 1.0106837453854398, + "learning_rate": 5.08488553179772e-06, + "loss": 3.5116, + "step": 48930 + }, + { + "epoch": 5.0308388157894735, + "grad_norm": 0.6548372457040622, + "learning_rate": 5.083306885551894e-06, + "loss": 3.4066, + "step": 48940 + }, + { + "epoch": 5.031866776315789, + "grad_norm": 0.7393163458807732, + "learning_rate": 5.081743040411347e-06, + "loss": 3.4395, + "step": 48950 + }, + { + "epoch": 5.032894736842105, + "grad_norm": 0.6366247095975844, + "learning_rate": 5.080193996994705e-06, + "loss": 3.4732, + "step": 48960 + }, + { + "epoch": 5.033922697368421, + "grad_norm": 0.5772620797625997, + "learning_rate": 5.078659755914733e-06, + "loss": 3.4461, + "step": 48970 + }, + { + "epoch": 5.034950657894737, + "grad_norm": 0.7168895443253235, + "learning_rate": 5.077140317778326e-06, + "loss": 3.3854, + "step": 48980 + }, + { + "epoch": 5.035978618421052, + "grad_norm": 0.732253154394897, + "learning_rate": 5.075635683186544e-06, + "loss": 3.4965, + "step": 48990 + }, + { + "epoch": 5.037006578947368, + "grad_norm": 1.048969848832251, + "learning_rate": 5.074145852734584e-06, + "loss": 3.4387, + "step": 49000 + }, + { + "epoch": 5.038034539473684, + "grad_norm": 0.7392510397435792, + "learning_rate": 5.072670827011779e-06, + "loss": 3.4839, + "step": 49010 + }, + { + "epoch": 5.0390625, + "grad_norm": 0.8523022999112227, + "learning_rate": 5.07121060660162e-06, + "loss": 3.5101, + "step": 49020 + }, + { + "epoch": 5.040090460526316, + "grad_norm": 0.6431577207806962, + "learning_rate": 5.069765192081726e-06, + "loss": 3.4864, + "step": 49030 + }, + { + "epoch": 5.041118421052632, + "grad_norm": 0.9681160298295769, + "learning_rate": 5.0683345840238706e-06, + "loss": 3.4573, + "step": 49040 + }, + { + "epoch": 5.042146381578948, + "grad_norm": 0.7304221561687658, + "learning_rate": 5.066918782993969e-06, + "loss": 3.5417, + "step": 49050 + }, + { + "epoch": 5.043174342105263, + "grad_norm": 0.6514344645333116, + "learning_rate": 5.06551778955207e-06, + "loss": 3.4043, + "step": 49060 + }, + { + "epoch": 5.044202302631579, + "grad_norm": 0.7395646983919696, + "learning_rate": 5.064131604252378e-06, + "loss": 3.4694, + "step": 49070 + }, + { + "epoch": 5.045230263157895, + "grad_norm": 0.7985256153911542, + "learning_rate": 5.062760227643227e-06, + "loss": 3.3978, + "step": 49080 + }, + { + "epoch": 5.046258223684211, + "grad_norm": 0.8078360947016805, + "learning_rate": 5.061403660267108e-06, + "loss": 3.4437, + "step": 49090 + }, + { + "epoch": 5.0472861842105265, + "grad_norm": 0.8295247106820698, + "learning_rate": 5.060061902660637e-06, + "loss": 3.4266, + "step": 49100 + }, + { + "epoch": 5.0483141447368425, + "grad_norm": 0.7424569293415866, + "learning_rate": 5.0587349553545775e-06, + "loss": 3.4544, + "step": 49110 + }, + { + "epoch": 5.0493421052631575, + "grad_norm": 1.0706395672788631, + "learning_rate": 5.057422818873845e-06, + "loss": 3.3576, + "step": 49120 + }, + { + "epoch": 5.0503700657894735, + "grad_norm": 0.7565002878444449, + "learning_rate": 5.056125493737486e-06, + "loss": 3.3928, + "step": 49130 + }, + { + "epoch": 5.051398026315789, + "grad_norm": 0.8344872274181308, + "learning_rate": 5.054842980458688e-06, + "loss": 3.4662, + "step": 49140 + }, + { + "epoch": 5.052425986842105, + "grad_norm": 0.5669738293499982, + "learning_rate": 5.053575279544778e-06, + "loss": 3.4905, + "step": 49150 + }, + { + "epoch": 5.053453947368421, + "grad_norm": 0.7411391232245537, + "learning_rate": 5.052322391497231e-06, + "loss": 3.4536, + "step": 49160 + }, + { + "epoch": 5.054481907894737, + "grad_norm": 1.051223831848963, + "learning_rate": 5.051084316811655e-06, + "loss": 3.4295, + "step": 49170 + }, + { + "epoch": 5.055509868421052, + "grad_norm": 0.8180275109871562, + "learning_rate": 5.0498610559778026e-06, + "loss": 3.5133, + "step": 49180 + }, + { + "epoch": 5.056537828947368, + "grad_norm": 0.8529552304771713, + "learning_rate": 5.048652609479567e-06, + "loss": 3.4583, + "step": 49190 + }, + { + "epoch": 5.057565789473684, + "grad_norm": 0.7007891319622652, + "learning_rate": 5.047458977794979e-06, + "loss": 3.4995, + "step": 49200 + }, + { + "epoch": 5.05859375, + "grad_norm": 1.0319245761951332, + "learning_rate": 5.046280161396209e-06, + "loss": 3.4696, + "step": 49210 + }, + { + "epoch": 5.059621710526316, + "grad_norm": 0.6557943260737931, + "learning_rate": 5.04511616074956e-06, + "loss": 3.4986, + "step": 49220 + }, + { + "epoch": 5.060649671052632, + "grad_norm": 0.5878707781004777, + "learning_rate": 5.043966976315493e-06, + "loss": 3.3795, + "step": 49230 + }, + { + "epoch": 5.061677631578948, + "grad_norm": 0.8818645282558902, + "learning_rate": 5.042832608548591e-06, + "loss": 3.391, + "step": 49240 + }, + { + "epoch": 5.062705592105263, + "grad_norm": 0.7935399596086687, + "learning_rate": 5.04171305789758e-06, + "loss": 3.4574, + "step": 49250 + }, + { + "epoch": 5.063733552631579, + "grad_norm": 0.5801589087546832, + "learning_rate": 5.040608324805329e-06, + "loss": 3.4426, + "step": 49260 + }, + { + "epoch": 5.064761513157895, + "grad_norm": 0.7544530358205338, + "learning_rate": 5.039518409708845e-06, + "loss": 3.3682, + "step": 49270 + }, + { + "epoch": 5.065789473684211, + "grad_norm": 0.8712173168997452, + "learning_rate": 5.038443313039261e-06, + "loss": 3.3685, + "step": 49280 + }, + { + "epoch": 5.0668174342105265, + "grad_norm": 0.6810037211499939, + "learning_rate": 5.037383035221871e-06, + "loss": 3.4269, + "step": 49290 + }, + { + "epoch": 5.0678453947368425, + "grad_norm": 0.5790743379160013, + "learning_rate": 5.036337576676087e-06, + "loss": 3.397, + "step": 49300 + }, + { + "epoch": 5.0688733552631575, + "grad_norm": 0.7109304574470894, + "learning_rate": 5.035306937815465e-06, + "loss": 3.3861, + "step": 49310 + }, + { + "epoch": 5.0699013157894735, + "grad_norm": 0.6030804938858797, + "learning_rate": 5.0342911190477005e-06, + "loss": 3.5133, + "step": 49320 + }, + { + "epoch": 5.070929276315789, + "grad_norm": 0.69561882986261, + "learning_rate": 5.033290120774633e-06, + "loss": 3.4467, + "step": 49330 + }, + { + "epoch": 5.071957236842105, + "grad_norm": 1.16029547167467, + "learning_rate": 5.0323039433922225e-06, + "loss": 3.4526, + "step": 49340 + }, + { + "epoch": 5.072985197368421, + "grad_norm": 1.1929453429678134, + "learning_rate": 5.0313325872905875e-06, + "loss": 3.4073, + "step": 49350 + }, + { + "epoch": 5.074013157894737, + "grad_norm": 0.7143350677810358, + "learning_rate": 5.030376052853958e-06, + "loss": 3.392, + "step": 49360 + }, + { + "epoch": 5.075041118421052, + "grad_norm": 0.5814758569538636, + "learning_rate": 5.029434340460726e-06, + "loss": 3.3708, + "step": 49370 + }, + { + "epoch": 5.076069078947368, + "grad_norm": 0.9284644215884684, + "learning_rate": 5.028507450483401e-06, + "loss": 3.4553, + "step": 49380 + }, + { + "epoch": 5.077097039473684, + "grad_norm": 0.7320067997027734, + "learning_rate": 5.027595383288648e-06, + "loss": 3.5027, + "step": 49390 + }, + { + "epoch": 5.078125, + "grad_norm": 0.9362226048779207, + "learning_rate": 5.0266981392372495e-06, + "loss": 3.4139, + "step": 49400 + }, + { + "epoch": 5.079152960526316, + "grad_norm": 0.838779715794751, + "learning_rate": 5.025815718684135e-06, + "loss": 3.4277, + "step": 49410 + }, + { + "epoch": 5.080180921052632, + "grad_norm": 0.8521636101200173, + "learning_rate": 5.024948121978369e-06, + "loss": 3.4277, + "step": 49420 + }, + { + "epoch": 5.081208881578948, + "grad_norm": 0.8854987873066935, + "learning_rate": 5.024095349463145e-06, + "loss": 3.4701, + "step": 49430 + }, + { + "epoch": 5.082236842105263, + "grad_norm": 0.9142835300921849, + "learning_rate": 5.0232574014758095e-06, + "loss": 3.4019, + "step": 49440 + }, + { + "epoch": 5.083264802631579, + "grad_norm": 0.606073360665629, + "learning_rate": 5.022434278347828e-06, + "loss": 3.4792, + "step": 49450 + }, + { + "epoch": 5.084292763157895, + "grad_norm": 0.8845733029345945, + "learning_rate": 5.021625980404804e-06, + "loss": 3.4562, + "step": 49460 + }, + { + "epoch": 5.085320723684211, + "grad_norm": 0.9897168855728146, + "learning_rate": 5.020832507966483e-06, + "loss": 3.4553, + "step": 49470 + }, + { + "epoch": 5.0863486842105265, + "grad_norm": 0.9682492530932157, + "learning_rate": 5.020053861346743e-06, + "loss": 3.4277, + "step": 49480 + }, + { + "epoch": 5.0873766447368425, + "grad_norm": 0.7201913396588469, + "learning_rate": 5.019290040853597e-06, + "loss": 3.3722, + "step": 49490 + }, + { + "epoch": 5.0884046052631575, + "grad_norm": 0.7623537595623049, + "learning_rate": 5.018541046789194e-06, + "loss": 3.5106, + "step": 49500 + }, + { + "epoch": 5.0894325657894735, + "grad_norm": 0.6314003812483663, + "learning_rate": 5.0178068794498165e-06, + "loss": 3.3477, + "step": 49510 + }, + { + "epoch": 5.090460526315789, + "grad_norm": 0.7378892614817504, + "learning_rate": 5.017087539125881e-06, + "loss": 3.4337, + "step": 49520 + }, + { + "epoch": 5.091488486842105, + "grad_norm": 1.6697004204860053, + "learning_rate": 5.01638302610194e-06, + "loss": 3.4438, + "step": 49530 + }, + { + "epoch": 5.092516447368421, + "grad_norm": 0.8523640491225761, + "learning_rate": 5.015693340656687e-06, + "loss": 3.4596, + "step": 49540 + }, + { + "epoch": 5.093544407894737, + "grad_norm": 0.6730668342274373, + "learning_rate": 5.015018483062941e-06, + "loss": 3.3523, + "step": 49550 + }, + { + "epoch": 5.094572368421052, + "grad_norm": 0.6581758216152931, + "learning_rate": 5.014358453587654e-06, + "loss": 3.4191, + "step": 49560 + }, + { + "epoch": 5.095600328947368, + "grad_norm": 0.6969163591573583, + "learning_rate": 5.0137132524919256e-06, + "loss": 3.3615, + "step": 49570 + }, + { + "epoch": 5.096628289473684, + "grad_norm": 0.82724582139381, + "learning_rate": 5.0130828800309735e-06, + "loss": 3.3611, + "step": 49580 + }, + { + "epoch": 5.09765625, + "grad_norm": 0.9286587898062285, + "learning_rate": 5.0124673364541595e-06, + "loss": 3.4474, + "step": 49590 + }, + { + "epoch": 5.098684210526316, + "grad_norm": 0.6138343746434719, + "learning_rate": 5.011866622004985e-06, + "loss": 3.4289, + "step": 49600 + }, + { + "epoch": 5.099712171052632, + "grad_norm": 0.6125815220512286, + "learning_rate": 5.0112807369210655e-06, + "loss": 3.3989, + "step": 49610 + }, + { + "epoch": 5.100740131578948, + "grad_norm": 0.68611097901026, + "learning_rate": 5.010709681434168e-06, + "loss": 3.4779, + "step": 49620 + }, + { + "epoch": 5.101768092105263, + "grad_norm": 0.6869792049389631, + "learning_rate": 5.010153455770186e-06, + "loss": 3.4447, + "step": 49630 + }, + { + "epoch": 5.102796052631579, + "grad_norm": 0.5932729963321743, + "learning_rate": 5.009612060149152e-06, + "loss": 3.384, + "step": 49640 + }, + { + "epoch": 5.103824013157895, + "grad_norm": 0.884324862942736, + "learning_rate": 5.009085494785228e-06, + "loss": 3.469, + "step": 49650 + }, + { + "epoch": 5.104851973684211, + "grad_norm": 0.6206737513760976, + "learning_rate": 5.008573759886709e-06, + "loss": 3.3794, + "step": 49660 + }, + { + "epoch": 5.1058799342105265, + "grad_norm": 0.7306225864576118, + "learning_rate": 5.008076855656021e-06, + "loss": 3.4478, + "step": 49670 + }, + { + "epoch": 5.1069078947368425, + "grad_norm": 0.6658685919036361, + "learning_rate": 5.0075947822897285e-06, + "loss": 3.4273, + "step": 49680 + }, + { + "epoch": 5.1079358552631575, + "grad_norm": 0.8741039047281429, + "learning_rate": 5.007127539978534e-06, + "loss": 3.4622, + "step": 49690 + }, + { + "epoch": 5.1089638157894735, + "grad_norm": 1.2423682890992016, + "learning_rate": 5.006675128907256e-06, + "loss": 3.4046, + "step": 49700 + }, + { + "epoch": 5.109991776315789, + "grad_norm": 0.8882498273433278, + "learning_rate": 5.006237549254869e-06, + "loss": 3.4221, + "step": 49710 + }, + { + "epoch": 5.111019736842105, + "grad_norm": 0.9622597320195325, + "learning_rate": 5.005814801194458e-06, + "loss": 3.3802, + "step": 49720 + }, + { + "epoch": 5.112047697368421, + "grad_norm": 0.8634757258720288, + "learning_rate": 5.005406884893257e-06, + "loss": 3.4158, + "step": 49730 + }, + { + "epoch": 5.113075657894737, + "grad_norm": 1.0682595047656234, + "learning_rate": 5.005013800512628e-06, + "loss": 3.442, + "step": 49740 + }, + { + "epoch": 5.114103618421052, + "grad_norm": 0.6921332034478641, + "learning_rate": 5.004635548208064e-06, + "loss": 3.3169, + "step": 49750 + }, + { + "epoch": 5.115131578947368, + "grad_norm": 0.8732541582809592, + "learning_rate": 5.004272128129186e-06, + "loss": 3.5164, + "step": 49760 + }, + { + "epoch": 5.116159539473684, + "grad_norm": 0.7293022756269304, + "learning_rate": 5.003923540419764e-06, + "loss": 3.4293, + "step": 49770 + }, + { + "epoch": 5.1171875, + "grad_norm": 0.6158223082863662, + "learning_rate": 5.003589785217689e-06, + "loss": 3.3778, + "step": 49780 + }, + { + "epoch": 5.118215460526316, + "grad_norm": 0.6837557355511161, + "learning_rate": 5.003270862654975e-06, + "loss": 3.3806, + "step": 49790 + }, + { + "epoch": 5.119243421052632, + "grad_norm": 0.6107970542160926, + "learning_rate": 5.002966772857795e-06, + "loss": 3.3917, + "step": 49800 + }, + { + "epoch": 5.120271381578948, + "grad_norm": 1.9894248206886938, + "learning_rate": 5.00267751594643e-06, + "loss": 3.4706, + "step": 49810 + }, + { + "epoch": 5.121299342105263, + "grad_norm": 0.6419569218776172, + "learning_rate": 5.002403092035306e-06, + "loss": 3.4219, + "step": 49820 + }, + { + "epoch": 5.122327302631579, + "grad_norm": 0.8118674677146276, + "learning_rate": 5.0021435012329765e-06, + "loss": 3.4042, + "step": 49830 + }, + { + "epoch": 5.123355263157895, + "grad_norm": 0.7159247640928055, + "learning_rate": 5.001898743642132e-06, + "loss": 3.4331, + "step": 49840 + }, + { + "epoch": 5.124383223684211, + "grad_norm": 0.8178331599761715, + "learning_rate": 5.00166881935959e-06, + "loss": 3.4512, + "step": 49850 + }, + { + "epoch": 5.1254111842105265, + "grad_norm": 0.6578789883535635, + "learning_rate": 5.001453728476302e-06, + "loss": 3.4819, + "step": 49860 + }, + { + "epoch": 5.1264391447368425, + "grad_norm": 0.7020914747028358, + "learning_rate": 5.0012534710773555e-06, + "loss": 3.4328, + "step": 49870 + }, + { + "epoch": 5.1274671052631575, + "grad_norm": 0.7772915693242293, + "learning_rate": 5.00106804724196e-06, + "loss": 3.4888, + "step": 49880 + }, + { + "epoch": 5.1284950657894735, + "grad_norm": 0.824860779257147, + "learning_rate": 5.000897457043477e-06, + "loss": 3.4881, + "step": 49890 + }, + { + "epoch": 5.129523026315789, + "grad_norm": 0.5924059563054651, + "learning_rate": 5.000741700549378e-06, + "loss": 3.4165, + "step": 49900 + }, + { + "epoch": 5.130550986842105, + "grad_norm": 0.8105366533852852, + "learning_rate": 5.0006007778212795e-06, + "loss": 3.39, + "step": 49910 + }, + { + "epoch": 5.131578947368421, + "grad_norm": 1.0329246217750374, + "learning_rate": 5.0004746889149255e-06, + "loss": 3.4446, + "step": 49920 + }, + { + "epoch": 5.132606907894737, + "grad_norm": 0.8026845503517257, + "learning_rate": 5.0003634338802e-06, + "loss": 3.4791, + "step": 49930 + }, + { + "epoch": 5.133634868421052, + "grad_norm": 0.8809323502654283, + "learning_rate": 5.000267012761102e-06, + "loss": 3.4106, + "step": 49940 + }, + { + "epoch": 5.134662828947368, + "grad_norm": 0.7270864012980474, + "learning_rate": 5.000185425595778e-06, + "loss": 3.4471, + "step": 49950 + }, + { + "epoch": 5.135690789473684, + "grad_norm": 0.5525225985121401, + "learning_rate": 5.000118672416506e-06, + "loss": 3.4354, + "step": 49960 + }, + { + "epoch": 5.13671875, + "grad_norm": 0.7228475781351282, + "learning_rate": 5.000066753249691e-06, + "loss": 3.4191, + "step": 49970 + }, + { + "epoch": 5.137746710526316, + "grad_norm": 0.7938120825210554, + "learning_rate": 5.000029668115865e-06, + "loss": 3.4416, + "step": 49980 + }, + { + "epoch": 5.138774671052632, + "grad_norm": 0.6443587101117711, + "learning_rate": 5.0000074170297e-06, + "loss": 3.5288, + "step": 49990 + }, + { + "epoch": 5.139802631578948, + "grad_norm": 0.6156145162706346, + "learning_rate": 5e-06, + "loss": 3.4188, + "step": 50000 + }, + { + "epoch": 5.139802631578948, + "step": 50000, + "total_flos": 3.2714589732864e+16, + "train_loss": 1.3955855757904052, + "train_runtime": 264657.9405, + "train_samples_per_second": 12.091, + "train_steps_per_second": 0.189 + } + ], + "logging_steps": 10, + "max_steps": 50000, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 5000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.2714589732864e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}