diff --git "a/checkpoint-12000/trainer_state.json" "b/checkpoint-12000/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/checkpoint-12000/trainer_state.json"
@@ -0,0 +1,84208 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.33110641277242986,
+  "eval_steps": 500,
+  "global_step": 12000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0,
+      "eval_runtime": 27.428,
+      "eval_samples_per_second": 1.167,
+      "eval_steps_per_second": 0.146,
+      "step": 0
+    },
+    {
+      "epoch": 2.7592201064369157e-05,
+      "grad_norm": 3.6326730251312256,
+      "learning_rate": 0.001,
+      "loss": 0.4698,
+      "step": 1
+    },
+    {
+      "epoch": 5.5184402128738314e-05,
+      "grad_norm": 0.062148336321115494,
+      "learning_rate": 0.001,
+      "loss": 0.4517,
+      "step": 2
+    },
+    {
+      "epoch": 8.277660319310746e-05,
+      "grad_norm": 0.0332246758043766,
+      "learning_rate": 0.001,
+      "loss": 0.4677,
+      "step": 3
+    },
+    {
+      "epoch": 0.00011036880425747663,
+      "grad_norm": 0.10165657848119736,
+      "learning_rate": 0.001,
+      "loss": 0.4603,
+      "step": 4
+    },
+    {
+      "epoch": 0.00013796100532184578,
+      "grad_norm": 0.04767799377441406,
+      "learning_rate": 0.001,
+      "loss": 0.4623,
+      "step": 5
+    },
+    {
+      "epoch": 0.00016555320638621493,
+      "grad_norm": 0.023448018357157707,
+      "learning_rate": 0.001,
+      "loss": 0.4107,
+      "step": 6
+    },
+    {
+      "epoch": 0.0001931454074505841,
+      "grad_norm": 0.013601069338619709,
+      "learning_rate": 0.001,
+      "loss": 0.4374,
+      "step": 7
+    },
+    {
+      "epoch": 0.00022073760851495325,
+      "grad_norm": 0.009370611980557442,
+      "learning_rate": 0.001,
+      "loss": 0.403,
+      "step": 8
+    },
+    {
+      "epoch": 0.0002483298095793224,
+      "grad_norm": 0.008329728618264198,
+      "learning_rate": 0.001,
+      "loss": 0.4436,
+      "step": 9
+    },
+    {
+      "epoch": 0.00027592201064369155,
+      "grad_norm": 0.004966976586729288,
+      "learning_rate": 0.001,
+      "loss": 0.3936,
+      "step": 10
+    },
+    {
+      "epoch": 0.0003035142117080607,
+      "grad_norm": 0.004952501505613327,
+      "learning_rate": 0.001,
+      "loss": 0.4144,
+      "step": 11
+    },
+    {
+      "epoch": 0.00033110641277242985,
+      "grad_norm": 0.008474660106003284,
+      "learning_rate": 0.001,
+      "loss": 0.4487,
+      "step": 12
+    },
+    {
+      "epoch": 0.000358698613836799,
+      "grad_norm": 0.004617814905941486,
+      "learning_rate": 0.001,
+      "loss": 0.3752,
+      "step": 13
+    },
+    {
+      "epoch": 0.0003862908149011682,
+      "grad_norm": 0.0045386566780507565,
+      "learning_rate": 0.001,
+      "loss": 0.4456,
+      "step": 14
+    },
+    {
+      "epoch": 0.00041388301596553736,
+      "grad_norm": 0.0031611656304448843,
+      "learning_rate": 0.001,
+      "loss": 0.3454,
+      "step": 15
+    },
+    {
+      "epoch": 0.0004414752170299065,
+      "grad_norm": 0.0026714960113167763,
+      "learning_rate": 0.001,
+      "loss": 0.4236,
+      "step": 16
+    },
+    {
+      "epoch": 0.00046906741809427566,
+      "grad_norm": 0.0028185201808810234,
+      "learning_rate": 0.001,
+      "loss": 0.4043,
+      "step": 17
+    },
+    {
+      "epoch": 0.0004966596191586448,
+      "grad_norm": 0.002944110194221139,
+      "learning_rate": 0.001,
+      "loss": 0.4142,
+      "step": 18
+    },
+    {
+      "epoch": 0.000524251820223014,
+      "grad_norm": 0.004909531679004431,
+      "learning_rate": 0.001,
+      "loss": 0.4356,
+      "step": 19
+    },
+    {
+      "epoch": 0.0005518440212873831,
+      "grad_norm": 0.0034190011210739613,
+      "learning_rate": 0.001,
+      "loss": 0.4146,
+      "step": 20
+    },
+    {
+      "epoch": 0.0005794362223517523,
+      "grad_norm": 0.003598715178668499,
+      "learning_rate": 0.001,
+      "loss": 0.4102,
+      "step": 21
+    },
+    {
+      "epoch": 0.0006070284234161214,
+      "grad_norm": 0.004229442682117224,
+      "learning_rate": 0.001,
+      "loss": 0.461,
+      "step": 22
+    },
+    {
+      "epoch": 0.0006346206244804906,
+      "grad_norm": 0.0024963070172816515,
+      "learning_rate": 0.001,
+      "loss": 0.3674,
+      "step": 23
+    },
+    {
+      "epoch": 0.0006622128255448597,
+      "grad_norm": 0.002500817645341158,
+      "learning_rate": 0.001,
+      "loss": 0.4194,
+      "step": 24
+    },
+    {
+      "epoch": 0.0006898050266092289,
+      "grad_norm": 0.002520238049328327,
+      "learning_rate": 0.001,
+      "loss": 0.4037,
+      "step": 25
+    },
+    {
+      "epoch": 0.000717397227673598,
+      "grad_norm": 0.002308758907020092,
+      "learning_rate": 0.001,
+      "loss": 0.3992,
+      "step": 26
+    },
+    {
+      "epoch": 0.0007449894287379673,
+      "grad_norm": 0.0020286778453737497,
+      "learning_rate": 0.001,
+      "loss": 0.3859,
+      "step": 27
+    },
+    {
+      "epoch": 0.0007725816298023364,
+      "grad_norm": 0.0013328269124031067,
+      "learning_rate": 0.001,
+      "loss": 0.4132,
+      "step": 28
+    },
+    {
+      "epoch": 0.0008001738308667056,
+      "grad_norm": 0.003233084687963128,
+      "learning_rate": 0.001,
+      "loss": 0.4118,
+      "step": 29
+    },
+    {
+      "epoch": 0.0008277660319310747,
+      "grad_norm": 0.0017853466561064124,
+      "learning_rate": 0.001,
+      "loss": 0.4184,
+      "step": 30
+    },
+    {
+      "epoch": 0.0008553582329954439,
+      "grad_norm": 0.0021520040463656187,
+      "learning_rate": 0.001,
+      "loss": 0.4215,
+      "step": 31
+    },
+    {
+      "epoch": 0.000882950434059813,
+      "grad_norm": 0.002464903285726905,
+      "learning_rate": 0.001,
+      "loss": 0.4404,
+      "step": 32
+    },
+    {
+      "epoch": 0.0009105426351241822,
+      "grad_norm": 0.0017728271195665002,
+      "learning_rate": 0.001,
+      "loss": 0.406,
+      "step": 33
+    },
+    {
+      "epoch": 0.0009381348361885513,
+      "grad_norm": 0.001393413869664073,
+      "learning_rate": 0.001,
+      "loss": 0.3873,
+      "step": 34
+    },
+    {
+      "epoch": 0.0009657270372529205,
+      "grad_norm": 0.001682270085439086,
+      "learning_rate": 0.001,
+      "loss": 0.4231,
+      "step": 35
+    },
+    {
+      "epoch": 0.0009933192383172896,
+      "grad_norm": 0.0018528420478105545,
+      "learning_rate": 0.001,
+      "loss": 0.3882,
+      "step": 36
+    },
+    {
+      "epoch": 0.0010209114393816589,
+      "grad_norm": 0.0014731371775269508,
+      "learning_rate": 0.001,
+      "loss": 0.4001,
+      "step": 37
+    },
+    {
+      "epoch": 0.001048503640446028,
+      "grad_norm": 0.0012595922453328967,
+      "learning_rate": 0.001,
+      "loss": 0.4372,
+      "step": 38
+    },
+    {
+      "epoch": 0.0010760958415103972,
+      "grad_norm": 0.0011791549623012543,
+      "learning_rate": 0.001,
+      "loss": 0.4218,
+      "step": 39
+    },
+    {
+      "epoch": 0.0011036880425747662,
+      "grad_norm": 0.001127090072259307,
+      "learning_rate": 0.001,
+      "loss": 0.3939,
+      "step": 40
+    },
+    {
+      "epoch": 0.0011312802436391355,
+      "grad_norm": 0.0015457386616617441,
+      "learning_rate": 0.001,
+      "loss": 0.4273,
+      "step": 41
+    },
+    {
+      "epoch": 0.0011588724447035045,
+      "grad_norm": 0.0013682112330570817,
+      "learning_rate": 0.001,
+      "loss": 0.4156,
+      "step": 42
+    },
+    {
+      "epoch": 0.0011864646457678738,
+      "grad_norm": 0.0012363678542897105,
+      "learning_rate": 0.001,
+      "loss": 0.3948,
+      "step": 43
+    },
+    {
+      "epoch": 0.0012140568468322428,
+      "grad_norm": 0.0017921420512720942,
+      "learning_rate": 0.001,
+      "loss": 0.4117,
+      "step": 44
+    },
+    {
+      "epoch": 0.001241649047896612,
+      "grad_norm": 0.001728672650642693,
+      "learning_rate": 0.001,
+      "loss": 0.3857,
+      "step": 45
+    },
+    {
+      "epoch": 0.0012692412489609811,
+      "grad_norm": 0.0038352590054273605,
+      "learning_rate": 0.001,
+      "loss": 0.4116,
+      "step": 46
+    },
+    {
+      "epoch": 0.0012968334500253504,
+      "grad_norm": 0.001102472422644496,
+      "learning_rate": 0.001,
+      "loss": 0.4265,
+      "step": 47
+    },
+    {
+      "epoch": 0.0013244256510897194,
+      "grad_norm": 0.0060236481949687,
+      "learning_rate": 0.001,
+      "loss": 0.4381,
+      "step": 48
+    },
+    {
+      "epoch": 0.0013520178521540887,
+      "grad_norm": 0.0013644751161336899,
+      "learning_rate": 0.001,
+      "loss": 0.4319,
+      "step": 49
+    },
+    {
+      "epoch": 0.0013796100532184577,
+      "grad_norm": 0.0019051303388550878,
+      "learning_rate": 0.001,
+      "loss": 0.3992,
+      "step": 50
+    },
+    {
+      "epoch": 0.001407202254282827,
+      "grad_norm": 0.0021409685723483562,
+      "learning_rate": 0.001,
+      "loss": 0.3841,
+      "step": 51
+    },
+    {
+      "epoch": 0.001434794455347196,
+      "grad_norm": 0.0015203810762614012,
+      "learning_rate": 0.001,
+      "loss": 0.3805,
+      "step": 52
+    },
+    {
+      "epoch": 0.0014623866564115653,
+      "grad_norm": 0.0018806306179612875,
+      "learning_rate": 0.001,
+      "loss": 0.4277,
+      "step": 53
+    },
+    {
+      "epoch": 0.0014899788574759345,
+      "grad_norm": 0.0024247588589787483,
+      "learning_rate": 0.001,
+      "loss": 0.4347,
+      "step": 54
+    },
+    {
+      "epoch": 0.0015175710585403036,
+      "grad_norm": 0.000940295634791255,
+      "learning_rate": 0.001,
+      "loss": 0.3891,
+      "step": 55
+    },
+    {
+      "epoch": 0.0015451632596046728,
+      "grad_norm": 0.0012640036875382066,
+      "learning_rate": 0.001,
+      "loss": 0.4115,
+      "step": 56
+    },
+    {
+      "epoch": 0.0015727554606690419,
+      "grad_norm": 0.00419093482196331,
+      "learning_rate": 0.001,
+      "loss": 0.3778,
+      "step": 57
+    },
+    {
+      "epoch": 0.0016003476617334111,
+      "grad_norm": 0.002213024301454425,
+      "learning_rate": 0.001,
+      "loss": 0.4179,
+      "step": 58
+    },
+    {
+      "epoch": 0.0016279398627977802,
+      "grad_norm": 0.004215582739561796,
+      "learning_rate": 0.001,
+      "loss": 0.4603,
+      "step": 59
+    },
+    {
+      "epoch": 0.0016555320638621494,
+      "grad_norm": 0.0016123323002830148,
+      "learning_rate": 0.001,
+      "loss": 0.4395,
+      "step": 60
+    },
+    {
+      "epoch": 0.0016831242649265185,
+      "grad_norm": 0.0014739107573404908,
+      "learning_rate": 0.001,
+      "loss": 0.3739,
+      "step": 61
+    },
+    {
+      "epoch": 0.0017107164659908877,
+      "grad_norm": 0.0014588399790227413,
+      "learning_rate": 0.001,
+      "loss": 0.4476,
+      "step": 62
+    },
+    {
+      "epoch": 0.0017383086670552568,
+      "grad_norm": 0.0037299874238669872,
+      "learning_rate": 0.001,
+      "loss": 0.4109,
+      "step": 63
+    },
+    {
+      "epoch": 0.001765900868119626,
+      "grad_norm": 0.0027305828407406807,
+      "learning_rate": 0.001,
+      "loss": 0.3968,
+      "step": 64
+    },
+    {
+      "epoch": 0.001793493069183995,
+      "grad_norm": 0.0027416169177740812,
+      "learning_rate": 0.001,
+      "loss": 0.4318,
+      "step": 65
+    },
+    {
+      "epoch": 0.0018210852702483643,
+      "grad_norm": 0.0017876614583656192,
+      "learning_rate": 0.001,
+      "loss": 0.3905,
+      "step": 66
+    },
+    {
+      "epoch": 0.0018486774713127334,
+      "grad_norm": 0.0019625681452453136,
+      "learning_rate": 0.001,
+      "loss": 0.443,
+      "step": 67
+    },
+    {
+      "epoch": 0.0018762696723771026,
+      "grad_norm": 0.0032296774443238974,
+      "learning_rate": 0.001,
+      "loss": 0.4341,
+      "step": 68
+    },
+    {
+      "epoch": 0.0019038618734414717,
+      "grad_norm": 0.0022506825625896454,
+      "learning_rate": 0.001,
+      "loss": 0.3885,
+      "step": 69
+    },
+    {
+      "epoch": 0.001931454074505841,
+      "grad_norm": 0.002406027168035507,
+      "learning_rate": 0.001,
+      "loss": 0.4453,
+      "step": 70
+    },
+    {
+      "epoch": 0.00195904627557021,
+      "grad_norm": 0.0014062536647543311,
+      "learning_rate": 0.001,
+      "loss": 0.3747,
+      "step": 71
+    },
+    {
+      "epoch": 0.0019866384766345792,
+      "grad_norm": 0.0020411391742527485,
+      "learning_rate": 0.001,
+      "loss": 0.3968,
+      "step": 72
+    },
+    {
+      "epoch": 0.0020142306776989483,
+      "grad_norm": 0.0023991591297090054,
+      "learning_rate": 0.001,
+      "loss": 0.4039,
+      "step": 73
+    },
+    {
+      "epoch": 0.0020418228787633178,
+      "grad_norm": 0.002641907660290599,
+      "learning_rate": 0.001,
+      "loss": 0.4308,
+      "step": 74
+    },
+    {
+      "epoch": 0.002069415079827687,
+      "grad_norm": 0.001838643685914576,
+      "learning_rate": 0.001,
+      "loss": 0.384,
+      "step": 75
+    },
+    {
+      "epoch": 0.002097007280892056,
+      "grad_norm": 0.0014234330737963319,
+      "learning_rate": 0.001,
+      "loss": 0.3965,
+      "step": 76
+    },
+    {
+      "epoch": 0.002124599481956425,
+      "grad_norm": 0.0020837297197431326,
+      "learning_rate": 0.001,
+      "loss": 0.4013,
+      "step": 77
+    },
+    {
+      "epoch": 0.0021521916830207944,
+      "grad_norm": 0.002444372745230794,
+      "learning_rate": 0.001,
+      "loss": 0.4,
+      "step": 78
+    },
+    {
+      "epoch": 0.0021797838840851634,
+      "grad_norm": 0.001574559137225151,
+      "learning_rate": 0.001,
+      "loss": 0.447,
+      "step": 79
+    },
+    {
+      "epoch": 0.0022073760851495324,
+      "grad_norm": 0.002465339843183756,
+      "learning_rate": 0.001,
+      "loss": 0.3916,
+      "step": 80
+    },
+    {
+      "epoch": 0.0022349682862139015,
+      "grad_norm": 0.0014318680623546243,
+      "learning_rate": 0.001,
+      "loss": 0.3763,
+      "step": 81
+    },
+    {
+      "epoch": 0.002262560487278271,
+      "grad_norm": 0.0018497896380722523,
+      "learning_rate": 0.001,
+      "loss": 0.3837,
+      "step": 82
+    },
+    {
+      "epoch": 0.00229015268834264,
+      "grad_norm": 0.0023651847150176764,
+      "learning_rate": 0.001,
+      "loss": 0.4341,
+      "step": 83
+    },
+    {
+      "epoch": 0.002317744889407009,
+      "grad_norm": 0.0019378148717805743,
+      "learning_rate": 0.001,
+      "loss": 0.408,
+      "step": 84
+    },
+    {
+      "epoch": 0.002345337090471378,
+      "grad_norm": 0.0015967771178111434,
+      "learning_rate": 0.001,
+      "loss": 0.3456,
+      "step": 85
+    },
+    {
+      "epoch": 0.0023729292915357476,
+      "grad_norm": 0.0013114806497469544,
+      "learning_rate": 0.001,
+      "loss": 0.3987,
+      "step": 86
+    },
+    {
+      "epoch": 0.0024005214926001166,
+      "grad_norm": 0.0018527540378272533,
+      "learning_rate": 0.001,
+      "loss": 0.4442,
+      "step": 87
+    },
+    {
+      "epoch": 0.0024281136936644856,
+      "grad_norm": 0.0018121429020538926,
+      "learning_rate": 0.001,
+      "loss": 0.3871,
+      "step": 88
+    },
+    {
+      "epoch": 0.0024557058947288547,
+      "grad_norm": 0.0017777991015464067,
+      "learning_rate": 0.001,
+      "loss": 0.4088,
+      "step": 89
+    },
+    {
+      "epoch": 0.002483298095793224,
+      "grad_norm": 0.0014042413095012307,
+      "learning_rate": 0.001,
+      "loss": 0.3839,
+      "step": 90
+    },
+    {
+      "epoch": 0.002510890296857593,
+      "grad_norm": 0.0023180118296295404,
+      "learning_rate": 0.001,
+      "loss": 0.3691,
+      "step": 91
+    },
+    {
+      "epoch": 0.0025384824979219622,
+      "grad_norm": 0.0018260233337059617,
+      "learning_rate": 0.001,
+      "loss": 0.3866,
+      "step": 92
+    },
+    {
+      "epoch": 0.0025660746989863317,
+      "grad_norm": 0.0020317393355071545,
+      "learning_rate": 0.001,
+      "loss": 0.43,
+      "step": 93
+    },
+    {
+      "epoch": 0.0025936669000507008,
+      "grad_norm": 0.0014620552537962794,
+      "learning_rate": 0.001,
+      "loss": 0.3873,
+      "step": 94
+    },
+    {
+      "epoch": 0.00262125910111507,
+      "grad_norm": 0.0030886537861078978,
+      "learning_rate": 0.001,
+      "loss": 0.4325,
+      "step": 95
+    },
+    {
+      "epoch": 0.002648851302179439,
+      "grad_norm": 0.001787678455002606,
+      "learning_rate": 0.001,
+      "loss": 0.4261,
+      "step": 96
+    },
+    {
+      "epoch": 0.0026764435032438083,
+      "grad_norm": 0.0020450972951948643,
+      "learning_rate": 0.001,
+      "loss": 0.4242,
+      "step": 97
+    },
+    {
+      "epoch": 0.0027040357043081773,
+      "grad_norm": 0.0015356248477473855,
+      "learning_rate": 0.001,
+      "loss": 0.4263,
+      "step": 98
+    },
+    {
+      "epoch": 0.0027316279053725464,
+      "grad_norm": 0.0019487686222419143,
+      "learning_rate": 0.001,
+      "loss": 0.4089,
+      "step": 99
+    },
+    {
+      "epoch": 0.0027592201064369154,
+      "grad_norm": 0.0016876134322956204,
+      "learning_rate": 0.001,
+      "loss": 0.4376,
+      "step": 100
+    },
+    {
+      "epoch": 0.002786812307501285,
+      "grad_norm": 0.0015564693603664637,
+      "learning_rate": 0.001,
+      "loss": 0.4016,
+      "step": 101
+    },
+    {
+      "epoch": 0.002814404508565654,
+      "grad_norm": 0.0011805463582277298,
+      "learning_rate": 0.001,
+      "loss": 0.4055,
+      "step": 102
+    },
+    {
+      "epoch": 0.002841996709630023,
+      "grad_norm": 0.0027237439062446356,
+      "learning_rate": 0.001,
+      "loss": 0.4386,
+      "step": 103
+    },
+    {
+      "epoch": 0.002869588910694392,
+      "grad_norm": 0.001259063370525837,
+      "learning_rate": 0.001,
+      "loss": 0.3692,
+      "step": 104
+    },
+    {
+      "epoch": 0.0028971811117587615,
+      "grad_norm": 0.0012861357536166906,
+      "learning_rate": 0.001,
+      "loss": 0.437,
+      "step": 105
+    },
+    {
+      "epoch": 0.0029247733128231305,
+      "grad_norm": 0.0017483624396845698,
+      "learning_rate": 0.001,
+      "loss": 0.3922,
+      "step": 106
+    },
+    {
+      "epoch": 0.0029523655138874996,
+      "grad_norm": 0.0022901813499629498,
+      "learning_rate": 0.001,
+      "loss": 0.4296,
+      "step": 107
+    },
+    {
+      "epoch": 0.002979957714951869,
+      "grad_norm": 0.00259222649037838,
+      "learning_rate": 0.001,
+      "loss": 0.4107,
+      "step": 108
+    },
+    {
+      "epoch": 0.003007549916016238,
+      "grad_norm": 0.0057897912338376045,
+      "learning_rate": 0.001,
+      "loss": 0.375,
+      "step": 109
+    },
+    {
+      "epoch": 0.003035142117080607,
+      "grad_norm": 0.002251250436529517,
+      "learning_rate": 0.001,
+      "loss": 0.3959,
+      "step": 110
+    },
+    {
+      "epoch": 0.003062734318144976,
+      "grad_norm": 0.0033183628693223,
+      "learning_rate": 0.001,
+      "loss": 0.4267,
+      "step": 111
+    },
+    {
+      "epoch": 0.0030903265192093457,
+      "grad_norm": 0.001699024927802384,
+      "learning_rate": 0.001,
+      "loss": 0.4068,
+      "step": 112
+    },
+    {
+      "epoch": 0.0031179187202737147,
+      "grad_norm": 0.002592903096228838,
+      "learning_rate": 0.001,
+      "loss": 0.3814,
+      "step": 113
+    },
+    {
+      "epoch": 0.0031455109213380837,
+      "grad_norm": 0.001526323496364057,
+      "learning_rate": 0.001,
+      "loss": 0.4176,
+      "step": 114
+    },
+    {
+      "epoch": 0.003173103122402453,
+      "grad_norm": 0.0022348894271999598,
+      "learning_rate": 0.001,
+      "loss": 0.4369,
+      "step": 115
+    },
+    {
+      "epoch": 0.0032006953234668223,
+      "grad_norm": 0.0024093682877719402,
+      "learning_rate": 0.001,
+      "loss": 0.3827,
+      "step": 116
+    },
+    {
+      "epoch": 0.0032282875245311913,
+      "grad_norm": 0.0054725524969398975,
+      "learning_rate": 0.001,
+      "loss": 0.4098,
+      "step": 117
+    },
+    {
+      "epoch": 0.0032558797255955603,
+      "grad_norm": 0.0026599527336657047,
+      "learning_rate": 0.001,
+      "loss": 0.3803,
+      "step": 118
+    },
+    {
+      "epoch": 0.0032834719266599294,
+      "grad_norm": 0.002410522662103176,
+      "learning_rate": 0.001,
+      "loss": 0.3951,
+      "step": 119
+    },
+    {
+      "epoch": 0.003311064127724299,
+      "grad_norm": 0.0026565720327198505,
+      "learning_rate": 0.001,
+      "loss": 0.4274,
+      "step": 120
+    },
+    {
+      "epoch": 0.003338656328788668,
+      "grad_norm": 0.0017594440141692758,
+      "learning_rate": 0.001,
+      "loss": 0.3748,
+      "step": 121
+    },
+    {
+      "epoch": 0.003366248529853037,
+      "grad_norm": 0.001644355827011168,
+      "learning_rate": 0.001,
+      "loss": 0.4052,
+      "step": 122
+    },
+    {
+      "epoch": 0.003393840730917406,
+      "grad_norm": 0.001211437862366438,
+      "learning_rate": 0.001,
+      "loss": 0.3873,
+      "step": 123
+    },
+    {
+      "epoch": 0.0034214329319817755,
+      "grad_norm": 0.0012707557762041688,
+      "learning_rate": 0.001,
+      "loss": 0.4175,
+      "step": 124
+    },
+    {
+      "epoch": 0.0034490251330461445,
+      "grad_norm": 0.00231559993699193,
+      "learning_rate": 0.001,
+      "loss": 0.3718,
+      "step": 125
+    },
+    {
+      "epoch": 0.0034766173341105135,
+      "grad_norm": 0.003967110998928547,
+      "learning_rate": 0.001,
+      "loss": 0.4437,
+      "step": 126
+    },
+    {
+      "epoch": 0.003504209535174883,
+      "grad_norm": 0.0014507032465189695,
+      "learning_rate": 0.001,
+      "loss": 0.4008,
+      "step": 127
+    },
+    {
+      "epoch": 0.003531801736239252,
+      "grad_norm": 0.0023966797161847353,
+      "learning_rate": 0.001,
+      "loss": 0.3724,
+      "step": 128
+    },
+    {
+      "epoch": 0.003559393937303621,
+      "grad_norm": 0.0019388011423870921,
+      "learning_rate": 0.001,
+      "loss": 0.4083,
+      "step": 129
+    },
+    {
+      "epoch": 0.00358698613836799,
+      "grad_norm": 0.0012276334455236793,
+      "learning_rate": 0.001,
+      "loss": 0.4624,
+      "step": 130
+    },
+    {
+      "epoch": 0.0036145783394323596,
+      "grad_norm": 0.0016817412106320262,
+      "learning_rate": 0.001,
+      "loss": 0.3641,
+      "step": 131
+    },
+    {
+      "epoch": 0.0036421705404967287,
+      "grad_norm": 0.0014624105533584952,
+      "learning_rate": 0.001,
+      "loss": 0.415,
+      "step": 132
+    },
+    {
+      "epoch": 0.0036697627415610977,
+      "grad_norm": 0.0023928394075483084,
+      "learning_rate": 0.001,
+      "loss": 0.4156,
+      "step": 133
+    },
+    {
+      "epoch": 0.0036973549426254667,
+      "grad_norm": 0.00194596650544554,
+      "learning_rate": 0.001,
+      "loss": 0.4234,
+      "step": 134
+    },
+    {
+      "epoch": 0.0037249471436898362,
+      "grad_norm": 0.0017044798005372286,
+      "learning_rate": 0.001,
+      "loss": 0.3717,
+      "step": 135
+    },
+    {
+      "epoch": 0.0037525393447542053,
+      "grad_norm": 0.0017636306583881378,
+      "learning_rate": 0.001,
+      "loss": 0.4256,
+      "step": 136
+    },
+    {
+      "epoch": 0.0037801315458185743,
+      "grad_norm": 0.0042044175788760185,
+      "learning_rate": 0.001,
+      "loss": 0.3936,
+      "step": 137
+    },
+    {
+      "epoch": 0.0038077237468829433,
+      "grad_norm": 0.0023780064657330513,
+      "learning_rate": 0.001,
+      "loss": 0.4101,
+      "step": 138
+    },
+    {
+      "epoch": 0.003835315947947313,
+      "grad_norm": 0.0018606351222842932,
+      "learning_rate": 0.001,
+      "loss": 0.3951,
+      "step": 139
+    },
+    {
+      "epoch": 0.003862908149011682,
+      "grad_norm": 0.0020658932626247406,
+      "learning_rate": 0.001,
+      "loss": 0.4095,
+      "step": 140
+    },
+    {
+      "epoch": 0.003890500350076051,
+      "grad_norm": 0.0021609310060739517,
+      "learning_rate": 0.001,
+      "loss": 0.3906,
+      "step": 141
+    },
+    {
+      "epoch": 0.00391809255114042,
+      "grad_norm": 0.0029214313253760338,
+      "learning_rate": 0.001,
+      "loss": 0.4448,
+      "step": 142
+    },
+    {
+      "epoch": 0.003945684752204789,
+      "grad_norm": 0.0013527723494917154,
+      "learning_rate": 0.001,
+      "loss": 0.4184,
+      "step": 143
+    },
+    {
+      "epoch": 0.0039732769532691585,
+      "grad_norm": 0.0014684811467304826,
+      "learning_rate": 0.001,
+      "loss": 0.4359,
+      "step": 144
+    },
+    {
+      "epoch": 0.0040008691543335275,
+      "grad_norm": 0.002133656293153763,
+      "learning_rate": 0.001,
+      "loss": 0.3947,
+      "step": 145
+    },
+    {
+      "epoch": 0.0040284613553978965,
+      "grad_norm": 0.0017957837553694844,
+      "learning_rate": 0.001,
+      "loss": 0.4116,
+      "step": 146
+    },
+    {
+      "epoch": 0.004056053556462266,
+      "grad_norm": 0.001586731756106019,
+      "learning_rate": 0.001,
+      "loss": 0.3759,
+      "step": 147
+    },
+    {
+      "epoch": 0.0040836457575266355,
+      "grad_norm": 0.0015585459768772125,
+      "learning_rate": 0.001,
+      "loss": 0.4004,
+      "step": 148
+    },
+    {
+      "epoch": 0.0041112379585910045,
+      "grad_norm": 0.001772695453837514,
+      "learning_rate": 0.001,
+      "loss": 0.4009,
+      "step": 149
+    },
+    {
+      "epoch": 0.004138830159655374,
+      "grad_norm": 0.0032015417236834764,
+      "learning_rate": 0.001,
+      "loss": 0.4229,
+      "step": 150
+    },
+    {
+      "epoch": 0.004166422360719743,
+      "grad_norm": 0.0014684676425531507,
+      "learning_rate": 0.001,
+      "loss": 0.3902,
+      "step": 151
+    },
+    {
+      "epoch": 0.004194014561784112,
+      "grad_norm": 0.0014085081638768315,
+      "learning_rate": 0.001,
+      "loss": 0.4123,
+      "step": 152
+    },
+    {
+      "epoch": 0.004221606762848481,
+      "grad_norm": 0.00355419609695673,
+      "learning_rate": 0.001,
+      "loss": 0.3749,
+      "step": 153
+    },
+    {
+      "epoch": 0.00424919896391285,
+      "grad_norm": 0.0021562143228948116,
+      "learning_rate": 0.001,
+      "loss": 0.4111,
+      "step": 154
+    },
+    {
+      "epoch": 0.004276791164977219,
+      "grad_norm": 0.001616101711988449,
+      "learning_rate": 0.001,
+      "loss": 0.4119,
+      "step": 155
+    },
+    {
+      "epoch": 0.004304383366041589,
+      "grad_norm": 0.0020637568086385727,
+      "learning_rate": 0.001,
+      "loss": 0.4025,
+      "step": 156
+    },
+    {
+      "epoch": 0.004331975567105958,
+      "grad_norm": 0.0013927265536040068,
+      "learning_rate": 0.001,
+      "loss": 0.3819,
+      "step": 157
+    },
+    {
+      "epoch": 0.004359567768170327,
+      "grad_norm": 0.001993082230910659,
+      "learning_rate": 0.001,
+      "loss": 0.4093,
+      "step": 158
+    },
+    {
+      "epoch": 0.004387159969234696,
+      "grad_norm": 0.001766142901033163,
+      "learning_rate": 0.001,
+      "loss": 0.4097,
+      "step": 159
+    },
+    {
+      "epoch": 0.004414752170299065,
+      "grad_norm": 0.0017266202485188842,
+      "learning_rate": 0.001,
+      "loss": 0.381,
+      "step": 160
+    },
+    {
+      "epoch": 0.004442344371363434,
+      "grad_norm": 0.0017482074908912182,
+      "learning_rate": 0.001,
+      "loss": 0.4035,
+      "step": 161
+    },
+    {
+      "epoch": 0.004469936572427803,
+      "grad_norm": 0.004118494223803282,
+      "learning_rate": 0.001,
+      "loss": 0.4169,
+      "step": 162
+    },
+    {
+      "epoch": 0.004497528773492173,
+      "grad_norm": 0.00243256869725883,
+      "learning_rate": 0.001,
+      "loss": 0.3896,
+      "step": 163
+    },
+    {
+      "epoch": 0.004525120974556542,
+      "grad_norm": 0.001603225595317781,
+      "learning_rate": 0.001,
+      "loss": 0.409,
+      "step": 164
+    },
+    {
+      "epoch": 0.004552713175620911,
+      "grad_norm": 0.0016920053167268634,
+      "learning_rate": 0.001,
+      "loss": 0.3818,
+      "step": 165
+    },
+    {
+      "epoch": 0.00458030537668528,
+      "grad_norm": 0.0016063055954873562,
+      "learning_rate": 0.001,
+      "loss": 0.4264,
+      "step": 166
+    },
+    {
+      "epoch": 0.004607897577749649,
+      "grad_norm": 0.0018608705140650272,
+      "learning_rate": 0.001,
+      "loss": 0.419,
+      "step": 167
+    },
+    {
+      "epoch": 0.004635489778814018,
+      "grad_norm": 0.001778002129867673,
+      "learning_rate": 0.001,
+      "loss": 0.4311,
+      "step": 168
+    },
+    {
+      "epoch": 0.004663081979878387,
+      "grad_norm": 0.0041995905339717865,
+      "learning_rate": 0.001,
+      "loss": 0.3816,
+      "step": 169
+    },
+    {
+      "epoch": 0.004690674180942756,
+      "grad_norm": 0.0013721170835196972,
+      "learning_rate": 0.001,
+      "loss": 0.4096,
+      "step": 170
+    },
+    {
+      "epoch": 0.004718266382007126,
+      "grad_norm": 0.0018527969950810075,
+      "learning_rate": 0.001,
+      "loss": 0.4166,
+      "step": 171
+    },
+    {
+      "epoch": 0.004745858583071495,
+      "grad_norm": 0.001698823063634336,
+      "learning_rate": 0.001,
+      "loss": 0.4139,
+      "step": 172
+    },
+    {
+      "epoch": 0.004773450784135864,
+      "grad_norm": 0.0015160103794187307,
+      "learning_rate": 0.001,
+      "loss": 0.3993,
+      "step": 173
+    },
+    {
+      "epoch": 0.004801042985200233,
+      "grad_norm": 0.0016305141616612673,
+      "learning_rate": 0.001,
+      "loss": 0.4338,
+      "step": 174
+    },
+    {
+      "epoch": 0.004828635186264602,
+      "grad_norm": 0.0016023332718759775,
+      "learning_rate": 0.001,
+      "loss": 0.3868,
+      "step": 175
+    },
+    {
+      "epoch": 0.004856227387328971,
+      "grad_norm": 0.0021146016661077738,
+      "learning_rate": 0.001,
+      "loss": 0.3993,
+      "step": 176
+    },
+    {
+      "epoch": 0.00488381958839334,
+      "grad_norm": 0.00299852411262691,
+      "learning_rate": 0.001,
+      "loss": 0.4033,
+      "step": 177
+    },
+    {
+      "epoch": 0.004911411789457709,
+      "grad_norm": 0.002649805974215269,
+      "learning_rate": 0.001,
+      "loss": 0.4111,
+      "step": 178
+    },
+    {
+      "epoch": 0.004939003990522079,
+      "grad_norm": 0.002029626164585352,
+      "learning_rate": 0.001,
+      "loss": 0.3914,
+      "step": 179
+    },
+    {
+      "epoch": 0.004966596191586448,
+      "grad_norm": 0.002307730261236429,
+      "learning_rate": 0.001,
+      "loss": 0.4142,
+      "step": 180
+    },
+    {
+      "epoch": 0.004994188392650817,
+      "grad_norm": 0.0019044012296944857,
+      "learning_rate": 0.001,
+      "loss": 0.4298,
+      "step": 181
+    },
+    {
+      "epoch": 0.005021780593715186,
+      "grad_norm": 0.0012402728898450732,
+      "learning_rate": 0.001,
+      "loss": 0.4187,
+      "step": 182
+    },
+    {
+      "epoch": 0.005049372794779555,
+      "grad_norm": 0.0015751667087897658,
+      "learning_rate": 0.001,
+      "loss": 0.375,
+      "step": 183
+    },
+    {
+      "epoch": 0.0050769649958439245,
+      "grad_norm": 0.0018701856024563313,
+      "learning_rate": 0.001,
+      "loss": 0.4079,
+      "step": 184
+    },
+    {
+      "epoch": 0.0051045571969082935,
+      "grad_norm": 0.002105995314195752,
+      "learning_rate": 0.001,
+      "loss": 0.3899,
+      "step": 185
+    },
+    {
+      "epoch": 0.005132149397972663,
+      "grad_norm": 0.0030122329480946064,
+      "learning_rate": 0.001,
+      "loss": 0.4169,
+      "step": 186
+    },
+    {
+      "epoch": 0.0051597415990370325,
+      "grad_norm": 0.0018410159973427653,
+      "learning_rate": 0.001,
+      "loss": 0.3982,
+      "step": 187
+    },
+    {
+      "epoch": 0.0051873338001014015,
+      "grad_norm": 0.0017063120612874627,
+      "learning_rate": 0.001,
+      "loss": 0.4323,
+      "step": 188
+    },
+    {
+      "epoch": 0.0052149260011657705,
+      "grad_norm": 0.001945548108778894,
+      "learning_rate": 0.001,
+      "loss": 0.3792,
+      "step": 189
+    },
+    {
+      "epoch": 0.00524251820223014,
+      "grad_norm": 0.002262406051158905,
+      "learning_rate": 0.001,
+      "loss": 0.3932,
+      "step": 190
+    },
+    {
+      "epoch": 0.005270110403294509,
+      "grad_norm": 0.003793769981712103,
+      "learning_rate": 0.001,
+      "loss": 0.42,
+      "step": 191
+    },
+    {
+      "epoch": 0.005297702604358878,
+      "grad_norm": 0.0024173606652766466,
+      "learning_rate": 0.001,
+      "loss": 0.4175,
+      "step": 192
+    },
+    {
+      "epoch": 0.005325294805423247,
+      "grad_norm": 0.0017464763950556517,
+      "learning_rate": 0.001,
+      "loss": 0.3971,
+      "step": 193
+    },
+    {
+      "epoch": 0.005352887006487617,
+      "grad_norm": 0.0034284228459000587,
+      "learning_rate": 0.001,
+      "loss": 0.3932,
+      "step": 194
+    },
+    {
+      "epoch": 0.005380479207551986,
+      "grad_norm": 0.001637522829696536,
+      "learning_rate": 0.001,
+      "loss": 0.404,
+      "step": 195
+    },
+    {
+      "epoch": 0.005408071408616355,
+      "grad_norm": 0.0019539205823093653,
+      "learning_rate": 0.001,
+      "loss": 0.3976,
+      "step": 196
+    },
+    {
+      "epoch": 0.005435663609680724,
+      "grad_norm": 0.0023392250295728445,
+      "learning_rate": 0.001,
+      "loss": 0.4224,
+      "step": 197
+    },
+    {
+      "epoch": 0.005463255810745093,
+      "grad_norm": 0.005975689273327589,
+      "learning_rate": 0.001,
+      "loss": 0.3852,
+      "step": 198
+    },
+    {
+      "epoch": 0.005490848011809462,
+      "grad_norm": 0.005825843196362257,
+      "learning_rate": 0.001,
+      "loss": 0.4076,
+      "step": 199
+    },
+    {
+      "epoch": 0.005518440212873831,
+      "grad_norm": 0.0023729391396045685,
+      "learning_rate": 0.001,
+      "loss": 0.4088,
+      "step": 200
+    },
+    {
+      "epoch": 0.005546032413938201,
+      "grad_norm": 0.0020657021086663008,
+      "learning_rate": 0.001,
+      "loss": 0.4162,
+      "step": 201
+    },
+    {
+      "epoch": 0.00557362461500257,
+      "grad_norm": 0.0022743509616702795,
+      "learning_rate": 0.001,
+      "loss": 0.4189,
+      "step": 202
+    },
+    {
+      "epoch": 0.005601216816066939,
+      "grad_norm": 0.002227703807875514,
+      "learning_rate": 0.001,
+      "loss": 0.4005,
+      "step": 203
+    },
+    {
+      "epoch": 0.005628809017131308,
+      "grad_norm": 0.0023818998597562313,
+      "learning_rate": 0.001,
+      "loss": 0.454,
+      "step": 204
+    },
+    {
+      "epoch": 0.005656401218195677,
+      "grad_norm": 0.002208840800449252,
+      "learning_rate": 0.001,
+      "loss": 0.4003,
+      "step": 205
+    },
+    {
+      "epoch": 0.005683993419260046,
+      "grad_norm": 0.0025773572269827127,
+      "learning_rate": 0.001,
+      "loss": 0.4447,
+      "step": 206
+    },
+    {
+      "epoch": 0.005711585620324415,
+      "grad_norm": 0.002260175533592701,
+      "learning_rate": 0.001,
+      "loss": 0.3893,
+      "step": 207
+    },
+    {
+      "epoch": 0.005739177821388784,
+      "grad_norm": 0.0023025628179311752,
+      "learning_rate": 0.001,
+      "loss": 0.4072,
+      "step": 208
+    },
+    {
+      "epoch": 0.005766770022453154,
+      "grad_norm": 0.0032320781610906124,
+      "learning_rate": 0.001,
+      "loss": 0.3924,
+      "step": 209
+    },
+    {
+      "epoch": 0.005794362223517523,
+      "grad_norm": 0.0018922177841886878,
+      "learning_rate": 0.001,
+      "loss": 0.4094,
+      "step": 210
+    },
+    {
+      "epoch": 0.005821954424581892,
+      "grad_norm": 0.0019504919182509184,
+      "learning_rate": 0.001,
+      "loss": 0.3948,
+      "step": 211
+    },
+    {
+      "epoch": 0.005849546625646261,
+      "grad_norm": 0.002194691449403763,
+      "learning_rate": 0.001,
+      "loss": 0.4021,
+      "step": 212
+    },
+    {
+      "epoch": 0.00587713882671063,
+      "grad_norm": 0.013725458644330502,
+      "learning_rate": 0.001,
+      "loss": 0.3922,
+      "step": 213
+    },
+    {
+      "epoch": 0.005904731027774999,
+      "grad_norm": 0.0019010631367564201,
+      "learning_rate": 0.001,
+      "loss": 0.4347,
+      "step": 214
+    },
+    {
+      "epoch": 0.005932323228839368,
+      "grad_norm": 0.005194473545998335,
+      "learning_rate": 0.001,
+      "loss": 0.4057,
+      "step": 215
+    },
+    {
+      "epoch": 0.005959915429903738,
+      "grad_norm": 0.002996876835823059,
+      "learning_rate": 0.001,
+      "loss": 0.3763,
+      "step": 216
+    },
+    {
+      "epoch": 0.005987507630968107,
+      "grad_norm": 0.0020920925308018923,
+      "learning_rate": 0.001,
+      "loss": 0.4245,
+      "step": 217
+    },
+    {
+      "epoch": 0.006015099832032476,
+      "grad_norm": 0.0023856102488934994,
+      "learning_rate": 0.001,
+      "loss": 0.401,
+      "step": 218
+    },
+    {
+      "epoch": 0.006042692033096845,
+      "grad_norm": 0.0022957061883062124,
+      "learning_rate": 0.001,
+      "loss": 0.4424,
+      "step": 219
+    },
+    {
+      "epoch": 0.006070284234161214,
+      "grad_norm": 0.00439048558473587,
+      "learning_rate": 0.001,
+      "loss": 0.3984,
+      "step": 220
+    },
+    {
+      "epoch": 0.006097876435225583,
+      "grad_norm": 0.0023554619401693344,
+      "learning_rate": 0.001,
+      "loss": 0.3743,
+      "step": 221
+    },
+    {
+      "epoch": 0.006125468636289952,
+      "grad_norm": 0.002596774371340871,
+      "learning_rate": 0.001,
+      "loss": 0.3997,
+      "step": 222
+    },
+    {
+      "epoch": 0.006153060837354321,
+      "grad_norm": 0.002902804408222437,
+      "learning_rate": 0.001,
+      "loss": 0.3741,
+      "step": 223
+    },
+    {
+      "epoch": 0.006180653038418691,
+      "grad_norm": 0.002238060114905238,
+      "learning_rate": 0.001,
+      "loss": 0.4194,
+      "step": 224
+    },
+    {
+      "epoch": 0.00620824523948306,
+      "grad_norm": 0.0024735773913562298,
+      "learning_rate": 0.001,
+      "loss": 0.4283,
+      "step": 225
+    },
+    {
+      "epoch": 0.006235837440547429,
+      "grad_norm": 0.0043862126767635345,
+      "learning_rate": 0.001,
+      "loss": 0.4169,
+      "step": 226
+    },
+    {
+      "epoch": 0.0062634296416117985,
+      "grad_norm": 0.002900010673329234,
+      "learning_rate": 0.001,
+      "loss": 0.4133,
+      "step": 227
+    },
+    {
+      "epoch": 0.0062910218426761675,
+      "grad_norm": 0.0025010716635733843,
+      "learning_rate": 0.001,
+      "loss": 0.4121,
+      "step": 228
+    },
+    {
+      "epoch": 0.0063186140437405365,
+      "grad_norm": 0.004444291349500418,
+      "learning_rate": 0.001,
+      "loss": 0.3828,
+      "step": 229
+    },
+    {
+      "epoch": 0.006346206244804906,
+      "grad_norm": 0.0021149932872503996,
+      "learning_rate": 0.001,
+      "loss": 0.4356,
+      "step": 230
+    },
+    {
+      "epoch": 0.006373798445869275,
+      "grad_norm": 0.001915531582199037,
+      "learning_rate": 0.001,
+      "loss": 0.4471,
+      "step": 231
+    },
+    {
+      "epoch": 0.0064013906469336445,
+      "grad_norm": 0.001954560400918126,
+      "learning_rate": 0.001,
+      "loss": 0.4018,
+      "step": 232
+    },
+    {
+      "epoch": 0.006428982847998014,
+      "grad_norm": 0.002395547227934003,
+      "learning_rate": 0.001,
+      "loss": 0.3945,
+      "step": 233
+    },
+    {
+      "epoch": 0.006456575049062383,
+      "grad_norm": 0.002621949650347233,
+      "learning_rate": 0.001,
+      "loss": 0.4052,
+      "step": 234
+    },
+    {
+      "epoch": 0.006484167250126752,
+      "grad_norm": 0.0037250854074954987,
+      "learning_rate": 0.001,
+      "loss": 0.4227,
+      "step": 235
+    },
+    {
+      "epoch": 0.006511759451191121,
+      "grad_norm": 0.002147798193618655,
+      "learning_rate": 0.001,
+      "loss": 0.3914,
+      "step": 236
+    },
+    {
+      "epoch": 0.00653935165225549,
+      "grad_norm": 0.002425707643851638,
+      "learning_rate": 0.001,
+      "loss": 0.3949,
+      "step": 237
+    },
+    {
+      "epoch": 0.006566943853319859,
+      "grad_norm": 0.002085171639919281,
+      "learning_rate": 0.001,
+      "loss": 0.4239,
+      "step": 238
+    },
+    {
+      "epoch": 0.006594536054384229,
+      "grad_norm": 0.0023823371157050133,
+      "learning_rate": 0.001,
+      "loss": 0.403,
+      "step": 239
+    },
+    {
+      "epoch": 0.006622128255448598,
+      "grad_norm": 0.0020582638680934906,
+      "learning_rate": 0.001,
+      "loss": 0.402,
+      "step": 240
+    },
+    {
+      "epoch": 0.006649720456512967,
+      "grad_norm": 0.002760798903182149,
+      "learning_rate": 0.001,
+      "loss": 0.3785,
+      "step": 241
+    },
+    {
+      "epoch": 0.006677312657577336,
+      "grad_norm": 0.0030749242287129164,
+      "learning_rate": 0.001,
+      "loss": 0.4379,
+      "step": 242
+    },
+    {
+      "epoch": 0.006704904858641705,
+      "grad_norm": 0.0023086306173354387,
+      "learning_rate": 0.001,
+      "loss": 0.4163,
+      "step": 243
+    },
+    {
+      "epoch": 0.006732497059706074,
+      "grad_norm": 0.0025757497642189264,
+      "learning_rate": 0.001,
+      "loss": 0.4264,
+      "step": 244
+    },
+    {
+      "epoch": 0.006760089260770443,
+      "grad_norm": 0.002158039715141058,
+      "learning_rate": 0.001,
+      "loss": 0.3881,
+      "step": 245
+    },
+    {
+      "epoch": 0.006787681461834812,
+      "grad_norm": 0.0018623712239786983,
+      "learning_rate": 0.001,
+      "loss": 0.3682,
+      "step": 246
+    },
+    {
+      "epoch": 0.006815273662899182,
+      "grad_norm": 0.0022342281881719828,
+      "learning_rate": 0.001,
+      "loss": 0.4181,
+      "step": 247
+    },
+    {
+      "epoch": 0.006842865863963551,
+      "grad_norm": 0.0026764923240989447,
+      "learning_rate": 0.001,
+      "loss": 0.392,
+      "step": 248
+    },
+    {
+      "epoch": 0.00687045806502792,
+      "grad_norm": 0.0038766246289014816,
+      "learning_rate": 0.001,
+      "loss": 0.4145,
+      "step": 249
+    },
+    {
+      "epoch": 0.006898050266092289,
+      "grad_norm": 0.002432230394333601,
+      "learning_rate": 0.001,
+      "loss": 0.4382,
+      "step": 250
+    },
+    {
+      "epoch": 0.006925642467156658,
+      "grad_norm": 0.003260681638494134,
+      "learning_rate": 0.001,
+      "loss": 0.4376,
+      "step": 251
+    },
+    {
+      "epoch": 0.006953234668221027,
+      "grad_norm": 0.0023729840759187937,
+      "learning_rate": 0.001,
+      "loss": 0.3961,
+      "step": 252
+    },
+    {
+      "epoch": 0.006980826869285396,
+      "grad_norm": 0.0032021389342844486,
+      "learning_rate": 0.001,
+      "loss": 0.4234,
+      "step": 253
+    },
+    {
+      "epoch": 0.007008419070349766,
+      "grad_norm": 0.003267744556069374,
+      "learning_rate": 0.001,
+      "loss": 0.412,
+      "step": 254
+    },
+    {
+      "epoch": 0.007036011271414135,
+      "grad_norm": 0.0024608762469142675,
+      "learning_rate": 0.001,
+      "loss": 0.3931,
+      "step": 255
+    },
+    {
+      "epoch": 0.007063603472478504,
+      "grad_norm": 0.002270517172291875,
+      "learning_rate": 0.001,
+      "loss": 0.4049,
+      "step": 256
+    },
+    {
+      "epoch": 0.007091195673542873,
+      "grad_norm": 0.002375125652179122,
+      "learning_rate": 0.001,
+      "loss": 0.4074,
+      "step": 257
+    },
+    {
+      "epoch": 0.007118787874607242,
+      "grad_norm": 0.0021832643542438745,
+      "learning_rate": 0.001,
+      "loss": 0.4059,
+      "step": 258
+    },
+    {
+      "epoch": 0.007146380075671611,
+      "grad_norm": 0.0018151308177039027,
+      "learning_rate": 0.001,
+      "loss": 0.3893,
+      "step": 259
+    },
+    {
+      "epoch": 0.00717397227673598,
+      "grad_norm": 0.0018910926301032305,
+      "learning_rate": 0.001,
+      "loss": 0.4327,
+      "step": 260
+    },
+    {
+      "epoch": 0.007201564477800349,
+      "grad_norm": 0.0024862713180482388,
+      "learning_rate": 0.001,
+      "loss": 0.4328,
+      "step": 261
+    },
+    {
+      "epoch": 0.007229156678864719,
+      "grad_norm": 0.0027056324761360884,
+      "learning_rate": 0.001,
+      "loss": 0.4027,
+      "step": 262
+    },
+    {
+      "epoch": 0.007256748879929088,
+      "grad_norm": 0.0018339842790737748,
+      "learning_rate": 0.001,
+      "loss": 0.3907,
+      "step": 263
+    },
+    {
+      "epoch": 0.007284341080993457,
+      "grad_norm": 0.002902393927797675,
+      "learning_rate": 0.001,
+      "loss": 0.4415,
+      "step": 264
+    },
+    {
+      "epoch": 0.007311933282057826,
+      "grad_norm": 0.0022214376367628574,
+      "learning_rate": 0.001,
+      "loss": 0.4046,
+      "step": 265
+    },
+    {
+      "epoch": 0.007339525483122195,
+      "grad_norm": 0.002073576208204031,
+      "learning_rate": 0.001,
+      "loss": 0.4365,
+      "step": 266
+    },
+    {
+      "epoch": 0.0073671176841865645,
+      "grad_norm": 0.002394103677943349,
+      "learning_rate": 0.001,
+      "loss": 0.4265,
+      "step": 267
+    },
+    {
+      "epoch": 0.0073947098852509335,
+      "grad_norm": 0.002571861259639263,
+      "learning_rate": 0.001,
+      "loss": 0.4012,
+      "step": 268
+    },
+    {
+      "epoch": 0.007422302086315303,
+      "grad_norm": 0.0016914806328713894,
+      "learning_rate": 0.001,
+      "loss": 0.3932,
+      "step": 269
+    },
+    {
+      "epoch": 0.0074498942873796725,
+      "grad_norm": 0.003131776349619031,
+      "learning_rate": 0.001,
+      "loss": 0.3807,
+      "step": 270
+    },
+    {
+      "epoch": 0.0074774864884440415,
+      "grad_norm": 0.0046376376412808895,
+      "learning_rate": 0.001,
+      "loss": 0.4189,
+      "step": 271
+    },
+    {
+      "epoch": 0.0075050786895084105,
+      "grad_norm": 0.0030513983219861984,
+      "learning_rate": 0.001,
+      "loss": 0.3983,
+      "step": 272
+    },
+    {
+      "epoch": 0.00753267089057278,
+      "grad_norm": 0.0018306419951841235,
+      "learning_rate": 0.001,
+      "loss": 0.4008,
+      "step": 273
+    },
+    {
+      "epoch": 0.007560263091637149,
+      "grad_norm": 0.002655989723280072,
+      "learning_rate": 0.001,
+      "loss": 0.3853,
+      "step": 274
+    },
+    {
+      "epoch": 0.007587855292701518,
+      "grad_norm": 0.002580752596259117,
+      "learning_rate": 0.001,
+      "loss": 0.4259,
+      "step": 275
+    },
+    {
+      "epoch": 0.007615447493765887,
+      "grad_norm": 0.002781835151836276,
+      "learning_rate": 0.001,
+      "loss": 0.4023,
+      "step": 276
+    },
+    {
+      "epoch": 0.007643039694830257,
+      "grad_norm": 0.0023255913984030485,
+      "learning_rate": 0.001,
+      "loss": 0.4107,
+      "step": 277
+    },
+    {
+      "epoch": 0.007670631895894626,
+      "grad_norm": 0.0026517838705331087,
+      "learning_rate": 0.001,
+      "loss": 0.3877,
+      "step": 278
+    },
+    {
+      "epoch": 0.007698224096958995,
+      "grad_norm": 0.009500747546553612,
+      "learning_rate": 0.001,
+      "loss": 0.4261,
+      "step": 279
+    },
+    {
+      "epoch": 0.007725816298023364,
+      "grad_norm": 0.002862557303160429,
+      "learning_rate": 0.001,
+      "loss": 0.3913,
+      "step": 280
+    },
+    {
+      "epoch": 0.007753408499087733,
+      "grad_norm": 0.0030510432552546263,
+      "learning_rate": 0.001,
+      "loss": 0.4157,
+      "step": 281
+    },
+    {
+      "epoch": 0.007781000700152102,
+      "grad_norm": 0.00323986797593534,
+      "learning_rate": 0.001,
+      "loss": 0.4314,
+      "step": 282
+    },
+    {
+      "epoch": 0.007808592901216471,
+      "grad_norm": 0.0030146867502480745,
+      "learning_rate": 0.001,
+      "loss": 0.4006,
+      "step": 283
+    },
+    {
+      "epoch": 0.00783618510228084,
+      "grad_norm": 0.002789125544950366,
+      "learning_rate": 0.001,
+      "loss": 0.4179,
+      "step": 284
+    },
+    {
+      "epoch": 0.007863777303345209,
+      "grad_norm": 0.0027734541799873114,
+      "learning_rate": 0.001,
+      "loss": 0.3941,
+      "step": 285
+    },
+    {
+      "epoch": 0.007891369504409579,
+      "grad_norm": 0.0021806685253977776,
+      "learning_rate": 0.001,
+      "loss": 0.3948,
+      "step": 286
+    },
+    {
+      "epoch": 0.007918961705473947,
+      "grad_norm": 0.003913281951099634,
+      "learning_rate": 0.001,
+      "loss": 0.4296,
+      "step": 287
+    },
+    {
+      "epoch": 0.007946553906538317,
+      "grad_norm": 0.0021719373762607574,
+      "learning_rate": 0.001,
+      "loss": 0.3839,
+      "step": 288
+    },
+    {
+      "epoch": 0.007974146107602687,
+      "grad_norm": 0.003087391145527363,
+      "learning_rate": 0.001,
+      "loss": 0.3958,
+      "step": 289
+    },
+    {
+      "epoch": 0.008001738308667055,
+      "grad_norm": 0.0025539237540215254,
+      "learning_rate": 0.001,
+      "loss": 0.4234,
+      "step": 290
+    },
+    {
+      "epoch": 0.008029330509731425,
+      "grad_norm": 0.002219514222815633,
+      "learning_rate": 0.001,
+      "loss": 0.4234,
+      "step": 291
+    },
+    {
+      "epoch": 0.008056922710795793,
+      "grad_norm": 0.0028160493820905685,
+      "learning_rate": 0.001,
+      "loss": 0.3935,
+      "step": 292
+    },
+    {
+      "epoch": 0.008084514911860163,
+      "grad_norm": 0.003081710310652852,
+      "learning_rate": 0.001,
+      "loss": 0.4197,
+      "step": 293
+    },
+    {
+      "epoch": 0.008112107112924531,
+      "grad_norm": 0.0024016399402171373,
+      "learning_rate": 0.001,
+      "loss": 0.3715,
+      "step": 294
+    },
+    {
+      "epoch": 0.008139699313988901,
+      "grad_norm": 0.0020687165670096874,
+      "learning_rate": 0.001,
+      "loss": 0.4132,
+      "step": 295
+    },
+    {
+      "epoch": 0.008167291515053271,
+      "grad_norm": 0.002624873537570238,
+      "learning_rate": 0.001,
+      "loss": 0.4201,
+      "step": 296
+    },
+    {
+      "epoch": 0.00819488371611764,
+      "grad_norm": 0.0027543094474822283,
+      "learning_rate": 0.001,
+      "loss": 0.3984,
+      "step": 297
+    },
+    {
+      "epoch": 0.008222475917182009,
+      "grad_norm": 0.005131872370839119,
+      "learning_rate": 0.001,
+      "loss": 0.3971,
+      "step": 298
+    },
+    {
+      "epoch": 0.008250068118246377,
+      "grad_norm": 0.0030383355915546417,
+      "learning_rate": 0.001,
+      "loss": 0.421,
+      "step": 299
+    },
+    {
+      "epoch": 0.008277660319310747,
+      "grad_norm": 0.00250818463973701,
+      "learning_rate": 0.001,
+      "loss": 0.4311,
+      "step": 300
+    },
+    {
+      "epoch": 0.008305252520375115,
+      "grad_norm": 0.006147111766040325,
+      "learning_rate": 0.001,
+      "loss": 0.386,
+      "step": 301
+    },
+    {
+      "epoch": 0.008332844721439485,
+      "grad_norm": 0.003149918746203184,
+      "learning_rate": 0.001,
+      "loss": 0.4015,
+      "step": 302
+    },
+    {
+      "epoch": 0.008360436922503853,
+      "grad_norm": 0.002058635698631406,
+      "learning_rate": 0.001,
+      "loss": 0.4586,
+      "step": 303
+    },
+    {
+      "epoch": 0.008388029123568223,
+      "grad_norm": 0.002103931736201048,
+      "learning_rate": 0.001,
+      "loss": 0.3906,
+      "step": 304
+    },
+    {
+      "epoch": 0.008415621324632593,
+      "grad_norm": 0.0036677306052297354,
+      "learning_rate": 0.001,
+      "loss": 0.4056,
+      "step": 305
+    },
+    {
+      "epoch": 0.008443213525696961,
+      "grad_norm": 0.002338196150958538,
+      "learning_rate": 0.001,
+      "loss": 0.3991,
+      "step": 306
+    },
+    {
+      "epoch": 0.008470805726761331,
+      "grad_norm": 0.002481523435562849,
+      "learning_rate": 0.001,
+      "loss": 0.4227,
+      "step": 307
+    },
+    {
+      "epoch": 0.0084983979278257,
+      "grad_norm": 0.0023166737519204617,
+      "learning_rate": 0.001,
+      "loss": 0.3723,
+      "step": 308
+    },
+    {
+      "epoch": 0.00852599012889007,
+      "grad_norm": 0.00202556187286973,
+      "learning_rate": 0.001,
+      "loss": 0.4171,
+      "step": 309
+    },
+    {
+      "epoch": 0.008553582329954438,
+      "grad_norm": 0.0018110661767423153,
+      "learning_rate": 0.001,
+      "loss": 0.4258,
+      "step": 310
+    },
+    {
+      "epoch": 0.008581174531018807,
+      "grad_norm": 0.003058563219383359,
+      "learning_rate": 0.001,
+      "loss": 0.4147,
+      "step": 311
+    },
+    {
+      "epoch": 0.008608766732083177,
+      "grad_norm": 0.004681632854044437,
+      "learning_rate": 0.001,
+      "loss": 0.4428,
+      "step": 312
+    },
+    {
+      "epoch": 0.008636358933147546,
+      "grad_norm": 0.0023308703675866127,
+      "learning_rate": 0.001,
+      "loss": 0.3874,
+      "step": 313
+    },
+    {
+      "epoch": 0.008663951134211915,
+      "grad_norm": 0.0029331271070986986,
+      "learning_rate": 0.001,
+      "loss": 0.4019,
+      "step": 314
+    },
+    {
+      "epoch": 0.008691543335276284,
+      "grad_norm": 0.0021851372439414263,
+      "learning_rate": 0.001,
+      "loss": 0.4097,
+      "step": 315
+    },
+    {
+      "epoch": 0.008719135536340654,
+      "grad_norm": 0.0027176933363080025,
+      "learning_rate": 0.001,
+      "loss": 0.3937,
+      "step": 316
+    },
+    {
+      "epoch": 0.008746727737405022,
+      "grad_norm": 0.0023573623038828373,
+      "learning_rate": 0.001,
+      "loss": 0.3926,
+      "step": 317
+    },
+    {
+      "epoch": 0.008774319938469392,
+      "grad_norm": 0.0023507485166192055,
+      "learning_rate": 0.001,
+      "loss": 0.429,
+      "step": 318
+    },
+    {
+      "epoch": 0.008801912139533762,
+      "grad_norm": 0.0026601781137287617,
+      "learning_rate": 0.001,
+      "loss": 0.4071,
+      "step": 319
+    },
+    {
+      "epoch": 0.00882950434059813,
+      "grad_norm": 0.002715731505304575,
+      "learning_rate": 0.001,
+      "loss": 0.4148,
+      "step": 320
+    },
+    {
+      "epoch": 0.0088570965416625,
+      "grad_norm": 0.004710033070296049,
+      "learning_rate": 0.001,
+      "loss": 0.3585,
+      "step": 321
+    },
+    {
+      "epoch": 0.008884688742726868,
+      "grad_norm": 0.0027322748210281134,
+      "learning_rate": 0.001,
+      "loss": 0.4053,
+      "step": 322
+    },
+    {
+      "epoch": 0.008912280943791238,
+      "grad_norm": 0.0024419347755610943,
+      "learning_rate": 0.001,
+      "loss": 0.4141,
+      "step": 323
+    },
+    {
+      "epoch": 0.008939873144855606,
+      "grad_norm": 0.004387510009109974,
+      "learning_rate": 0.001,
+      "loss": 0.3747,
+      "step": 324
+    },
+    {
+      "epoch": 0.008967465345919976,
+      "grad_norm": 0.0033073730301111937,
+      "learning_rate": 0.001,
+      "loss": 0.3981,
+      "step": 325
+    },
+    {
+      "epoch": 0.008995057546984346,
+      "grad_norm": 0.002799189416691661,
+      "learning_rate": 0.001,
+      "loss": 0.3687,
+      "step": 326
+    },
+    {
+      "epoch": 0.009022649748048714,
+      "grad_norm": 0.0031710139010101557,
+      "learning_rate": 0.001,
+      "loss": 0.3988,
+      "step": 327
+    },
+    {
+      "epoch": 0.009050241949113084,
+      "grad_norm": 0.0027909427881240845,
+      "learning_rate": 0.001,
+      "loss": 0.3804,
+      "step": 328
+    },
+    {
+      "epoch": 0.009077834150177452,
+      "grad_norm": 0.0035845122765749693,
+      "learning_rate": 0.001,
+      "loss": 0.4203,
+      "step": 329
+    },
+    {
+      "epoch": 0.009105426351241822,
+      "grad_norm": 0.0034582631196826696,
+      "learning_rate": 0.001,
+      "loss": 0.404,
+      "step": 330
+    },
+    {
+      "epoch": 0.00913301855230619,
+      "grad_norm": 0.005869538523256779,
+      "learning_rate": 0.001,
+      "loss": 0.3392,
+      "step": 331
+    },
+    {
+      "epoch": 0.00916061075337056,
+      "grad_norm": 0.0037916686851531267,
+      "learning_rate": 0.001,
+      "loss": 0.4509,
+      "step": 332
+    },
+    {
+      "epoch": 0.009188202954434928,
+      "grad_norm": 0.004066781606525183,
+      "learning_rate": 0.001,
+      "loss": 0.4105,
+      "step": 333
+    },
+    {
+      "epoch": 0.009215795155499298,
+      "grad_norm": 0.002667912980541587,
+      "learning_rate": 0.001,
+      "loss": 0.4371,
+      "step": 334
+    },
+    {
+      "epoch": 0.009243387356563668,
+      "grad_norm": 0.0032026427797973156,
+      "learning_rate": 0.001,
+      "loss": 0.3814,
+      "step": 335
+    },
+    {
+      "epoch": 0.009270979557628036,
+      "grad_norm": 0.002195654669776559,
+      "learning_rate": 0.001,
+      "loss": 0.374,
+      "step": 336
+    },
+    {
+      "epoch": 0.009298571758692406,
+      "grad_norm": 0.002170421415939927,
+      "learning_rate": 0.001,
+      "loss": 0.416,
+      "step": 337
+    },
+    {
+      "epoch": 0.009326163959756774,
+      "grad_norm": 0.002630366710945964,
+      "learning_rate": 0.001,
+      "loss": 0.4068,
+      "step": 338
+    },
+    {
+      "epoch": 0.009353756160821144,
+      "grad_norm": 0.0026457132771611214,
+      "learning_rate": 0.001,
+      "loss": 0.436,
+      "step": 339
+    },
+    {
+      "epoch": 0.009381348361885512,
+      "grad_norm": 0.002360823331400752,
+      "learning_rate": 0.001,
+      "loss": 0.4335,
+      "step": 340
+    },
+    {
+      "epoch": 0.009408940562949882,
+      "grad_norm": 0.0026195445097982883,
+      "learning_rate": 0.001,
+      "loss": 0.4233,
+      "step": 341
+    },
+    {
+      "epoch": 0.009436532764014252,
+      "grad_norm": 0.0018392925849184394,
+      "learning_rate": 0.001,
+      "loss": 0.4215,
+      "step": 342
+    },
+    {
+      "epoch": 0.00946412496507862,
+      "grad_norm": 0.0029209102503955364,
+      "learning_rate": 0.001,
+      "loss": 0.3949,
+      "step": 343
+    },
+    {
+      "epoch": 0.00949171716614299,
+      "grad_norm": 0.00245369179174304,
+      "learning_rate": 0.001,
+      "loss": 0.4349,
+      "step": 344
+    },
+    {
+      "epoch": 0.009519309367207358,
+      "grad_norm": 0.0021382428240031004,
+      "learning_rate": 0.001,
+      "loss": 0.4234,
+      "step": 345
+    },
+    {
+      "epoch": 0.009546901568271728,
+      "grad_norm": 0.0018214443698525429,
+      "learning_rate": 0.001,
+      "loss": 0.3922,
+      "step": 346
+    },
+    {
+      "epoch": 0.009574493769336096,
+      "grad_norm": 0.0025145094841718674,
+      "learning_rate": 0.001,
+      "loss": 0.3583,
+      "step": 347
+    },
+    {
+      "epoch": 0.009602085970400466,
+      "grad_norm": 0.002913502510637045,
+      "learning_rate": 0.001,
+      "loss": 0.4189,
+      "step": 348
+    },
+    {
+      "epoch": 0.009629678171464836,
+      "grad_norm": 0.002811063313856721,
+      "learning_rate": 0.001,
+      "loss": 0.3864,
+      "step": 349
+    },
+    {
+      "epoch": 0.009657270372529204,
+      "grad_norm": 0.0031305616721510887,
+      "learning_rate": 0.001,
+      "loss": 0.3724,
+      "step": 350
+    },
+    {
+      "epoch": 0.009684862573593574,
+      "grad_norm": 0.002753301290795207,
+      "learning_rate": 0.001,
+      "loss": 0.4034,
+      "step": 351
+    },
+    {
+      "epoch": 0.009712454774657943,
+      "grad_norm": 0.002502492628991604,
+      "learning_rate": 0.001,
+      "loss": 0.4056,
+      "step": 352
+    },
+    {
+      "epoch": 0.009740046975722312,
+      "grad_norm": 0.0029816054739058018,
+      "learning_rate": 0.001,
+      "loss": 0.4103,
+      "step": 353
+    },
+    {
+      "epoch": 0.00976763917678668,
+      "grad_norm": 0.00210220436565578,
+      "learning_rate": 0.001,
+      "loss": 0.42,
+      "step": 354
+    },
+    {
+      "epoch": 0.00979523137785105,
+      "grad_norm": 0.002407314023002982,
+      "learning_rate": 0.001,
+      "loss": 0.3993,
+      "step": 355
+    },
+    {
+      "epoch": 0.009822823578915419,
+      "grad_norm": 0.0025853144470602274,
+      "learning_rate": 0.001,
+      "loss": 0.3886,
+      "step": 356
+    },
+    {
+      "epoch": 0.009850415779979789,
+      "grad_norm": 0.0020144209265708923,
+      "learning_rate": 0.001,
+      "loss": 0.4465,
+      "step": 357
+    },
+    {
+      "epoch": 0.009878007981044159,
+      "grad_norm": 0.0026473626494407654,
+      "learning_rate": 0.001,
+      "loss": 0.384,
+      "step": 358
+    },
+    {
+      "epoch": 0.009905600182108527,
+      "grad_norm": 0.005975659471005201,
+      "learning_rate": 0.001,
+      "loss": 0.4328,
+      "step": 359
+    },
+    {
+      "epoch": 0.009933192383172897,
+      "grad_norm": 0.0024808787275105715,
+      "learning_rate": 0.001,
+      "loss": 0.3949,
+      "step": 360
+    },
+    {
+      "epoch": 0.009960784584237265,
+      "grad_norm": 0.011580473743379116,
+      "learning_rate": 0.001,
+      "loss": 0.4419,
+      "step": 361
+    },
+    {
+      "epoch": 0.009988376785301635,
+      "grad_norm": 0.0025010586250573397,
+      "learning_rate": 0.001,
+      "loss": 0.3943,
+      "step": 362
+    },
+    {
+      "epoch": 0.010015968986366003,
+      "grad_norm": 0.0029951941687613726,
+      "learning_rate": 0.001,
+      "loss": 0.3898,
+      "step": 363
+    },
+    {
+      "epoch": 0.010043561187430373,
+      "grad_norm": 0.0038613236974924803,
+      "learning_rate": 0.001,
+      "loss": 0.4186,
+      "step": 364
+    },
+    {
+      "epoch": 0.010071153388494743,
+      "grad_norm": 0.00325029413215816,
+      "learning_rate": 0.001,
+      "loss": 0.392,
+      "step": 365
+    },
+    {
+      "epoch": 0.01009874558955911,
+      "grad_norm": 0.002298851264640689,
+      "learning_rate": 0.001,
+      "loss": 0.4364,
+      "step": 366
+    },
+    {
+      "epoch": 0.01012633779062348,
+      "grad_norm": 0.003252133959904313,
+      "learning_rate": 0.001,
+      "loss": 0.3851,
+      "step": 367
+    },
+    {
+      "epoch": 0.010153929991687849,
+      "grad_norm": 0.0026516057550907135,
+      "learning_rate": 0.001,
+      "loss": 0.4368,
+      "step": 368
+    },
+    {
+      "epoch": 0.010181522192752219,
+      "grad_norm": 0.0033587999641895294,
+      "learning_rate": 0.001,
+      "loss": 0.3924,
+      "step": 369
+    },
+    {
+      "epoch": 0.010209114393816587,
+      "grad_norm": 0.0030985455960035324,
+      "learning_rate": 0.001,
+      "loss": 0.4134,
+      "step": 370
+    },
+    {
+      "epoch": 0.010236706594880957,
+      "grad_norm": 0.0028507187962532043,
+      "learning_rate": 0.001,
+      "loss": 0.4237,
+      "step": 371
+    },
+    {
+      "epoch": 0.010264298795945327,
+      "grad_norm": 0.0024174090940505266,
+      "learning_rate": 0.001,
+      "loss": 0.4607,
+      "step": 372
+    },
+    {
+      "epoch": 0.010291890997009695,
+      "grad_norm": 0.003416180843487382,
+      "learning_rate": 0.001,
+      "loss": 0.3812,
+      "step": 373
+    },
+    {
+      "epoch": 0.010319483198074065,
+      "grad_norm": 0.0031029239762574434,
+      "learning_rate": 0.001,
+      "loss": 0.402,
+      "step": 374
+    },
+    {
+      "epoch": 0.010347075399138433,
+      "grad_norm": 0.0023850388824939728,
+      "learning_rate": 0.001,
+      "loss": 0.3854,
+      "step": 375
+    },
+    {
+      "epoch": 0.010374667600202803,
+      "grad_norm": 0.0026809065602719784,
+      "learning_rate": 0.001,
+      "loss": 0.4062,
+      "step": 376
+    },
+    {
+      "epoch": 0.010402259801267171,
+      "grad_norm": 0.004038350190967321,
+      "learning_rate": 0.001,
+      "loss": 0.4223,
+      "step": 377
+    },
+    {
+      "epoch": 0.010429852002331541,
+      "grad_norm": 0.0030194921419024467,
+      "learning_rate": 0.001,
+      "loss": 0.4525,
+      "step": 378
+    },
+    {
+      "epoch": 0.010457444203395911,
+      "grad_norm": 0.0023160662967711687,
+      "learning_rate": 0.001,
+      "loss": 0.4256,
+      "step": 379
+    },
+    {
+      "epoch": 0.01048503640446028,
+      "grad_norm": 0.0031149794813245535,
+      "learning_rate": 0.001,
+      "loss": 0.4334,
+      "step": 380
+    },
+    {
+      "epoch": 0.010512628605524649,
+      "grad_norm": 0.0025531111750751734,
+      "learning_rate": 0.001,
+      "loss": 0.4142,
+      "step": 381
+    },
+    {
+      "epoch": 0.010540220806589017,
+      "grad_norm": 0.003499183803796768,
+      "learning_rate": 0.001,
+      "loss": 0.4105,
+      "step": 382
+    },
+    {
+      "epoch": 0.010567813007653387,
+      "grad_norm": 0.004215524531900883,
+      "learning_rate": 0.001,
+      "loss": 0.3859,
+      "step": 383
+    },
+    {
+      "epoch": 0.010595405208717755,
+      "grad_norm": 0.0024398749228566885,
+      "learning_rate": 0.001,
+      "loss": 0.4182,
+      "step": 384
+    },
+    {
+      "epoch": 0.010622997409782125,
+      "grad_norm": 0.003436741651967168,
+      "learning_rate": 0.001,
+      "loss": 0.4264,
+      "step": 385
+    },
+    {
+      "epoch": 0.010650589610846493,
+      "grad_norm": 0.0023827480617910624,
+      "learning_rate": 0.001,
+      "loss": 0.4117,
+      "step": 386
+    },
+    {
+      "epoch": 0.010678181811910863,
+      "grad_norm": 0.0032302262261509895,
+      "learning_rate": 0.001,
+      "loss": 0.399,
+      "step": 387
+    },
+    {
+      "epoch": 0.010705774012975233,
+      "grad_norm": 0.004642815329134464,
+      "learning_rate": 0.001,
+      "loss": 0.341,
+      "step": 388
+    },
+    {
+      "epoch": 0.010733366214039601,
+      "grad_norm": 0.002242898801341653,
+      "learning_rate": 0.001,
+      "loss": 0.4084,
+      "step": 389
+    },
+    {
+      "epoch": 0.010760958415103971,
+      "grad_norm": 0.002073424868285656,
+      "learning_rate": 0.001,
+      "loss": 0.4344,
+      "step": 390
+    },
+    {
+      "epoch": 0.01078855061616834,
+      "grad_norm": 0.0029485910199582577,
+      "learning_rate": 0.001,
+      "loss": 0.3803,
+      "step": 391
+    },
+    {
+      "epoch": 0.01081614281723271,
+      "grad_norm": 0.003694765968248248,
+      "learning_rate": 0.001,
+      "loss": 0.4351,
+      "step": 392
+    },
+    {
+      "epoch": 0.010843735018297078,
+      "grad_norm": 0.004921768791973591,
+      "learning_rate": 0.001,
+      "loss": 0.4089,
+      "step": 393
+    },
+    {
+      "epoch": 0.010871327219361447,
+      "grad_norm": 0.0033451106864959,
+      "learning_rate": 0.001,
+      "loss": 0.4005,
+      "step": 394
+    },
+    {
+      "epoch": 0.010898919420425817,
+      "grad_norm": 0.003096395405009389,
+      "learning_rate": 0.001,
+      "loss": 0.3951,
+      "step": 395
+    },
+    {
+      "epoch": 0.010926511621490186,
+      "grad_norm": 0.002606458030641079,
+      "learning_rate": 0.001,
+      "loss": 0.3887,
+      "step": 396
+    },
+    {
+      "epoch": 0.010954103822554555,
+      "grad_norm": 0.002552368212491274,
+      "learning_rate": 0.001,
+      "loss": 0.436,
+      "step": 397
+    },
+    {
+      "epoch": 0.010981696023618924,
+      "grad_norm": 0.002896772464737296,
+      "learning_rate": 0.001,
+      "loss": 0.3605,
+      "step": 398
+    },
+    {
+      "epoch": 0.011009288224683294,
+      "grad_norm": 0.004260985646396875,
+      "learning_rate": 0.001,
+      "loss": 0.3817,
+      "step": 399
+    },
+    {
+      "epoch": 0.011036880425747662,
+      "grad_norm": 0.0023302636109292507,
+      "learning_rate": 0.001,
+      "loss": 0.3852,
+      "step": 400
+    },
+    {
+      "epoch": 0.011064472626812032,
+      "grad_norm": 0.0028871933463960886,
+      "learning_rate": 0.001,
+      "loss": 0.3944,
+      "step": 401
+    },
+    {
+      "epoch": 0.011092064827876402,
+      "grad_norm": 0.003007282270118594,
+      "learning_rate": 0.001,
+      "loss": 0.4197,
+      "step": 402
+    },
+    {
+      "epoch": 0.01111965702894077,
+      "grad_norm": 0.0023298682644963264,
+      "learning_rate": 0.001,
+      "loss": 0.4007,
+      "step": 403
+    },
+    {
+      "epoch": 0.01114724923000514,
+      "grad_norm": 0.0032247831113636494,
+      "learning_rate": 0.001,
+      "loss": 0.404,
+      "step": 404
+    },
+    {
+      "epoch": 0.011174841431069508,
+      "grad_norm": 0.002755606546998024,
+      "learning_rate": 0.001,
+      "loss": 0.3675,
+      "step": 405
+    },
+    {
+      "epoch": 0.011202433632133878,
+      "grad_norm": 0.0030489088967442513,
+      "learning_rate": 0.001,
+      "loss": 0.3863,
+      "step": 406
+    },
+    {
+      "epoch": 0.011230025833198246,
+      "grad_norm": 0.0037001632153987885,
+      "learning_rate": 0.001,
+      "loss": 0.395,
+      "step": 407
+    },
+    {
+      "epoch": 0.011257618034262616,
+      "grad_norm": 0.0033608553931117058,
+      "learning_rate": 0.001,
+      "loss": 0.3818,
+      "step": 408
+    },
+    {
+      "epoch": 0.011285210235326984,
+      "grad_norm": 0.00226064445450902,
+      "learning_rate": 0.001,
+      "loss": 0.4162,
+      "step": 409
+    },
+    {
+      "epoch": 0.011312802436391354,
+      "grad_norm": 0.0037739353720098734,
+      "learning_rate": 0.001,
+      "loss": 0.3952,
+      "step": 410
+    },
+    {
+      "epoch": 0.011340394637455724,
+      "grad_norm": 0.0032893354073166847,
+      "learning_rate": 0.001,
+      "loss": 0.3916,
+      "step": 411
+    },
+    {
+      "epoch": 0.011367986838520092,
+      "grad_norm": 0.0036774203181266785,
+      "learning_rate": 0.001,
+      "loss": 0.4207,
+      "step": 412
+    },
+    {
+      "epoch": 0.011395579039584462,
+      "grad_norm": 0.0031027563381940126,
+      "learning_rate": 0.001,
+      "loss": 0.4202,
+      "step": 413
+    },
+    {
+      "epoch": 0.01142317124064883,
+      "grad_norm": 0.006735049653798342,
+      "learning_rate": 0.001,
+      "loss": 0.4151,
+      "step": 414
+    },
+    {
+      "epoch": 0.0114507634417132,
+      "grad_norm": 0.0036674696020781994,
+      "learning_rate": 0.001,
+      "loss": 0.3887,
+      "step": 415
+    },
+    {
+      "epoch": 0.011478355642777568,
+      "grad_norm": 0.0060126762837171555,
+      "learning_rate": 0.001,
+      "loss": 0.4291,
+      "step": 416
+    },
+    {
+      "epoch": 0.011505947843841938,
+      "grad_norm": 0.005611831322312355,
+      "learning_rate": 0.001,
+      "loss": 0.3769,
+      "step": 417
+    },
+    {
+      "epoch": 0.011533540044906308,
+      "grad_norm": 0.004347233567386866,
+      "learning_rate": 0.001,
+      "loss": 0.4006,
+      "step": 418
+    },
+    {
+      "epoch": 0.011561132245970676,
+      "grad_norm": 0.0035771261900663376,
+      "learning_rate": 0.001,
+      "loss": 0.4122,
+      "step": 419
+    },
+    {
+      "epoch": 0.011588724447035046,
+      "grad_norm": 0.002888308372348547,
+      "learning_rate": 0.001,
+      "loss": 0.3947,
+      "step": 420
+    },
+    {
+      "epoch": 0.011616316648099414,
+      "grad_norm": 0.006516223307698965,
+      "learning_rate": 0.001,
+      "loss": 0.395,
+      "step": 421
+    },
+    {
+      "epoch": 0.011643908849163784,
+      "grad_norm": 0.004090446978807449,
+      "learning_rate": 0.001,
+      "loss": 0.3573,
+      "step": 422
+    },
+    {
+      "epoch": 0.011671501050228152,
+      "grad_norm": 0.002728004939854145,
+      "learning_rate": 0.001,
+      "loss": 0.4364,
+      "step": 423
+    },
+    {
+      "epoch": 0.011699093251292522,
+      "grad_norm": 0.002871421165764332,
+      "learning_rate": 0.001,
+      "loss": 0.3978,
+      "step": 424
+    },
+    {
+      "epoch": 0.011726685452356892,
+      "grad_norm": 0.0021404619328677654,
+      "learning_rate": 0.001,
+      "loss": 0.4034,
+      "step": 425
+    },
+    {
+      "epoch": 0.01175427765342126,
+      "grad_norm": 0.0027238999027758837,
+      "learning_rate": 0.001,
+      "loss": 0.4189,
+      "step": 426
+    },
+    {
+      "epoch": 0.01178186985448563,
+      "grad_norm": 0.0025113308802247047,
+      "learning_rate": 0.001,
+      "loss": 0.4096,
+      "step": 427
+    },
+    {
+      "epoch": 0.011809462055549998,
+      "grad_norm": 0.0029517943039536476,
+      "learning_rate": 0.001,
+      "loss": 0.415,
+      "step": 428
+    },
+    {
+      "epoch": 0.011837054256614368,
+      "grad_norm": 0.0023259243462234735,
+      "learning_rate": 0.001,
+      "loss": 0.4034,
+      "step": 429
+    },
+    {
+      "epoch": 0.011864646457678736,
+      "grad_norm": 0.002625027671456337,
+      "learning_rate": 0.001,
+      "loss": 0.3656,
+      "step": 430
+    },
+    {
+      "epoch": 0.011892238658743106,
+      "grad_norm": 0.0032775115687400103,
+      "learning_rate": 0.001,
+      "loss": 0.4151,
+      "step": 431
+    },
+    {
+      "epoch": 0.011919830859807476,
+      "grad_norm": 0.002229101490229368,
+      "learning_rate": 0.001,
+      "loss": 0.4495,
+      "step": 432
+    },
+    {
+      "epoch": 0.011947423060871844,
+      "grad_norm": 0.0024553320836275816,
+      "learning_rate": 0.001,
+      "loss": 0.4261,
+      "step": 433
+    },
+    {
+      "epoch": 0.011975015261936214,
+      "grad_norm": 0.0025780866853892803,
+      "learning_rate": 0.001,
+      "loss": 0.4094,
+      "step": 434
+    },
+    {
+      "epoch": 0.012002607463000583,
+      "grad_norm": 0.003502671839669347,
+      "learning_rate": 0.001,
+      "loss": 0.3563,
+      "step": 435
+    },
+    {
+      "epoch": 0.012030199664064952,
+      "grad_norm": 0.004440659191459417,
+      "learning_rate": 0.001,
+      "loss": 0.3973,
+      "step": 436
+    },
+    {
+      "epoch": 0.01205779186512932,
+      "grad_norm": 0.0051866755820810795,
+      "learning_rate": 0.001,
+      "loss": 0.4232,
+      "step": 437
+    },
+    {
+      "epoch": 0.01208538406619369,
+      "grad_norm": 0.002548003103584051,
+      "learning_rate": 0.001,
+      "loss": 0.4387,
+      "step": 438
+    },
+    {
+      "epoch": 0.012112976267258059,
+      "grad_norm": 0.0030665360391139984,
+      "learning_rate": 0.001,
+      "loss": 0.4336,
+      "step": 439
+    },
+    {
+      "epoch": 0.012140568468322429,
+      "grad_norm": 0.0032852613367140293,
+      "learning_rate": 0.001,
+      "loss": 0.4213,
+      "step": 440
+    },
+    {
+      "epoch": 0.012168160669386799,
+      "grad_norm": 0.0030793712940067053,
+      "learning_rate": 0.001,
+      "loss": 0.4191,
+      "step": 441
+    },
+    {
+      "epoch": 0.012195752870451167,
+      "grad_norm": 0.0028317072428762913,
+      "learning_rate": 0.001,
+      "loss": 0.4219,
+      "step": 442
+    },
+    {
+      "epoch": 0.012223345071515537,
+      "grad_norm": 0.0029394179582595825,
+      "learning_rate": 0.001,
+      "loss": 0.373,
+      "step": 443
+    },
+    {
+      "epoch": 0.012250937272579905,
+      "grad_norm": 0.0032694858964532614,
+      "learning_rate": 0.001,
+      "loss": 0.3938,
+      "step": 444
+    },
+    {
+      "epoch": 0.012278529473644275,
+      "grad_norm": 0.002599178347736597,
+      "learning_rate": 0.001,
+      "loss": 0.4012,
+      "step": 445
+    },
+    {
+      "epoch": 0.012306121674708643,
+      "grad_norm": 0.0053001102060079575,
+      "learning_rate": 0.001,
+      "loss": 0.393,
+      "step": 446
+    },
+    {
+      "epoch": 0.012333713875773013,
+      "grad_norm": 0.0027281581424176693,
+      "learning_rate": 0.001,
+      "loss": 0.4207,
+      "step": 447
+    },
+    {
+      "epoch": 0.012361306076837383,
+      "grad_norm": 0.0040309545584023,
+      "learning_rate": 0.001,
+      "loss": 0.3925,
+      "step": 448
+    },
+    {
+      "epoch": 0.01238889827790175,
+      "grad_norm": 0.0024766335263848305,
+      "learning_rate": 0.001,
+      "loss": 0.3947,
+      "step": 449
+    },
+    {
+      "epoch": 0.01241649047896612,
+      "grad_norm": 0.0033247419632971287,
+      "learning_rate": 0.001,
+      "loss": 0.4369,
+      "step": 450
+    },
+    {
+      "epoch": 0.012444082680030489,
+      "grad_norm": 0.0030413721688091755,
+      "learning_rate": 0.001,
+      "loss": 0.4193,
+      "step": 451
+    },
+    {
+      "epoch": 0.012471674881094859,
+      "grad_norm": 0.0068315728567540646,
+      "learning_rate": 0.001,
+      "loss": 0.4207,
+      "step": 452
+    },
+    {
+      "epoch": 0.012499267082159227,
+      "grad_norm": 0.0032153630163520575,
+      "learning_rate": 0.001,
+      "loss": 0.4242,
+      "step": 453
+    },
+    {
+      "epoch": 0.012526859283223597,
+      "grad_norm": 0.003297319170087576,
+      "learning_rate": 0.001,
+      "loss": 0.4082,
+      "step": 454
+    },
+    {
+      "epoch": 0.012554451484287967,
+      "grad_norm": 0.003063608892261982,
+      "learning_rate": 0.001,
+      "loss": 0.44,
+      "step": 455
+    },
+    {
+      "epoch": 0.012582043685352335,
+      "grad_norm": 0.0030015911906957626,
+      "learning_rate": 0.001,
+      "loss": 0.3988,
+      "step": 456
+    },
+    {
+      "epoch": 0.012609635886416705,
+      "grad_norm": 0.0032717676367610693,
+      "learning_rate": 0.001,
+      "loss": 0.3561,
+      "step": 457
+    },
+    {
+      "epoch": 0.012637228087481073,
+      "grad_norm": 0.0031338001135736704,
+      "learning_rate": 0.001,
+      "loss": 0.393,
+      "step": 458
+    },
+    {
+      "epoch": 0.012664820288545443,
+      "grad_norm": 0.0055600800551474094,
+      "learning_rate": 0.001,
+      "loss": 0.4241,
+      "step": 459
+    },
+    {
+      "epoch": 0.012692412489609811,
+      "grad_norm": 0.009609074331820011,
+      "learning_rate": 0.001,
+      "loss": 0.3647,
+      "step": 460
+    },
+    {
+      "epoch": 0.012720004690674181,
+      "grad_norm": 0.0031838284339755774,
+      "learning_rate": 0.001,
+      "loss": 0.4308,
+      "step": 461
+    },
+    {
+      "epoch": 0.01274759689173855,
+      "grad_norm": 0.002876470098271966,
+      "learning_rate": 0.001,
+      "loss": 0.3771,
+      "step": 462
+    },
+    {
+      "epoch": 0.01277518909280292,
+      "grad_norm": 0.0027010440826416016,
+      "learning_rate": 0.001,
+      "loss": 0.4392,
+      "step": 463
+    },
+    {
+      "epoch": 0.012802781293867289,
+      "grad_norm": 0.0024778309743851423,
+      "learning_rate": 0.001,
+      "loss": 0.446,
+      "step": 464
+    },
+    {
+      "epoch": 0.012830373494931657,
+      "grad_norm": 0.0021515442058444023,
+      "learning_rate": 0.001,
+      "loss": 0.4114,
+      "step": 465
+    },
+    {
+      "epoch": 0.012857965695996027,
+      "grad_norm": 0.0023079351522028446,
+      "learning_rate": 0.001,
+      "loss": 0.392,
+      "step": 466
+    },
+    {
+      "epoch": 0.012885557897060395,
+      "grad_norm": 0.002527826000005007,
+      "learning_rate": 0.001,
+      "loss": 0.4094,
+      "step": 467
+    },
+    {
+      "epoch": 0.012913150098124765,
+      "grad_norm": 0.003819882869720459,
+      "learning_rate": 0.001,
+      "loss": 0.3828,
+      "step": 468
+    },
+    {
+      "epoch": 0.012940742299189133,
+      "grad_norm": 0.002980564022436738,
+      "learning_rate": 0.001,
+      "loss": 0.4015,
+      "step": 469
+    },
+    {
+      "epoch": 0.012968334500253503,
+      "grad_norm": 0.004125640727579594,
+      "learning_rate": 0.001,
+      "loss": 0.4114,
+      "step": 470
+    },
+    {
+      "epoch": 0.012995926701317873,
+      "grad_norm": 0.0025943296495825052,
+      "learning_rate": 0.001,
+      "loss": 0.4019,
+      "step": 471
+    },
+    {
+      "epoch": 0.013023518902382241,
+      "grad_norm": 0.002629917813464999,
+      "learning_rate": 0.001,
+      "loss": 0.366,
+      "step": 472
+    },
+    {
+      "epoch": 0.013051111103446611,
+      "grad_norm": 0.0021362872794270515,
+      "learning_rate": 0.001,
+      "loss": 0.3903,
+      "step": 473
+    },
+    {
+      "epoch": 0.01307870330451098,
+      "grad_norm": 0.002968277782201767,
+      "learning_rate": 0.001,
+      "loss": 0.4124,
+      "step": 474
+    },
+    {
+      "epoch": 0.01310629550557535,
+      "grad_norm": 0.002851092955097556,
+      "learning_rate": 0.001,
+      "loss": 0.3988,
+      "step": 475
+    },
+    {
+      "epoch": 0.013133887706639718,
+      "grad_norm": 0.0023420127108693123,
+      "learning_rate": 0.001,
+      "loss": 0.3874,
+      "step": 476
+    },
+    {
+      "epoch": 0.013161479907704087,
+      "grad_norm": 0.003359900787472725,
+      "learning_rate": 0.001,
+      "loss": 0.4202,
+      "step": 477
+    },
+    {
+      "epoch": 0.013189072108768457,
+      "grad_norm": 0.0038361374754458666,
+      "learning_rate": 0.001,
+      "loss": 0.392,
+      "step": 478
+    },
+    {
+      "epoch": 0.013216664309832826,
+      "grad_norm": 0.003093348117545247,
+      "learning_rate": 0.001,
+      "loss": 0.4009,
+      "step": 479
+    },
+    {
+      "epoch": 0.013244256510897195,
+      "grad_norm": 0.003098709974437952,
+      "learning_rate": 0.001,
+      "loss": 0.3942,
+      "step": 480
+    },
+    {
+      "epoch": 0.013271848711961564,
+      "grad_norm": 0.002880271291360259,
+      "learning_rate": 0.001,
+      "loss": 0.4004,
+      "step": 481
+    },
+    {
+      "epoch": 0.013299440913025934,
+      "grad_norm": 0.005491374060511589,
+      "learning_rate": 0.001,
+      "loss": 0.3652,
+      "step": 482
+    },
+    {
+      "epoch": 0.013327033114090302,
+      "grad_norm": 0.0029838404152542353,
+      "learning_rate": 0.001,
+      "loss": 0.4022,
+      "step": 483
+    },
+    {
+      "epoch": 0.013354625315154672,
+      "grad_norm": 0.00296381744556129,
+      "learning_rate": 0.001,
+      "loss": 0.4335,
+      "step": 484
+    },
+    {
+      "epoch": 0.013382217516219042,
+      "grad_norm": 0.0037699625827372074,
+      "learning_rate": 0.001,
+      "loss": 0.3784,
+      "step": 485
+    },
+    {
+      "epoch": 0.01340980971728341,
+      "grad_norm": 0.0022699085529893637,
+      "learning_rate": 0.001,
+      "loss": 0.3953,
+      "step": 486
+    },
+    {
+      "epoch": 0.01343740191834778,
+      "grad_norm": 0.003976910375058651,
+      "learning_rate": 0.001,
+      "loss": 0.4096,
+      "step": 487
+    },
+    {
+      "epoch": 0.013464994119412148,
+      "grad_norm": 0.003803370287641883,
+      "learning_rate": 0.001,
+      "loss": 0.3852,
+      "step": 488
+    },
+    {
+      "epoch": 0.013492586320476518,
+      "grad_norm": 0.0036107334308326244,
+      "learning_rate": 0.001,
+      "loss": 0.3878,
+      "step": 489
+    },
+    {
+      "epoch": 0.013520178521540886,
+      "grad_norm": 0.0034353930968791246,
+      "learning_rate": 0.001,
+      "loss": 0.3992,
+      "step": 490
+    },
+    {
+      "epoch": 0.013547770722605256,
+      "grad_norm": 0.006415998097509146,
+      "learning_rate": 0.001,
+      "loss": 0.4034,
+      "step": 491
+    },
+    {
+      "epoch": 0.013575362923669624,
+      "grad_norm": 0.0031678853556513786,
+      "learning_rate": 0.001,
+      "loss": 0.4182,
+      "step": 492
+    },
+    {
+      "epoch": 0.013602955124733994,
+      "grad_norm": 0.003964356612414122,
+      "learning_rate": 0.001,
+      "loss": 0.4311,
+      "step": 493
+    },
+    {
+      "epoch": 0.013630547325798364,
+      "grad_norm": 0.004771376959979534,
+      "learning_rate": 0.001,
+      "loss": 0.3865,
+      "step": 494
+    },
+    {
+      "epoch": 0.013658139526862732,
+      "grad_norm": 0.0035825970117002726,
+      "learning_rate": 0.001,
+      "loss": 0.3672,
+      "step": 495
+    },
+    {
+      "epoch": 0.013685731727927102,
+      "grad_norm": 0.0036343352403491735,
+      "learning_rate": 0.001,
+      "loss": 0.4152,
+      "step": 496
+    },
+    {
+      "epoch": 0.01371332392899147,
+      "grad_norm": 0.002659760881215334,
+      "learning_rate": 0.001,
+      "loss": 0.4029,
+      "step": 497
+    },
+    {
+      "epoch": 0.01374091613005584,
+      "grad_norm": 0.003316852729767561,
+      "learning_rate": 0.001,
+      "loss": 0.4527,
+      "step": 498
+    },
+    {
+      "epoch": 0.013768508331120208,
+      "grad_norm": 0.0034902123734354973,
+      "learning_rate": 0.001,
+      "loss": 0.4,
+      "step": 499
+    },
+    {
+      "epoch": 0.013796100532184578,
+      "grad_norm": 0.002378157339990139,
+      "learning_rate": 0.001,
+      "loss": 0.3903,
+      "step": 500
+    },
+    {
+      "epoch": 0.013796100532184578,
+      "eval_runtime": 25.0496,
+      "eval_samples_per_second": 1.277,
+      "eval_steps_per_second": 0.16,
+      "step": 500
+    },
+    {
+      "epoch": 0.013823692733248948,
+      "grad_norm": 0.005439688451588154,
+      "learning_rate": 0.001,
+      "loss": 0.4152,
+      "step": 501
+    },
+    {
+      "epoch": 0.013851284934313316,
+      "grad_norm": 0.0026094792410731316,
+      "learning_rate": 0.001,
+      "loss": 0.3904,
+      "step": 502
+    },
+    {
+      "epoch": 0.013878877135377686,
+      "grad_norm": 0.002377827186137438,
+      "learning_rate": 0.001,
+      "loss": 0.4233,
+      "step": 503
+    },
+    {
+      "epoch": 0.013906469336442054,
+      "grad_norm": 0.004557560198009014,
+      "learning_rate": 0.001,
+      "loss": 0.4086,
+      "step": 504
+    },
+    {
+      "epoch": 0.013934061537506424,
+      "grad_norm": 0.0030535697005689144,
+      "learning_rate": 0.001,
+      "loss": 0.388,
+      "step": 505
+    },
+    {
+      "epoch": 0.013961653738570792,
+      "grad_norm": 0.0032748053781688213,
+      "learning_rate": 0.001,
+      "loss": 0.386,
+      "step": 506
+    },
+    {
+      "epoch": 0.013989245939635162,
+      "grad_norm": 0.003947910387068987,
+      "learning_rate": 0.001,
+      "loss": 0.4099,
+      "step": 507
+    },
+    {
+      "epoch": 0.014016838140699532,
+      "grad_norm": 0.0033470303751528263,
+      "learning_rate": 0.001,
+      "loss": 0.4139,
+      "step": 508
+    },
+    {
+      "epoch": 0.0140444303417639,
+      "grad_norm": 0.011195505037903786,
+      "learning_rate": 0.001,
+      "loss": 0.3807,
+      "step": 509
+    },
+    {
+      "epoch": 0.01407202254282827,
+      "grad_norm": 0.0028728062752634287,
+      "learning_rate": 0.001,
+      "loss": 0.4007,
+      "step": 510
+    },
+    {
+      "epoch": 0.014099614743892638,
+      "grad_norm": 0.0035205583553761244,
+      "learning_rate": 0.001,
+      "loss": 0.3804,
+      "step": 511
+    },
+    {
+      "epoch": 0.014127206944957008,
+      "grad_norm": 0.003692334285005927,
+      "learning_rate": 0.001,
+      "loss": 0.3984,
+      "step": 512
+    },
+    {
+      "epoch": 0.014154799146021376,
+      "grad_norm": 0.004000712651759386,
+      "learning_rate": 0.001,
+      "loss": 0.4239,
+      "step": 513
+    },
+    {
+      "epoch": 0.014182391347085746,
+      "grad_norm": 0.0022262849379330873,
+      "learning_rate": 0.001,
+      "loss": 0.3848,
+      "step": 514
+    },
+    {
+      "epoch": 0.014209983548150115,
+      "grad_norm": 0.005194266326725483,
+      "learning_rate": 0.001,
+      "loss": 0.3673,
+      "step": 515
+    },
+    {
+      "epoch": 0.014237575749214484,
+      "grad_norm": 0.0022841247264295816,
+      "learning_rate": 0.001,
+      "loss": 0.4089,
+      "step": 516
+    },
+    {
+      "epoch": 0.014265167950278854,
+      "grad_norm": 0.003351402934640646,
+      "learning_rate": 0.001,
+      "loss": 0.4278,
+      "step": 517
+    },
+    {
+      "epoch": 0.014292760151343223,
+      "grad_norm": 0.0031243113335222006,
+      "learning_rate": 0.001,
+      "loss": 0.3712,
+      "step": 518
+    },
+    {
+      "epoch": 0.014320352352407592,
+      "grad_norm": 0.003822315251454711,
+      "learning_rate": 0.001,
+      "loss": 0.4124,
+      "step": 519
+    },
+    {
+      "epoch": 0.01434794455347196,
+      "grad_norm": 0.0023426790721714497,
+      "learning_rate": 0.001,
+      "loss": 0.3997,
+      "step": 520
+    },
+    {
+      "epoch": 0.01437553675453633,
+      "grad_norm": 0.0027026657480746508,
+      "learning_rate": 0.001,
+      "loss": 0.373,
+      "step": 521
+    },
+    {
+      "epoch": 0.014403128955600699,
+      "grad_norm": 0.0023081700783222914,
+      "learning_rate": 0.001,
+      "loss": 0.4221,
+      "step": 522
+    },
+    {
+      "epoch": 0.014430721156665069,
+      "grad_norm": 0.003079401096329093,
+      "learning_rate": 0.001,
+      "loss": 0.4128,
+      "step": 523
+    },
+    {
+      "epoch": 0.014458313357729439,
+      "grad_norm": 0.0027522866148501635,
+      "learning_rate": 0.001,
+      "loss": 0.397,
+      "step": 524
+    },
+    {
+      "epoch": 0.014485905558793807,
+      "grad_norm": 0.002710319822654128,
+      "learning_rate": 0.001,
+      "loss": 0.4254,
+      "step": 525
+    },
+    {
+      "epoch": 0.014513497759858177,
+      "grad_norm": 0.002838405082002282,
+      "learning_rate": 0.001,
+      "loss": 0.4054,
+      "step": 526
+    },
+    {
+      "epoch": 0.014541089960922545,
+      "grad_norm": 0.0028795620892196894,
+      "learning_rate": 0.001,
+      "loss": 0.4186,
+      "step": 527
+    },
+    {
+      "epoch": 0.014568682161986915,
+      "grad_norm": 0.002826494164764881,
+      "learning_rate": 0.001,
+      "loss": 0.4381,
+      "step": 528
+    },
+    {
+      "epoch": 0.014596274363051283,
+      "grad_norm": 0.002902933629229665,
+      "learning_rate": 0.001,
+      "loss": 0.4003,
+      "step": 529
+    },
+    {
+      "epoch": 0.014623866564115653,
+      "grad_norm": 0.0034344778396189213,
+      "learning_rate": 0.001,
+      "loss": 0.4149,
+      "step": 530
+    },
+    {
+      "epoch": 0.014651458765180023,
+      "grad_norm": 0.002652675611898303,
+      "learning_rate": 0.001,
+      "loss": 0.4075,
+      "step": 531
+    },
+    {
+      "epoch": 0.01467905096624439,
+      "grad_norm": 0.00368179427459836,
+      "learning_rate": 0.001,
+      "loss": 0.3899,
+      "step": 532
+    },
+    {
+      "epoch": 0.01470664316730876,
+      "grad_norm": 0.0028401080053299665,
+      "learning_rate": 0.001,
+      "loss": 0.4247,
+      "step": 533
+    },
+    {
+      "epoch": 0.014734235368373129,
+      "grad_norm": 0.0047442615032196045,
+      "learning_rate": 0.001,
+      "loss": 0.4046,
+      "step": 534
+    },
+    {
+      "epoch": 0.014761827569437499,
+      "grad_norm": 0.0031780744902789593,
+      "learning_rate": 0.001,
+      "loss": 0.4043,
+      "step": 535
+    },
+    {
+      "epoch": 0.014789419770501867,
+      "grad_norm": 0.004496684763580561,
+      "learning_rate": 0.001,
+      "loss": 0.3824,
+      "step": 536
+    },
+    {
+      "epoch": 0.014817011971566237,
+      "grad_norm": 0.0026886628475040197,
+      "learning_rate": 0.001,
+      "loss": 0.4136,
+      "step": 537
+    },
+    {
+      "epoch": 0.014844604172630607,
+      "grad_norm": 0.004109974484890699,
+      "learning_rate": 0.001,
+      "loss": 0.4547,
+      "step": 538
+    },
+    {
+      "epoch": 0.014872196373694975,
+      "grad_norm": 0.004985136911273003,
+      "learning_rate": 0.001,
+      "loss": 0.4171,
+      "step": 539
+    },
+    {
+      "epoch": 0.014899788574759345,
+      "grad_norm": 0.00309336488135159,
+      "learning_rate": 0.001,
+      "loss": 0.4374,
+      "step": 540
+    },
+    {
+      "epoch": 0.014927380775823713,
+      "grad_norm": 0.0031439203303307295,
+      "learning_rate": 0.001,
+      "loss": 0.4415,
+      "step": 541
+    },
+    {
+      "epoch": 0.014954972976888083,
+      "grad_norm": 0.0027853951323777437,
+      "learning_rate": 0.001,
+      "loss": 0.4088,
+      "step": 542
+    },
+    {
+      "epoch": 0.014982565177952451,
+      "grad_norm": 0.0026657688431441784,
+      "learning_rate": 0.001,
+      "loss": 0.4178,
+      "step": 543
+    },
+    {
+      "epoch": 0.015010157379016821,
+      "grad_norm": 0.01247773040086031,
+      "learning_rate": 0.001,
+      "loss": 0.3682,
+      "step": 544
+    },
+    {
+      "epoch": 0.01503774958008119,
+      "grad_norm": 0.002995160175487399,
+      "learning_rate": 0.001,
+      "loss": 0.4147,
+      "step": 545
+    },
+    {
+      "epoch": 0.01506534178114556,
+      "grad_norm": 0.002868333365768194,
+      "learning_rate": 0.001,
+      "loss": 0.435,
+      "step": 546
+    },
+    {
+      "epoch": 0.015092933982209929,
+      "grad_norm": 0.004190264735370874,
+      "learning_rate": 0.001,
+      "loss": 0.3718,
+      "step": 547
+    },
+    {
+      "epoch": 0.015120526183274297,
+      "grad_norm": 0.003989442717283964,
+      "learning_rate": 0.001,
+      "loss": 0.4276,
+      "step": 548
+    },
+    {
+      "epoch": 0.015148118384338667,
+      "grad_norm": 0.0031920846085995436,
+      "learning_rate": 0.001,
+      "loss": 0.428,
+      "step": 549
+    },
+    {
+      "epoch": 0.015175710585403035,
+      "grad_norm": 0.0037179081700742245,
+      "learning_rate": 0.001,
+      "loss": 0.3972,
+      "step": 550
+    },
+    {
+      "epoch": 0.015203302786467405,
+      "grad_norm": 0.003381606424227357,
+      "learning_rate": 0.001,
+      "loss": 0.3809,
+      "step": 551
+    },
+    {
+      "epoch": 0.015230894987531773,
+      "grad_norm": 0.0029906374402344227,
+      "learning_rate": 0.001,
+      "loss": 0.4199,
+      "step": 552
+    },
+    {
+      "epoch": 0.015258487188596143,
+      "grad_norm": 0.0036886725574731827,
+      "learning_rate": 0.001,
+      "loss": 0.4128,
+      "step": 553
+    },
+    {
+      "epoch": 0.015286079389660513,
+      "grad_norm": 0.0037765917368233204,
+      "learning_rate": 0.001,
+      "loss": 0.4126,
+      "step": 554
+    },
+    {
+      "epoch": 0.015313671590724881,
+      "grad_norm": 0.00531379971653223,
+      "learning_rate": 0.001,
+      "loss": 0.4103,
+      "step": 555
+    },
+    {
+      "epoch": 0.015341263791789251,
+      "grad_norm": 0.003965814132243395,
+      "learning_rate": 0.001,
+      "loss": 0.4012,
+      "step": 556
+    },
+    {
+      "epoch": 0.01536885599285362,
+      "grad_norm": 0.0031218293588608503,
+      "learning_rate": 0.001,
+      "loss": 0.4285,
+      "step": 557
+    },
+    {
+      "epoch": 0.01539644819391799,
+      "grad_norm": 0.0027389719616621733,
+      "learning_rate": 0.001,
+      "loss": 0.356,
+      "step": 558
+    },
+    {
+      "epoch": 0.015424040394982358,
+      "grad_norm": 0.003185281064361334,
+      "learning_rate": 0.001,
+      "loss": 0.4471,
+      "step": 559
+    },
+    {
+      "epoch": 0.015451632596046727,
+      "grad_norm": 0.003925340250134468,
+      "learning_rate": 0.001,
+      "loss": 0.4023,
+      "step": 560
+    },
+    {
+      "epoch": 0.015479224797111097,
+      "grad_norm": 0.0033257382456213236,
+      "learning_rate": 0.001,
+      "loss": 0.3707,
+      "step": 561
+    },
+    {
+      "epoch": 0.015506816998175466,
+      "grad_norm": 0.0033314244356006384,
+      "learning_rate": 0.001,
+      "loss": 0.405,
+      "step": 562
+    },
+    {
+      "epoch": 0.015534409199239835,
+      "grad_norm": 0.0034566351678222418,
+      "learning_rate": 0.001,
+      "loss": 0.396,
+      "step": 563
+    },
+    {
+      "epoch": 0.015562001400304204,
+      "grad_norm": 0.0038912983145564795,
+      "learning_rate": 0.001,
+      "loss": 0.3906,
+      "step": 564
+    },
+    {
+      "epoch": 0.015589593601368574,
+      "grad_norm": 0.004165092017501593,
+      "learning_rate": 0.001,
+      "loss": 0.3926,
+      "step": 565
+    },
+    {
+      "epoch": 0.015617185802432942,
+      "grad_norm": 0.004660595208406448,
+      "learning_rate": 0.001,
+      "loss": 0.3888,
+      "step": 566
+    },
+    {
+      "epoch": 0.01564477800349731,
+      "grad_norm": 0.0033354307524859905,
+      "learning_rate": 0.001,
+      "loss": 0.372,
+      "step": 567
+    },
+    {
+      "epoch": 0.01567237020456168,
+      "grad_norm": 0.002619788981974125,
+      "learning_rate": 0.001,
+      "loss": 0.3913,
+      "step": 568
+    },
+    {
+      "epoch": 0.01569996240562605,
+      "grad_norm": 0.004775674548000097,
+      "learning_rate": 0.001,
+      "loss": 0.3792,
+      "step": 569
+    },
+    {
+      "epoch": 0.015727554606690418,
+      "grad_norm": 0.0030274325981736183,
+      "learning_rate": 0.001,
+      "loss": 0.4075,
+      "step": 570
+    },
+    {
+      "epoch": 0.015755146807754788,
+      "grad_norm": 0.0031505031511187553,
+      "learning_rate": 0.001,
+      "loss": 0.4031,
+      "step": 571
+    },
+    {
+      "epoch": 0.015782739008819158,
+      "grad_norm": 0.005169673822820187,
+      "learning_rate": 0.001,
+      "loss": 0.3862,
+      "step": 572
+    },
+    {
+      "epoch": 0.015810331209883528,
+      "grad_norm": 0.002902763895690441,
+      "learning_rate": 0.001,
+      "loss": 0.3947,
+      "step": 573
+    },
+    {
+      "epoch": 0.015837923410947894,
+      "grad_norm": 0.0037871890235692263,
+      "learning_rate": 0.001,
+      "loss": 0.3957,
+      "step": 574
+    },
+    {
+      "epoch": 0.015865515612012264,
+      "grad_norm": 0.004014831036329269,
+      "learning_rate": 0.001,
+      "loss": 0.4278,
+      "step": 575
+    },
+    {
+      "epoch": 0.015893107813076634,
+      "grad_norm": 0.004346000496298075,
+      "learning_rate": 0.001,
+      "loss": 0.3972,
+      "step": 576
+    },
+    {
+      "epoch": 0.015920700014141004,
+      "grad_norm": 0.0033372112084180117,
+      "learning_rate": 0.001,
+      "loss": 0.4178,
+      "step": 577
+    },
+    {
+      "epoch": 0.015948292215205374,
+      "grad_norm": 0.0031993126031011343,
+      "learning_rate": 0.001,
+      "loss": 0.3842,
+      "step": 578
+    },
+    {
+      "epoch": 0.01597588441626974,
+      "grad_norm": 0.0026338782627135515,
+      "learning_rate": 0.001,
+      "loss": 0.422,
+      "step": 579
+    },
+    {
+      "epoch": 0.01600347661733411,
+      "grad_norm": 0.003543695667758584,
+      "learning_rate": 0.001,
+      "loss": 0.385,
+      "step": 580
+    },
+    {
+      "epoch": 0.01603106881839848,
+      "grad_norm": 0.003536064876243472,
+      "learning_rate": 0.001,
+      "loss": 0.3892,
+      "step": 581
+    },
+    {
+      "epoch": 0.01605866101946285,
+      "grad_norm": 0.0031016895081847906,
+      "learning_rate": 0.001,
+      "loss": 0.4282,
+      "step": 582
+    },
+    {
+      "epoch": 0.016086253220527216,
+      "grad_norm": 0.004082076251506805,
+      "learning_rate": 0.001,
+      "loss": 0.3919,
+      "step": 583
+    },
+    {
+      "epoch": 0.016113845421591586,
+      "grad_norm": 0.003274913877248764,
+      "learning_rate": 0.001,
+      "loss": 0.4003,
+      "step": 584
+    },
+    {
+      "epoch": 0.016141437622655956,
+      "grad_norm": 0.003890890395268798,
+      "learning_rate": 0.001,
+      "loss": 0.4164,
+      "step": 585
+    },
+    {
+      "epoch": 0.016169029823720326,
+      "grad_norm": 0.0035965372808277607,
+      "learning_rate": 0.001,
+      "loss": 0.3856,
+      "step": 586
+    },
+    {
+      "epoch": 0.016196622024784696,
+      "grad_norm": 0.02042904868721962,
+      "learning_rate": 0.001,
+      "loss": 0.3851,
+      "step": 587
+    },
+    {
+      "epoch": 0.016224214225849062,
+      "grad_norm": 0.00438309321179986,
+      "learning_rate": 0.001,
+      "loss": 0.363,
+      "step": 588
+    },
+    {
+      "epoch": 0.016251806426913432,
+      "grad_norm": 0.004080008715391159,
+      "learning_rate": 0.001,
+      "loss": 0.4066,
+      "step": 589
+    },
+    {
+      "epoch": 0.016279398627977802,
+      "grad_norm": 0.004915320780128241,
+      "learning_rate": 0.001,
+      "loss": 0.3786,
+      "step": 590
+    },
+    {
+      "epoch": 0.016306990829042172,
+      "grad_norm": 0.0037770781200379133,
+      "learning_rate": 0.001,
+      "loss": 0.451,
+      "step": 591
+    },
+    {
+      "epoch": 0.016334583030106542,
+      "grad_norm": 0.0039979214780032635,
+      "learning_rate": 0.001,
+      "loss": 0.3812,
+      "step": 592
+    },
+    {
+      "epoch": 0.01636217523117091,
+      "grad_norm": 0.0035175576340407133,
+      "learning_rate": 0.001,
+      "loss": 0.3929,
+      "step": 593
+    },
+    {
+      "epoch": 0.01638976743223528,
+      "grad_norm": 0.003597306553274393,
+      "learning_rate": 0.001,
+      "loss": 0.3986,
+      "step": 594
+    },
+    {
+      "epoch": 0.016417359633299648,
+      "grad_norm": 0.002302913460880518,
+      "learning_rate": 0.001,
+      "loss": 0.423,
+      "step": 595
+    },
+    {
+      "epoch": 0.016444951834364018,
+      "grad_norm": 0.0026150753255933523,
+      "learning_rate": 0.001,
+      "loss": 0.3875,
+      "step": 596
+    },
+    {
+      "epoch": 0.016472544035428385,
+      "grad_norm": 0.002459397306665778,
+      "learning_rate": 0.001,
+      "loss": 0.4173,
+      "step": 597
+    },
+    {
+      "epoch": 0.016500136236492755,
+      "grad_norm": 0.00271104765124619,
+      "learning_rate": 0.001,
+      "loss": 0.417,
+      "step": 598
+    },
+    {
+      "epoch": 0.016527728437557124,
+      "grad_norm": 0.003122537862509489,
+      "learning_rate": 0.001,
+      "loss": 0.383,
+      "step": 599
+    },
+    {
+      "epoch": 0.016555320638621494,
+      "grad_norm": 0.0028217623475939035,
+      "learning_rate": 0.001,
+      "loss": 0.4076,
+      "step": 600
+    },
+    {
+      "epoch": 0.016582912839685864,
+      "grad_norm": 0.0029432664159685373,
+      "learning_rate": 0.001,
+      "loss": 0.4022,
+      "step": 601
+    },
+    {
+      "epoch": 0.01661050504075023,
+      "grad_norm": 0.0030998094007372856,
+      "learning_rate": 0.001,
+      "loss": 0.4058,
+      "step": 602
+    },
+    {
+      "epoch": 0.0166380972418146,
+      "grad_norm": 0.005838985554873943,
+      "learning_rate": 0.001,
+      "loss": 0.3998,
+      "step": 603
+    },
+    {
+      "epoch": 0.01666568944287897,
+      "grad_norm": 0.003850255161523819,
+      "learning_rate": 0.001,
+      "loss": 0.3664,
+      "step": 604
+    },
+    {
+      "epoch": 0.01669328164394334,
+      "grad_norm": 0.0026384994853287935,
+      "learning_rate": 0.001,
+      "loss": 0.4035,
+      "step": 605
+    },
+    {
+      "epoch": 0.016720873845007707,
+      "grad_norm": 0.003034224035218358,
+      "learning_rate": 0.001,
+      "loss": 0.4005,
+      "step": 606
+    },
+    {
+      "epoch": 0.016748466046072077,
+      "grad_norm": 0.003741856198757887,
+      "learning_rate": 0.001,
+      "loss": 0.3892,
+      "step": 607
+    },
+    {
+      "epoch": 0.016776058247136447,
+      "grad_norm": 0.0029668582137674093,
+      "learning_rate": 0.001,
+      "loss": 0.4492,
+      "step": 608
+    },
+    {
+      "epoch": 0.016803650448200817,
+      "grad_norm": 0.006931662559509277,
+      "learning_rate": 0.001,
+      "loss": 0.3865,
+      "step": 609
+    },
+    {
+      "epoch": 0.016831242649265186,
+      "grad_norm": 0.005429959390312433,
+      "learning_rate": 0.001,
+      "loss": 0.3945,
+      "step": 610
+    },
+    {
+      "epoch": 0.016858834850329553,
+      "grad_norm": 0.008053823374211788,
+      "learning_rate": 0.001,
+      "loss": 0.3765,
+      "step": 611
+    },
+    {
+      "epoch": 0.016886427051393923,
+      "grad_norm": 0.0056150988675653934,
+      "learning_rate": 0.001,
+      "loss": 0.3773,
+      "step": 612
+    },
+    {
+      "epoch": 0.016914019252458293,
+      "grad_norm": 0.005078485235571861,
+      "learning_rate": 0.001,
+      "loss": 0.3796,
+      "step": 613
+    },
+    {
+      "epoch": 0.016941611453522663,
+      "grad_norm": 0.0043738181702792645,
+      "learning_rate": 0.001,
+      "loss": 0.4368,
+      "step": 614
+    },
+    {
+      "epoch": 0.016969203654587033,
+      "grad_norm": 0.004265911877155304,
+      "learning_rate": 0.001,
+      "loss": 0.4181,
+      "step": 615
+    },
+    {
+      "epoch": 0.0169967958556514,
+      "grad_norm": 0.004496406763792038,
+      "learning_rate": 0.001,
+      "loss": 0.3882,
+      "step": 616
+    },
+    {
+      "epoch": 0.01702438805671577,
+      "grad_norm": 0.004225629381835461,
+      "learning_rate": 0.001,
+      "loss": 0.4094,
+      "step": 617
+    },
+    {
+      "epoch": 0.01705198025778014,
+      "grad_norm": 0.004265735857188702,
+      "learning_rate": 0.001,
+      "loss": 0.3624,
+      "step": 618
+    },
+    {
+      "epoch": 0.01707957245884451,
+      "grad_norm": 0.0045761000365018845,
+      "learning_rate": 0.001,
+      "loss": 0.4298,
+      "step": 619
+    },
+    {
+      "epoch": 0.017107164659908875,
+      "grad_norm": 0.0046028876677155495,
+      "learning_rate": 0.001,
+      "loss": 0.4366,
+      "step": 620
+    },
+    {
+      "epoch": 0.017134756860973245,
+      "grad_norm": 0.003382893279194832,
+      "learning_rate": 0.001,
+      "loss": 0.385,
+      "step": 621
+    },
+    {
+      "epoch": 0.017162349062037615,
+      "grad_norm": 0.003314296016469598,
+      "learning_rate": 0.001,
+      "loss": 0.3811,
+      "step": 622
+    },
+    {
+      "epoch": 0.017189941263101985,
+      "grad_norm": 0.0035887316334992647,
+      "learning_rate": 0.001,
+      "loss": 0.4035,
+      "step": 623
+    },
+    {
+      "epoch": 0.017217533464166355,
+      "grad_norm": 0.003954799845814705,
+      "learning_rate": 0.001,
+      "loss": 0.4236,
+      "step": 624
+    },
+    {
+      "epoch": 0.01724512566523072,
+      "grad_norm": 0.003979894332587719,
+      "learning_rate": 0.001,
+      "loss": 0.3853,
+      "step": 625
+    },
+    {
+      "epoch": 0.01727271786629509,
+      "grad_norm": 0.0029897873755544424,
+      "learning_rate": 0.001,
+      "loss": 0.3907,
+      "step": 626
+    },
+    {
+      "epoch": 0.01730031006735946,
+      "grad_norm": 0.0038952503819018602,
+      "learning_rate": 0.001,
+      "loss": 0.4244,
+      "step": 627
+    },
+    {
+      "epoch": 0.01732790226842383,
+      "grad_norm": 0.004283791407942772,
+      "learning_rate": 0.001,
+      "loss": 0.3884,
+      "step": 628
+    },
+    {
+      "epoch": 0.0173554944694882,
+      "grad_norm": 0.004268042277544737,
+      "learning_rate": 0.001,
+      "loss": 0.4143,
+      "step": 629
+    },
+    {
+      "epoch": 0.017383086670552567,
+      "grad_norm": 0.0034829305950552225,
+      "learning_rate": 0.001,
+      "loss": 0.4026,
+      "step": 630
+    },
+    {
+      "epoch": 0.017410678871616937,
+      "grad_norm": 0.004891110584139824,
+      "learning_rate": 0.001,
+      "loss": 0.4131,
+      "step": 631
+    },
+    {
+      "epoch": 0.017438271072681307,
+      "grad_norm": 0.009508724324405193,
+      "learning_rate": 0.001,
+      "loss": 0.4033,
+      "step": 632
+    },
+    {
+      "epoch": 0.017465863273745677,
+      "grad_norm": 0.003979192115366459,
+      "learning_rate": 0.001,
+      "loss": 0.3876,
+      "step": 633
+    },
+    {
+      "epoch": 0.017493455474810043,
+      "grad_norm": 0.006190054584294558,
+      "learning_rate": 0.001,
+      "loss": 0.4007,
+      "step": 634
+    },
+    {
+      "epoch": 0.017521047675874413,
+      "grad_norm": 0.003986046649515629,
+      "learning_rate": 0.001,
+      "loss": 0.4067,
+      "step": 635
+    },
+    {
+      "epoch": 0.017548639876938783,
+      "grad_norm": 0.002939463360235095,
+      "learning_rate": 0.001,
+      "loss": 0.4167,
+      "step": 636
+    },
+    {
+      "epoch": 0.017576232078003153,
+      "grad_norm": 0.005479493178427219,
+      "learning_rate": 0.001,
+      "loss": 0.3996,
+      "step": 637
+    },
+    {
+      "epoch": 0.017603824279067523,
+      "grad_norm": 0.00527595542371273,
+      "learning_rate": 0.001,
+      "loss": 0.4107,
+      "step": 638
+    },
+    {
+      "epoch": 0.01763141648013189,
+      "grad_norm": 0.004213378299027681,
+      "learning_rate": 0.001,
+      "loss": 0.3912,
+      "step": 639
+    },
+    {
+      "epoch": 0.01765900868119626,
+      "grad_norm": 0.004745858255773783,
+      "learning_rate": 0.001,
+      "loss": 0.4167,
+      "step": 640
+    },
+    {
+      "epoch": 0.01768660088226063,
+      "grad_norm": 0.0038344142958521843,
+      "learning_rate": 0.001,
+      "loss": 0.3954,
+      "step": 641
+    },
+    {
+      "epoch": 0.017714193083325,
+      "grad_norm": 0.004831280559301376,
+      "learning_rate": 0.001,
+      "loss": 0.3904,
+      "step": 642
+    },
+    {
+      "epoch": 0.017741785284389366,
+      "grad_norm": 0.0035769357345998287,
+      "learning_rate": 0.001,
+      "loss": 0.3951,
+      "step": 643
+    },
+    {
+      "epoch": 0.017769377485453736,
+      "grad_norm": 0.00572391739115119,
+      "learning_rate": 0.001,
+      "loss": 0.4168,
+      "step": 644
+    },
+    {
+      "epoch": 0.017796969686518106,
+      "grad_norm": 0.005403329152613878,
+      "learning_rate": 0.001,
+      "loss": 0.4049,
+      "step": 645
+    },
+    {
+      "epoch": 0.017824561887582475,
+      "grad_norm": 0.003341693663969636,
+      "learning_rate": 0.001,
+      "loss": 0.3905,
+      "step": 646
+    },
+    {
+      "epoch": 0.017852154088646845,
+      "grad_norm": 0.00487833097577095,
+      "learning_rate": 0.001,
+      "loss": 0.3853,
+      "step": 647
+    },
+    {
+      "epoch": 0.017879746289711212,
+      "grad_norm": 0.004141534212976694,
+      "learning_rate": 0.001,
+      "loss": 0.3956,
+      "step": 648
+    },
+    {
+      "epoch": 0.01790733849077558,
+      "grad_norm": 0.004081532824784517,
+      "learning_rate": 0.001,
+      "loss": 0.3974,
+      "step": 649
+    },
+    {
+      "epoch": 0.01793493069183995,
+      "grad_norm": 0.004535014741122723,
+      "learning_rate": 0.001,
+      "loss": 0.4088,
+      "step": 650
+    },
+    {
+      "epoch": 0.01796252289290432,
+      "grad_norm": 0.003384925192221999,
+      "learning_rate": 0.001,
+      "loss": 0.3795,
+      "step": 651
+    },
+    {
+      "epoch": 0.01799011509396869,
+      "grad_norm": 0.003582439851015806,
+      "learning_rate": 0.001,
+      "loss": 0.3926,
+      "step": 652
+    },
+    {
+      "epoch": 0.018017707295033058,
+      "grad_norm": 0.0033751516602933407,
+      "learning_rate": 0.001,
+      "loss": 0.3909,
+      "step": 653
+    },
+    {
+      "epoch": 0.018045299496097428,
+      "grad_norm": 0.004208603408187628,
+      "learning_rate": 0.001,
+      "loss": 0.4124,
+      "step": 654
+    },
+    {
+      "epoch": 0.018072891697161798,
+      "grad_norm": 0.005743230227380991,
+      "learning_rate": 0.001,
+      "loss": 0.3685,
+      "step": 655
+    },
+    {
+      "epoch": 0.018100483898226168,
+      "grad_norm": 0.00536683714017272,
+      "learning_rate": 0.001,
+      "loss": 0.3488,
+      "step": 656
+    },
+    {
+      "epoch": 0.018128076099290534,
+      "grad_norm": 0.003961362410336733,
+      "learning_rate": 0.001,
+      "loss": 0.4241,
+      "step": 657
+    },
+    {
+      "epoch": 0.018155668300354904,
+      "grad_norm": 0.004491451662033796,
+      "learning_rate": 0.001,
+      "loss": 0.3928,
+      "step": 658
+    },
+    {
+      "epoch": 0.018183260501419274,
+      "grad_norm": 0.005195736885070801,
+      "learning_rate": 0.001,
+      "loss": 0.406,
+      "step": 659
+    },
+    {
+      "epoch": 0.018210852702483644,
+      "grad_norm": 0.003914379980415106,
+      "learning_rate": 0.001,
+      "loss": 0.3541,
+      "step": 660
+    },
+    {
+      "epoch": 0.018238444903548014,
+      "grad_norm": 0.00485312519595027,
+      "learning_rate": 0.001,
+      "loss": 0.3992,
+      "step": 661
+    },
+    {
+      "epoch": 0.01826603710461238,
+      "grad_norm": 0.00337631581351161,
+      "learning_rate": 0.001,
+      "loss": 0.4469,
+      "step": 662
+    },
+    {
+      "epoch": 0.01829362930567675,
+      "grad_norm": 0.00616971543058753,
+      "learning_rate": 0.001,
+      "loss": 0.3783,
+      "step": 663
+    },
+    {
+      "epoch": 0.01832122150674112,
+      "grad_norm": 0.0032724293414503336,
+      "learning_rate": 0.001,
+      "loss": 0.3717,
+      "step": 664
+    },
+    {
+      "epoch": 0.01834881370780549,
+      "grad_norm": 0.003196743782609701,
+      "learning_rate": 0.001,
+      "loss": 0.39,
+      "step": 665
+    },
+    {
+      "epoch": 0.018376405908869856,
+      "grad_norm": 0.00492246774956584,
+      "learning_rate": 0.001,
+      "loss": 0.4178,
+      "step": 666
+    },
+    {
+      "epoch": 0.018403998109934226,
+      "grad_norm": 0.0029899398796260357,
+      "learning_rate": 0.001,
+      "loss": 0.4323,
+      "step": 667
+    },
+    {
+      "epoch": 0.018431590310998596,
+      "grad_norm": 0.00527157774195075,
+      "learning_rate": 0.001,
+      "loss": 0.4103,
+      "step": 668
+    },
+    {
+      "epoch": 0.018459182512062966,
+      "grad_norm": 0.002288073068484664,
+      "learning_rate": 0.001,
+      "loss": 0.407,
+      "step": 669
+    },
+    {
+      "epoch": 0.018486774713127336,
+      "grad_norm": 0.0031164512038230896,
+      "learning_rate": 0.001,
+      "loss": 0.4287,
+      "step": 670
+    },
+    {
+      "epoch": 0.018514366914191702,
+      "grad_norm": 0.0031951169949024916,
+      "learning_rate": 0.001,
+      "loss": 0.3753,
+      "step": 671
+    },
+    {
+      "epoch": 0.018541959115256072,
+      "grad_norm": 0.0035726516507565975,
+      "learning_rate": 0.001,
+      "loss": 0.4032,
+      "step": 672
+    },
+    {
+      "epoch": 0.018569551316320442,
+      "grad_norm": 0.0036449262406677008,
+      "learning_rate": 0.001,
+      "loss": 0.4154,
+      "step": 673
+    },
+    {
+      "epoch": 0.018597143517384812,
+      "grad_norm": 0.003955410327762365,
+      "learning_rate": 0.001,
+      "loss": 0.3787,
+      "step": 674
+    },
+    {
+      "epoch": 0.018624735718449182,
+      "grad_norm": 0.0030794877093285322,
+      "learning_rate": 0.001,
+      "loss": 0.3982,
+      "step": 675
+    },
+    {
+      "epoch": 0.01865232791951355,
+      "grad_norm": 0.00874530989676714,
+      "learning_rate": 0.001,
+      "loss": 0.3859,
+      "step": 676
+    },
+    {
+      "epoch": 0.01867992012057792,
+      "grad_norm": 0.0042561995796859264,
+      "learning_rate": 0.001,
+      "loss": 0.4258,
+      "step": 677
+    },
+    {
+      "epoch": 0.018707512321642288,
+      "grad_norm": 0.00404794467613101,
+      "learning_rate": 0.001,
+      "loss": 0.4028,
+      "step": 678
+    },
+    {
+      "epoch": 0.018735104522706658,
+      "grad_norm": 0.004145196173340082,
+      "learning_rate": 0.001,
+      "loss": 0.4131,
+      "step": 679
+    },
+    {
+      "epoch": 0.018762696723771025,
+      "grad_norm": 0.00520210200920701,
+      "learning_rate": 0.001,
+      "loss": 0.375,
+      "step": 680
+    },
+    {
+      "epoch": 0.018790288924835395,
+      "grad_norm": 0.004079956095665693,
+      "learning_rate": 0.001,
+      "loss": 0.377,
+      "step": 681
+    },
+    {
+      "epoch": 0.018817881125899764,
+      "grad_norm": 0.0040541659109294415,
+      "learning_rate": 0.001,
+      "loss": 0.3939,
+      "step": 682
+    },
+    {
+      "epoch": 0.018845473326964134,
+      "grad_norm": 0.004334924276918173,
+      "learning_rate": 0.001,
+      "loss": 0.414,
+      "step": 683
+    },
+    {
+      "epoch": 0.018873065528028504,
+      "grad_norm": 0.0037169347051531076,
+      "learning_rate": 0.001,
+      "loss": 0.4317,
+      "step": 684
+    },
+    {
+      "epoch": 0.01890065772909287,
+      "grad_norm": 0.005052134394645691,
+      "learning_rate": 0.001,
+      "loss": 0.3915,
+      "step": 685
+    },
+    {
+      "epoch": 0.01892824993015724,
+      "grad_norm": 0.003371543250977993,
+      "learning_rate": 0.001,
+      "loss": 0.4131,
+      "step": 686
+    },
+    {
+      "epoch": 0.01895584213122161,
+      "grad_norm": 0.002939490368589759,
+      "learning_rate": 0.001,
+      "loss": 0.3933,
+      "step": 687
+    },
+    {
+      "epoch": 0.01898343433228598,
+      "grad_norm": 0.003637490328401327,
+      "learning_rate": 0.001,
+      "loss": 0.3654,
+      "step": 688
+    },
+    {
+      "epoch": 0.019011026533350347,
+      "grad_norm": 0.003198450431227684,
+      "learning_rate": 0.001,
+      "loss": 0.4128,
+      "step": 689
+    },
+    {
+      "epoch": 0.019038618734414717,
+      "grad_norm": 0.004216643515974283,
+      "learning_rate": 0.001,
+      "loss": 0.4033,
+      "step": 690
+    },
+    {
+      "epoch": 0.019066210935479087,
+      "grad_norm": 0.004692459478974342,
+      "learning_rate": 0.001,
+      "loss": 0.3592,
+      "step": 691
+    },
+    {
+      "epoch": 0.019093803136543457,
+      "grad_norm": 0.0043529337272048,
+      "learning_rate": 0.001,
+      "loss": 0.4099,
+      "step": 692
+    },
+    {
+      "epoch": 0.019121395337607826,
+      "grad_norm": 0.004598288331180811,
+      "learning_rate": 0.001,
+      "loss": 0.3994,
+      "step": 693
+    },
+    {
+      "epoch": 0.019148987538672193,
+      "grad_norm": 0.0035506533458828926,
+      "learning_rate": 0.001,
+      "loss": 0.3788,
+      "step": 694
+    },
+    {
+      "epoch": 0.019176579739736563,
+      "grad_norm": 0.0035839497577399015,
+      "learning_rate": 0.001,
+      "loss": 0.4209,
+      "step": 695
+    },
+    {
+      "epoch": 0.019204171940800933,
+      "grad_norm": 0.005143923219293356,
+      "learning_rate": 0.001,
+      "loss": 0.4011,
+      "step": 696
+    },
+    {
+      "epoch": 0.019231764141865303,
+      "grad_norm": 0.003638223512098193,
+      "learning_rate": 0.001,
+      "loss": 0.4007,
+      "step": 697
+    },
+    {
+      "epoch": 0.019259356342929673,
+      "grad_norm": 0.003875188762322068,
+      "learning_rate": 0.001,
+      "loss": 0.3868,
+      "step": 698
+    },
+    {
+      "epoch": 0.01928694854399404,
+      "grad_norm": 0.004542836919426918,
+      "learning_rate": 0.001,
+      "loss": 0.4026,
+      "step": 699
+    },
+    {
+      "epoch": 0.01931454074505841,
+      "grad_norm": 0.00382284470833838,
+      "learning_rate": 0.001,
+      "loss": 0.4094,
+      "step": 700
+    },
+    {
+      "epoch": 0.01934213294612278,
+      "grad_norm": 0.004041844513267279,
+      "learning_rate": 0.001,
+      "loss": 0.4014,
+      "step": 701
+    },
+    {
+      "epoch": 0.01936972514718715,
+      "grad_norm": 0.0031541790813207626,
+      "learning_rate": 0.001,
+      "loss": 0.4363,
+      "step": 702
+    },
+    {
+      "epoch": 0.019397317348251515,
+      "grad_norm": 0.00377883343026042,
+      "learning_rate": 0.001,
+      "loss": 0.3953,
+      "step": 703
+    },
+    {
+      "epoch": 0.019424909549315885,
+      "grad_norm": 0.0043754116632044315,
+      "learning_rate": 0.001,
+      "loss": 0.4191,
+      "step": 704
+    },
+    {
+      "epoch": 0.019452501750380255,
+      "grad_norm": 0.0037639569491147995,
+      "learning_rate": 0.001,
+      "loss": 0.399,
+      "step": 705
+    },
+    {
+      "epoch": 0.019480093951444625,
+      "grad_norm": 0.003134689759463072,
+      "learning_rate": 0.001,
+      "loss": 0.3875,
+      "step": 706
+    },
+    {
+      "epoch": 0.019507686152508995,
+      "grad_norm": 0.0035348469391465187,
+      "learning_rate": 0.001,
+      "loss": 0.3759,
+      "step": 707
+    },
+    {
+      "epoch": 0.01953527835357336,
+      "grad_norm": 0.0043303752318024635,
+      "learning_rate": 0.001,
+      "loss": 0.4274,
+      "step": 708
+    },
+    {
+      "epoch": 0.01956287055463773,
+      "grad_norm": 0.004089695867151022,
+      "learning_rate": 0.001,
+      "loss": 0.4217,
+      "step": 709
+    },
+    {
+      "epoch": 0.0195904627557021,
+      "grad_norm": 0.004323967732489109,
+      "learning_rate": 0.001,
+      "loss": 0.4269,
+      "step": 710
+    },
+    {
+      "epoch": 0.01961805495676647,
+      "grad_norm": 0.003221880178898573,
+      "learning_rate": 0.001,
+      "loss": 0.4217,
+      "step": 711
+    },
+    {
+      "epoch": 0.019645647157830837,
+      "grad_norm": 0.004038861952722073,
+      "learning_rate": 0.001,
+      "loss": 0.3987,
+      "step": 712
+    },
+    {
+      "epoch": 0.019673239358895207,
+      "grad_norm": 0.00399352191016078,
+      "learning_rate": 0.001,
+      "loss": 0.4151,
+      "step": 713
+    },
+    {
+      "epoch": 0.019700831559959577,
+      "grad_norm": 0.003673392115160823,
+      "learning_rate": 0.001,
+      "loss": 0.408,
+      "step": 714
+    },
+    {
+      "epoch": 0.019728423761023947,
+      "grad_norm": 0.004011491779237986,
+      "learning_rate": 0.001,
+      "loss": 0.4067,
+      "step": 715
+    },
+    {
+      "epoch": 0.019756015962088317,
+      "grad_norm": 0.005186786409467459,
+      "learning_rate": 0.001,
+      "loss": 0.4198,
+      "step": 716
+    },
+    {
+      "epoch": 0.019783608163152683,
+      "grad_norm": 0.005963923409581184,
+      "learning_rate": 0.001,
+      "loss": 0.3882,
+      "step": 717
+    },
+    {
+      "epoch": 0.019811200364217053,
+      "grad_norm": 0.004075867123901844,
+      "learning_rate": 0.001,
+      "loss": 0.4152,
+      "step": 718
+    },
+    {
+      "epoch": 0.019838792565281423,
+      "grad_norm": 0.0034671409521251917,
+      "learning_rate": 0.001,
+      "loss": 0.4068,
+      "step": 719
+    },
+    {
+      "epoch": 0.019866384766345793,
+      "grad_norm": 0.004385409411042929,
+      "learning_rate": 0.001,
+      "loss": 0.3911,
+      "step": 720
+    },
+    {
+      "epoch": 0.019893976967410163,
+      "grad_norm": 0.0036052153445780277,
+      "learning_rate": 0.001,
+      "loss": 0.4123,
+      "step": 721
+    },
+    {
+      "epoch": 0.01992156916847453,
+      "grad_norm": 0.0038659870624542236,
+      "learning_rate": 0.001,
+      "loss": 0.415,
+      "step": 722
+    },
+    {
+      "epoch": 0.0199491613695389,
+      "grad_norm": 0.006984284613281488,
+      "learning_rate": 0.001,
+      "loss": 0.3863,
+      "step": 723
+    },
+    {
+      "epoch": 0.01997675357060327,
+      "grad_norm": 0.00536124873906374,
+      "learning_rate": 0.001,
+      "loss": 0.4265,
+      "step": 724
+    },
+    {
+      "epoch": 0.02000434577166764,
+      "grad_norm": 0.005374076310545206,
+      "learning_rate": 0.001,
+      "loss": 0.3992,
+      "step": 725
+    },
+    {
+      "epoch": 0.020031937972732006,
+      "grad_norm": 0.004354424774646759,
+      "learning_rate": 0.001,
+      "loss": 0.4103,
+      "step": 726
+    },
+    {
+      "epoch": 0.020059530173796376,
+      "grad_norm": 0.004792310297489166,
+      "learning_rate": 0.001,
+      "loss": 0.4311,
+      "step": 727
+    },
+    {
+      "epoch": 0.020087122374860746,
+      "grad_norm": 0.004100026097148657,
+      "learning_rate": 0.001,
+      "loss": 0.384,
+      "step": 728
+    },
+    {
+      "epoch": 0.020114714575925115,
+      "grad_norm": 0.00429933238774538,
+      "learning_rate": 0.001,
+      "loss": 0.4231,
+      "step": 729
+    },
+    {
+      "epoch": 0.020142306776989485,
+      "grad_norm": 0.003623842727392912,
+      "learning_rate": 0.001,
+      "loss": 0.3872,
+      "step": 730
+    },
+    {
+      "epoch": 0.020169898978053852,
+      "grad_norm": 0.003992657642811537,
+      "learning_rate": 0.001,
+      "loss": 0.4407,
+      "step": 731
+    },
+    {
+      "epoch": 0.02019749117911822,
+      "grad_norm": 0.004881622269749641,
+      "learning_rate": 0.001,
+      "loss": 0.3723,
+      "step": 732
+    },
+    {
+      "epoch": 0.02022508338018259,
+      "grad_norm": 0.004593881778419018,
+      "learning_rate": 0.001,
+      "loss": 0.4067,
+      "step": 733
+    },
+    {
+      "epoch": 0.02025267558124696,
+      "grad_norm": 0.00592020945623517,
+      "learning_rate": 0.001,
+      "loss": 0.4004,
+      "step": 734
+    },
+    {
+      "epoch": 0.02028026778231133,
+      "grad_norm": 0.004086300730705261,
+      "learning_rate": 0.001,
+      "loss": 0.365,
+      "step": 735
+    },
+    {
+      "epoch": 0.020307859983375698,
+      "grad_norm": 0.008120741695165634,
+      "learning_rate": 0.001,
+      "loss": 0.4389,
+      "step": 736
+    },
+    {
+      "epoch": 0.020335452184440068,
+      "grad_norm": 0.003741480875760317,
+      "learning_rate": 0.001,
+      "loss": 0.4148,
+      "step": 737
+    },
+    {
+      "epoch": 0.020363044385504438,
+      "grad_norm": 0.00764830969274044,
+      "learning_rate": 0.001,
+      "loss": 0.3817,
+      "step": 738
+    },
+    {
+      "epoch": 0.020390636586568808,
+      "grad_norm": 0.0035037719644606113,
+      "learning_rate": 0.001,
+      "loss": 0.4163,
+      "step": 739
+    },
+    {
+      "epoch": 0.020418228787633174,
+      "grad_norm": 0.00437674205750227,
+      "learning_rate": 0.001,
+      "loss": 0.4059,
+      "step": 740
+    },
+    {
+      "epoch": 0.020445820988697544,
+      "grad_norm": 0.004506263881921768,
+      "learning_rate": 0.001,
+      "loss": 0.4221,
+      "step": 741
+    },
+    {
+      "epoch": 0.020473413189761914,
+      "grad_norm": 0.004545763600617647,
+      "learning_rate": 0.001,
+      "loss": 0.3621,
+      "step": 742
+    },
+    {
+      "epoch": 0.020501005390826284,
+      "grad_norm": 0.0050647263415157795,
+      "learning_rate": 0.001,
+      "loss": 0.3866,
+      "step": 743
+    },
+    {
+      "epoch": 0.020528597591890654,
+      "grad_norm": 0.006431067828088999,
+      "learning_rate": 0.001,
+      "loss": 0.3693,
+      "step": 744
+    },
+    {
+      "epoch": 0.02055618979295502,
+      "grad_norm": 0.00419000955298543,
+      "learning_rate": 0.001,
+      "loss": 0.4093,
+      "step": 745
+    },
+    {
+      "epoch": 0.02058378199401939,
+      "grad_norm": 0.005267234519124031,
+      "learning_rate": 0.001,
+      "loss": 0.3822,
+      "step": 746
+    },
+    {
+      "epoch": 0.02061137419508376,
+      "grad_norm": 0.0034249969758093357,
+      "learning_rate": 0.001,
+      "loss": 0.4225,
+      "step": 747
+    },
+    {
+      "epoch": 0.02063896639614813,
+      "grad_norm": 0.00436822697520256,
+      "learning_rate": 0.001,
+      "loss": 0.3655,
+      "step": 748
+    },
+    {
+      "epoch": 0.020666558597212496,
+      "grad_norm": 0.002724075224250555,
+      "learning_rate": 0.001,
+      "loss": 0.4055,
+      "step": 749
+    },
+    {
+      "epoch": 0.020694150798276866,
+      "grad_norm": 0.0030281811486929655,
+      "learning_rate": 0.001,
+      "loss": 0.3682,
+      "step": 750
+    },
+    {
+      "epoch": 0.020721742999341236,
+      "grad_norm": 0.004672218579798937,
+      "learning_rate": 0.001,
+      "loss": 0.4083,
+      "step": 751
+    },
+    {
+      "epoch": 0.020749335200405606,
+      "grad_norm": 0.004708775784820318,
+      "learning_rate": 0.001,
+      "loss": 0.3751,
+      "step": 752
+    },
+    {
+      "epoch": 0.020776927401469976,
+      "grad_norm": 0.009867183864116669,
+      "learning_rate": 0.001,
+      "loss": 0.3943,
+      "step": 753
+    },
+    {
+      "epoch": 0.020804519602534342,
+      "grad_norm": 0.003236403688788414,
+      "learning_rate": 0.001,
+      "loss": 0.3786,
+      "step": 754
+    },
+    {
+      "epoch": 0.020832111803598712,
+      "grad_norm": 0.003183891996741295,
+      "learning_rate": 0.001,
+      "loss": 0.3921,
+      "step": 755
+    },
+    {
+      "epoch": 0.020859704004663082,
+      "grad_norm": 0.0026451335288584232,
+      "learning_rate": 0.001,
+      "loss": 0.3793,
+      "step": 756
+    },
+    {
+      "epoch": 0.020887296205727452,
+      "grad_norm": 0.0030694296583533287,
+      "learning_rate": 0.001,
+      "loss": 0.4227,
+      "step": 757
+    },
+    {
+      "epoch": 0.020914888406791822,
+      "grad_norm": 0.004503907170146704,
+      "learning_rate": 0.001,
+      "loss": 0.365,
+      "step": 758
+    },
+    {
+      "epoch": 0.02094248060785619,
+      "grad_norm": 0.0033429849427193403,
+      "learning_rate": 0.001,
+      "loss": 0.4116,
+      "step": 759
+    },
+    {
+      "epoch": 0.02097007280892056,
+      "grad_norm": 0.005442556459456682,
+      "learning_rate": 0.001,
+      "loss": 0.3935,
+      "step": 760
+    },
+    {
+      "epoch": 0.020997665009984928,
+      "grad_norm": 0.0036932167131453753,
+      "learning_rate": 0.001,
+      "loss": 0.4113,
+      "step": 761
+    },
+    {
+      "epoch": 0.021025257211049298,
+      "grad_norm": 0.004506903700530529,
+      "learning_rate": 0.001,
+      "loss": 0.4098,
+      "step": 762
+    },
+    {
+      "epoch": 0.021052849412113665,
+      "grad_norm": 0.003803919767960906,
+      "learning_rate": 0.001,
+      "loss": 0.3865,
+      "step": 763
+    },
+    {
+      "epoch": 0.021080441613178034,
+      "grad_norm": 0.005300893913954496,
+      "learning_rate": 0.001,
+      "loss": 0.4023,
+      "step": 764
+    },
+    {
+      "epoch": 0.021108033814242404,
+      "grad_norm": 0.005235253367573023,
+      "learning_rate": 0.001,
+      "loss": 0.4339,
+      "step": 765
+    },
+    {
+      "epoch": 0.021135626015306774,
+      "grad_norm": 0.00439833290874958,
+      "learning_rate": 0.001,
+      "loss": 0.4012,
+      "step": 766
+    },
+    {
+      "epoch": 0.021163218216371144,
+      "grad_norm": 0.003949869889765978,
+      "learning_rate": 0.001,
+      "loss": 0.3549,
+      "step": 767
+    },
+    {
+      "epoch": 0.02119081041743551,
+      "grad_norm": 0.0055135395377874374,
+      "learning_rate": 0.001,
+      "loss": 0.3876,
+      "step": 768
+    },
+    {
+      "epoch": 0.02121840261849988,
+      "grad_norm": 0.004492396488785744,
+      "learning_rate": 0.001,
+      "loss": 0.3639,
+      "step": 769
+    },
+    {
+      "epoch": 0.02124599481956425,
+      "grad_norm": 0.0040711937472224236,
+      "learning_rate": 0.001,
+      "loss": 0.3678,
+      "step": 770
+    },
+    {
+      "epoch": 0.02127358702062862,
+      "grad_norm": 0.0038428730331361294,
+      "learning_rate": 0.001,
+      "loss": 0.437,
+      "step": 771
+    },
+    {
+      "epoch": 0.021301179221692987,
+      "grad_norm": 0.003672838443890214,
+      "learning_rate": 0.001,
+      "loss": 0.397,
+      "step": 772
+    },
+    {
+      "epoch": 0.021328771422757357,
+      "grad_norm": 0.003955157473683357,
+      "learning_rate": 0.001,
+      "loss": 0.4375,
+      "step": 773
+    },
+    {
+      "epoch": 0.021356363623821727,
+      "grad_norm": 0.005334306508302689,
+      "learning_rate": 0.001,
+      "loss": 0.4313,
+      "step": 774
+    },
+    {
+      "epoch": 0.021383955824886097,
+      "grad_norm": 0.004772811662405729,
+      "learning_rate": 0.001,
+      "loss": 0.4244,
+      "step": 775
+    },
+    {
+      "epoch": 0.021411548025950466,
+      "grad_norm": 0.005606191698461771,
+      "learning_rate": 0.001,
+      "loss": 0.3974,
+      "step": 776
+    },
+    {
+      "epoch": 0.021439140227014833,
+      "grad_norm": 0.004244519397616386,
+      "learning_rate": 0.001,
+      "loss": 0.3942,
+      "step": 777
+    },
+    {
+      "epoch": 0.021466732428079203,
+      "grad_norm": 0.0028992686420679092,
+      "learning_rate": 0.001,
+      "loss": 0.4309,
+      "step": 778
+    },
+    {
+      "epoch": 0.021494324629143573,
+      "grad_norm": 0.0031759096309542656,
+      "learning_rate": 0.001,
+      "loss": 0.437,
+      "step": 779
+    },
+    {
+      "epoch": 0.021521916830207943,
+      "grad_norm": 0.007329195272177458,
+      "learning_rate": 0.001,
+      "loss": 0.3619,
+      "step": 780
+    },
+    {
+      "epoch": 0.021549509031272313,
+      "grad_norm": 0.006688808090984821,
+      "learning_rate": 0.001,
+      "loss": 0.4491,
+      "step": 781
+    },
+    {
+      "epoch": 0.02157710123233668,
+      "grad_norm": 0.00896657258272171,
+      "learning_rate": 0.001,
+      "loss": 0.4072,
+      "step": 782
+    },
+    {
+      "epoch": 0.02160469343340105,
+      "grad_norm": 0.005213500466197729,
+      "learning_rate": 0.001,
+      "loss": 0.4435,
+      "step": 783
+    },
+    {
+      "epoch": 0.02163228563446542,
+      "grad_norm": 0.0036038621328771114,
+      "learning_rate": 0.001,
+      "loss": 0.4152,
+      "step": 784
+    },
+    {
+      "epoch": 0.02165987783552979,
+      "grad_norm": 0.003994919825345278,
+      "learning_rate": 0.001,
+      "loss": 0.3968,
+      "step": 785
+    },
+    {
+      "epoch": 0.021687470036594155,
+      "grad_norm": 0.004139062017202377,
+      "learning_rate": 0.001,
+      "loss": 0.395,
+      "step": 786
+    },
+    {
+      "epoch": 0.021715062237658525,
+      "grad_norm": 0.008915436454117298,
+      "learning_rate": 0.001,
+      "loss": 0.421,
+      "step": 787
+    },
+    {
+      "epoch": 0.021742654438722895,
+      "grad_norm": 0.003705317387357354,
+      "learning_rate": 0.001,
+      "loss": 0.3576,
+      "step": 788
+    },
+    {
+      "epoch": 0.021770246639787265,
+      "grad_norm": 0.004881418775767088,
+      "learning_rate": 0.001,
+      "loss": 0.4031,
+      "step": 789
+    },
+    {
+      "epoch": 0.021797838840851635,
+      "grad_norm": 0.00573571864515543,
+      "learning_rate": 0.001,
+      "loss": 0.4116,
+      "step": 790
+    },
+    {
+      "epoch": 0.021825431041916,
+      "grad_norm": 0.002695683157071471,
+      "learning_rate": 0.001,
+      "loss": 0.4335,
+      "step": 791
+    },
+    {
+      "epoch": 0.02185302324298037,
+      "grad_norm": 0.003691880265250802,
+      "learning_rate": 0.001,
+      "loss": 0.4108,
+      "step": 792
+    },
+    {
+      "epoch": 0.02188061544404474,
+      "grad_norm": 0.003528386354446411,
+      "learning_rate": 0.001,
+      "loss": 0.4062,
+      "step": 793
+    },
+    {
+      "epoch": 0.02190820764510911,
+      "grad_norm": 0.0033985383342951536,
+      "learning_rate": 0.001,
+      "loss": 0.4327,
+      "step": 794
+    },
+    {
+      "epoch": 0.021935799846173477,
+      "grad_norm": 0.003107238095253706,
+      "learning_rate": 0.001,
+      "loss": 0.3966,
+      "step": 795
+    },
+    {
+      "epoch": 0.021963392047237847,
+      "grad_norm": 0.0034598596394062042,
+      "learning_rate": 0.001,
+      "loss": 0.3998,
+      "step": 796
+    },
+    {
+      "epoch": 0.021990984248302217,
+      "grad_norm": 0.004219945520162582,
+      "learning_rate": 0.001,
+      "loss": 0.4331,
+      "step": 797
+    },
+    {
+      "epoch": 0.022018576449366587,
+      "grad_norm": 0.003744914662092924,
+      "learning_rate": 0.001,
+      "loss": 0.4155,
+      "step": 798
+    },
+    {
+      "epoch": 0.022046168650430957,
+      "grad_norm": 0.004566433373838663,
+      "learning_rate": 0.001,
+      "loss": 0.4304,
+      "step": 799
+    },
+    {
+      "epoch": 0.022073760851495323,
+      "grad_norm": 0.003758675418794155,
+      "learning_rate": 0.001,
+      "loss": 0.3851,
+      "step": 800
+    },
+    {
+      "epoch": 0.022101353052559693,
+      "grad_norm": 0.0030292419251054525,
+      "learning_rate": 0.001,
+      "loss": 0.4417,
+      "step": 801
+    },
+    {
+      "epoch": 0.022128945253624063,
+      "grad_norm": 0.004074092488735914,
+      "learning_rate": 0.001,
+      "loss": 0.4053,
+      "step": 802
+    },
+    {
+      "epoch": 0.022156537454688433,
+      "grad_norm": 0.00471070921048522,
+      "learning_rate": 0.001,
+      "loss": 0.4152,
+      "step": 803
+    },
+    {
+      "epoch": 0.022184129655752803,
+      "grad_norm": 0.007867252454161644,
+      "learning_rate": 0.001,
+      "loss": 0.3932,
+      "step": 804
+    },
+    {
+      "epoch": 0.02221172185681717,
+      "grad_norm": 0.003963668830692768,
+      "learning_rate": 0.001,
+      "loss": 0.4058,
+      "step": 805
+    },
+    {
+      "epoch": 0.02223931405788154,
+      "grad_norm": 0.005508980248123407,
+      "learning_rate": 0.001,
+      "loss": 0.395,
+      "step": 806
+    },
+    {
+      "epoch": 0.02226690625894591,
+      "grad_norm": 0.003475068835541606,
+      "learning_rate": 0.001,
+      "loss": 0.434,
+      "step": 807
+    },
+    {
+      "epoch": 0.02229449846001028,
+      "grad_norm": 0.0032727334182709455,
+      "learning_rate": 0.001,
+      "loss": 0.3785,
+      "step": 808
+    },
+    {
+      "epoch": 0.022322090661074646,
+      "grad_norm": 0.0029324067290872335,
+      "learning_rate": 0.001,
+      "loss": 0.4136,
+      "step": 809
+    },
+    {
+      "epoch": 0.022349682862139016,
+      "grad_norm": 0.005346233956515789,
+      "learning_rate": 0.001,
+      "loss": 0.3863,
+      "step": 810
+    },
+    {
+      "epoch": 0.022377275063203386,
+      "grad_norm": 0.004121492151170969,
+      "learning_rate": 0.001,
+      "loss": 0.3875,
+      "step": 811
+    },
+    {
+      "epoch": 0.022404867264267755,
+      "grad_norm": 0.004381257575005293,
+      "learning_rate": 0.001,
+      "loss": 0.4042,
+      "step": 812
+    },
+    {
+      "epoch": 0.022432459465332125,
+      "grad_norm": 0.005230156239122152,
+      "learning_rate": 0.001,
+      "loss": 0.4351,
+      "step": 813
+    },
+    {
+      "epoch": 0.022460051666396492,
+      "grad_norm": 0.004103715531527996,
+      "learning_rate": 0.001,
+      "loss": 0.3795,
+      "step": 814
+    },
+    {
+      "epoch": 0.02248764386746086,
+      "grad_norm": 0.007542972918599844,
+      "learning_rate": 0.001,
+      "loss": 0.4058,
+      "step": 815
+    },
+    {
+      "epoch": 0.02251523606852523,
+      "grad_norm": 0.003434807062149048,
+      "learning_rate": 0.001,
+      "loss": 0.4029,
+      "step": 816
+    },
+    {
+      "epoch": 0.0225428282695896,
+      "grad_norm": 0.003715425031259656,
+      "learning_rate": 0.001,
+      "loss": 0.3636,
+      "step": 817
+    },
+    {
+      "epoch": 0.022570420470653968,
+      "grad_norm": 0.005767806898802519,
+      "learning_rate": 0.001,
+      "loss": 0.433,
+      "step": 818
+    },
+    {
+      "epoch": 0.022598012671718338,
+      "grad_norm": 0.007371674291789532,
+      "learning_rate": 0.001,
+      "loss": 0.3893,
+      "step": 819
+    },
+    {
+      "epoch": 0.022625604872782708,
+      "grad_norm": 0.0067810118198394775,
+      "learning_rate": 0.001,
+      "loss": 0.4248,
+      "step": 820
+    },
+    {
+      "epoch": 0.022653197073847078,
+      "grad_norm": 0.004905116744339466,
+      "learning_rate": 0.001,
+      "loss": 0.3493,
+      "step": 821
+    },
+    {
+      "epoch": 0.022680789274911448,
+      "grad_norm": 0.0027144188061356544,
+      "learning_rate": 0.001,
+      "loss": 0.424,
+      "step": 822
+    },
+    {
+      "epoch": 0.022708381475975814,
+      "grad_norm": 0.013933762907981873,
+      "learning_rate": 0.001,
+      "loss": 0.4002,
+      "step": 823
+    },
+    {
+      "epoch": 0.022735973677040184,
+      "grad_norm": 0.002978452481329441,
+      "learning_rate": 0.001,
+      "loss": 0.4361,
+      "step": 824
+    },
+    {
+      "epoch": 0.022763565878104554,
+      "grad_norm": 0.0060105458833277225,
+      "learning_rate": 0.001,
+      "loss": 0.3998,
+      "step": 825
+    },
+    {
+      "epoch": 0.022791158079168924,
+      "grad_norm": 0.004003115464001894,
+      "learning_rate": 0.001,
+      "loss": 0.4344,
+      "step": 826
+    },
+    {
+      "epoch": 0.022818750280233294,
+      "grad_norm": 0.008088911883533001,
+      "learning_rate": 0.001,
+      "loss": 0.3831,
+      "step": 827
+    },
+    {
+      "epoch": 0.02284634248129766,
+      "grad_norm": 0.005061788484454155,
+      "learning_rate": 0.001,
+      "loss": 0.4043,
+      "step": 828
+    },
+    {
+      "epoch": 0.02287393468236203,
+      "grad_norm": 0.00426626717671752,
+      "learning_rate": 0.001,
+      "loss": 0.3992,
+      "step": 829
+    },
+    {
+      "epoch": 0.0229015268834264,
+      "grad_norm": 0.0029426752589643,
+      "learning_rate": 0.001,
+      "loss": 0.3841,
+      "step": 830
+    },
+    {
+      "epoch": 0.02292911908449077,
+      "grad_norm": 0.005651662591844797,
+      "learning_rate": 0.001,
+      "loss": 0.4277,
+      "step": 831
+    },
+    {
+      "epoch": 0.022956711285555136,
+      "grad_norm": 0.0026404529344290495,
+      "learning_rate": 0.001,
+      "loss": 0.4194,
+      "step": 832
+    },
+    {
+      "epoch": 0.022984303486619506,
+      "grad_norm": 0.0029586381278932095,
+      "learning_rate": 0.001,
+      "loss": 0.3983,
+      "step": 833
+    },
+    {
+      "epoch": 0.023011895687683876,
+      "grad_norm": 0.0026806823443621397,
+      "learning_rate": 0.001,
+      "loss": 0.3844,
+      "step": 834
+    },
+    {
+      "epoch": 0.023039487888748246,
+      "grad_norm": 0.004135414958000183,
+      "learning_rate": 0.001,
+      "loss": 0.4138,
+      "step": 835
+    },
+    {
+      "epoch": 0.023067080089812616,
+      "grad_norm": 0.0023586770985275507,
+      "learning_rate": 0.001,
+      "loss": 0.4143,
+      "step": 836
+    },
+    {
+      "epoch": 0.023094672290876982,
+      "grad_norm": 0.004498126916587353,
+      "learning_rate": 0.001,
+      "loss": 0.4021,
+      "step": 837
+    },
+    {
+      "epoch": 0.023122264491941352,
+      "grad_norm": 0.003083957824856043,
+      "learning_rate": 0.001,
+      "loss": 0.3931,
+      "step": 838
+    },
+    {
+      "epoch": 0.023149856693005722,
+      "grad_norm": 0.002510181860998273,
+      "learning_rate": 0.001,
+      "loss": 0.3618,
+      "step": 839
+    },
+    {
+      "epoch": 0.023177448894070092,
+      "grad_norm": 0.002793237566947937,
+      "learning_rate": 0.001,
+      "loss": 0.3774,
+      "step": 840
+    },
+    {
+      "epoch": 0.023205041095134462,
+      "grad_norm": 0.002880933927372098,
+      "learning_rate": 0.001,
+      "loss": 0.4165,
+      "step": 841
+    },
+    {
+      "epoch": 0.02323263329619883,
+      "grad_norm": 0.004375714808702469,
+      "learning_rate": 0.001,
+      "loss": 0.4275,
+      "step": 842
+    },
+    {
+      "epoch": 0.0232602254972632,
+      "grad_norm": 0.0033162750769406557,
+      "learning_rate": 0.001,
+      "loss": 0.4052,
+      "step": 843
+    },
+    {
+      "epoch": 0.023287817698327568,
+      "grad_norm": 0.004250579979270697,
+      "learning_rate": 0.001,
+      "loss": 0.4024,
+      "step": 844
+    },
+    {
+      "epoch": 0.023315409899391938,
+      "grad_norm": 0.0036853316705673933,
+      "learning_rate": 0.001,
+      "loss": 0.3553,
+      "step": 845
+    },
+    {
+      "epoch": 0.023343002100456305,
+      "grad_norm": 0.005420148838311434,
+      "learning_rate": 0.001,
+      "loss": 0.3904,
+      "step": 846
+    },
+    {
+      "epoch": 0.023370594301520674,
+      "grad_norm": 0.002736524445936084,
+      "learning_rate": 0.001,
+      "loss": 0.3655,
+      "step": 847
+    },
+    {
+      "epoch": 0.023398186502585044,
+      "grad_norm": 0.0048865810967981815,
+      "learning_rate": 0.001,
+      "loss": 0.3967,
+      "step": 848
+    },
+    {
+      "epoch": 0.023425778703649414,
+      "grad_norm": 0.0033940684515982866,
+      "learning_rate": 0.001,
+      "loss": 0.3891,
+      "step": 849
+    },
+    {
+      "epoch": 0.023453370904713784,
+      "grad_norm": 0.00402069790288806,
+      "learning_rate": 0.001,
+      "loss": 0.3791,
+      "step": 850
+    },
+    {
+      "epoch": 0.02348096310577815,
+      "grad_norm": 0.004255734849721193,
+      "learning_rate": 0.001,
+      "loss": 0.3838,
+      "step": 851
+    },
+    {
+      "epoch": 0.02350855530684252,
+      "grad_norm": 0.0029521863907575607,
+      "learning_rate": 0.001,
+      "loss": 0.3948,
+      "step": 852
+    },
+    {
+      "epoch": 0.02353614750790689,
+      "grad_norm": 0.007674784865230322,
+      "learning_rate": 0.001,
+      "loss": 0.399,
+      "step": 853
+    },
+    {
+      "epoch": 0.02356373970897126,
+      "grad_norm": 0.003159287618473172,
+      "learning_rate": 0.001,
+      "loss": 0.3804,
+      "step": 854
+    },
+    {
+      "epoch": 0.023591331910035627,
+      "grad_norm": 0.00475959200412035,
+      "learning_rate": 0.001,
+      "loss": 0.4162,
+      "step": 855
+    },
+    {
+      "epoch": 0.023618924111099997,
+      "grad_norm": 0.003215026343241334,
+      "learning_rate": 0.001,
+      "loss": 0.4014,
+      "step": 856
+    },
+    {
+      "epoch": 0.023646516312164367,
+      "grad_norm": 0.0038892014417797327,
+      "learning_rate": 0.001,
+      "loss": 0.4367,
+      "step": 857
+    },
+    {
+      "epoch": 0.023674108513228737,
+      "grad_norm": 0.003636479377746582,
+      "learning_rate": 0.001,
+      "loss": 0.408,
+      "step": 858
+    },
+    {
+      "epoch": 0.023701700714293106,
+      "grad_norm": 0.0037935448344796896,
+      "learning_rate": 0.001,
+      "loss": 0.381,
+      "step": 859
+    },
+    {
+      "epoch": 0.023729292915357473,
+      "grad_norm": 0.004558536224067211,
+      "learning_rate": 0.001,
+      "loss": 0.3901,
+      "step": 860
+    },
+    {
+      "epoch": 0.023756885116421843,
+      "grad_norm": 0.003610727610066533,
+      "learning_rate": 0.001,
+      "loss": 0.3836,
+      "step": 861
+    },
+    {
+      "epoch": 0.023784477317486213,
+      "grad_norm": 0.0035163855645805597,
+      "learning_rate": 0.001,
+      "loss": 0.4254,
+      "step": 862
+    },
+    {
+      "epoch": 0.023812069518550583,
+      "grad_norm": 0.003148287069052458,
+      "learning_rate": 0.001,
+      "loss": 0.3916,
+      "step": 863
+    },
+    {
+      "epoch": 0.023839661719614953,
+      "grad_norm": 0.0030976871494203806,
+      "learning_rate": 0.001,
+      "loss": 0.3976,
+      "step": 864
+    },
+    {
+      "epoch": 0.02386725392067932,
+      "grad_norm": 0.003255886258557439,
+      "learning_rate": 0.001,
+      "loss": 0.3968,
+      "step": 865
+    },
+    {
+      "epoch": 0.02389484612174369,
+      "grad_norm": 0.003951660823076963,
+      "learning_rate": 0.001,
+      "loss": 0.4325,
+      "step": 866
+    },
+    {
+      "epoch": 0.02392243832280806,
+      "grad_norm": 0.0038296151906251907,
+      "learning_rate": 0.001,
+      "loss": 0.3865,
+      "step": 867
+    },
+    {
+      "epoch": 0.02395003052387243,
+      "grad_norm": 0.003523425431922078,
+      "learning_rate": 0.001,
+      "loss": 0.3773,
+      "step": 868
+    },
+    {
+      "epoch": 0.023977622724936795,
+      "grad_norm": 0.003752518445253372,
+      "learning_rate": 0.001,
+      "loss": 0.4031,
+      "step": 869
+    },
+    {
+      "epoch": 0.024005214926001165,
+      "grad_norm": 0.005763133056461811,
+      "learning_rate": 0.001,
+      "loss": 0.4345,
+      "step": 870
+    },
+    {
+      "epoch": 0.024032807127065535,
+      "grad_norm": 0.009727811440825462,
+      "learning_rate": 0.001,
+      "loss": 0.3966,
+      "step": 871
+    },
+    {
+      "epoch": 0.024060399328129905,
+      "grad_norm": 0.006550495512783527,
+      "learning_rate": 0.001,
+      "loss": 0.434,
+      "step": 872
+    },
+    {
+      "epoch": 0.024087991529194275,
+      "grad_norm": 0.003153527621179819,
+      "learning_rate": 0.001,
+      "loss": 0.416,
+      "step": 873
+    },
+    {
+      "epoch": 0.02411558373025864,
+      "grad_norm": 0.004990814719349146,
+      "learning_rate": 0.001,
+      "loss": 0.4099,
+      "step": 874
+    },
+    {
+      "epoch": 0.02414317593132301,
+      "grad_norm": 0.0033743572421371937,
+      "learning_rate": 0.001,
+      "loss": 0.4336,
+      "step": 875
+    },
+    {
+      "epoch": 0.02417076813238738,
+      "grad_norm": 0.004232621286064386,
+      "learning_rate": 0.001,
+      "loss": 0.407,
+      "step": 876
+    },
+    {
+      "epoch": 0.02419836033345175,
+      "grad_norm": 0.0045753102749586105,
+      "learning_rate": 0.001,
+      "loss": 0.3933,
+      "step": 877
+    },
+    {
+      "epoch": 0.024225952534516117,
+      "grad_norm": 0.005658434238284826,
+      "learning_rate": 0.001,
+      "loss": 0.4274,
+      "step": 878
+    },
+    {
+      "epoch": 0.024253544735580487,
+      "grad_norm": 0.003705628216266632,
+      "learning_rate": 0.001,
+      "loss": 0.4084,
+      "step": 879
+    },
+    {
+      "epoch": 0.024281136936644857,
+      "grad_norm": 0.004948351066559553,
+      "learning_rate": 0.001,
+      "loss": 0.4735,
+      "step": 880
+    },
+    {
+      "epoch": 0.024308729137709227,
+      "grad_norm": 0.003841443918645382,
+      "learning_rate": 0.001,
+      "loss": 0.4017,
+      "step": 881
+    },
+    {
+      "epoch": 0.024336321338773597,
+      "grad_norm": 0.006151879671961069,
+      "learning_rate": 0.001,
+      "loss": 0.3591,
+      "step": 882
+    },
+    {
+      "epoch": 0.024363913539837963,
+      "grad_norm": 0.004619830287992954,
+      "learning_rate": 0.001,
+      "loss": 0.3906,
+      "step": 883
+    },
+    {
+      "epoch": 0.024391505740902333,
+      "grad_norm": 0.004188140854239464,
+      "learning_rate": 0.001,
+      "loss": 0.3842,
+      "step": 884
+    },
+    {
+      "epoch": 0.024419097941966703,
+      "grad_norm": 0.004255149979144335,
+      "learning_rate": 0.001,
+      "loss": 0.3664,
+      "step": 885
+    },
+    {
+      "epoch": 0.024446690143031073,
+      "grad_norm": 0.00479822838678956,
+      "learning_rate": 0.001,
+      "loss": 0.3781,
+      "step": 886
+    },
+    {
+      "epoch": 0.024474282344095443,
+      "grad_norm": 0.004328257869929075,
+      "learning_rate": 0.001,
+      "loss": 0.4,
+      "step": 887
+    },
+    {
+      "epoch": 0.02450187454515981,
+      "grad_norm": 0.0029929610900580883,
+      "learning_rate": 0.001,
+      "loss": 0.4315,
+      "step": 888
+    },
+    {
+      "epoch": 0.02452946674622418,
+      "grad_norm": 0.003600528696551919,
+      "learning_rate": 0.001,
+      "loss": 0.393,
+      "step": 889
+    },
+    {
+      "epoch": 0.02455705894728855,
+      "grad_norm": 0.0042820703238248825,
+      "learning_rate": 0.001,
+      "loss": 0.4092,
+      "step": 890
+    },
+    {
+      "epoch": 0.02458465114835292,
+      "grad_norm": 0.004282411653548479,
+      "learning_rate": 0.001,
+      "loss": 0.4112,
+      "step": 891
+    },
+    {
+      "epoch": 0.024612243349417286,
+      "grad_norm": 0.0038429568521678448,
+      "learning_rate": 0.001,
+      "loss": 0.416,
+      "step": 892
+    },
+    {
+      "epoch": 0.024639835550481656,
+      "grad_norm": 0.0035555907525122166,
+      "learning_rate": 0.001,
+      "loss": 0.4325,
+      "step": 893
+    },
+    {
+      "epoch": 0.024667427751546026,
+      "grad_norm": 0.0035739324521273375,
+      "learning_rate": 0.001,
+      "loss": 0.4009,
+      "step": 894
+    },
+    {
+      "epoch": 0.024695019952610395,
+      "grad_norm": 0.004270533565431833,
+      "learning_rate": 0.001,
+      "loss": 0.4254,
+      "step": 895
+    },
+    {
+      "epoch": 0.024722612153674765,
+      "grad_norm": 0.003811136120930314,
+      "learning_rate": 0.001,
+      "loss": 0.3913,
+      "step": 896
+    },
+    {
+      "epoch": 0.024750204354739132,
+      "grad_norm": 0.004315483383834362,
+      "learning_rate": 0.001,
+      "loss": 0.3805,
+      "step": 897
+    },
+    {
+      "epoch": 0.0247777965558035,
+      "grad_norm": 0.003949417732656002,
+      "learning_rate": 0.001,
+      "loss": 0.403,
+      "step": 898
+    },
+    {
+      "epoch": 0.02480538875686787,
+      "grad_norm": 0.004281886387616396,
+      "learning_rate": 0.001,
+      "loss": 0.4046,
+      "step": 899
+    },
+    {
+      "epoch": 0.02483298095793224,
+      "grad_norm": 0.004681427031755447,
+      "learning_rate": 0.001,
+      "loss": 0.3914,
+      "step": 900
+    },
+    {
+      "epoch": 0.024860573158996608,
+      "grad_norm": 0.008973667398095131,
+      "learning_rate": 0.001,
+      "loss": 0.4357,
+      "step": 901
+    },
+    {
+      "epoch": 0.024888165360060978,
+      "grad_norm": 0.004130321089178324,
+      "learning_rate": 0.001,
+      "loss": 0.4063,
+      "step": 902
+    },
+    {
+      "epoch": 0.024915757561125348,
+      "grad_norm": 0.005490110721439123,
+      "learning_rate": 0.001,
+      "loss": 0.3905,
+      "step": 903
+    },
+    {
+      "epoch": 0.024943349762189718,
+      "grad_norm": 0.007906914688646793,
+      "learning_rate": 0.001,
+      "loss": 0.3685,
+      "step": 904
+    },
+    {
+      "epoch": 0.024970941963254088,
+      "grad_norm": 0.022035721689462662,
+      "learning_rate": 0.001,
+      "loss": 0.3882,
+      "step": 905
+    },
+    {
+      "epoch": 0.024998534164318454,
+      "grad_norm": 0.0064789773896336555,
+      "learning_rate": 0.001,
+      "loss": 0.3644,
+      "step": 906
+    },
+    {
+      "epoch": 0.025026126365382824,
+      "grad_norm": 0.003105413168668747,
+      "learning_rate": 0.001,
+      "loss": 0.3742,
+      "step": 907
+    },
+    {
+      "epoch": 0.025053718566447194,
+      "grad_norm": 0.0031007244251668453,
+      "learning_rate": 0.001,
+      "loss": 0.3836,
+      "step": 908
+    },
+    {
+      "epoch": 0.025081310767511564,
+      "grad_norm": 0.003298420924693346,
+      "learning_rate": 0.001,
+      "loss": 0.3963,
+      "step": 909
+    },
+    {
+      "epoch": 0.025108902968575934,
+      "grad_norm": 0.0028220806270837784,
+      "learning_rate": 0.001,
+      "loss": 0.428,
+      "step": 910
+    },
+    {
+      "epoch": 0.0251364951696403,
+      "grad_norm": 0.0031455964781343937,
+      "learning_rate": 0.001,
+      "loss": 0.3877,
+      "step": 911
+    },
+    {
+      "epoch": 0.02516408737070467,
+      "grad_norm": 0.0031188016291707754,
+      "learning_rate": 0.001,
+      "loss": 0.403,
+      "step": 912
+    },
+    {
+      "epoch": 0.02519167957176904,
+      "grad_norm": 0.0035306380596011877,
+      "learning_rate": 0.001,
+      "loss": 0.4075,
+      "step": 913
+    },
+    {
+      "epoch": 0.02521927177283341,
+      "grad_norm": 0.010045260190963745,
+      "learning_rate": 0.001,
+      "loss": 0.3853,
+      "step": 914
+    },
+    {
+      "epoch": 0.025246863973897776,
+      "grad_norm": 0.002441234653815627,
+      "learning_rate": 0.001,
+      "loss": 0.3652,
+      "step": 915
+    },
+    {
+      "epoch": 0.025274456174962146,
+      "grad_norm": 0.0037013038527220488,
+      "learning_rate": 0.001,
+      "loss": 0.3761,
+      "step": 916
+    },
+    {
+      "epoch": 0.025302048376026516,
+      "grad_norm": 0.004198992624878883,
+      "learning_rate": 0.001,
+      "loss": 0.4195,
+      "step": 917
+    },
+    {
+      "epoch": 0.025329640577090886,
+      "grad_norm": 0.0041293492540717125,
+      "learning_rate": 0.001,
+      "loss": 0.3822,
+      "step": 918
+    },
+    {
+      "epoch": 0.025357232778155256,
+      "grad_norm": 0.0031841343734413385,
+      "learning_rate": 0.001,
+      "loss": 0.4145,
+      "step": 919
+    },
+    {
+      "epoch": 0.025384824979219622,
+      "grad_norm": 0.003239045385271311,
+      "learning_rate": 0.001,
+      "loss": 0.3771,
+      "step": 920
+    },
+    {
+      "epoch": 0.025412417180283992,
+      "grad_norm": 0.004458332899957895,
+      "learning_rate": 0.001,
+      "loss": 0.4111,
+      "step": 921
+    },
+    {
+      "epoch": 0.025440009381348362,
+      "grad_norm": 0.00542887207120657,
+      "learning_rate": 0.001,
+      "loss": 0.4112,
+      "step": 922
+    },
+    {
+      "epoch": 0.025467601582412732,
+      "grad_norm": 0.0039135608822107315,
+      "learning_rate": 0.001,
+      "loss": 0.4327,
+      "step": 923
+    },
+    {
+      "epoch": 0.0254951937834771,
+      "grad_norm": 0.003864714875817299,
+      "learning_rate": 0.001,
+      "loss": 0.399,
+      "step": 924
+    },
+    {
+      "epoch": 0.02552278598454147,
+      "grad_norm": 0.003554902272298932,
+      "learning_rate": 0.001,
+      "loss": 0.3696,
+      "step": 925
+    },
+    {
+      "epoch": 0.02555037818560584,
+      "grad_norm": 0.003268434898927808,
+      "learning_rate": 0.001,
+      "loss": 0.437,
+      "step": 926
+    },
+    {
+      "epoch": 0.025577970386670208,
+      "grad_norm": 0.003844626247882843,
+      "learning_rate": 0.001,
+      "loss": 0.3928,
+      "step": 927
+    },
+    {
+      "epoch": 0.025605562587734578,
+      "grad_norm": 0.0027172230184078217,
+      "learning_rate": 0.001,
+      "loss": 0.4315,
+      "step": 928
+    },
+    {
+      "epoch": 0.025633154788798945,
+      "grad_norm": 0.0031102465000003576,
+      "learning_rate": 0.001,
+      "loss": 0.4023,
+      "step": 929
+    },
+    {
+      "epoch": 0.025660746989863314,
+      "grad_norm": 0.00434950040653348,
+      "learning_rate": 0.001,
+      "loss": 0.4048,
+      "step": 930
+    },
+    {
+      "epoch": 0.025688339190927684,
+      "grad_norm": 0.0034932976122945547,
+      "learning_rate": 0.001,
+      "loss": 0.4578,
+      "step": 931
+    },
+    {
+      "epoch": 0.025715931391992054,
+      "grad_norm": 0.00359813729301095,
+      "learning_rate": 0.001,
+      "loss": 0.4268,
+      "step": 932
+    },
+    {
+      "epoch": 0.025743523593056424,
+      "grad_norm": 0.004753883462399244,
+      "learning_rate": 0.001,
+      "loss": 0.3659,
+      "step": 933
+    },
+    {
+      "epoch": 0.02577111579412079,
+      "grad_norm": 0.0034384452737867832,
+      "learning_rate": 0.001,
+      "loss": 0.3892,
+      "step": 934
+    },
+    {
+      "epoch": 0.02579870799518516,
+      "grad_norm": 0.0031536207534372807,
+      "learning_rate": 0.001,
+      "loss": 0.4061,
+      "step": 935
+    },
+    {
+      "epoch": 0.02582630019624953,
+      "grad_norm": 0.0027636385057121515,
+      "learning_rate": 0.001,
+      "loss": 0.4256,
+      "step": 936
+    },
+    {
+      "epoch": 0.0258538923973139,
+      "grad_norm": 0.002858042251318693,
+      "learning_rate": 0.001,
+      "loss": 0.4199,
+      "step": 937
+    },
+    {
+      "epoch": 0.025881484598378267,
+      "grad_norm": 0.0022810858208686113,
+      "learning_rate": 0.001,
+      "loss": 0.3972,
+      "step": 938
+    },
+    {
+      "epoch": 0.025909076799442637,
+      "grad_norm": 0.004132518544793129,
+      "learning_rate": 0.001,
+      "loss": 0.3967,
+      "step": 939
+    },
+    {
+      "epoch": 0.025936669000507007,
+      "grad_norm": 0.00525138434022665,
+      "learning_rate": 0.001,
+      "loss": 0.4237,
+      "step": 940
+    },
+    {
+      "epoch": 0.025964261201571377,
+      "grad_norm": 0.003772285534068942,
+      "learning_rate": 0.001,
+      "loss": 0.4164,
+      "step": 941
+    },
+    {
+      "epoch": 0.025991853402635746,
+      "grad_norm": 0.0035187045577913523,
+      "learning_rate": 0.001,
+      "loss": 0.3605,
+      "step": 942
+    },
+    {
+      "epoch": 0.026019445603700113,
+      "grad_norm": 0.00876245740801096,
+      "learning_rate": 0.001,
+      "loss": 0.3699,
+      "step": 943
+    },
+    {
+      "epoch": 0.026047037804764483,
+      "grad_norm": 0.002588030882179737,
+      "learning_rate": 0.001,
+      "loss": 0.404,
+      "step": 944
+    },
+    {
+      "epoch": 0.026074630005828853,
+      "grad_norm": 0.003442909335717559,
+      "learning_rate": 0.001,
+      "loss": 0.3946,
+      "step": 945
+    },
+    {
+      "epoch": 0.026102222206893223,
+      "grad_norm": 0.003080985974520445,
+      "learning_rate": 0.001,
+      "loss": 0.3941,
+      "step": 946
+    },
+    {
+      "epoch": 0.02612981440795759,
+      "grad_norm": 0.0026055227499455214,
+      "learning_rate": 0.001,
+      "loss": 0.4081,
+      "step": 947
+    },
+    {
+      "epoch": 0.02615740660902196,
+      "grad_norm": 0.00331856869161129,
+      "learning_rate": 0.001,
+      "loss": 0.4073,
+      "step": 948
+    },
+    {
+      "epoch": 0.02618499881008633,
+      "grad_norm": 0.003704390488564968,
+      "learning_rate": 0.001,
+      "loss": 0.409,
+      "step": 949
+    },
+    {
+      "epoch": 0.0262125910111507,
+      "grad_norm": 0.0028117795009166002,
+      "learning_rate": 0.001,
+      "loss": 0.4347,
+      "step": 950
+    },
+    {
+      "epoch": 0.02624018321221507,
+      "grad_norm": 0.005540241952985525,
+      "learning_rate": 0.001,
+      "loss": 0.394,
+      "step": 951
+    },
+    {
+      "epoch": 0.026267775413279435,
+      "grad_norm": 0.0033867203164845705,
+      "learning_rate": 0.001,
+      "loss": 0.3855,
+      "step": 952
+    },
+    {
+      "epoch": 0.026295367614343805,
+      "grad_norm": 0.005016230046749115,
+      "learning_rate": 0.001,
+      "loss": 0.4041,
+      "step": 953
+    },
+    {
+      "epoch": 0.026322959815408175,
+      "grad_norm": 0.003560574259608984,
+      "learning_rate": 0.001,
+      "loss": 0.4006,
+      "step": 954
+    },
+    {
+      "epoch": 0.026350552016472545,
+      "grad_norm": 0.006354731973260641,
+      "learning_rate": 0.001,
+      "loss": 0.3982,
+      "step": 955
+    },
+    {
+      "epoch": 0.026378144217536915,
+      "grad_norm": 0.0036503588780760765,
+      "learning_rate": 0.001,
+      "loss": 0.4549,
+      "step": 956
+    },
+    {
+      "epoch": 0.02640573641860128,
+      "grad_norm": 0.0061042881570756435,
+      "learning_rate": 0.001,
+      "loss": 0.419,
+      "step": 957
+    },
+    {
+      "epoch": 0.02643332861966565,
+      "grad_norm": 0.0033794385381042957,
+      "learning_rate": 0.001,
+      "loss": 0.3847,
+      "step": 958
+    },
+    {
+      "epoch": 0.02646092082073002,
+      "grad_norm": 0.00359528511762619,
+      "learning_rate": 0.001,
+      "loss": 0.3874,
+      "step": 959
+    },
+    {
+      "epoch": 0.02648851302179439,
+      "grad_norm": 0.0028623088728636503,
+      "learning_rate": 0.001,
+      "loss": 0.3936,
+      "step": 960
+    },
+    {
+      "epoch": 0.026516105222858757,
+      "grad_norm": 0.0029379641637206078,
+      "learning_rate": 0.001,
+      "loss": 0.3741,
+      "step": 961
+    },
+    {
+      "epoch": 0.026543697423923127,
+      "grad_norm": 0.0034645479172468185,
+      "learning_rate": 0.001,
+      "loss": 0.3783,
+      "step": 962
+    },
+    {
+      "epoch": 0.026571289624987497,
+      "grad_norm": 0.0062490347772836685,
+      "learning_rate": 0.001,
+      "loss": 0.3959,
+      "step": 963
+    },
+    {
+      "epoch": 0.026598881826051867,
+      "grad_norm": 0.010679258033633232,
+      "learning_rate": 0.001,
+      "loss": 0.362,
+      "step": 964
+    },
+    {
+      "epoch": 0.026626474027116237,
+      "grad_norm": 0.003056140150874853,
+      "learning_rate": 0.001,
+      "loss": 0.3784,
+      "step": 965
+    },
+    {
+      "epoch": 0.026654066228180603,
+      "grad_norm": 0.0043676625937223434,
+      "learning_rate": 0.001,
+      "loss": 0.4242,
+      "step": 966
+    },
+    {
+      "epoch": 0.026681658429244973,
+      "grad_norm": 0.004542070906609297,
+      "learning_rate": 0.001,
+      "loss": 0.4068,
+      "step": 967
+    },
+    {
+      "epoch": 0.026709250630309343,
+      "grad_norm": 0.003998064436018467,
+      "learning_rate": 0.001,
+      "loss": 0.4086,
+      "step": 968
+    },
+    {
+      "epoch": 0.026736842831373713,
+      "grad_norm": 0.00380288646556437,
+      "learning_rate": 0.001,
+      "loss": 0.4274,
+      "step": 969
+    },
+    {
+      "epoch": 0.026764435032438083,
+      "grad_norm": 0.005197952967137098,
+      "learning_rate": 0.001,
+      "loss": 0.37,
+      "step": 970
+    },
+    {
+      "epoch": 0.02679202723350245,
+      "grad_norm": 0.0045735882595181465,
+      "learning_rate": 0.001,
+      "loss": 0.3623,
+      "step": 971
+    },
+    {
+      "epoch": 0.02681961943456682,
+      "grad_norm": 0.013644758611917496,
+      "learning_rate": 0.001,
+      "loss": 0.3709,
+      "step": 972
+    },
+    {
+      "epoch": 0.02684721163563119,
+      "grad_norm": 0.0032162275165319443,
+      "learning_rate": 0.001,
+      "loss": 0.4419,
+      "step": 973
+    },
+    {
+      "epoch": 0.02687480383669556,
+      "grad_norm": 0.003487576497718692,
+      "learning_rate": 0.001,
+      "loss": 0.4018,
+      "step": 974
+    },
+    {
+      "epoch": 0.026902396037759926,
+      "grad_norm": 0.0028352616354823112,
+      "learning_rate": 0.001,
+      "loss": 0.4059,
+      "step": 975
+    },
+    {
+      "epoch": 0.026929988238824296,
+      "grad_norm": 0.0032060358207672834,
+      "learning_rate": 0.001,
+      "loss": 0.4277,
+      "step": 976
+    },
+    {
+      "epoch": 0.026957580439888666,
+      "grad_norm": 0.004065185319632292,
+      "learning_rate": 0.001,
+      "loss": 0.4403,
+      "step": 977
+    },
+    {
+      "epoch": 0.026985172640953035,
+      "grad_norm": 0.0032863083761185408,
+      "learning_rate": 0.001,
+      "loss": 0.4257,
+      "step": 978
+    },
+    {
+      "epoch": 0.027012764842017405,
+      "grad_norm": 0.0028903197962790728,
+      "learning_rate": 0.001,
+      "loss": 0.4371,
+      "step": 979
+    },
+    {
+      "epoch": 0.027040357043081772,
+      "grad_norm": 0.0034798365086317062,
+      "learning_rate": 0.001,
+      "loss": 0.4378,
+      "step": 980
+    },
+    {
+      "epoch": 0.02706794924414614,
+      "grad_norm": 0.0034869504161179066,
+      "learning_rate": 0.001,
+      "loss": 0.451,
+      "step": 981
+    },
+    {
+      "epoch": 0.02709554144521051,
+      "grad_norm": 0.003647441975772381,
+      "learning_rate": 0.001,
+      "loss": 0.3957,
+      "step": 982
+    },
+    {
+      "epoch": 0.02712313364627488,
+      "grad_norm": 0.003412257879972458,
+      "learning_rate": 0.001,
+      "loss": 0.427,
+      "step": 983
+    },
+    {
+      "epoch": 0.027150725847339248,
+      "grad_norm": 0.0046013942919671535,
+      "learning_rate": 0.001,
+      "loss": 0.4154,
+      "step": 984
+    },
+    {
+      "epoch": 0.027178318048403618,
+      "grad_norm": 0.00263599562458694,
+      "learning_rate": 0.001,
+      "loss": 0.438,
+      "step": 985
+    },
+    {
+      "epoch": 0.027205910249467988,
+      "grad_norm": 0.00280582788400352,
+      "learning_rate": 0.001,
+      "loss": 0.3824,
+      "step": 986
+    },
+    {
+      "epoch": 0.027233502450532358,
+      "grad_norm": 0.004769660532474518,
+      "learning_rate": 0.001,
+      "loss": 0.3607,
+      "step": 987
+    },
+    {
+      "epoch": 0.027261094651596728,
+      "grad_norm": 0.004607087001204491,
+      "learning_rate": 0.001,
+      "loss": 0.4218,
+      "step": 988
+    },
+    {
+      "epoch": 0.027288686852661094,
+      "grad_norm": 0.003465674351900816,
+      "learning_rate": 0.001,
+      "loss": 0.4187,
+      "step": 989
+    },
+    {
+      "epoch": 0.027316279053725464,
+      "grad_norm": 0.003429507138207555,
+      "learning_rate": 0.001,
+      "loss": 0.3691,
+      "step": 990
+    },
+    {
+      "epoch": 0.027343871254789834,
+      "grad_norm": 0.004496248438954353,
+      "learning_rate": 0.001,
+      "loss": 0.4095,
+      "step": 991
+    },
+    {
+      "epoch": 0.027371463455854204,
+      "grad_norm": 0.0036759956274181604,
+      "learning_rate": 0.001,
+      "loss": 0.4263,
+      "step": 992
+    },
+    {
+      "epoch": 0.027399055656918574,
+      "grad_norm": 0.003747584531083703,
+      "learning_rate": 0.001,
+      "loss": 0.43,
+      "step": 993
+    },
+    {
+      "epoch": 0.02742664785798294,
+      "grad_norm": 0.0032122142147272825,
+      "learning_rate": 0.001,
+      "loss": 0.4167,
+      "step": 994
+    },
+    {
+      "epoch": 0.02745424005904731,
+      "grad_norm": 0.0035362818744033575,
+      "learning_rate": 0.001,
+      "loss": 0.4097,
+      "step": 995
+    },
+    {
+      "epoch": 0.02748183226011168,
+      "grad_norm": 0.002750742482021451,
+      "learning_rate": 0.001,
+      "loss": 0.4027,
+      "step": 996
+    },
+    {
+      "epoch": 0.02750942446117605,
+      "grad_norm": 0.004634341225028038,
+      "learning_rate": 0.001,
+      "loss": 0.3845,
+      "step": 997
+    },
+    {
+      "epoch": 0.027537016662240416,
+      "grad_norm": 0.0034089027903974056,
+      "learning_rate": 0.001,
+      "loss": 0.4119,
+      "step": 998
+    },
+    {
+      "epoch": 0.027564608863304786,
+      "grad_norm": 0.008999533019959927,
+      "learning_rate": 0.001,
+      "loss": 0.3786,
+      "step": 999
+    },
+    {
+      "epoch": 0.027592201064369156,
+      "grad_norm": 0.007176951505243778,
+      "learning_rate": 0.001,
+      "loss": 0.3667,
+      "step": 1000
+    },
+    {
+      "epoch": 0.027592201064369156,
+      "eval_runtime": 23.9716,
+      "eval_samples_per_second": 1.335,
+      "eval_steps_per_second": 0.167,
+      "step": 1000
+    },
+    {
+      "epoch": 0.027619793265433526,
+      "grad_norm": 0.0036988353822380304,
+      "learning_rate": 0.001,
+      "loss": 0.3749,
+      "step": 1001
+    },
+    {
+      "epoch": 0.027647385466497896,
+      "grad_norm": 0.0030345297418534756,
+      "learning_rate": 0.001,
+      "loss": 0.4101,
+      "step": 1002
+    },
+    {
+      "epoch": 0.027674977667562262,
+      "grad_norm": 0.003801414743065834,
+      "learning_rate": 0.001,
+      "loss": 0.405,
+      "step": 1003
+    },
+    {
+      "epoch": 0.027702569868626632,
+      "grad_norm": 0.002585576381534338,
+      "learning_rate": 0.001,
+      "loss": 0.4069,
+      "step": 1004
+    },
+    {
+      "epoch": 0.027730162069691002,
+      "grad_norm": 0.002826697425916791,
+      "learning_rate": 0.001,
+      "loss": 0.4051,
+      "step": 1005
+    },
+    {
+      "epoch": 0.027757754270755372,
+      "grad_norm": 0.0023764553479850292,
+      "learning_rate": 0.001,
+      "loss": 0.4434,
+      "step": 1006
+    },
+    {
+      "epoch": 0.02778534647181974,
+      "grad_norm": 0.002361831720918417,
+      "learning_rate": 0.001,
+      "loss": 0.4271,
+      "step": 1007
+    },
+    {
+      "epoch": 0.02781293867288411,
+      "grad_norm": 0.0033377348445355892,
+      "learning_rate": 0.001,
+      "loss": 0.4132,
+      "step": 1008
+    },
+    {
+      "epoch": 0.02784053087394848,
+      "grad_norm": 0.004634097684174776,
+      "learning_rate": 0.001,
+      "loss": 0.421,
+      "step": 1009
+    },
+    {
+      "epoch": 0.027868123075012848,
+      "grad_norm": 0.0033739793580025434,
+      "learning_rate": 0.001,
+      "loss": 0.398,
+      "step": 1010
+    },
+    {
+      "epoch": 0.027895715276077218,
+      "grad_norm": 0.003304282436147332,
+      "learning_rate": 0.001,
+      "loss": 0.4162,
+      "step": 1011
+    },
+    {
+      "epoch": 0.027923307477141585,
+      "grad_norm": 0.007290132809430361,
+      "learning_rate": 0.001,
+      "loss": 0.3914,
+      "step": 1012
+    },
+    {
+      "epoch": 0.027950899678205954,
+      "grad_norm": 0.008183951489627361,
+      "learning_rate": 0.001,
+      "loss": 0.39,
+      "step": 1013
+    },
+    {
+      "epoch": 0.027978491879270324,
+      "grad_norm": 0.012678315863013268,
+      "learning_rate": 0.001,
+      "loss": 0.3956,
+      "step": 1014
+    },
+    {
+      "epoch": 0.028006084080334694,
+      "grad_norm": 0.00686604343354702,
+      "learning_rate": 0.001,
+      "loss": 0.4036,
+      "step": 1015
+    },
+    {
+      "epoch": 0.028033676281399064,
+      "grad_norm": 0.004761406686156988,
+      "learning_rate": 0.001,
+      "loss": 0.4203,
+      "step": 1016
+    },
+    {
+      "epoch": 0.02806126848246343,
+      "grad_norm": 0.004743502475321293,
+      "learning_rate": 0.001,
+      "loss": 0.3745,
+      "step": 1017
+    },
+    {
+      "epoch": 0.0280888606835278,
+      "grad_norm": 0.003992531448602676,
+      "learning_rate": 0.001,
+      "loss": 0.3979,
+      "step": 1018
+    },
+    {
+      "epoch": 0.02811645288459217,
+      "grad_norm": 0.0027658091858029366,
+      "learning_rate": 0.001,
+      "loss": 0.4219,
+      "step": 1019
+    },
+    {
+      "epoch": 0.02814404508565654,
+      "grad_norm": 0.002945561660453677,
+      "learning_rate": 0.001,
+      "loss": 0.3572,
+      "step": 1020
+    },
+    {
+      "epoch": 0.028171637286720907,
+      "grad_norm": 0.004641372710466385,
+      "learning_rate": 0.001,
+      "loss": 0.4017,
+      "step": 1021
+    },
+    {
+      "epoch": 0.028199229487785277,
+      "grad_norm": 0.0027934517711400986,
+      "learning_rate": 0.001,
+      "loss": 0.4321,
+      "step": 1022
+    },
+    {
+      "epoch": 0.028226821688849647,
+      "grad_norm": 0.0028974004089832306,
+      "learning_rate": 0.001,
+      "loss": 0.417,
+      "step": 1023
+    },
+    {
+      "epoch": 0.028254413889914017,
+      "grad_norm": 0.0028739396948367357,
+      "learning_rate": 0.001,
+      "loss": 0.3921,
+      "step": 1024
+    },
+    {
+      "epoch": 0.028282006090978386,
+      "grad_norm": 0.0029768026433885098,
+      "learning_rate": 0.001,
+      "loss": 0.4339,
+      "step": 1025
+    },
+    {
+      "epoch": 0.028309598292042753,
+      "grad_norm": 0.0032322900369763374,
+      "learning_rate": 0.001,
+      "loss": 0.4045,
+      "step": 1026
+    },
+    {
+      "epoch": 0.028337190493107123,
+      "grad_norm": 0.0037773947697132826,
+      "learning_rate": 0.001,
+      "loss": 0.3926,
+      "step": 1027
+    },
+    {
+      "epoch": 0.028364782694171493,
+      "grad_norm": 0.0037146552931517363,
+      "learning_rate": 0.001,
+      "loss": 0.3743,
+      "step": 1028
+    },
+    {
+      "epoch": 0.028392374895235863,
+      "grad_norm": 0.003289048792794347,
+      "learning_rate": 0.001,
+      "loss": 0.4035,
+      "step": 1029
+    },
+    {
+      "epoch": 0.02841996709630023,
+      "grad_norm": 0.003675120184198022,
+      "learning_rate": 0.001,
+      "loss": 0.3811,
+      "step": 1030
+    },
+    {
+      "epoch": 0.0284475592973646,
+      "grad_norm": 0.002748252125456929,
+      "learning_rate": 0.001,
+      "loss": 0.457,
+      "step": 1031
+    },
+    {
+      "epoch": 0.02847515149842897,
+      "grad_norm": 0.0026871277950704098,
+      "learning_rate": 0.001,
+      "loss": 0.4112,
+      "step": 1032
+    },
+    {
+      "epoch": 0.02850274369949334,
+      "grad_norm": 0.0031060713808983564,
+      "learning_rate": 0.001,
+      "loss": 0.4268,
+      "step": 1033
+    },
+    {
+      "epoch": 0.02853033590055771,
+      "grad_norm": 0.0029715097043663263,
+      "learning_rate": 0.001,
+      "loss": 0.3883,
+      "step": 1034
+    },
+    {
+      "epoch": 0.028557928101622075,
+      "grad_norm": 0.004726918879896402,
+      "learning_rate": 0.001,
+      "loss": 0.4021,
+      "step": 1035
+    },
+    {
+      "epoch": 0.028585520302686445,
+      "grad_norm": 0.0036529232747852802,
+      "learning_rate": 0.001,
+      "loss": 0.3947,
+      "step": 1036
+    },
+    {
+      "epoch": 0.028613112503750815,
+      "grad_norm": 0.003458012593910098,
+      "learning_rate": 0.001,
+      "loss": 0.3919,
+      "step": 1037
+    },
+    {
+      "epoch": 0.028640704704815185,
+      "grad_norm": 0.0026079469826072454,
+      "learning_rate": 0.001,
+      "loss": 0.3977,
+      "step": 1038
+    },
+    {
+      "epoch": 0.028668296905879555,
+      "grad_norm": 0.004490693099796772,
+      "learning_rate": 0.001,
+      "loss": 0.3769,
+      "step": 1039
+    },
+    {
+      "epoch": 0.02869588910694392,
+      "grad_norm": 0.0030477193649858236,
+      "learning_rate": 0.001,
+      "loss": 0.3832,
+      "step": 1040
+    },
+    {
+      "epoch": 0.02872348130800829,
+      "grad_norm": 0.0033396866638213396,
+      "learning_rate": 0.001,
+      "loss": 0.4095,
+      "step": 1041
+    },
+    {
+      "epoch": 0.02875107350907266,
+      "grad_norm": 0.0030391959007829428,
+      "learning_rate": 0.001,
+      "loss": 0.3874,
+      "step": 1042
+    },
+    {
+      "epoch": 0.02877866571013703,
+      "grad_norm": 0.0035509227309376,
+      "learning_rate": 0.001,
+      "loss": 0.4058,
+      "step": 1043
+    },
+    {
+      "epoch": 0.028806257911201397,
+      "grad_norm": 0.0039217835292220116,
+      "learning_rate": 0.001,
+      "loss": 0.3737,
+      "step": 1044
+    },
+    {
+      "epoch": 0.028833850112265767,
+      "grad_norm": 0.007786846719682217,
+      "learning_rate": 0.001,
+      "loss": 0.3881,
+      "step": 1045
+    },
+    {
+      "epoch": 0.028861442313330137,
+      "grad_norm": 0.0030108222272247076,
+      "learning_rate": 0.001,
+      "loss": 0.4365,
+      "step": 1046
+    },
+    {
+      "epoch": 0.028889034514394507,
+      "grad_norm": 0.0058325594291090965,
+      "learning_rate": 0.001,
+      "loss": 0.4182,
+      "step": 1047
+    },
+    {
+      "epoch": 0.028916626715458877,
+      "grad_norm": 0.0032810927368700504,
+      "learning_rate": 0.001,
+      "loss": 0.38,
+      "step": 1048
+    },
+    {
+      "epoch": 0.028944218916523243,
+      "grad_norm": 0.0035062895622104406,
+      "learning_rate": 0.001,
+      "loss": 0.3968,
+      "step": 1049
+    },
+    {
+      "epoch": 0.028971811117587613,
+      "grad_norm": 0.003582380712032318,
+      "learning_rate": 0.001,
+      "loss": 0.4174,
+      "step": 1050
+    },
+    {
+      "epoch": 0.028999403318651983,
+      "grad_norm": 0.003514527576044202,
+      "learning_rate": 0.001,
+      "loss": 0.4019,
+      "step": 1051
+    },
+    {
+      "epoch": 0.029026995519716353,
+      "grad_norm": 0.003413026686757803,
+      "learning_rate": 0.001,
+      "loss": 0.3962,
+      "step": 1052
+    },
+    {
+      "epoch": 0.02905458772078072,
+      "grad_norm": 0.0038671360816806555,
+      "learning_rate": 0.001,
+      "loss": 0.4344,
+      "step": 1053
+    },
+    {
+      "epoch": 0.02908217992184509,
+      "grad_norm": 0.004694768693298101,
+      "learning_rate": 0.001,
+      "loss": 0.3934,
+      "step": 1054
+    },
+    {
+      "epoch": 0.02910977212290946,
+      "grad_norm": 0.004935418255627155,
+      "learning_rate": 0.001,
+      "loss": 0.3878,
+      "step": 1055
+    },
+    {
+      "epoch": 0.02913736432397383,
+      "grad_norm": 0.0034861708991229534,
+      "learning_rate": 0.001,
+      "loss": 0.3961,
+      "step": 1056
+    },
+    {
+      "epoch": 0.0291649565250382,
+      "grad_norm": 0.00353567604906857,
+      "learning_rate": 0.001,
+      "loss": 0.3977,
+      "step": 1057
+    },
+    {
+      "epoch": 0.029192548726102566,
+      "grad_norm": 0.0043948497623205185,
+      "learning_rate": 0.001,
+      "loss": 0.4041,
+      "step": 1058
+    },
+    {
+      "epoch": 0.029220140927166936,
+      "grad_norm": 0.00446993438526988,
+      "learning_rate": 0.001,
+      "loss": 0.4099,
+      "step": 1059
+    },
+    {
+      "epoch": 0.029247733128231305,
+      "grad_norm": 0.003901879768818617,
+      "learning_rate": 0.001,
+      "loss": 0.4197,
+      "step": 1060
+    },
+    {
+      "epoch": 0.029275325329295675,
+      "grad_norm": 0.004764191340655088,
+      "learning_rate": 0.001,
+      "loss": 0.3927,
+      "step": 1061
+    },
+    {
+      "epoch": 0.029302917530360045,
+      "grad_norm": 0.0031485301442444324,
+      "learning_rate": 0.001,
+      "loss": 0.4176,
+      "step": 1062
+    },
+    {
+      "epoch": 0.029330509731424412,
+      "grad_norm": 0.004180070944130421,
+      "learning_rate": 0.001,
+      "loss": 0.3713,
+      "step": 1063
+    },
+    {
+      "epoch": 0.02935810193248878,
+      "grad_norm": 0.005659409333020449,
+      "learning_rate": 0.001,
+      "loss": 0.3968,
+      "step": 1064
+    },
+    {
+      "epoch": 0.02938569413355315,
+      "grad_norm": 0.0038438751362264156,
+      "learning_rate": 0.001,
+      "loss": 0.4278,
+      "step": 1065
+    },
+    {
+      "epoch": 0.02941328633461752,
+      "grad_norm": 0.00400773249566555,
+      "learning_rate": 0.001,
+      "loss": 0.4241,
+      "step": 1066
+    },
+    {
+      "epoch": 0.029440878535681888,
+      "grad_norm": 0.005502818617969751,
+      "learning_rate": 0.001,
+      "loss": 0.4205,
+      "step": 1067
+    },
+    {
+      "epoch": 0.029468470736746258,
+      "grad_norm": 0.003952248953282833,
+      "learning_rate": 0.001,
+      "loss": 0.4407,
+      "step": 1068
+    },
+    {
+      "epoch": 0.029496062937810628,
+      "grad_norm": 0.011100285686552525,
+      "learning_rate": 0.001,
+      "loss": 0.3699,
+      "step": 1069
+    },
+    {
+      "epoch": 0.029523655138874998,
+      "grad_norm": 0.00686876242980361,
+      "learning_rate": 0.001,
+      "loss": 0.3705,
+      "step": 1070
+    },
+    {
+      "epoch": 0.029551247339939368,
+      "grad_norm": 0.007713994476944208,
+      "learning_rate": 0.001,
+      "loss": 0.3842,
+      "step": 1071
+    },
+    {
+      "epoch": 0.029578839541003734,
+      "grad_norm": 0.009867096319794655,
+      "learning_rate": 0.001,
+      "loss": 0.3383,
+      "step": 1072
+    },
+    {
+      "epoch": 0.029606431742068104,
+      "grad_norm": 0.004925290122628212,
+      "learning_rate": 0.001,
+      "loss": 0.3811,
+      "step": 1073
+    },
+    {
+      "epoch": 0.029634023943132474,
+      "grad_norm": 0.004095774609595537,
+      "learning_rate": 0.001,
+      "loss": 0.3824,
+      "step": 1074
+    },
+    {
+      "epoch": 0.029661616144196844,
+      "grad_norm": 0.0056510199792683125,
+      "learning_rate": 0.001,
+      "loss": 0.386,
+      "step": 1075
+    },
+    {
+      "epoch": 0.029689208345261214,
+      "grad_norm": 0.003869240405037999,
+      "learning_rate": 0.001,
+      "loss": 0.4113,
+      "step": 1076
+    },
+    {
+      "epoch": 0.02971680054632558,
+      "grad_norm": 0.0034619278740137815,
+      "learning_rate": 0.001,
+      "loss": 0.392,
+      "step": 1077
+    },
+    {
+      "epoch": 0.02974439274738995,
+      "grad_norm": 0.003166050184518099,
+      "learning_rate": 0.001,
+      "loss": 0.3854,
+      "step": 1078
+    },
+    {
+      "epoch": 0.02977198494845432,
+      "grad_norm": 0.004651426337659359,
+      "learning_rate": 0.001,
+      "loss": 0.4369,
+      "step": 1079
+    },
+    {
+      "epoch": 0.02979957714951869,
+      "grad_norm": 0.00306075275875628,
+      "learning_rate": 0.001,
+      "loss": 0.3843,
+      "step": 1080
+    },
+    {
+      "epoch": 0.029827169350583056,
+      "grad_norm": 0.0034789969213306904,
+      "learning_rate": 0.001,
+      "loss": 0.4118,
+      "step": 1081
+    },
+    {
+      "epoch": 0.029854761551647426,
+      "grad_norm": 0.004120729863643646,
+      "learning_rate": 0.001,
+      "loss": 0.3697,
+      "step": 1082
+    },
+    {
+      "epoch": 0.029882353752711796,
+      "grad_norm": 0.005171756725758314,
+      "learning_rate": 0.001,
+      "loss": 0.4336,
+      "step": 1083
+    },
+    {
+      "epoch": 0.029909945953776166,
+      "grad_norm": 0.00312454323284328,
+      "learning_rate": 0.001,
+      "loss": 0.4105,
+      "step": 1084
+    },
+    {
+      "epoch": 0.029937538154840536,
+      "grad_norm": 0.003719372907653451,
+      "learning_rate": 0.001,
+      "loss": 0.3919,
+      "step": 1085
+    },
+    {
+      "epoch": 0.029965130355904902,
+      "grad_norm": 0.003698839107528329,
+      "learning_rate": 0.001,
+      "loss": 0.3745,
+      "step": 1086
+    },
+    {
+      "epoch": 0.029992722556969272,
+      "grad_norm": 0.0035393829457461834,
+      "learning_rate": 0.001,
+      "loss": 0.3922,
+      "step": 1087
+    },
+    {
+      "epoch": 0.030020314758033642,
+      "grad_norm": 0.004070476163178682,
+      "learning_rate": 0.001,
+      "loss": 0.3524,
+      "step": 1088
+    },
+    {
+      "epoch": 0.030047906959098012,
+      "grad_norm": 0.004007712937891483,
+      "learning_rate": 0.001,
+      "loss": 0.4154,
+      "step": 1089
+    },
+    {
+      "epoch": 0.03007549916016238,
+      "grad_norm": 0.0035324685741215944,
+      "learning_rate": 0.001,
+      "loss": 0.3845,
+      "step": 1090
+    },
+    {
+      "epoch": 0.03010309136122675,
+      "grad_norm": 0.007462177891284227,
+      "learning_rate": 0.001,
+      "loss": 0.4082,
+      "step": 1091
+    },
+    {
+      "epoch": 0.03013068356229112,
+      "grad_norm": 0.003764525754377246,
+      "learning_rate": 0.001,
+      "loss": 0.4065,
+      "step": 1092
+    },
+    {
+      "epoch": 0.030158275763355488,
+      "grad_norm": 0.0034473277628421783,
+      "learning_rate": 0.001,
+      "loss": 0.4106,
+      "step": 1093
+    },
+    {
+      "epoch": 0.030185867964419858,
+      "grad_norm": 0.0025921028573065996,
+      "learning_rate": 0.001,
+      "loss": 0.4125,
+      "step": 1094
+    },
+    {
+      "epoch": 0.030213460165484225,
+      "grad_norm": 0.004729332402348518,
+      "learning_rate": 0.001,
+      "loss": 0.3841,
+      "step": 1095
+    },
+    {
+      "epoch": 0.030241052366548594,
+      "grad_norm": 0.0037650500889867544,
+      "learning_rate": 0.001,
+      "loss": 0.3903,
+      "step": 1096
+    },
+    {
+      "epoch": 0.030268644567612964,
+      "grad_norm": 0.0038791224360466003,
+      "learning_rate": 0.001,
+      "loss": 0.4233,
+      "step": 1097
+    },
+    {
+      "epoch": 0.030296236768677334,
+      "grad_norm": 0.004454473964869976,
+      "learning_rate": 0.001,
+      "loss": 0.3714,
+      "step": 1098
+    },
+    {
+      "epoch": 0.030323828969741704,
+      "grad_norm": 0.0029884730465710163,
+      "learning_rate": 0.001,
+      "loss": 0.4107,
+      "step": 1099
+    },
+    {
+      "epoch": 0.03035142117080607,
+      "grad_norm": 0.0049886396154761314,
+      "learning_rate": 0.001,
+      "loss": 0.3922,
+      "step": 1100
+    },
+    {
+      "epoch": 0.03037901337187044,
+      "grad_norm": 0.0025764929596334696,
+      "learning_rate": 0.001,
+      "loss": 0.4182,
+      "step": 1101
+    },
+    {
+      "epoch": 0.03040660557293481,
+      "grad_norm": 0.0029421220533549786,
+      "learning_rate": 0.001,
+      "loss": 0.4079,
+      "step": 1102
+    },
+    {
+      "epoch": 0.03043419777399918,
+      "grad_norm": 0.014296631328761578,
+      "learning_rate": 0.001,
+      "loss": 0.3668,
+      "step": 1103
+    },
+    {
+      "epoch": 0.030461789975063547,
+      "grad_norm": 0.0035509790759533644,
+      "learning_rate": 0.001,
+      "loss": 0.3593,
+      "step": 1104
+    },
+    {
+      "epoch": 0.030489382176127917,
+      "grad_norm": 0.0033370009623467922,
+      "learning_rate": 0.001,
+      "loss": 0.4534,
+      "step": 1105
+    },
+    {
+      "epoch": 0.030516974377192287,
+      "grad_norm": 0.004134595859795809,
+      "learning_rate": 0.001,
+      "loss": 0.3947,
+      "step": 1106
+    },
+    {
+      "epoch": 0.030544566578256657,
+      "grad_norm": 0.005164284259080887,
+      "learning_rate": 0.001,
+      "loss": 0.3831,
+      "step": 1107
+    },
+    {
+      "epoch": 0.030572158779321026,
+      "grad_norm": 0.004688777029514313,
+      "learning_rate": 0.001,
+      "loss": 0.3772,
+      "step": 1108
+    },
+    {
+      "epoch": 0.030599750980385393,
+      "grad_norm": 0.0034766956232488155,
+      "learning_rate": 0.001,
+      "loss": 0.4172,
+      "step": 1109
+    },
+    {
+      "epoch": 0.030627343181449763,
+      "grad_norm": 0.0031806135084480047,
+      "learning_rate": 0.001,
+      "loss": 0.3803,
+      "step": 1110
+    },
+    {
+      "epoch": 0.030654935382514133,
+      "grad_norm": 0.0031535644084215164,
+      "learning_rate": 0.001,
+      "loss": 0.4076,
+      "step": 1111
+    },
+    {
+      "epoch": 0.030682527583578503,
+      "grad_norm": 0.004275255836546421,
+      "learning_rate": 0.001,
+      "loss": 0.397,
+      "step": 1112
+    },
+    {
+      "epoch": 0.03071011978464287,
+      "grad_norm": 0.002956011099740863,
+      "learning_rate": 0.001,
+      "loss": 0.415,
+      "step": 1113
+    },
+    {
+      "epoch": 0.03073771198570724,
+      "grad_norm": 0.002727919491007924,
+      "learning_rate": 0.001,
+      "loss": 0.4248,
+      "step": 1114
+    },
+    {
+      "epoch": 0.03076530418677161,
+      "grad_norm": 0.0037477565929293633,
+      "learning_rate": 0.001,
+      "loss": 0.3827,
+      "step": 1115
+    },
+    {
+      "epoch": 0.03079289638783598,
+      "grad_norm": 0.002299915300682187,
+      "learning_rate": 0.001,
+      "loss": 0.4314,
+      "step": 1116
+    },
+    {
+      "epoch": 0.03082048858890035,
+      "grad_norm": 0.0031318028923124075,
+      "learning_rate": 0.001,
+      "loss": 0.4159,
+      "step": 1117
+    },
+    {
+      "epoch": 0.030848080789964715,
+      "grad_norm": 0.004523344803601503,
+      "learning_rate": 0.001,
+      "loss": 0.3997,
+      "step": 1118
+    },
+    {
+      "epoch": 0.030875672991029085,
+      "grad_norm": 0.0020371493883430958,
+      "learning_rate": 0.001,
+      "loss": 0.4182,
+      "step": 1119
+    },
+    {
+      "epoch": 0.030903265192093455,
+      "grad_norm": 0.0022572767920792103,
+      "learning_rate": 0.001,
+      "loss": 0.4066,
+      "step": 1120
+    },
+    {
+      "epoch": 0.030930857393157825,
+      "grad_norm": 0.0032437213230878115,
+      "learning_rate": 0.001,
+      "loss": 0.3895,
+      "step": 1121
+    },
+    {
+      "epoch": 0.030958449594222195,
+      "grad_norm": 0.004357442259788513,
+      "learning_rate": 0.001,
+      "loss": 0.3938,
+      "step": 1122
+    },
+    {
+      "epoch": 0.03098604179528656,
+      "grad_norm": 0.0027816223446279764,
+      "learning_rate": 0.001,
+      "loss": 0.3732,
+      "step": 1123
+    },
+    {
+      "epoch": 0.03101363399635093,
+      "grad_norm": 0.002367631997913122,
+      "learning_rate": 0.001,
+      "loss": 0.4242,
+      "step": 1124
+    },
+    {
+      "epoch": 0.0310412261974153,
+      "grad_norm": 0.007127678487449884,
+      "learning_rate": 0.001,
+      "loss": 0.4038,
+      "step": 1125
+    },
+    {
+      "epoch": 0.03106881839847967,
+      "grad_norm": 0.003470206633210182,
+      "learning_rate": 0.001,
+      "loss": 0.4224,
+      "step": 1126
+    },
+    {
+      "epoch": 0.031096410599544037,
+      "grad_norm": 0.003238279139623046,
+      "learning_rate": 0.001,
+      "loss": 0.4325,
+      "step": 1127
+    },
+    {
+      "epoch": 0.031124002800608407,
+      "grad_norm": 0.0037913359701633453,
+      "learning_rate": 0.001,
+      "loss": 0.4244,
+      "step": 1128
+    },
+    {
+      "epoch": 0.031151595001672777,
+      "grad_norm": 0.006213251501321793,
+      "learning_rate": 0.001,
+      "loss": 0.4206,
+      "step": 1129
+    },
+    {
+      "epoch": 0.031179187202737147,
+      "grad_norm": 0.004787992220371962,
+      "learning_rate": 0.001,
+      "loss": 0.3691,
+      "step": 1130
+    },
+    {
+      "epoch": 0.031206779403801517,
+      "grad_norm": 0.0029551470652222633,
+      "learning_rate": 0.001,
+      "loss": 0.3784,
+      "step": 1131
+    },
+    {
+      "epoch": 0.031234371604865883,
+      "grad_norm": 0.0033430701587349176,
+      "learning_rate": 0.001,
+      "loss": 0.4101,
+      "step": 1132
+    },
+    {
+      "epoch": 0.03126196380593026,
+      "grad_norm": 0.0027807389851659536,
+      "learning_rate": 0.001,
+      "loss": 0.397,
+      "step": 1133
+    },
+    {
+      "epoch": 0.03128955600699462,
+      "grad_norm": 0.003867300460115075,
+      "learning_rate": 0.001,
+      "loss": 0.4063,
+      "step": 1134
+    },
+    {
+      "epoch": 0.03131714820805899,
+      "grad_norm": 0.0034717791713774204,
+      "learning_rate": 0.001,
+      "loss": 0.3979,
+      "step": 1135
+    },
+    {
+      "epoch": 0.03134474040912336,
+      "grad_norm": 0.0047624786384403706,
+      "learning_rate": 0.001,
+      "loss": 0.415,
+      "step": 1136
+    },
+    {
+      "epoch": 0.03137233261018773,
+      "grad_norm": 0.0029204641468822956,
+      "learning_rate": 0.001,
+      "loss": 0.4426,
+      "step": 1137
+    },
+    {
+      "epoch": 0.0313999248112521,
+      "grad_norm": 0.0029500045347958803,
+      "learning_rate": 0.001,
+      "loss": 0.4429,
+      "step": 1138
+    },
+    {
+      "epoch": 0.03142751701231647,
+      "grad_norm": 0.0030622980557382107,
+      "learning_rate": 0.001,
+      "loss": 0.3874,
+      "step": 1139
+    },
+    {
+      "epoch": 0.031455109213380836,
+      "grad_norm": 0.006081267725676298,
+      "learning_rate": 0.001,
+      "loss": 0.4088,
+      "step": 1140
+    },
+    {
+      "epoch": 0.03148270141444521,
+      "grad_norm": 0.0026580330450087786,
+      "learning_rate": 0.001,
+      "loss": 0.4361,
+      "step": 1141
+    },
+    {
+      "epoch": 0.031510293615509576,
+      "grad_norm": 0.0036745185498148203,
+      "learning_rate": 0.001,
+      "loss": 0.3995,
+      "step": 1142
+    },
+    {
+      "epoch": 0.03153788581657394,
+      "grad_norm": 0.0035352655686438084,
+      "learning_rate": 0.001,
+      "loss": 0.3758,
+      "step": 1143
+    },
+    {
+      "epoch": 0.031565478017638315,
+      "grad_norm": 0.005509037058800459,
+      "learning_rate": 0.001,
+      "loss": 0.4058,
+      "step": 1144
+    },
+    {
+      "epoch": 0.03159307021870268,
+      "grad_norm": 0.0026996792294085026,
+      "learning_rate": 0.001,
+      "loss": 0.4276,
+      "step": 1145
+    },
+    {
+      "epoch": 0.031620662419767055,
+      "grad_norm": 0.0030703977681696415,
+      "learning_rate": 0.001,
+      "loss": 0.4071,
+      "step": 1146
+    },
+    {
+      "epoch": 0.03164825462083142,
+      "grad_norm": 0.004798520356416702,
+      "learning_rate": 0.001,
+      "loss": 0.3623,
+      "step": 1147
+    },
+    {
+      "epoch": 0.03167584682189579,
+      "grad_norm": 0.0030252067372202873,
+      "learning_rate": 0.001,
+      "loss": 0.4195,
+      "step": 1148
+    },
+    {
+      "epoch": 0.03170343902296016,
+      "grad_norm": 0.0031654182821512222,
+      "learning_rate": 0.001,
+      "loss": 0.4016,
+      "step": 1149
+    },
+    {
+      "epoch": 0.03173103122402453,
+      "grad_norm": 0.005452923942357302,
+      "learning_rate": 0.001,
+      "loss": 0.4021,
+      "step": 1150
+    },
+    {
+      "epoch": 0.0317586234250889,
+      "grad_norm": 0.004767664708197117,
+      "learning_rate": 0.001,
+      "loss": 0.3967,
+      "step": 1151
+    },
+    {
+      "epoch": 0.03178621562615327,
+      "grad_norm": 0.0034988594707101583,
+      "learning_rate": 0.001,
+      "loss": 0.3939,
+      "step": 1152
+    },
+    {
+      "epoch": 0.031813807827217634,
+      "grad_norm": 0.0034391777589917183,
+      "learning_rate": 0.001,
+      "loss": 0.407,
+      "step": 1153
+    },
+    {
+      "epoch": 0.03184140002828201,
+      "grad_norm": 0.003413598518818617,
+      "learning_rate": 0.001,
+      "loss": 0.4151,
+      "step": 1154
+    },
+    {
+      "epoch": 0.031868992229346374,
+      "grad_norm": 0.00447838706895709,
+      "learning_rate": 0.001,
+      "loss": 0.4024,
+      "step": 1155
+    },
+    {
+      "epoch": 0.03189658443041075,
+      "grad_norm": 0.002987177576869726,
+      "learning_rate": 0.001,
+      "loss": 0.385,
+      "step": 1156
+    },
+    {
+      "epoch": 0.031924176631475114,
+      "grad_norm": 0.002857605693861842,
+      "learning_rate": 0.001,
+      "loss": 0.3813,
+      "step": 1157
+    },
+    {
+      "epoch": 0.03195176883253948,
+      "grad_norm": 0.0050466121174395084,
+      "learning_rate": 0.001,
+      "loss": 0.4231,
+      "step": 1158
+    },
+    {
+      "epoch": 0.031979361033603854,
+      "grad_norm": 0.003727944800630212,
+      "learning_rate": 0.001,
+      "loss": 0.4096,
+      "step": 1159
+    },
+    {
+      "epoch": 0.03200695323466822,
+      "grad_norm": 0.003613363951444626,
+      "learning_rate": 0.001,
+      "loss": 0.408,
+      "step": 1160
+    },
+    {
+      "epoch": 0.03203454543573259,
+      "grad_norm": 0.003519849618896842,
+      "learning_rate": 0.001,
+      "loss": 0.4054,
+      "step": 1161
+    },
+    {
+      "epoch": 0.03206213763679696,
+      "grad_norm": 0.005940241273492575,
+      "learning_rate": 0.001,
+      "loss": 0.423,
+      "step": 1162
+    },
+    {
+      "epoch": 0.032089729837861326,
+      "grad_norm": 0.0036697194445878267,
+      "learning_rate": 0.001,
+      "loss": 0.4025,
+      "step": 1163
+    },
+    {
+      "epoch": 0.0321173220389257,
+      "grad_norm": 0.0032535314094275236,
+      "learning_rate": 0.001,
+      "loss": 0.3744,
+      "step": 1164
+    },
+    {
+      "epoch": 0.032144914239990066,
+      "grad_norm": 0.0032929456792771816,
+      "learning_rate": 0.001,
+      "loss": 0.435,
+      "step": 1165
+    },
+    {
+      "epoch": 0.03217250644105443,
+      "grad_norm": 0.0031699042301625013,
+      "learning_rate": 0.001,
+      "loss": 0.3564,
+      "step": 1166
+    },
+    {
+      "epoch": 0.032200098642118806,
+      "grad_norm": 0.004080107901245356,
+      "learning_rate": 0.001,
+      "loss": 0.4004,
+      "step": 1167
+    },
+    {
+      "epoch": 0.03222769084318317,
+      "grad_norm": 0.0031074616126716137,
+      "learning_rate": 0.001,
+      "loss": 0.4407,
+      "step": 1168
+    },
+    {
+      "epoch": 0.032255283044247546,
+      "grad_norm": 0.004778381437063217,
+      "learning_rate": 0.001,
+      "loss": 0.4701,
+      "step": 1169
+    },
+    {
+      "epoch": 0.03228287524531191,
+      "grad_norm": 0.004103434272110462,
+      "learning_rate": 0.001,
+      "loss": 0.427,
+      "step": 1170
+    },
+    {
+      "epoch": 0.03231046744637628,
+      "grad_norm": 0.0030324216932058334,
+      "learning_rate": 0.001,
+      "loss": 0.4087,
+      "step": 1171
+    },
+    {
+      "epoch": 0.03233805964744065,
+      "grad_norm": 0.0036339950747787952,
+      "learning_rate": 0.001,
+      "loss": 0.3929,
+      "step": 1172
+    },
+    {
+      "epoch": 0.03236565184850502,
+      "grad_norm": 0.008613920770585537,
+      "learning_rate": 0.001,
+      "loss": 0.4442,
+      "step": 1173
+    },
+    {
+      "epoch": 0.03239324404956939,
+      "grad_norm": 0.014312573708593845,
+      "learning_rate": 0.001,
+      "loss": 0.383,
+      "step": 1174
+    },
+    {
+      "epoch": 0.03242083625063376,
+      "grad_norm": 0.00535425404086709,
+      "learning_rate": 0.001,
+      "loss": 0.398,
+      "step": 1175
+    },
+    {
+      "epoch": 0.032448428451698125,
+      "grad_norm": 0.007150169927626848,
+      "learning_rate": 0.001,
+      "loss": 0.3847,
+      "step": 1176
+    },
+    {
+      "epoch": 0.0324760206527625,
+      "grad_norm": 0.0030312181916087866,
+      "learning_rate": 0.001,
+      "loss": 0.4539,
+      "step": 1177
+    },
+    {
+      "epoch": 0.032503612853826865,
+      "grad_norm": 0.003885834477841854,
+      "learning_rate": 0.001,
+      "loss": 0.3887,
+      "step": 1178
+    },
+    {
+      "epoch": 0.03253120505489124,
+      "grad_norm": 0.0037573217414319515,
+      "learning_rate": 0.001,
+      "loss": 0.3754,
+      "step": 1179
+    },
+    {
+      "epoch": 0.032558797255955604,
+      "grad_norm": 0.015517042018473148,
+      "learning_rate": 0.001,
+      "loss": 0.3809,
+      "step": 1180
+    },
+    {
+      "epoch": 0.03258638945701997,
+      "grad_norm": 0.004123013466596603,
+      "learning_rate": 0.001,
+      "loss": 0.4034,
+      "step": 1181
+    },
+    {
+      "epoch": 0.032613981658084344,
+      "grad_norm": 0.00358961196616292,
+      "learning_rate": 0.001,
+      "loss": 0.4251,
+      "step": 1182
+    },
+    {
+      "epoch": 0.03264157385914871,
+      "grad_norm": 0.002588262315839529,
+      "learning_rate": 0.001,
+      "loss": 0.4649,
+      "step": 1183
+    },
+    {
+      "epoch": 0.032669166060213084,
+      "grad_norm": 0.0033034030348062515,
+      "learning_rate": 0.001,
+      "loss": 0.4165,
+      "step": 1184
+    },
+    {
+      "epoch": 0.03269675826127745,
+      "grad_norm": 0.0026907999999821186,
+      "learning_rate": 0.001,
+      "loss": 0.435,
+      "step": 1185
+    },
+    {
+      "epoch": 0.03272435046234182,
+      "grad_norm": 0.0029784864746034145,
+      "learning_rate": 0.001,
+      "loss": 0.4088,
+      "step": 1186
+    },
+    {
+      "epoch": 0.03275194266340619,
+      "grad_norm": 0.004101334605365992,
+      "learning_rate": 0.001,
+      "loss": 0.4322,
+      "step": 1187
+    },
+    {
+      "epoch": 0.03277953486447056,
+      "grad_norm": 0.008756603114306927,
+      "learning_rate": 0.001,
+      "loss": 0.3735,
+      "step": 1188
+    },
+    {
+      "epoch": 0.03280712706553492,
+      "grad_norm": 0.005191429052501917,
+      "learning_rate": 0.001,
+      "loss": 0.4222,
+      "step": 1189
+    },
+    {
+      "epoch": 0.032834719266599297,
+      "grad_norm": 0.00770029379054904,
+      "learning_rate": 0.001,
+      "loss": 0.3631,
+      "step": 1190
+    },
+    {
+      "epoch": 0.03286231146766366,
+      "grad_norm": 0.005141628440469503,
+      "learning_rate": 0.001,
+      "loss": 0.4402,
+      "step": 1191
+    },
+    {
+      "epoch": 0.032889903668728036,
+      "grad_norm": 0.004422733094543219,
+      "learning_rate": 0.001,
+      "loss": 0.3201,
+      "step": 1192
+    },
+    {
+      "epoch": 0.0329174958697924,
+      "grad_norm": 0.0060617136768996716,
+      "learning_rate": 0.001,
+      "loss": 0.4299,
+      "step": 1193
+    },
+    {
+      "epoch": 0.03294508807085677,
+      "grad_norm": 0.007132702972739935,
+      "learning_rate": 0.001,
+      "loss": 0.4178,
+      "step": 1194
+    },
+    {
+      "epoch": 0.03297268027192114,
+      "grad_norm": 0.006239313166588545,
+      "learning_rate": 0.001,
+      "loss": 0.4001,
+      "step": 1195
+    },
+    {
+      "epoch": 0.03300027247298551,
+      "grad_norm": 0.00840328261256218,
+      "learning_rate": 0.001,
+      "loss": 0.4236,
+      "step": 1196
+    },
+    {
+      "epoch": 0.03302786467404988,
+      "grad_norm": 0.004313162062317133,
+      "learning_rate": 0.001,
+      "loss": 0.3821,
+      "step": 1197
+    },
+    {
+      "epoch": 0.03305545687511425,
+      "grad_norm": 0.004403871949762106,
+      "learning_rate": 0.001,
+      "loss": 0.4372,
+      "step": 1198
+    },
+    {
+      "epoch": 0.033083049076178615,
+      "grad_norm": 0.005073420237749815,
+      "learning_rate": 0.001,
+      "loss": 0.3998,
+      "step": 1199
+    },
+    {
+      "epoch": 0.03311064127724299,
+      "grad_norm": 0.00479225255548954,
+      "learning_rate": 0.001,
+      "loss": 0.398,
+      "step": 1200
+    },
+    {
+      "epoch": 0.033138233478307355,
+      "grad_norm": 0.003564857877790928,
+      "learning_rate": 0.001,
+      "loss": 0.4359,
+      "step": 1201
+    },
+    {
+      "epoch": 0.03316582567937173,
+      "grad_norm": 0.004907668102532625,
+      "learning_rate": 0.001,
+      "loss": 0.3734,
+      "step": 1202
+    },
+    {
+      "epoch": 0.033193417880436095,
+      "grad_norm": 0.003451892174780369,
+      "learning_rate": 0.001,
+      "loss": 0.3853,
+      "step": 1203
+    },
+    {
+      "epoch": 0.03322101008150046,
+      "grad_norm": 0.0029975506477057934,
+      "learning_rate": 0.001,
+      "loss": 0.3931,
+      "step": 1204
+    },
+    {
+      "epoch": 0.033248602282564835,
+      "grad_norm": 0.005138040985912085,
+      "learning_rate": 0.001,
+      "loss": 0.3977,
+      "step": 1205
+    },
+    {
+      "epoch": 0.0332761944836292,
+      "grad_norm": 0.004696134477853775,
+      "learning_rate": 0.001,
+      "loss": 0.3863,
+      "step": 1206
+    },
+    {
+      "epoch": 0.033303786684693575,
+      "grad_norm": 0.005408620461821556,
+      "learning_rate": 0.001,
+      "loss": 0.3861,
+      "step": 1207
+    },
+    {
+      "epoch": 0.03333137888575794,
+      "grad_norm": 0.004990086425095797,
+      "learning_rate": 0.001,
+      "loss": 0.408,
+      "step": 1208
+    },
+    {
+      "epoch": 0.03335897108682231,
+      "grad_norm": 0.003546757623553276,
+      "learning_rate": 0.001,
+      "loss": 0.4335,
+      "step": 1209
+    },
+    {
+      "epoch": 0.03338656328788668,
+      "grad_norm": 0.003440044354647398,
+      "learning_rate": 0.001,
+      "loss": 0.4024,
+      "step": 1210
+    },
+    {
+      "epoch": 0.03341415548895105,
+      "grad_norm": 0.005536787211894989,
+      "learning_rate": 0.001,
+      "loss": 0.3986,
+      "step": 1211
+    },
+    {
+      "epoch": 0.033441747690015414,
+      "grad_norm": 0.005742164328694344,
+      "learning_rate": 0.001,
+      "loss": 0.386,
+      "step": 1212
+    },
+    {
+      "epoch": 0.03346933989107979,
+      "grad_norm": 0.009792082943022251,
+      "learning_rate": 0.001,
+      "loss": 0.3656,
+      "step": 1213
+    },
+    {
+      "epoch": 0.033496932092144154,
+      "grad_norm": 0.004209910985082388,
+      "learning_rate": 0.001,
+      "loss": 0.3601,
+      "step": 1214
+    },
+    {
+      "epoch": 0.03352452429320853,
+      "grad_norm": 0.0035643631126731634,
+      "learning_rate": 0.001,
+      "loss": 0.4034,
+      "step": 1215
+    },
+    {
+      "epoch": 0.03355211649427289,
+      "grad_norm": 0.0032242017332464457,
+      "learning_rate": 0.001,
+      "loss": 0.4123,
+      "step": 1216
+    },
+    {
+      "epoch": 0.03357970869533726,
+      "grad_norm": 0.004148339852690697,
+      "learning_rate": 0.001,
+      "loss": 0.4498,
+      "step": 1217
+    },
+    {
+      "epoch": 0.03360730089640163,
+      "grad_norm": 0.004815380088984966,
+      "learning_rate": 0.001,
+      "loss": 0.4287,
+      "step": 1218
+    },
+    {
+      "epoch": 0.033634893097466,
+      "grad_norm": 0.0036340258084237576,
+      "learning_rate": 0.001,
+      "loss": 0.4204,
+      "step": 1219
+    },
+    {
+      "epoch": 0.03366248529853037,
+      "grad_norm": 0.0031413130927830935,
+      "learning_rate": 0.001,
+      "loss": 0.4011,
+      "step": 1220
+    },
+    {
+      "epoch": 0.03369007749959474,
+      "grad_norm": 0.005375964101403952,
+      "learning_rate": 0.001,
+      "loss": 0.3973,
+      "step": 1221
+    },
+    {
+      "epoch": 0.033717669700659106,
+      "grad_norm": 0.0039574578404426575,
+      "learning_rate": 0.001,
+      "loss": 0.3736,
+      "step": 1222
+    },
+    {
+      "epoch": 0.03374526190172348,
+      "grad_norm": 0.0040151095017790794,
+      "learning_rate": 0.001,
+      "loss": 0.4118,
+      "step": 1223
+    },
+    {
+      "epoch": 0.033772854102787846,
+      "grad_norm": 0.005083186086267233,
+      "learning_rate": 0.001,
+      "loss": 0.411,
+      "step": 1224
+    },
+    {
+      "epoch": 0.03380044630385222,
+      "grad_norm": 0.00605596462264657,
+      "learning_rate": 0.001,
+      "loss": 0.3944,
+      "step": 1225
+    },
+    {
+      "epoch": 0.033828038504916585,
+      "grad_norm": 0.0039448305033147335,
+      "learning_rate": 0.001,
+      "loss": 0.4024,
+      "step": 1226
+    },
+    {
+      "epoch": 0.03385563070598095,
+      "grad_norm": 0.012280398979783058,
+      "learning_rate": 0.001,
+      "loss": 0.377,
+      "step": 1227
+    },
+    {
+      "epoch": 0.033883222907045325,
+      "grad_norm": 0.049376230686903,
+      "learning_rate": 0.001,
+      "loss": 0.3919,
+      "step": 1228
+    },
+    {
+      "epoch": 0.03391081510810969,
+      "grad_norm": 0.0036013920325785875,
+      "learning_rate": 0.001,
+      "loss": 0.4092,
+      "step": 1229
+    },
+    {
+      "epoch": 0.033938407309174065,
+      "grad_norm": 0.0094405896961689,
+      "learning_rate": 0.001,
+      "loss": 0.3937,
+      "step": 1230
+    },
+    {
+      "epoch": 0.03396599951023843,
+      "grad_norm": 0.007406734395772219,
+      "learning_rate": 0.001,
+      "loss": 0.3865,
+      "step": 1231
+    },
+    {
+      "epoch": 0.0339935917113028,
+      "grad_norm": 0.004268816206604242,
+      "learning_rate": 0.001,
+      "loss": 0.3914,
+      "step": 1232
+    },
+    {
+      "epoch": 0.03402118391236717,
+      "grad_norm": 0.003923129290342331,
+      "learning_rate": 0.001,
+      "loss": 0.3959,
+      "step": 1233
+    },
+    {
+      "epoch": 0.03404877611343154,
+      "grad_norm": 0.003082707989960909,
+      "learning_rate": 0.001,
+      "loss": 0.3886,
+      "step": 1234
+    },
+    {
+      "epoch": 0.03407636831449591,
+      "grad_norm": 0.004590165335685015,
+      "learning_rate": 0.001,
+      "loss": 0.3789,
+      "step": 1235
+    },
+    {
+      "epoch": 0.03410396051556028,
+      "grad_norm": 0.003626961726695299,
+      "learning_rate": 0.001,
+      "loss": 0.4374,
+      "step": 1236
+    },
+    {
+      "epoch": 0.034131552716624644,
+      "grad_norm": 0.003703797934576869,
+      "learning_rate": 0.001,
+      "loss": 0.382,
+      "step": 1237
+    },
+    {
+      "epoch": 0.03415914491768902,
+      "grad_norm": 0.005130970384925604,
+      "learning_rate": 0.001,
+      "loss": 0.3956,
+      "step": 1238
+    },
+    {
+      "epoch": 0.034186737118753384,
+      "grad_norm": 0.0035557232331484556,
+      "learning_rate": 0.001,
+      "loss": 0.4152,
+      "step": 1239
+    },
+    {
+      "epoch": 0.03421432931981775,
+      "grad_norm": 0.0043634334579110146,
+      "learning_rate": 0.001,
+      "loss": 0.4072,
+      "step": 1240
+    },
+    {
+      "epoch": 0.034241921520882124,
+      "grad_norm": 0.006564748473465443,
+      "learning_rate": 0.001,
+      "loss": 0.3998,
+      "step": 1241
+    },
+    {
+      "epoch": 0.03426951372194649,
+      "grad_norm": 0.0034478590823709965,
+      "learning_rate": 0.001,
+      "loss": 0.3809,
+      "step": 1242
+    },
+    {
+      "epoch": 0.034297105923010864,
+      "grad_norm": 0.003941735252737999,
+      "learning_rate": 0.001,
+      "loss": 0.3988,
+      "step": 1243
+    },
+    {
+      "epoch": 0.03432469812407523,
+      "grad_norm": 0.004107107874006033,
+      "learning_rate": 0.001,
+      "loss": 0.3597,
+      "step": 1244
+    },
+    {
+      "epoch": 0.034352290325139596,
+      "grad_norm": 0.0032025808468461037,
+      "learning_rate": 0.001,
+      "loss": 0.3771,
+      "step": 1245
+    },
+    {
+      "epoch": 0.03437988252620397,
+      "grad_norm": 0.0033102971501648426,
+      "learning_rate": 0.001,
+      "loss": 0.4277,
+      "step": 1246
+    },
+    {
+      "epoch": 0.034407474727268336,
+      "grad_norm": 0.00529972231015563,
+      "learning_rate": 0.001,
+      "loss": 0.4127,
+      "step": 1247
+    },
+    {
+      "epoch": 0.03443506692833271,
+      "grad_norm": 0.004502330906689167,
+      "learning_rate": 0.001,
+      "loss": 0.4212,
+      "step": 1248
+    },
+    {
+      "epoch": 0.034462659129397076,
+      "grad_norm": 0.0027463252190500498,
+      "learning_rate": 0.001,
+      "loss": 0.4051,
+      "step": 1249
+    },
+    {
+      "epoch": 0.03449025133046144,
+      "grad_norm": 0.0033640682231634855,
+      "learning_rate": 0.001,
+      "loss": 0.388,
+      "step": 1250
+    },
+    {
+      "epoch": 0.034517843531525816,
+      "grad_norm": 0.007285924628376961,
+      "learning_rate": 0.001,
+      "loss": 0.413,
+      "step": 1251
+    },
+    {
+      "epoch": 0.03454543573259018,
+      "grad_norm": 0.004217895213514566,
+      "learning_rate": 0.001,
+      "loss": 0.4238,
+      "step": 1252
+    },
+    {
+      "epoch": 0.034573027933654556,
+      "grad_norm": 0.0027172528207302094,
+      "learning_rate": 0.001,
+      "loss": 0.3733,
+      "step": 1253
+    },
+    {
+      "epoch": 0.03460062013471892,
+      "grad_norm": 0.0033437691163271666,
+      "learning_rate": 0.001,
+      "loss": 0.412,
+      "step": 1254
+    },
+    {
+      "epoch": 0.03462821233578329,
+      "grad_norm": 0.003804217092692852,
+      "learning_rate": 0.001,
+      "loss": 0.4009,
+      "step": 1255
+    },
+    {
+      "epoch": 0.03465580453684766,
+      "grad_norm": 0.0027781634125858545,
+      "learning_rate": 0.001,
+      "loss": 0.456,
+      "step": 1256
+    },
+    {
+      "epoch": 0.03468339673791203,
+      "grad_norm": 0.003424674505367875,
+      "learning_rate": 0.001,
+      "loss": 0.3924,
+      "step": 1257
+    },
+    {
+      "epoch": 0.0347109889389764,
+      "grad_norm": 0.003052354324609041,
+      "learning_rate": 0.001,
+      "loss": 0.4331,
+      "step": 1258
+    },
+    {
+      "epoch": 0.03473858114004077,
+      "grad_norm": 0.004982203710824251,
+      "learning_rate": 0.001,
+      "loss": 0.3557,
+      "step": 1259
+    },
+    {
+      "epoch": 0.034766173341105135,
+      "grad_norm": 0.003158049425110221,
+      "learning_rate": 0.001,
+      "loss": 0.4131,
+      "step": 1260
+    },
+    {
+      "epoch": 0.03479376554216951,
+      "grad_norm": 0.0035426870454102755,
+      "learning_rate": 0.001,
+      "loss": 0.3496,
+      "step": 1261
+    },
+    {
+      "epoch": 0.034821357743233874,
+      "grad_norm": 0.0026156532112509012,
+      "learning_rate": 0.001,
+      "loss": 0.4396,
+      "step": 1262
+    },
+    {
+      "epoch": 0.03484894994429824,
+      "grad_norm": 0.0027896466199308634,
+      "learning_rate": 0.001,
+      "loss": 0.4073,
+      "step": 1263
+    },
+    {
+      "epoch": 0.034876542145362614,
+      "grad_norm": 0.002534053521230817,
+      "learning_rate": 0.001,
+      "loss": 0.379,
+      "step": 1264
+    },
+    {
+      "epoch": 0.03490413434642698,
+      "grad_norm": 0.005910890176892281,
+      "learning_rate": 0.001,
+      "loss": 0.3777,
+      "step": 1265
+    },
+    {
+      "epoch": 0.034931726547491354,
+      "grad_norm": 0.004372291266918182,
+      "learning_rate": 0.001,
+      "loss": 0.4066,
+      "step": 1266
+    },
+    {
+      "epoch": 0.03495931874855572,
+      "grad_norm": 0.0036974658723920584,
+      "learning_rate": 0.001,
+      "loss": 0.4099,
+      "step": 1267
+    },
+    {
+      "epoch": 0.03498691094962009,
+      "grad_norm": 0.04189533367753029,
+      "learning_rate": 0.001,
+      "loss": 0.4203,
+      "step": 1268
+    },
+    {
+      "epoch": 0.03501450315068446,
+      "grad_norm": 0.0037905664648860693,
+      "learning_rate": 0.001,
+      "loss": 0.4287,
+      "step": 1269
+    },
+    {
+      "epoch": 0.03504209535174883,
+      "grad_norm": 0.004496126435697079,
+      "learning_rate": 0.001,
+      "loss": 0.3499,
+      "step": 1270
+    },
+    {
+      "epoch": 0.0350696875528132,
+      "grad_norm": 0.002956201322376728,
+      "learning_rate": 0.001,
+      "loss": 0.4148,
+      "step": 1271
+    },
+    {
+      "epoch": 0.03509727975387757,
+      "grad_norm": 0.004545163828879595,
+      "learning_rate": 0.001,
+      "loss": 0.399,
+      "step": 1272
+    },
+    {
+      "epoch": 0.03512487195494193,
+      "grad_norm": 0.005399242043495178,
+      "learning_rate": 0.001,
+      "loss": 0.409,
+      "step": 1273
+    },
+    {
+      "epoch": 0.035152464156006306,
+      "grad_norm": 0.003836257616057992,
+      "learning_rate": 0.001,
+      "loss": 0.3758,
+      "step": 1274
+    },
+    {
+      "epoch": 0.03518005635707067,
+      "grad_norm": 0.0035388548858463764,
+      "learning_rate": 0.001,
+      "loss": 0.3961,
+      "step": 1275
+    },
+    {
+      "epoch": 0.035207648558135046,
+      "grad_norm": 0.009006824344396591,
+      "learning_rate": 0.001,
+      "loss": 0.4311,
+      "step": 1276
+    },
+    {
+      "epoch": 0.03523524075919941,
+      "grad_norm": 0.006458511110395193,
+      "learning_rate": 0.001,
+      "loss": 0.4337,
+      "step": 1277
+    },
+    {
+      "epoch": 0.03526283296026378,
+      "grad_norm": 0.0033469260670244694,
+      "learning_rate": 0.001,
+      "loss": 0.4188,
+      "step": 1278
+    },
+    {
+      "epoch": 0.03529042516132815,
+      "grad_norm": 0.003662231145426631,
+      "learning_rate": 0.001,
+      "loss": 0.3961,
+      "step": 1279
+    },
+    {
+      "epoch": 0.03531801736239252,
+      "grad_norm": 0.004838781896978617,
+      "learning_rate": 0.001,
+      "loss": 0.4223,
+      "step": 1280
+    },
+    {
+      "epoch": 0.03534560956345689,
+      "grad_norm": 0.003931723535060883,
+      "learning_rate": 0.001,
+      "loss": 0.3892,
+      "step": 1281
+    },
+    {
+      "epoch": 0.03537320176452126,
+      "grad_norm": 0.0037028163205832243,
+      "learning_rate": 0.001,
+      "loss": 0.4293,
+      "step": 1282
+    },
+    {
+      "epoch": 0.035400793965585625,
+      "grad_norm": 0.00402813870459795,
+      "learning_rate": 0.001,
+      "loss": 0.4242,
+      "step": 1283
+    },
+    {
+      "epoch": 0.03542838616665,
+      "grad_norm": 0.0033616770524531603,
+      "learning_rate": 0.001,
+      "loss": 0.3881,
+      "step": 1284
+    },
+    {
+      "epoch": 0.035455978367714365,
+      "grad_norm": 0.002920625265687704,
+      "learning_rate": 0.001,
+      "loss": 0.4269,
+      "step": 1285
+    },
+    {
+      "epoch": 0.03548357056877873,
+      "grad_norm": 0.007799847517162561,
+      "learning_rate": 0.001,
+      "loss": 0.3389,
+      "step": 1286
+    },
+    {
+      "epoch": 0.035511162769843105,
+      "grad_norm": 0.0034114914014935493,
+      "learning_rate": 0.001,
+      "loss": 0.3903,
+      "step": 1287
+    },
+    {
+      "epoch": 0.03553875497090747,
+      "grad_norm": 0.0037257294170558453,
+      "learning_rate": 0.001,
+      "loss": 0.4003,
+      "step": 1288
+    },
+    {
+      "epoch": 0.035566347171971845,
+      "grad_norm": 0.0022869498934596777,
+      "learning_rate": 0.001,
+      "loss": 0.4179,
+      "step": 1289
+    },
+    {
+      "epoch": 0.03559393937303621,
+      "grad_norm": 0.006633399520069361,
+      "learning_rate": 0.001,
+      "loss": 0.3713,
+      "step": 1290
+    },
+    {
+      "epoch": 0.03562153157410058,
+      "grad_norm": 0.004205191507935524,
+      "learning_rate": 0.001,
+      "loss": 0.4066,
+      "step": 1291
+    },
+    {
+      "epoch": 0.03564912377516495,
+      "grad_norm": 0.0037389290519058704,
+      "learning_rate": 0.001,
+      "loss": 0.3952,
+      "step": 1292
+    },
+    {
+      "epoch": 0.03567671597622932,
+      "grad_norm": 0.00401865690946579,
+      "learning_rate": 0.001,
+      "loss": 0.411,
+      "step": 1293
+    },
+    {
+      "epoch": 0.03570430817729369,
+      "grad_norm": 0.0036637093871831894,
+      "learning_rate": 0.001,
+      "loss": 0.3678,
+      "step": 1294
+    },
+    {
+      "epoch": 0.03573190037835806,
+      "grad_norm": 0.002707039937376976,
+      "learning_rate": 0.001,
+      "loss": 0.4078,
+      "step": 1295
+    },
+    {
+      "epoch": 0.035759492579422424,
+      "grad_norm": 0.004088058602064848,
+      "learning_rate": 0.001,
+      "loss": 0.388,
+      "step": 1296
+    },
+    {
+      "epoch": 0.0357870847804868,
+      "grad_norm": 0.0029987411107867956,
+      "learning_rate": 0.001,
+      "loss": 0.3935,
+      "step": 1297
+    },
+    {
+      "epoch": 0.03581467698155116,
+      "grad_norm": 0.0037499302998185158,
+      "learning_rate": 0.001,
+      "loss": 0.4239,
+      "step": 1298
+    },
+    {
+      "epoch": 0.03584226918261554,
+      "grad_norm": 0.00426569813862443,
+      "learning_rate": 0.001,
+      "loss": 0.3822,
+      "step": 1299
+    },
+    {
+      "epoch": 0.0358698613836799,
+      "grad_norm": 0.0036391124594956636,
+      "learning_rate": 0.001,
+      "loss": 0.3956,
+      "step": 1300
+    },
+    {
+      "epoch": 0.03589745358474427,
+      "grad_norm": 0.003542504971846938,
+      "learning_rate": 0.001,
+      "loss": 0.4064,
+      "step": 1301
+    },
+    {
+      "epoch": 0.03592504578580864,
+      "grad_norm": 0.003770799608901143,
+      "learning_rate": 0.001,
+      "loss": 0.3766,
+      "step": 1302
+    },
+    {
+      "epoch": 0.03595263798687301,
+      "grad_norm": 0.006019794847816229,
+      "learning_rate": 0.001,
+      "loss": 0.4044,
+      "step": 1303
+    },
+    {
+      "epoch": 0.03598023018793738,
+      "grad_norm": 0.0027127231005579233,
+      "learning_rate": 0.001,
+      "loss": 0.4451,
+      "step": 1304
+    },
+    {
+      "epoch": 0.03600782238900175,
+      "grad_norm": 0.004511113744229078,
+      "learning_rate": 0.001,
+      "loss": 0.3737,
+      "step": 1305
+    },
+    {
+      "epoch": 0.036035414590066116,
+      "grad_norm": 0.0061105103231966496,
+      "learning_rate": 0.001,
+      "loss": 0.4041,
+      "step": 1306
+    },
+    {
+      "epoch": 0.03606300679113049,
+      "grad_norm": 0.003959209658205509,
+      "learning_rate": 0.001,
+      "loss": 0.3976,
+      "step": 1307
+    },
+    {
+      "epoch": 0.036090598992194856,
+      "grad_norm": 0.006086795590817928,
+      "learning_rate": 0.001,
+      "loss": 0.4124,
+      "step": 1308
+    },
+    {
+      "epoch": 0.03611819119325922,
+      "grad_norm": 0.003370011458173394,
+      "learning_rate": 0.001,
+      "loss": 0.447,
+      "step": 1309
+    },
+    {
+      "epoch": 0.036145783394323595,
+      "grad_norm": 0.004544582683593035,
+      "learning_rate": 0.001,
+      "loss": 0.3875,
+      "step": 1310
+    },
+    {
+      "epoch": 0.03617337559538796,
+      "grad_norm": 0.004398183431476355,
+      "learning_rate": 0.001,
+      "loss": 0.3935,
+      "step": 1311
+    },
+    {
+      "epoch": 0.036200967796452335,
+      "grad_norm": 0.0034012263640761375,
+      "learning_rate": 0.001,
+      "loss": 0.3787,
+      "step": 1312
+    },
+    {
+      "epoch": 0.0362285599975167,
+      "grad_norm": 0.004038006532937288,
+      "learning_rate": 0.001,
+      "loss": 0.4019,
+      "step": 1313
+    },
+    {
+      "epoch": 0.03625615219858107,
+      "grad_norm": 0.004931545816361904,
+      "learning_rate": 0.001,
+      "loss": 0.3872,
+      "step": 1314
+    },
+    {
+      "epoch": 0.03628374439964544,
+      "grad_norm": 0.004279931075870991,
+      "learning_rate": 0.001,
+      "loss": 0.4528,
+      "step": 1315
+    },
+    {
+      "epoch": 0.03631133660070981,
+      "grad_norm": 0.0031895472202450037,
+      "learning_rate": 0.001,
+      "loss": 0.4364,
+      "step": 1316
+    },
+    {
+      "epoch": 0.03633892880177418,
+      "grad_norm": 0.005345645360648632,
+      "learning_rate": 0.001,
+      "loss": 0.4076,
+      "step": 1317
+    },
+    {
+      "epoch": 0.03636652100283855,
+      "grad_norm": 0.004938568454235792,
+      "learning_rate": 0.001,
+      "loss": 0.3869,
+      "step": 1318
+    },
+    {
+      "epoch": 0.036394113203902914,
+      "grad_norm": 0.004648009780794382,
+      "learning_rate": 0.001,
+      "loss": 0.4285,
+      "step": 1319
+    },
+    {
+      "epoch": 0.03642170540496729,
+      "grad_norm": 0.006550830788910389,
+      "learning_rate": 0.001,
+      "loss": 0.4224,
+      "step": 1320
+    },
+    {
+      "epoch": 0.036449297606031654,
+      "grad_norm": 0.002846076153218746,
+      "learning_rate": 0.001,
+      "loss": 0.4178,
+      "step": 1321
+    },
+    {
+      "epoch": 0.03647688980709603,
+      "grad_norm": 0.004674985073506832,
+      "learning_rate": 0.001,
+      "loss": 0.4307,
+      "step": 1322
+    },
+    {
+      "epoch": 0.036504482008160394,
+      "grad_norm": 0.003952328581362963,
+      "learning_rate": 0.001,
+      "loss": 0.4039,
+      "step": 1323
+    },
+    {
+      "epoch": 0.03653207420922476,
+      "grad_norm": 0.0029479297809302807,
+      "learning_rate": 0.001,
+      "loss": 0.405,
+      "step": 1324
+    },
+    {
+      "epoch": 0.036559666410289134,
+      "grad_norm": 0.0032731324899941683,
+      "learning_rate": 0.001,
+      "loss": 0.3883,
+      "step": 1325
+    },
+    {
+      "epoch": 0.0365872586113535,
+      "grad_norm": 0.003791957162320614,
+      "learning_rate": 0.001,
+      "loss": 0.4212,
+      "step": 1326
+    },
+    {
+      "epoch": 0.03661485081241787,
+      "grad_norm": 0.009052555076777935,
+      "learning_rate": 0.001,
+      "loss": 0.4102,
+      "step": 1327
+    },
+    {
+      "epoch": 0.03664244301348224,
+      "grad_norm": 0.005174124613404274,
+      "learning_rate": 0.001,
+      "loss": 0.4333,
+      "step": 1328
+    },
+    {
+      "epoch": 0.036670035214546606,
+      "grad_norm": 0.008013852871954441,
+      "learning_rate": 0.001,
+      "loss": 0.4314,
+      "step": 1329
+    },
+    {
+      "epoch": 0.03669762741561098,
+      "grad_norm": 0.006423450540751219,
+      "learning_rate": 0.001,
+      "loss": 0.402,
+      "step": 1330
+    },
+    {
+      "epoch": 0.036725219616675346,
+      "grad_norm": 0.006040682550519705,
+      "learning_rate": 0.001,
+      "loss": 0.4004,
+      "step": 1331
+    },
+    {
+      "epoch": 0.03675281181773971,
+      "grad_norm": 0.0055701639503240585,
+      "learning_rate": 0.001,
+      "loss": 0.3934,
+      "step": 1332
+    },
+    {
+      "epoch": 0.036780404018804086,
+      "grad_norm": 0.01270906999707222,
+      "learning_rate": 0.001,
+      "loss": 0.3981,
+      "step": 1333
+    },
+    {
+      "epoch": 0.03680799621986845,
+      "grad_norm": 0.003626336809247732,
+      "learning_rate": 0.001,
+      "loss": 0.3984,
+      "step": 1334
+    },
+    {
+      "epoch": 0.036835588420932826,
+      "grad_norm": 0.004632920026779175,
+      "learning_rate": 0.001,
+      "loss": 0.3836,
+      "step": 1335
+    },
+    {
+      "epoch": 0.03686318062199719,
+      "grad_norm": 0.0060633327811956406,
+      "learning_rate": 0.001,
+      "loss": 0.424,
+      "step": 1336
+    },
+    {
+      "epoch": 0.03689077282306156,
+      "grad_norm": 0.004871148616075516,
+      "learning_rate": 0.001,
+      "loss": 0.4247,
+      "step": 1337
+    },
+    {
+      "epoch": 0.03691836502412593,
+      "grad_norm": 0.0031172886956483126,
+      "learning_rate": 0.001,
+      "loss": 0.4316,
+      "step": 1338
+    },
+    {
+      "epoch": 0.0369459572251903,
+      "grad_norm": 0.003916094545274973,
+      "learning_rate": 0.001,
+      "loss": 0.3908,
+      "step": 1339
+    },
+    {
+      "epoch": 0.03697354942625467,
+      "grad_norm": 0.0040051937103271484,
+      "learning_rate": 0.001,
+      "loss": 0.3949,
+      "step": 1340
+    },
+    {
+      "epoch": 0.03700114162731904,
+      "grad_norm": 0.006250888109207153,
+      "learning_rate": 0.001,
+      "loss": 0.402,
+      "step": 1341
+    },
+    {
+      "epoch": 0.037028733828383405,
+      "grad_norm": 0.002888569375500083,
+      "learning_rate": 0.001,
+      "loss": 0.405,
+      "step": 1342
+    },
+    {
+      "epoch": 0.03705632602944778,
+      "grad_norm": 0.004510107915848494,
+      "learning_rate": 0.001,
+      "loss": 0.368,
+      "step": 1343
+    },
+    {
+      "epoch": 0.037083918230512145,
+      "grad_norm": 0.007241697516292334,
+      "learning_rate": 0.001,
+      "loss": 0.3993,
+      "step": 1344
+    },
+    {
+      "epoch": 0.03711151043157652,
+      "grad_norm": 0.0032798638567328453,
+      "learning_rate": 0.001,
+      "loss": 0.3932,
+      "step": 1345
+    },
+    {
+      "epoch": 0.037139102632640884,
+      "grad_norm": 0.003440143307670951,
+      "learning_rate": 0.001,
+      "loss": 0.4474,
+      "step": 1346
+    },
+    {
+      "epoch": 0.03716669483370525,
+      "grad_norm": 0.0044672004878520966,
+      "learning_rate": 0.001,
+      "loss": 0.4135,
+      "step": 1347
+    },
+    {
+      "epoch": 0.037194287034769624,
+      "grad_norm": 0.0031135319732129574,
+      "learning_rate": 0.001,
+      "loss": 0.389,
+      "step": 1348
+    },
+    {
+      "epoch": 0.03722187923583399,
+      "grad_norm": 0.004170152824372053,
+      "learning_rate": 0.001,
+      "loss": 0.4193,
+      "step": 1349
+    },
+    {
+      "epoch": 0.037249471436898364,
+      "grad_norm": 0.0036481074057519436,
+      "learning_rate": 0.001,
+      "loss": 0.4454,
+      "step": 1350
+    },
+    {
+      "epoch": 0.03727706363796273,
+      "grad_norm": 0.003243829123675823,
+      "learning_rate": 0.001,
+      "loss": 0.4246,
+      "step": 1351
+    },
+    {
+      "epoch": 0.0373046558390271,
+      "grad_norm": 0.0034886065404862165,
+      "learning_rate": 0.001,
+      "loss": 0.4065,
+      "step": 1352
+    },
+    {
+      "epoch": 0.03733224804009147,
+      "grad_norm": 0.004647396504878998,
+      "learning_rate": 0.001,
+      "loss": 0.4131,
+      "step": 1353
+    },
+    {
+      "epoch": 0.03735984024115584,
+      "grad_norm": 0.004046002868562937,
+      "learning_rate": 0.001,
+      "loss": 0.3903,
+      "step": 1354
+    },
+    {
+      "epoch": 0.0373874324422202,
+      "grad_norm": 0.004573929589241743,
+      "learning_rate": 0.001,
+      "loss": 0.3851,
+      "step": 1355
+    },
+    {
+      "epoch": 0.037415024643284576,
+      "grad_norm": 0.006424955558031797,
+      "learning_rate": 0.001,
+      "loss": 0.3623,
+      "step": 1356
+    },
+    {
+      "epoch": 0.03744261684434894,
+      "grad_norm": 0.0033393288031220436,
+      "learning_rate": 0.001,
+      "loss": 0.4311,
+      "step": 1357
+    },
+    {
+      "epoch": 0.037470209045413316,
+      "grad_norm": 0.0031134155578911304,
+      "learning_rate": 0.001,
+      "loss": 0.3876,
+      "step": 1358
+    },
+    {
+      "epoch": 0.03749780124647768,
+      "grad_norm": 0.00366019899956882,
+      "learning_rate": 0.001,
+      "loss": 0.4282,
+      "step": 1359
+    },
+    {
+      "epoch": 0.03752539344754205,
+      "grad_norm": 0.003400568151846528,
+      "learning_rate": 0.001,
+      "loss": 0.4119,
+      "step": 1360
+    },
+    {
+      "epoch": 0.03755298564860642,
+      "grad_norm": 0.002846767893061042,
+      "learning_rate": 0.001,
+      "loss": 0.4286,
+      "step": 1361
+    },
+    {
+      "epoch": 0.03758057784967079,
+      "grad_norm": 0.0031303889118134975,
+      "learning_rate": 0.001,
+      "loss": 0.3936,
+      "step": 1362
+    },
+    {
+      "epoch": 0.03760817005073516,
+      "grad_norm": 0.0043816519901156425,
+      "learning_rate": 0.001,
+      "loss": 0.4158,
+      "step": 1363
+    },
+    {
+      "epoch": 0.03763576225179953,
+      "grad_norm": 0.004520198330283165,
+      "learning_rate": 0.001,
+      "loss": 0.349,
+      "step": 1364
+    },
+    {
+      "epoch": 0.037663354452863895,
+      "grad_norm": 0.005284131038933992,
+      "learning_rate": 0.001,
+      "loss": 0.3745,
+      "step": 1365
+    },
+    {
+      "epoch": 0.03769094665392827,
+      "grad_norm": 0.0037800793070346117,
+      "learning_rate": 0.001,
+      "loss": 0.3821,
+      "step": 1366
+    },
+    {
+      "epoch": 0.037718538854992635,
+      "grad_norm": 0.004985132720321417,
+      "learning_rate": 0.001,
+      "loss": 0.4065,
+      "step": 1367
+    },
+    {
+      "epoch": 0.03774613105605701,
+      "grad_norm": 0.0036822801921516657,
+      "learning_rate": 0.001,
+      "loss": 0.3809,
+      "step": 1368
+    },
+    {
+      "epoch": 0.037773723257121375,
+      "grad_norm": 0.0032694186083972454,
+      "learning_rate": 0.001,
+      "loss": 0.4116,
+      "step": 1369
+    },
+    {
+      "epoch": 0.03780131545818574,
+      "grad_norm": 0.003384978510439396,
+      "learning_rate": 0.001,
+      "loss": 0.3715,
+      "step": 1370
+    },
+    {
+      "epoch": 0.037828907659250115,
+      "grad_norm": 0.0035624897573143244,
+      "learning_rate": 0.001,
+      "loss": 0.3843,
+      "step": 1371
+    },
+    {
+      "epoch": 0.03785649986031448,
+      "grad_norm": 0.004096219781786203,
+      "learning_rate": 0.001,
+      "loss": 0.3674,
+      "step": 1372
+    },
+    {
+      "epoch": 0.037884092061378855,
+      "grad_norm": 0.004491012543439865,
+      "learning_rate": 0.001,
+      "loss": 0.4039,
+      "step": 1373
+    },
+    {
+      "epoch": 0.03791168426244322,
+      "grad_norm": 0.0034480481408536434,
+      "learning_rate": 0.001,
+      "loss": 0.422,
+      "step": 1374
+    },
+    {
+      "epoch": 0.03793927646350759,
+      "grad_norm": 0.006217781454324722,
+      "learning_rate": 0.001,
+      "loss": 0.3801,
+      "step": 1375
+    },
+    {
+      "epoch": 0.03796686866457196,
+      "grad_norm": 0.004664869979023933,
+      "learning_rate": 0.001,
+      "loss": 0.423,
+      "step": 1376
+    },
+    {
+      "epoch": 0.03799446086563633,
+      "grad_norm": 0.008887716569006443,
+      "learning_rate": 0.001,
+      "loss": 0.425,
+      "step": 1377
+    },
+    {
+      "epoch": 0.038022053066700694,
+      "grad_norm": 0.003177997190505266,
+      "learning_rate": 0.001,
+      "loss": 0.4169,
+      "step": 1378
+    },
+    {
+      "epoch": 0.03804964526776507,
+      "grad_norm": 0.0035175783559679985,
+      "learning_rate": 0.001,
+      "loss": 0.4148,
+      "step": 1379
+    },
+    {
+      "epoch": 0.038077237468829433,
+      "grad_norm": 0.0047409930266439915,
+      "learning_rate": 0.001,
+      "loss": 0.4128,
+      "step": 1380
+    },
+    {
+      "epoch": 0.03810482966989381,
+      "grad_norm": 0.007139190100133419,
+      "learning_rate": 0.001,
+      "loss": 0.3857,
+      "step": 1381
+    },
+    {
+      "epoch": 0.03813242187095817,
+      "grad_norm": 0.0083334194496274,
+      "learning_rate": 0.001,
+      "loss": 0.3634,
+      "step": 1382
+    },
+    {
+      "epoch": 0.03816001407202254,
+      "grad_norm": 0.0037119935732334852,
+      "learning_rate": 0.001,
+      "loss": 0.4357,
+      "step": 1383
+    },
+    {
+      "epoch": 0.03818760627308691,
+      "grad_norm": 0.00669982610270381,
+      "learning_rate": 0.001,
+      "loss": 0.386,
+      "step": 1384
+    },
+    {
+      "epoch": 0.03821519847415128,
+      "grad_norm": 0.00357433152385056,
+      "learning_rate": 0.001,
+      "loss": 0.4024,
+      "step": 1385
+    },
+    {
+      "epoch": 0.03824279067521565,
+      "grad_norm": 0.004873959813266993,
+      "learning_rate": 0.001,
+      "loss": 0.4139,
+      "step": 1386
+    },
+    {
+      "epoch": 0.03827038287628002,
+      "grad_norm": 0.011126353405416012,
+      "learning_rate": 0.001,
+      "loss": 0.3646,
+      "step": 1387
+    },
+    {
+      "epoch": 0.038297975077344386,
+      "grad_norm": 0.0038117689546197653,
+      "learning_rate": 0.001,
+      "loss": 0.4382,
+      "step": 1388
+    },
+    {
+      "epoch": 0.03832556727840876,
+      "grad_norm": 0.011077326722443104,
+      "learning_rate": 0.001,
+      "loss": 0.3894,
+      "step": 1389
+    },
+    {
+      "epoch": 0.038353159479473126,
+      "grad_norm": 0.005909190978854895,
+      "learning_rate": 0.001,
+      "loss": 0.3893,
+      "step": 1390
+    },
+    {
+      "epoch": 0.0383807516805375,
+      "grad_norm": 0.0056834593415260315,
+      "learning_rate": 0.001,
+      "loss": 0.4396,
+      "step": 1391
+    },
+    {
+      "epoch": 0.038408343881601865,
+      "grad_norm": 0.0038277830462902784,
+      "learning_rate": 0.001,
+      "loss": 0.3663,
+      "step": 1392
+    },
+    {
+      "epoch": 0.03843593608266623,
+      "grad_norm": 0.005587390623986721,
+      "learning_rate": 0.001,
+      "loss": 0.4298,
+      "step": 1393
+    },
+    {
+      "epoch": 0.038463528283730605,
+      "grad_norm": 0.004197390284389257,
+      "learning_rate": 0.001,
+      "loss": 0.4228,
+      "step": 1394
+    },
+    {
+      "epoch": 0.03849112048479497,
+      "grad_norm": 0.0047178626991808414,
+      "learning_rate": 0.001,
+      "loss": 0.4275,
+      "step": 1395
+    },
+    {
+      "epoch": 0.038518712685859345,
+      "grad_norm": 0.0031104707159101963,
+      "learning_rate": 0.001,
+      "loss": 0.4309,
+      "step": 1396
+    },
+    {
+      "epoch": 0.03854630488692371,
+      "grad_norm": 0.005666974000632763,
+      "learning_rate": 0.001,
+      "loss": 0.378,
+      "step": 1397
+    },
+    {
+      "epoch": 0.03857389708798808,
+      "grad_norm": 0.006076369900256395,
+      "learning_rate": 0.001,
+      "loss": 0.388,
+      "step": 1398
+    },
+    {
+      "epoch": 0.03860148928905245,
+      "grad_norm": 0.004276493098586798,
+      "learning_rate": 0.001,
+      "loss": 0.3863,
+      "step": 1399
+    },
+    {
+      "epoch": 0.03862908149011682,
+      "grad_norm": 0.00408798037096858,
+      "learning_rate": 0.001,
+      "loss": 0.3718,
+      "step": 1400
+    },
+    {
+      "epoch": 0.038656673691181184,
+      "grad_norm": 0.003366732969880104,
+      "learning_rate": 0.001,
+      "loss": 0.3514,
+      "step": 1401
+    },
+    {
+      "epoch": 0.03868426589224556,
+      "grad_norm": 0.003257429925724864,
+      "learning_rate": 0.001,
+      "loss": 0.414,
+      "step": 1402
+    },
+    {
+      "epoch": 0.038711858093309924,
+      "grad_norm": 0.0036397224757820368,
+      "learning_rate": 0.001,
+      "loss": 0.4286,
+      "step": 1403
+    },
+    {
+      "epoch": 0.0387394502943743,
+      "grad_norm": 0.00421003857627511,
+      "learning_rate": 0.001,
+      "loss": 0.3951,
+      "step": 1404
+    },
+    {
+      "epoch": 0.038767042495438664,
+      "grad_norm": 0.004263239912688732,
+      "learning_rate": 0.001,
+      "loss": 0.3903,
+      "step": 1405
+    },
+    {
+      "epoch": 0.03879463469650303,
+      "grad_norm": 0.0025550604332238436,
+      "learning_rate": 0.001,
+      "loss": 0.403,
+      "step": 1406
+    },
+    {
+      "epoch": 0.038822226897567404,
+      "grad_norm": 0.003278963966295123,
+      "learning_rate": 0.001,
+      "loss": 0.401,
+      "step": 1407
+    },
+    {
+      "epoch": 0.03884981909863177,
+      "grad_norm": 0.002250393619760871,
+      "learning_rate": 0.001,
+      "loss": 0.4521,
+      "step": 1408
+    },
+    {
+      "epoch": 0.038877411299696144,
+      "grad_norm": 0.002963767386972904,
+      "learning_rate": 0.001,
+      "loss": 0.4566,
+      "step": 1409
+    },
+    {
+      "epoch": 0.03890500350076051,
+      "grad_norm": 0.006573919206857681,
+      "learning_rate": 0.001,
+      "loss": 0.3568,
+      "step": 1410
+    },
+    {
+      "epoch": 0.038932595701824876,
+      "grad_norm": 0.005289596039801836,
+      "learning_rate": 0.001,
+      "loss": 0.3729,
+      "step": 1411
+    },
+    {
+      "epoch": 0.03896018790288925,
+      "grad_norm": 0.0031945211812853813,
+      "learning_rate": 0.001,
+      "loss": 0.4238,
+      "step": 1412
+    },
+    {
+      "epoch": 0.038987780103953616,
+      "grad_norm": 0.002856782404705882,
+      "learning_rate": 0.001,
+      "loss": 0.4219,
+      "step": 1413
+    },
+    {
+      "epoch": 0.03901537230501799,
+      "grad_norm": 0.0037729961331933737,
+      "learning_rate": 0.001,
+      "loss": 0.4379,
+      "step": 1414
+    },
+    {
+      "epoch": 0.039042964506082356,
+      "grad_norm": 0.00266630039550364,
+      "learning_rate": 0.001,
+      "loss": 0.4188,
+      "step": 1415
+    },
+    {
+      "epoch": 0.03907055670714672,
+      "grad_norm": 0.0042518191039562225,
+      "learning_rate": 0.001,
+      "loss": 0.3827,
+      "step": 1416
+    },
+    {
+      "epoch": 0.039098148908211096,
+      "grad_norm": 0.0030879939440637827,
+      "learning_rate": 0.001,
+      "loss": 0.3825,
+      "step": 1417
+    },
+    {
+      "epoch": 0.03912574110927546,
+      "grad_norm": 0.0036084537860006094,
+      "learning_rate": 0.001,
+      "loss": 0.4009,
+      "step": 1418
+    },
+    {
+      "epoch": 0.039153333310339836,
+      "grad_norm": 0.003634381340816617,
+      "learning_rate": 0.001,
+      "loss": 0.4245,
+      "step": 1419
+    },
+    {
+      "epoch": 0.0391809255114042,
+      "grad_norm": 0.0030327397398650646,
+      "learning_rate": 0.001,
+      "loss": 0.4399,
+      "step": 1420
+    },
+    {
+      "epoch": 0.03920851771246857,
+      "grad_norm": 0.008019665256142616,
+      "learning_rate": 0.001,
+      "loss": 0.4233,
+      "step": 1421
+    },
+    {
+      "epoch": 0.03923610991353294,
+      "grad_norm": 0.004183803219348192,
+      "learning_rate": 0.001,
+      "loss": 0.4155,
+      "step": 1422
+    },
+    {
+      "epoch": 0.03926370211459731,
+      "grad_norm": 0.005366318393498659,
+      "learning_rate": 0.001,
+      "loss": 0.4228,
+      "step": 1423
+    },
+    {
+      "epoch": 0.039291294315661675,
+      "grad_norm": 0.003394525730982423,
+      "learning_rate": 0.001,
+      "loss": 0.4053,
+      "step": 1424
+    },
+    {
+      "epoch": 0.03931888651672605,
+      "grad_norm": 0.003373740240931511,
+      "learning_rate": 0.001,
+      "loss": 0.4043,
+      "step": 1425
+    },
+    {
+      "epoch": 0.039346478717790415,
+      "grad_norm": 0.002769649960100651,
+      "learning_rate": 0.001,
+      "loss": 0.4116,
+      "step": 1426
+    },
+    {
+      "epoch": 0.03937407091885479,
+      "grad_norm": 0.005319521296769381,
+      "learning_rate": 0.001,
+      "loss": 0.3995,
+      "step": 1427
+    },
+    {
+      "epoch": 0.039401663119919154,
+      "grad_norm": 0.0031118986662477255,
+      "learning_rate": 0.001,
+      "loss": 0.4008,
+      "step": 1428
+    },
+    {
+      "epoch": 0.03942925532098352,
+      "grad_norm": 0.0032665557228028774,
+      "learning_rate": 0.001,
+      "loss": 0.4214,
+      "step": 1429
+    },
+    {
+      "epoch": 0.039456847522047894,
+      "grad_norm": 0.0047190007753670216,
+      "learning_rate": 0.001,
+      "loss": 0.3242,
+      "step": 1430
+    },
+    {
+      "epoch": 0.03948443972311226,
+      "grad_norm": 0.0038909264840185642,
+      "learning_rate": 0.001,
+      "loss": 0.4022,
+      "step": 1431
+    },
+    {
+      "epoch": 0.039512031924176634,
+      "grad_norm": 0.004970925394445658,
+      "learning_rate": 0.001,
+      "loss": 0.4257,
+      "step": 1432
+    },
+    {
+      "epoch": 0.039539624125241,
+      "grad_norm": 0.004649787209928036,
+      "learning_rate": 0.001,
+      "loss": 0.4018,
+      "step": 1433
+    },
+    {
+      "epoch": 0.03956721632630537,
+      "grad_norm": 0.0030645502265542746,
+      "learning_rate": 0.001,
+      "loss": 0.4151,
+      "step": 1434
+    },
+    {
+      "epoch": 0.03959480852736974,
+      "grad_norm": 0.005270305555313826,
+      "learning_rate": 0.001,
+      "loss": 0.4114,
+      "step": 1435
+    },
+    {
+      "epoch": 0.03962240072843411,
+      "grad_norm": 0.004368067253381014,
+      "learning_rate": 0.001,
+      "loss": 0.3898,
+      "step": 1436
+    },
+    {
+      "epoch": 0.03964999292949848,
+      "grad_norm": 0.0032691999804228544,
+      "learning_rate": 0.001,
+      "loss": 0.4158,
+      "step": 1437
+    },
+    {
+      "epoch": 0.03967758513056285,
+      "grad_norm": 0.003509074915200472,
+      "learning_rate": 0.001,
+      "loss": 0.4044,
+      "step": 1438
+    },
+    {
+      "epoch": 0.03970517733162721,
+      "grad_norm": 0.00402647303417325,
+      "learning_rate": 0.001,
+      "loss": 0.3958,
+      "step": 1439
+    },
+    {
+      "epoch": 0.039732769532691586,
+      "grad_norm": 0.003934496082365513,
+      "learning_rate": 0.001,
+      "loss": 0.4351,
+      "step": 1440
+    },
+    {
+      "epoch": 0.03976036173375595,
+      "grad_norm": 0.0035782591439783573,
+      "learning_rate": 0.001,
+      "loss": 0.4143,
+      "step": 1441
+    },
+    {
+      "epoch": 0.039787953934820326,
+      "grad_norm": 0.0036837344523519278,
+      "learning_rate": 0.001,
+      "loss": 0.4007,
+      "step": 1442
+    },
+    {
+      "epoch": 0.03981554613588469,
+      "grad_norm": 0.003719213418662548,
+      "learning_rate": 0.001,
+      "loss": 0.4019,
+      "step": 1443
+    },
+    {
+      "epoch": 0.03984313833694906,
+      "grad_norm": 0.008284253068268299,
+      "learning_rate": 0.001,
+      "loss": 0.3995,
+      "step": 1444
+    },
+    {
+      "epoch": 0.03987073053801343,
+      "grad_norm": 0.0037160192150622606,
+      "learning_rate": 0.001,
+      "loss": 0.4298,
+      "step": 1445
+    },
+    {
+      "epoch": 0.0398983227390778,
+      "grad_norm": 0.003967109136283398,
+      "learning_rate": 0.001,
+      "loss": 0.4195,
+      "step": 1446
+    },
+    {
+      "epoch": 0.039925914940142165,
+      "grad_norm": 0.003119664965197444,
+      "learning_rate": 0.001,
+      "loss": 0.3624,
+      "step": 1447
+    },
+    {
+      "epoch": 0.03995350714120654,
+      "grad_norm": 0.0027750935405492783,
+      "learning_rate": 0.001,
+      "loss": 0.403,
+      "step": 1448
+    },
+    {
+      "epoch": 0.039981099342270905,
+      "grad_norm": 0.008331545628607273,
+      "learning_rate": 0.001,
+      "loss": 0.4148,
+      "step": 1449
+    },
+    {
+      "epoch": 0.04000869154333528,
+      "grad_norm": 0.004883471876382828,
+      "learning_rate": 0.001,
+      "loss": 0.3759,
+      "step": 1450
+    },
+    {
+      "epoch": 0.040036283744399645,
+      "grad_norm": 0.0037747775204479694,
+      "learning_rate": 0.001,
+      "loss": 0.4103,
+      "step": 1451
+    },
+    {
+      "epoch": 0.04006387594546401,
+      "grad_norm": 0.0035403715446591377,
+      "learning_rate": 0.001,
+      "loss": 0.3694,
+      "step": 1452
+    },
+    {
+      "epoch": 0.040091468146528385,
+      "grad_norm": 0.012222831137478352,
+      "learning_rate": 0.001,
+      "loss": 0.3925,
+      "step": 1453
+    },
+    {
+      "epoch": 0.04011906034759275,
+      "grad_norm": 0.029686246067285538,
+      "learning_rate": 0.001,
+      "loss": 0.3688,
+      "step": 1454
+    },
+    {
+      "epoch": 0.040146652548657125,
+      "grad_norm": 0.007045884151011705,
+      "learning_rate": 0.001,
+      "loss": 0.3971,
+      "step": 1455
+    },
+    {
+      "epoch": 0.04017424474972149,
+      "grad_norm": 0.0033973727840930223,
+      "learning_rate": 0.001,
+      "loss": 0.3764,
+      "step": 1456
+    },
+    {
+      "epoch": 0.04020183695078586,
+      "grad_norm": 0.004133992828428745,
+      "learning_rate": 0.001,
+      "loss": 0.3862,
+      "step": 1457
+    },
+    {
+      "epoch": 0.04022942915185023,
+      "grad_norm": 0.003264515893533826,
+      "learning_rate": 0.001,
+      "loss": 0.384,
+      "step": 1458
+    },
+    {
+      "epoch": 0.0402570213529146,
+      "grad_norm": 0.00320844491943717,
+      "learning_rate": 0.001,
+      "loss": 0.4153,
+      "step": 1459
+    },
+    {
+      "epoch": 0.04028461355397897,
+      "grad_norm": 0.0038754413835704327,
+      "learning_rate": 0.001,
+      "loss": 0.3661,
+      "step": 1460
+    },
+    {
+      "epoch": 0.04031220575504334,
+      "grad_norm": 0.009661386720836163,
+      "learning_rate": 0.001,
+      "loss": 0.4309,
+      "step": 1461
+    },
+    {
+      "epoch": 0.040339797956107704,
+      "grad_norm": 0.010238132439553738,
+      "learning_rate": 0.001,
+      "loss": 0.3922,
+      "step": 1462
+    },
+    {
+      "epoch": 0.04036739015717208,
+      "grad_norm": 0.04491569846868515,
+      "learning_rate": 0.001,
+      "loss": 0.4166,
+      "step": 1463
+    },
+    {
+      "epoch": 0.04039498235823644,
+      "grad_norm": 0.0031146227847784758,
+      "learning_rate": 0.001,
+      "loss": 0.4181,
+      "step": 1464
+    },
+    {
+      "epoch": 0.04042257455930082,
+      "grad_norm": 0.0035386565141379833,
+      "learning_rate": 0.001,
+      "loss": 0.3915,
+      "step": 1465
+    },
+    {
+      "epoch": 0.04045016676036518,
+      "grad_norm": 0.0033056430984288454,
+      "learning_rate": 0.001,
+      "loss": 0.3864,
+      "step": 1466
+    },
+    {
+      "epoch": 0.04047775896142955,
+      "grad_norm": 0.0025265736039727926,
+      "learning_rate": 0.001,
+      "loss": 0.3693,
+      "step": 1467
+    },
+    {
+      "epoch": 0.04050535116249392,
+      "grad_norm": 0.004877384752035141,
+      "learning_rate": 0.001,
+      "loss": 0.3686,
+      "step": 1468
+    },
+    {
+      "epoch": 0.04053294336355829,
+      "grad_norm": 0.006324070505797863,
+      "learning_rate": 0.001,
+      "loss": 0.3858,
+      "step": 1469
+    },
+    {
+      "epoch": 0.04056053556462266,
+      "grad_norm": 0.004497391637414694,
+      "learning_rate": 0.001,
+      "loss": 0.4021,
+      "step": 1470
+    },
+    {
+      "epoch": 0.04058812776568703,
+      "grad_norm": 0.003843271406367421,
+      "learning_rate": 0.001,
+      "loss": 0.4008,
+      "step": 1471
+    },
+    {
+      "epoch": 0.040615719966751396,
+      "grad_norm": 0.003053538501262665,
+      "learning_rate": 0.001,
+      "loss": 0.4392,
+      "step": 1472
+    },
+    {
+      "epoch": 0.04064331216781577,
+      "grad_norm": 0.0038446690887212753,
+      "learning_rate": 0.001,
+      "loss": 0.3751,
+      "step": 1473
+    },
+    {
+      "epoch": 0.040670904368880136,
+      "grad_norm": 0.003199394093826413,
+      "learning_rate": 0.001,
+      "loss": 0.3877,
+      "step": 1474
+    },
+    {
+      "epoch": 0.0406984965699445,
+      "grad_norm": 0.003342741634696722,
+      "learning_rate": 0.001,
+      "loss": 0.4147,
+      "step": 1475
+    },
+    {
+      "epoch": 0.040726088771008875,
+      "grad_norm": 0.004331924952566624,
+      "learning_rate": 0.001,
+      "loss": 0.3963,
+      "step": 1476
+    },
+    {
+      "epoch": 0.04075368097207324,
+      "grad_norm": 0.00260713673196733,
+      "learning_rate": 0.001,
+      "loss": 0.4192,
+      "step": 1477
+    },
+    {
+      "epoch": 0.040781273173137615,
+      "grad_norm": 0.003122882917523384,
+      "learning_rate": 0.001,
+      "loss": 0.3598,
+      "step": 1478
+    },
+    {
+      "epoch": 0.04080886537420198,
+      "grad_norm": 0.0028676025103777647,
+      "learning_rate": 0.001,
+      "loss": 0.4095,
+      "step": 1479
+    },
+    {
+      "epoch": 0.04083645757526635,
+      "grad_norm": 0.0038910373114049435,
+      "learning_rate": 0.001,
+      "loss": 0.3938,
+      "step": 1480
+    },
+    {
+      "epoch": 0.04086404977633072,
+      "grad_norm": 0.003291686065495014,
+      "learning_rate": 0.001,
+      "loss": 0.4073,
+      "step": 1481
+    },
+    {
+      "epoch": 0.04089164197739509,
+      "grad_norm": 0.006192247848957777,
+      "learning_rate": 0.001,
+      "loss": 0.4321,
+      "step": 1482
+    },
+    {
+      "epoch": 0.04091923417845946,
+      "grad_norm": 0.002540907124057412,
+      "learning_rate": 0.001,
+      "loss": 0.4048,
+      "step": 1483
+    },
+    {
+      "epoch": 0.04094682637952383,
+      "grad_norm": 0.003824718063697219,
+      "learning_rate": 0.001,
+      "loss": 0.3796,
+      "step": 1484
+    },
+    {
+      "epoch": 0.040974418580588194,
+      "grad_norm": 0.0036246173549443483,
+      "learning_rate": 0.001,
+      "loss": 0.3983,
+      "step": 1485
+    },
+    {
+      "epoch": 0.04100201078165257,
+      "grad_norm": 0.030295656993985176,
+      "learning_rate": 0.001,
+      "loss": 0.3849,
+      "step": 1486
+    },
+    {
+      "epoch": 0.041029602982716934,
+      "grad_norm": 0.0050461613573133945,
+      "learning_rate": 0.001,
+      "loss": 0.412,
+      "step": 1487
+    },
+    {
+      "epoch": 0.04105719518378131,
+      "grad_norm": 0.0023631087969988585,
+      "learning_rate": 0.001,
+      "loss": 0.4146,
+      "step": 1488
+    },
+    {
+      "epoch": 0.041084787384845674,
+      "grad_norm": 0.002529110526666045,
+      "learning_rate": 0.001,
+      "loss": 0.4547,
+      "step": 1489
+    },
+    {
+      "epoch": 0.04111237958591004,
+      "grad_norm": 0.00316584762185812,
+      "learning_rate": 0.001,
+      "loss": 0.3747,
+      "step": 1490
+    },
+    {
+      "epoch": 0.041139971786974414,
+      "grad_norm": 0.002411734312772751,
+      "learning_rate": 0.001,
+      "loss": 0.4104,
+      "step": 1491
+    },
+    {
+      "epoch": 0.04116756398803878,
+      "grad_norm": 0.0029997879173606634,
+      "learning_rate": 0.001,
+      "loss": 0.3994,
+      "step": 1492
+    },
+    {
+      "epoch": 0.04119515618910315,
+      "grad_norm": 0.003948535770177841,
+      "learning_rate": 0.001,
+      "loss": 0.3833,
+      "step": 1493
+    },
+    {
+      "epoch": 0.04122274839016752,
+      "grad_norm": 0.002781339455395937,
+      "learning_rate": 0.001,
+      "loss": 0.4324,
+      "step": 1494
+    },
+    {
+      "epoch": 0.041250340591231886,
+      "grad_norm": 0.015317432582378387,
+      "learning_rate": 0.001,
+      "loss": 0.4044,
+      "step": 1495
+    },
+    {
+      "epoch": 0.04127793279229626,
+      "grad_norm": 0.0075756567530334,
+      "learning_rate": 0.001,
+      "loss": 0.4164,
+      "step": 1496
+    },
+    {
+      "epoch": 0.041305524993360626,
+      "grad_norm": 0.002881971187889576,
+      "learning_rate": 0.001,
+      "loss": 0.4122,
+      "step": 1497
+    },
+    {
+      "epoch": 0.04133311719442499,
+      "grad_norm": 0.004069055896252394,
+      "learning_rate": 0.001,
+      "loss": 0.4491,
+      "step": 1498
+    },
+    {
+      "epoch": 0.041360709395489366,
+      "grad_norm": 0.002320400904864073,
+      "learning_rate": 0.001,
+      "loss": 0.4208,
+      "step": 1499
+    },
+    {
+      "epoch": 0.04138830159655373,
+      "grad_norm": 0.003089721780270338,
+      "learning_rate": 0.001,
+      "loss": 0.4066,
+      "step": 1500
+    },
+    {
+      "epoch": 0.04138830159655373,
+      "eval_runtime": 23.6686,
+      "eval_samples_per_second": 1.352,
+      "eval_steps_per_second": 0.169,
+      "step": 1500
+    },
+    {
+      "epoch": 0.041415893797618106,
+      "grad_norm": 0.004915047902613878,
+      "learning_rate": 0.001,
+      "loss": 0.4313,
+      "step": 1501
+    },
+    {
+      "epoch": 0.04144348599868247,
+      "grad_norm": 0.00290488894097507,
+      "learning_rate": 0.001,
+      "loss": 0.3915,
+      "step": 1502
+    },
+    {
+      "epoch": 0.04147107819974684,
+      "grad_norm": 0.0034425961785018444,
+      "learning_rate": 0.001,
+      "loss": 0.3732,
+      "step": 1503
+    },
+    {
+      "epoch": 0.04149867040081121,
+      "grad_norm": 0.003592686727643013,
+      "learning_rate": 0.001,
+      "loss": 0.3947,
+      "step": 1504
+    },
+    {
+      "epoch": 0.04152626260187558,
+      "grad_norm": 0.005649790167808533,
+      "learning_rate": 0.001,
+      "loss": 0.3954,
+      "step": 1505
+    },
+    {
+      "epoch": 0.04155385480293995,
+      "grad_norm": 0.002451283158734441,
+      "learning_rate": 0.001,
+      "loss": 0.4477,
+      "step": 1506
+    },
+    {
+      "epoch": 0.04158144700400432,
+      "grad_norm": 0.0028861695900559425,
+      "learning_rate": 0.001,
+      "loss": 0.3852,
+      "step": 1507
+    },
+    {
+      "epoch": 0.041609039205068685,
+      "grad_norm": 0.0033806059509515762,
+      "learning_rate": 0.001,
+      "loss": 0.398,
+      "step": 1508
+    },
+    {
+      "epoch": 0.04163663140613306,
+      "grad_norm": 0.003824063576757908,
+      "learning_rate": 0.001,
+      "loss": 0.3906,
+      "step": 1509
+    },
+    {
+      "epoch": 0.041664223607197425,
+      "grad_norm": 0.00630558468401432,
+      "learning_rate": 0.001,
+      "loss": 0.4021,
+      "step": 1510
+    },
+    {
+      "epoch": 0.0416918158082618,
+      "grad_norm": 0.0037113004364073277,
+      "learning_rate": 0.001,
+      "loss": 0.3717,
+      "step": 1511
+    },
+    {
+      "epoch": 0.041719408009326164,
+      "grad_norm": 0.0054063801653683186,
+      "learning_rate": 0.001,
+      "loss": 0.4019,
+      "step": 1512
+    },
+    {
+      "epoch": 0.04174700021039053,
+      "grad_norm": 0.003154453821480274,
+      "learning_rate": 0.001,
+      "loss": 0.3825,
+      "step": 1513
+    },
+    {
+      "epoch": 0.041774592411454904,
+      "grad_norm": 0.0029439502395689487,
+      "learning_rate": 0.001,
+      "loss": 0.3875,
+      "step": 1514
+    },
+    {
+      "epoch": 0.04180218461251927,
+      "grad_norm": 0.003378200577571988,
+      "learning_rate": 0.001,
+      "loss": 0.4199,
+      "step": 1515
+    },
+    {
+      "epoch": 0.041829776813583644,
+      "grad_norm": 0.003158586798235774,
+      "learning_rate": 0.001,
+      "loss": 0.4098,
+      "step": 1516
+    },
+    {
+      "epoch": 0.04185736901464801,
+      "grad_norm": 0.004686887841671705,
+      "learning_rate": 0.001,
+      "loss": 0.39,
+      "step": 1517
+    },
+    {
+      "epoch": 0.04188496121571238,
+      "grad_norm": 0.004565032664686441,
+      "learning_rate": 0.001,
+      "loss": 0.3755,
+      "step": 1518
+    },
+    {
+      "epoch": 0.04191255341677675,
+      "grad_norm": 0.005517443176358938,
+      "learning_rate": 0.001,
+      "loss": 0.4254,
+      "step": 1519
+    },
+    {
+      "epoch": 0.04194014561784112,
+      "grad_norm": 0.002761922078207135,
+      "learning_rate": 0.001,
+      "loss": 0.3857,
+      "step": 1520
+    },
+    {
+      "epoch": 0.04196773781890548,
+      "grad_norm": 0.0039441585540771484,
+      "learning_rate": 0.001,
+      "loss": 0.386,
+      "step": 1521
+    },
+    {
+      "epoch": 0.041995330019969856,
+      "grad_norm": 0.00710391066968441,
+      "learning_rate": 0.001,
+      "loss": 0.4264,
+      "step": 1522
+    },
+    {
+      "epoch": 0.04202292222103422,
+      "grad_norm": 0.025746062397956848,
+      "learning_rate": 0.001,
+      "loss": 0.4271,
+      "step": 1523
+    },
+    {
+      "epoch": 0.042050514422098596,
+      "grad_norm": 0.004072318784892559,
+      "learning_rate": 0.001,
+      "loss": 0.3753,
+      "step": 1524
+    },
+    {
+      "epoch": 0.04207810662316296,
+      "grad_norm": 0.0024748845025897026,
+      "learning_rate": 0.001,
+      "loss": 0.4254,
+      "step": 1525
+    },
+    {
+      "epoch": 0.04210569882422733,
+      "grad_norm": 0.007111032959073782,
+      "learning_rate": 0.001,
+      "loss": 0.3759,
+      "step": 1526
+    },
+    {
+      "epoch": 0.0421332910252917,
+      "grad_norm": 0.005953185725957155,
+      "learning_rate": 0.001,
+      "loss": 0.3995,
+      "step": 1527
+    },
+    {
+      "epoch": 0.04216088322635607,
+      "grad_norm": 0.004936009179800749,
+      "learning_rate": 0.001,
+      "loss": 0.3839,
+      "step": 1528
+    },
+    {
+      "epoch": 0.04218847542742044,
+      "grad_norm": 0.004421617835760117,
+      "learning_rate": 0.001,
+      "loss": 0.3926,
+      "step": 1529
+    },
+    {
+      "epoch": 0.04221606762848481,
+      "grad_norm": 0.007696077227592468,
+      "learning_rate": 0.001,
+      "loss": 0.4035,
+      "step": 1530
+    },
+    {
+      "epoch": 0.042243659829549175,
+      "grad_norm": 0.0060005756095051765,
+      "learning_rate": 0.001,
+      "loss": 0.4186,
+      "step": 1531
+    },
+    {
+      "epoch": 0.04227125203061355,
+      "grad_norm": 0.006462580990046263,
+      "learning_rate": 0.001,
+      "loss": 0.4137,
+      "step": 1532
+    },
+    {
+      "epoch": 0.042298844231677915,
+      "grad_norm": 0.023720385506749153,
+      "learning_rate": 0.001,
+      "loss": 0.3589,
+      "step": 1533
+    },
+    {
+      "epoch": 0.04232643643274229,
+      "grad_norm": 0.003752040909603238,
+      "learning_rate": 0.001,
+      "loss": 0.4127,
+      "step": 1534
+    },
+    {
+      "epoch": 0.042354028633806655,
+      "grad_norm": 0.005650446284562349,
+      "learning_rate": 0.001,
+      "loss": 0.4025,
+      "step": 1535
+    },
+    {
+      "epoch": 0.04238162083487102,
+      "grad_norm": 0.003995851147919893,
+      "learning_rate": 0.001,
+      "loss": 0.4094,
+      "step": 1536
+    },
+    {
+      "epoch": 0.042409213035935395,
+      "grad_norm": 0.003202822059392929,
+      "learning_rate": 0.001,
+      "loss": 0.3891,
+      "step": 1537
+    },
+    {
+      "epoch": 0.04243680523699976,
+      "grad_norm": 0.004148907493799925,
+      "learning_rate": 0.001,
+      "loss": 0.4121,
+      "step": 1538
+    },
+    {
+      "epoch": 0.042464397438064135,
+      "grad_norm": 0.004003866575658321,
+      "learning_rate": 0.001,
+      "loss": 0.4071,
+      "step": 1539
+    },
+    {
+      "epoch": 0.0424919896391285,
+      "grad_norm": 0.003789936425164342,
+      "learning_rate": 0.001,
+      "loss": 0.4593,
+      "step": 1540
+    },
+    {
+      "epoch": 0.04251958184019287,
+      "grad_norm": 0.004240360110998154,
+      "learning_rate": 0.001,
+      "loss": 0.353,
+      "step": 1541
+    },
+    {
+      "epoch": 0.04254717404125724,
+      "grad_norm": 0.002904722234234214,
+      "learning_rate": 0.001,
+      "loss": 0.4451,
+      "step": 1542
+    },
+    {
+      "epoch": 0.04257476624232161,
+      "grad_norm": 0.004250886384397745,
+      "learning_rate": 0.001,
+      "loss": 0.4395,
+      "step": 1543
+    },
+    {
+      "epoch": 0.042602358443385974,
+      "grad_norm": 0.0044527859427034855,
+      "learning_rate": 0.001,
+      "loss": 0.4137,
+      "step": 1544
+    },
+    {
+      "epoch": 0.04262995064445035,
+      "grad_norm": 0.006279831752181053,
+      "learning_rate": 0.001,
+      "loss": 0.354,
+      "step": 1545
+    },
+    {
+      "epoch": 0.042657542845514713,
+      "grad_norm": 0.004428897984325886,
+      "learning_rate": 0.001,
+      "loss": 0.407,
+      "step": 1546
+    },
+    {
+      "epoch": 0.04268513504657909,
+      "grad_norm": 0.00569180166348815,
+      "learning_rate": 0.001,
+      "loss": 0.4014,
+      "step": 1547
+    },
+    {
+      "epoch": 0.04271272724764345,
+      "grad_norm": 0.011190955527126789,
+      "learning_rate": 0.001,
+      "loss": 0.3869,
+      "step": 1548
+    },
+    {
+      "epoch": 0.04274031944870782,
+      "grad_norm": 0.015735691413283348,
+      "learning_rate": 0.001,
+      "loss": 0.3924,
+      "step": 1549
+    },
+    {
+      "epoch": 0.04276791164977219,
+      "grad_norm": 0.0033663571812212467,
+      "learning_rate": 0.001,
+      "loss": 0.4163,
+      "step": 1550
+    },
+    {
+      "epoch": 0.04279550385083656,
+      "grad_norm": 0.005885041318833828,
+      "learning_rate": 0.001,
+      "loss": 0.3891,
+      "step": 1551
+    },
+    {
+      "epoch": 0.04282309605190093,
+      "grad_norm": 0.022580578923225403,
+      "learning_rate": 0.001,
+      "loss": 0.3921,
+      "step": 1552
+    },
+    {
+      "epoch": 0.0428506882529653,
+      "grad_norm": 0.004381990525871515,
+      "learning_rate": 0.001,
+      "loss": 0.4167,
+      "step": 1553
+    },
+    {
+      "epoch": 0.042878280454029666,
+      "grad_norm": 0.0038387307431548834,
+      "learning_rate": 0.001,
+      "loss": 0.38,
+      "step": 1554
+    },
+    {
+      "epoch": 0.04290587265509404,
+      "grad_norm": 0.027915263548493385,
+      "learning_rate": 0.001,
+      "loss": 0.4126,
+      "step": 1555
+    },
+    {
+      "epoch": 0.042933464856158406,
+      "grad_norm": 0.006606489885598421,
+      "learning_rate": 0.001,
+      "loss": 0.3775,
+      "step": 1556
+    },
+    {
+      "epoch": 0.04296105705722278,
+      "grad_norm": 0.013772227801382542,
+      "learning_rate": 0.001,
+      "loss": 0.3991,
+      "step": 1557
+    },
+    {
+      "epoch": 0.042988649258287145,
+      "grad_norm": 0.01971166953444481,
+      "learning_rate": 0.001,
+      "loss": 0.3872,
+      "step": 1558
+    },
+    {
+      "epoch": 0.04301624145935151,
+      "grad_norm": 0.0028942166827619076,
+      "learning_rate": 0.001,
+      "loss": 0.3951,
+      "step": 1559
+    },
+    {
+      "epoch": 0.043043833660415885,
+      "grad_norm": 0.004030006006360054,
+      "learning_rate": 0.001,
+      "loss": 0.3701,
+      "step": 1560
+    },
+    {
+      "epoch": 0.04307142586148025,
+      "grad_norm": 0.0030979826115071774,
+      "learning_rate": 0.001,
+      "loss": 0.4082,
+      "step": 1561
+    },
+    {
+      "epoch": 0.043099018062544625,
+      "grad_norm": 0.012965509667992592,
+      "learning_rate": 0.001,
+      "loss": 0.3686,
+      "step": 1562
+    },
+    {
+      "epoch": 0.04312661026360899,
+      "grad_norm": 0.0034757067915052176,
+      "learning_rate": 0.001,
+      "loss": 0.4043,
+      "step": 1563
+    },
+    {
+      "epoch": 0.04315420246467336,
+      "grad_norm": 0.003752148011699319,
+      "learning_rate": 0.001,
+      "loss": 0.4315,
+      "step": 1564
+    },
+    {
+      "epoch": 0.04318179466573773,
+      "grad_norm": 0.002841662149876356,
+      "learning_rate": 0.001,
+      "loss": 0.3934,
+      "step": 1565
+    },
+    {
+      "epoch": 0.0432093868668021,
+      "grad_norm": 0.0031714646611362696,
+      "learning_rate": 0.001,
+      "loss": 0.3961,
+      "step": 1566
+    },
+    {
+      "epoch": 0.043236979067866464,
+      "grad_norm": 0.004814228042960167,
+      "learning_rate": 0.001,
+      "loss": 0.3909,
+      "step": 1567
+    },
+    {
+      "epoch": 0.04326457126893084,
+      "grad_norm": 0.004090086091309786,
+      "learning_rate": 0.001,
+      "loss": 0.4061,
+      "step": 1568
+    },
+    {
+      "epoch": 0.043292163469995204,
+      "grad_norm": 0.0035240172874182463,
+      "learning_rate": 0.001,
+      "loss": 0.4002,
+      "step": 1569
+    },
+    {
+      "epoch": 0.04331975567105958,
+      "grad_norm": 0.0025755963288247585,
+      "learning_rate": 0.001,
+      "loss": 0.4287,
+      "step": 1570
+    },
+    {
+      "epoch": 0.043347347872123944,
+      "grad_norm": 0.010493564419448376,
+      "learning_rate": 0.001,
+      "loss": 0.4117,
+      "step": 1571
+    },
+    {
+      "epoch": 0.04337494007318831,
+      "grad_norm": 0.003166765673086047,
+      "learning_rate": 0.001,
+      "loss": 0.4402,
+      "step": 1572
+    },
+    {
+      "epoch": 0.043402532274252684,
+      "grad_norm": 0.004130365792661905,
+      "learning_rate": 0.001,
+      "loss": 0.395,
+      "step": 1573
+    },
+    {
+      "epoch": 0.04343012447531705,
+      "grad_norm": 0.003527469700202346,
+      "learning_rate": 0.001,
+      "loss": 0.4327,
+      "step": 1574
+    },
+    {
+      "epoch": 0.043457716676381423,
+      "grad_norm": 0.0034512521233409643,
+      "learning_rate": 0.001,
+      "loss": 0.3567,
+      "step": 1575
+    },
+    {
+      "epoch": 0.04348530887744579,
+      "grad_norm": 0.002996640047058463,
+      "learning_rate": 0.001,
+      "loss": 0.3649,
+      "step": 1576
+    },
+    {
+      "epoch": 0.043512901078510156,
+      "grad_norm": 0.0035705615300685167,
+      "learning_rate": 0.001,
+      "loss": 0.3878,
+      "step": 1577
+    },
+    {
+      "epoch": 0.04354049327957453,
+      "grad_norm": 0.004510779399424791,
+      "learning_rate": 0.001,
+      "loss": 0.3945,
+      "step": 1578
+    },
+    {
+      "epoch": 0.043568085480638896,
+      "grad_norm": 0.005176417529582977,
+      "learning_rate": 0.001,
+      "loss": 0.3704,
+      "step": 1579
+    },
+    {
+      "epoch": 0.04359567768170327,
+      "grad_norm": 0.0046493723057210445,
+      "learning_rate": 0.001,
+      "loss": 0.4271,
+      "step": 1580
+    },
+    {
+      "epoch": 0.043623269882767636,
+      "grad_norm": 0.004336629528552294,
+      "learning_rate": 0.001,
+      "loss": 0.3974,
+      "step": 1581
+    },
+    {
+      "epoch": 0.043650862083832,
+      "grad_norm": 0.0031178700737655163,
+      "learning_rate": 0.001,
+      "loss": 0.4275,
+      "step": 1582
+    },
+    {
+      "epoch": 0.043678454284896376,
+      "grad_norm": 0.006402260158210993,
+      "learning_rate": 0.001,
+      "loss": 0.4198,
+      "step": 1583
+    },
+    {
+      "epoch": 0.04370604648596074,
+      "grad_norm": 0.004152487497776747,
+      "learning_rate": 0.001,
+      "loss": 0.3969,
+      "step": 1584
+    },
+    {
+      "epoch": 0.043733638687025116,
+      "grad_norm": 0.00424406910315156,
+      "learning_rate": 0.001,
+      "loss": 0.3865,
+      "step": 1585
+    },
+    {
+      "epoch": 0.04376123088808948,
+      "grad_norm": 0.005350259132683277,
+      "learning_rate": 0.001,
+      "loss": 0.3781,
+      "step": 1586
+    },
+    {
+      "epoch": 0.04378882308915385,
+      "grad_norm": 0.0027786684222519398,
+      "learning_rate": 0.001,
+      "loss": 0.4478,
+      "step": 1587
+    },
+    {
+      "epoch": 0.04381641529021822,
+      "grad_norm": 0.004228509031236172,
+      "learning_rate": 0.001,
+      "loss": 0.4055,
+      "step": 1588
+    },
+    {
+      "epoch": 0.04384400749128259,
+      "grad_norm": 0.0037349634803831577,
+      "learning_rate": 0.001,
+      "loss": 0.3892,
+      "step": 1589
+    },
+    {
+      "epoch": 0.043871599692346955,
+      "grad_norm": 0.0034225585404783487,
+      "learning_rate": 0.001,
+      "loss": 0.4,
+      "step": 1590
+    },
+    {
+      "epoch": 0.04389919189341133,
+      "grad_norm": 0.00584405055269599,
+      "learning_rate": 0.001,
+      "loss": 0.386,
+      "step": 1591
+    },
+    {
+      "epoch": 0.043926784094475695,
+      "grad_norm": 0.004452804569154978,
+      "learning_rate": 0.001,
+      "loss": 0.4242,
+      "step": 1592
+    },
+    {
+      "epoch": 0.04395437629554007,
+      "grad_norm": 0.0026068915612995625,
+      "learning_rate": 0.001,
+      "loss": 0.3954,
+      "step": 1593
+    },
+    {
+      "epoch": 0.043981968496604434,
+      "grad_norm": 0.003229719353839755,
+      "learning_rate": 0.001,
+      "loss": 0.4059,
+      "step": 1594
+    },
+    {
+      "epoch": 0.0440095606976688,
+      "grad_norm": 0.005484900437295437,
+      "learning_rate": 0.001,
+      "loss": 0.3981,
+      "step": 1595
+    },
+    {
+      "epoch": 0.044037152898733174,
+      "grad_norm": 0.007316559553146362,
+      "learning_rate": 0.001,
+      "loss": 0.3879,
+      "step": 1596
+    },
+    {
+      "epoch": 0.04406474509979754,
+      "grad_norm": 0.009250715374946594,
+      "learning_rate": 0.001,
+      "loss": 0.3797,
+      "step": 1597
+    },
+    {
+      "epoch": 0.044092337300861914,
+      "grad_norm": 0.004528039135038853,
+      "learning_rate": 0.001,
+      "loss": 0.371,
+      "step": 1598
+    },
+    {
+      "epoch": 0.04411992950192628,
+      "grad_norm": 0.005715006496757269,
+      "learning_rate": 0.001,
+      "loss": 0.4012,
+      "step": 1599
+    },
+    {
+      "epoch": 0.04414752170299065,
+      "grad_norm": 0.0036250154953449965,
+      "learning_rate": 0.001,
+      "loss": 0.4285,
+      "step": 1600
+    },
+    {
+      "epoch": 0.04417511390405502,
+      "grad_norm": 0.007519236300140619,
+      "learning_rate": 0.001,
+      "loss": 0.4164,
+      "step": 1601
+    },
+    {
+      "epoch": 0.04420270610511939,
+      "grad_norm": 0.005943160969763994,
+      "learning_rate": 0.001,
+      "loss": 0.3953,
+      "step": 1602
+    },
+    {
+      "epoch": 0.04423029830618376,
+      "grad_norm": 0.00410908367484808,
+      "learning_rate": 0.001,
+      "loss": 0.3608,
+      "step": 1603
+    },
+    {
+      "epoch": 0.04425789050724813,
+      "grad_norm": 0.004322184715420008,
+      "learning_rate": 0.001,
+      "loss": 0.4226,
+      "step": 1604
+    },
+    {
+      "epoch": 0.04428548270831249,
+      "grad_norm": 0.0035136695951223373,
+      "learning_rate": 0.001,
+      "loss": 0.38,
+      "step": 1605
+    },
+    {
+      "epoch": 0.044313074909376866,
+      "grad_norm": 0.020684808492660522,
+      "learning_rate": 0.001,
+      "loss": 0.3983,
+      "step": 1606
+    },
+    {
+      "epoch": 0.04434066711044123,
+      "grad_norm": 0.00836837850511074,
+      "learning_rate": 0.001,
+      "loss": 0.4281,
+      "step": 1607
+    },
+    {
+      "epoch": 0.044368259311505606,
+      "grad_norm": 0.004143499303609133,
+      "learning_rate": 0.001,
+      "loss": 0.4234,
+      "step": 1608
+    },
+    {
+      "epoch": 0.04439585151256997,
+      "grad_norm": 0.005604143720120192,
+      "learning_rate": 0.001,
+      "loss": 0.3957,
+      "step": 1609
+    },
+    {
+      "epoch": 0.04442344371363434,
+      "grad_norm": 0.0037680943496525288,
+      "learning_rate": 0.001,
+      "loss": 0.4055,
+      "step": 1610
+    },
+    {
+      "epoch": 0.04445103591469871,
+      "grad_norm": 0.004539195913821459,
+      "learning_rate": 0.001,
+      "loss": 0.4032,
+      "step": 1611
+    },
+    {
+      "epoch": 0.04447862811576308,
+      "grad_norm": 0.003528768662363291,
+      "learning_rate": 0.001,
+      "loss": 0.3904,
+      "step": 1612
+    },
+    {
+      "epoch": 0.044506220316827445,
+      "grad_norm": 0.002794221742078662,
+      "learning_rate": 0.001,
+      "loss": 0.4185,
+      "step": 1613
+    },
+    {
+      "epoch": 0.04453381251789182,
+      "grad_norm": 0.003042758908122778,
+      "learning_rate": 0.001,
+      "loss": 0.4114,
+      "step": 1614
+    },
+    {
+      "epoch": 0.044561404718956185,
+      "grad_norm": 0.0024623468052595854,
+      "learning_rate": 0.001,
+      "loss": 0.4099,
+      "step": 1615
+    },
+    {
+      "epoch": 0.04458899692002056,
+      "grad_norm": 0.0035985438153147697,
+      "learning_rate": 0.001,
+      "loss": 0.4159,
+      "step": 1616
+    },
+    {
+      "epoch": 0.044616589121084925,
+      "grad_norm": 0.007022172212600708,
+      "learning_rate": 0.001,
+      "loss": 0.3593,
+      "step": 1617
+    },
+    {
+      "epoch": 0.04464418132214929,
+      "grad_norm": 0.003542872378602624,
+      "learning_rate": 0.001,
+      "loss": 0.4131,
+      "step": 1618
+    },
+    {
+      "epoch": 0.044671773523213665,
+      "grad_norm": 0.0038910373114049435,
+      "learning_rate": 0.001,
+      "loss": 0.423,
+      "step": 1619
+    },
+    {
+      "epoch": 0.04469936572427803,
+      "grad_norm": 0.0027147457003593445,
+      "learning_rate": 0.001,
+      "loss": 0.4051,
+      "step": 1620
+    },
+    {
+      "epoch": 0.044726957925342405,
+      "grad_norm": 0.002940715989097953,
+      "learning_rate": 0.001,
+      "loss": 0.4256,
+      "step": 1621
+    },
+    {
+      "epoch": 0.04475455012640677,
+      "grad_norm": 0.0030967697966843843,
+      "learning_rate": 0.001,
+      "loss": 0.3798,
+      "step": 1622
+    },
+    {
+      "epoch": 0.04478214232747114,
+      "grad_norm": 0.002238066168501973,
+      "learning_rate": 0.001,
+      "loss": 0.4067,
+      "step": 1623
+    },
+    {
+      "epoch": 0.04480973452853551,
+      "grad_norm": 0.003412696998566389,
+      "learning_rate": 0.001,
+      "loss": 0.413,
+      "step": 1624
+    },
+    {
+      "epoch": 0.04483732672959988,
+      "grad_norm": 0.0026292402762919664,
+      "learning_rate": 0.001,
+      "loss": 0.4015,
+      "step": 1625
+    },
+    {
+      "epoch": 0.04486491893066425,
+      "grad_norm": 0.004190067294985056,
+      "learning_rate": 0.001,
+      "loss": 0.3692,
+      "step": 1626
+    },
+    {
+      "epoch": 0.04489251113172862,
+      "grad_norm": 0.0036978810094296932,
+      "learning_rate": 0.001,
+      "loss": 0.3922,
+      "step": 1627
+    },
+    {
+      "epoch": 0.044920103332792984,
+      "grad_norm": 0.004438107367604971,
+      "learning_rate": 0.001,
+      "loss": 0.3819,
+      "step": 1628
+    },
+    {
+      "epoch": 0.04494769553385736,
+      "grad_norm": 0.004876590799540281,
+      "learning_rate": 0.001,
+      "loss": 0.3844,
+      "step": 1629
+    },
+    {
+      "epoch": 0.04497528773492172,
+      "grad_norm": 0.004272471182048321,
+      "learning_rate": 0.001,
+      "loss": 0.3869,
+      "step": 1630
+    },
+    {
+      "epoch": 0.0450028799359861,
+      "grad_norm": 0.006823899690061808,
+      "learning_rate": 0.001,
+      "loss": 0.4183,
+      "step": 1631
+    },
+    {
+      "epoch": 0.04503047213705046,
+      "grad_norm": 0.008064229972660542,
+      "learning_rate": 0.001,
+      "loss": 0.417,
+      "step": 1632
+    },
+    {
+      "epoch": 0.04505806433811483,
+      "grad_norm": 0.004609786439687014,
+      "learning_rate": 0.001,
+      "loss": 0.4324,
+      "step": 1633
+    },
+    {
+      "epoch": 0.0450856565391792,
+      "grad_norm": 0.0027909104246646166,
+      "learning_rate": 0.001,
+      "loss": 0.4091,
+      "step": 1634
+    },
+    {
+      "epoch": 0.04511324874024357,
+      "grad_norm": 0.0036747294943779707,
+      "learning_rate": 0.001,
+      "loss": 0.3533,
+      "step": 1635
+    },
+    {
+      "epoch": 0.045140840941307936,
+      "grad_norm": 0.0037599606439471245,
+      "learning_rate": 0.001,
+      "loss": 0.4282,
+      "step": 1636
+    },
+    {
+      "epoch": 0.04516843314237231,
+      "grad_norm": 0.0029045911505818367,
+      "learning_rate": 0.001,
+      "loss": 0.3837,
+      "step": 1637
+    },
+    {
+      "epoch": 0.045196025343436676,
+      "grad_norm": 0.0038696962874382734,
+      "learning_rate": 0.001,
+      "loss": 0.3883,
+      "step": 1638
+    },
+    {
+      "epoch": 0.04522361754450105,
+      "grad_norm": 0.004320462234318256,
+      "learning_rate": 0.001,
+      "loss": 0.3783,
+      "step": 1639
+    },
+    {
+      "epoch": 0.045251209745565416,
+      "grad_norm": 0.002876073122024536,
+      "learning_rate": 0.001,
+      "loss": 0.406,
+      "step": 1640
+    },
+    {
+      "epoch": 0.04527880194662978,
+      "grad_norm": 0.002509112237021327,
+      "learning_rate": 0.001,
+      "loss": 0.4383,
+      "step": 1641
+    },
+    {
+      "epoch": 0.045306394147694155,
+      "grad_norm": 0.012423038482666016,
+      "learning_rate": 0.001,
+      "loss": 0.3616,
+      "step": 1642
+    },
+    {
+      "epoch": 0.04533398634875852,
+      "grad_norm": 0.00442110188305378,
+      "learning_rate": 0.001,
+      "loss": 0.3814,
+      "step": 1643
+    },
+    {
+      "epoch": 0.045361578549822895,
+      "grad_norm": 0.003638372290879488,
+      "learning_rate": 0.001,
+      "loss": 0.4611,
+      "step": 1644
+    },
+    {
+      "epoch": 0.04538917075088726,
+      "grad_norm": 0.0037818588316440582,
+      "learning_rate": 0.001,
+      "loss": 0.3893,
+      "step": 1645
+    },
+    {
+      "epoch": 0.04541676295195163,
+      "grad_norm": 0.0035038308706134558,
+      "learning_rate": 0.001,
+      "loss": 0.3853,
+      "step": 1646
+    },
+    {
+      "epoch": 0.045444355153016,
+      "grad_norm": 0.003616967238485813,
+      "learning_rate": 0.001,
+      "loss": 0.3903,
+      "step": 1647
+    },
+    {
+      "epoch": 0.04547194735408037,
+      "grad_norm": 0.002239943016320467,
+      "learning_rate": 0.001,
+      "loss": 0.3769,
+      "step": 1648
+    },
+    {
+      "epoch": 0.04549953955514474,
+      "grad_norm": 0.0036853745114058256,
+      "learning_rate": 0.001,
+      "loss": 0.3835,
+      "step": 1649
+    },
+    {
+      "epoch": 0.04552713175620911,
+      "grad_norm": 0.00262830825522542,
+      "learning_rate": 0.001,
+      "loss": 0.4076,
+      "step": 1650
+    },
+    {
+      "epoch": 0.045554723957273474,
+      "grad_norm": 0.010400556027889252,
+      "learning_rate": 0.001,
+      "loss": 0.4051,
+      "step": 1651
+    },
+    {
+      "epoch": 0.04558231615833785,
+      "grad_norm": 0.0026481510140001774,
+      "learning_rate": 0.001,
+      "loss": 0.3712,
+      "step": 1652
+    },
+    {
+      "epoch": 0.045609908359402214,
+      "grad_norm": 0.0029154308140277863,
+      "learning_rate": 0.001,
+      "loss": 0.3951,
+      "step": 1653
+    },
+    {
+      "epoch": 0.04563750056046659,
+      "grad_norm": 0.0028925796505063772,
+      "learning_rate": 0.001,
+      "loss": 0.3907,
+      "step": 1654
+    },
+    {
+      "epoch": 0.045665092761530954,
+      "grad_norm": 0.0028384907636791468,
+      "learning_rate": 0.001,
+      "loss": 0.411,
+      "step": 1655
+    },
+    {
+      "epoch": 0.04569268496259532,
+      "grad_norm": 0.0039413925260305405,
+      "learning_rate": 0.001,
+      "loss": 0.3847,
+      "step": 1656
+    },
+    {
+      "epoch": 0.045720277163659694,
+      "grad_norm": 0.004098753910511732,
+      "learning_rate": 0.001,
+      "loss": 0.4258,
+      "step": 1657
+    },
+    {
+      "epoch": 0.04574786936472406,
+      "grad_norm": 0.0032568899914622307,
+      "learning_rate": 0.001,
+      "loss": 0.3791,
+      "step": 1658
+    },
+    {
+      "epoch": 0.045775461565788426,
+      "grad_norm": 0.0038113740738481283,
+      "learning_rate": 0.001,
+      "loss": 0.4379,
+      "step": 1659
+    },
+    {
+      "epoch": 0.0458030537668528,
+      "grad_norm": 0.004003043286502361,
+      "learning_rate": 0.001,
+      "loss": 0.3718,
+      "step": 1660
+    },
+    {
+      "epoch": 0.045830645967917166,
+      "grad_norm": 0.003037869231775403,
+      "learning_rate": 0.001,
+      "loss": 0.3917,
+      "step": 1661
+    },
+    {
+      "epoch": 0.04585823816898154,
+      "grad_norm": 0.004264459945261478,
+      "learning_rate": 0.001,
+      "loss": 0.4109,
+      "step": 1662
+    },
+    {
+      "epoch": 0.045885830370045906,
+      "grad_norm": 0.0026220069266855717,
+      "learning_rate": 0.001,
+      "loss": 0.4086,
+      "step": 1663
+    },
+    {
+      "epoch": 0.04591342257111027,
+      "grad_norm": 0.003170250216498971,
+      "learning_rate": 0.001,
+      "loss": 0.3909,
+      "step": 1664
+    },
+    {
+      "epoch": 0.045941014772174646,
+      "grad_norm": 0.004167741164565086,
+      "learning_rate": 0.001,
+      "loss": 0.4141,
+      "step": 1665
+    },
+    {
+      "epoch": 0.04596860697323901,
+      "grad_norm": 0.0035946646239608526,
+      "learning_rate": 0.001,
+      "loss": 0.4111,
+      "step": 1666
+    },
+    {
+      "epoch": 0.045996199174303386,
+      "grad_norm": 0.003959175664931536,
+      "learning_rate": 0.001,
+      "loss": 0.407,
+      "step": 1667
+    },
+    {
+      "epoch": 0.04602379137536775,
+      "grad_norm": 0.004025304224342108,
+      "learning_rate": 0.001,
+      "loss": 0.4136,
+      "step": 1668
+    },
+    {
+      "epoch": 0.04605138357643212,
+      "grad_norm": 0.004997665528208017,
+      "learning_rate": 0.001,
+      "loss": 0.3931,
+      "step": 1669
+    },
+    {
+      "epoch": 0.04607897577749649,
+      "grad_norm": 0.003882192773744464,
+      "learning_rate": 0.001,
+      "loss": 0.397,
+      "step": 1670
+    },
+    {
+      "epoch": 0.04610656797856086,
+      "grad_norm": 0.006935080513358116,
+      "learning_rate": 0.001,
+      "loss": 0.412,
+      "step": 1671
+    },
+    {
+      "epoch": 0.04613416017962523,
+      "grad_norm": 0.0038688175845891237,
+      "learning_rate": 0.001,
+      "loss": 0.3767,
+      "step": 1672
+    },
+    {
+      "epoch": 0.0461617523806896,
+      "grad_norm": 0.0037809666246175766,
+      "learning_rate": 0.001,
+      "loss": 0.4086,
+      "step": 1673
+    },
+    {
+      "epoch": 0.046189344581753965,
+      "grad_norm": 0.009138336405158043,
+      "learning_rate": 0.001,
+      "loss": 0.4027,
+      "step": 1674
+    },
+    {
+      "epoch": 0.04621693678281834,
+      "grad_norm": 0.004577755928039551,
+      "learning_rate": 0.001,
+      "loss": 0.3929,
+      "step": 1675
+    },
+    {
+      "epoch": 0.046244528983882704,
+      "grad_norm": 0.0038164344150573015,
+      "learning_rate": 0.001,
+      "loss": 0.441,
+      "step": 1676
+    },
+    {
+      "epoch": 0.04627212118494708,
+      "grad_norm": 0.004704809281975031,
+      "learning_rate": 0.001,
+      "loss": 0.3689,
+      "step": 1677
+    },
+    {
+      "epoch": 0.046299713386011444,
+      "grad_norm": 0.008598784916102886,
+      "learning_rate": 0.001,
+      "loss": 0.4019,
+      "step": 1678
+    },
+    {
+      "epoch": 0.04632730558707581,
+      "grad_norm": 0.004788943100720644,
+      "learning_rate": 0.001,
+      "loss": 0.4133,
+      "step": 1679
+    },
+    {
+      "epoch": 0.046354897788140184,
+      "grad_norm": 0.0038863064255565405,
+      "learning_rate": 0.001,
+      "loss": 0.3827,
+      "step": 1680
+    },
+    {
+      "epoch": 0.04638248998920455,
+      "grad_norm": 0.0029795279260724783,
+      "learning_rate": 0.001,
+      "loss": 0.4245,
+      "step": 1681
+    },
+    {
+      "epoch": 0.046410082190268924,
+      "grad_norm": 0.003151806304231286,
+      "learning_rate": 0.001,
+      "loss": 0.4215,
+      "step": 1682
+    },
+    {
+      "epoch": 0.04643767439133329,
+      "grad_norm": 0.006868270225822926,
+      "learning_rate": 0.001,
+      "loss": 0.4285,
+      "step": 1683
+    },
+    {
+      "epoch": 0.04646526659239766,
+      "grad_norm": 0.00662572355940938,
+      "learning_rate": 0.001,
+      "loss": 0.3975,
+      "step": 1684
+    },
+    {
+      "epoch": 0.04649285879346203,
+      "grad_norm": 0.01428454089909792,
+      "learning_rate": 0.001,
+      "loss": 0.3854,
+      "step": 1685
+    },
+    {
+      "epoch": 0.0465204509945264,
+      "grad_norm": 0.006754969246685505,
+      "learning_rate": 0.001,
+      "loss": 0.3699,
+      "step": 1686
+    },
+    {
+      "epoch": 0.04654804319559076,
+      "grad_norm": 0.00340940966270864,
+      "learning_rate": 0.001,
+      "loss": 0.3674,
+      "step": 1687
+    },
+    {
+      "epoch": 0.046575635396655136,
+      "grad_norm": 0.003416246036067605,
+      "learning_rate": 0.001,
+      "loss": 0.3975,
+      "step": 1688
+    },
+    {
+      "epoch": 0.0466032275977195,
+      "grad_norm": 0.0039048672188073397,
+      "learning_rate": 0.001,
+      "loss": 0.3848,
+      "step": 1689
+    },
+    {
+      "epoch": 0.046630819798783876,
+      "grad_norm": 0.003685768460854888,
+      "learning_rate": 0.001,
+      "loss": 0.4198,
+      "step": 1690
+    },
+    {
+      "epoch": 0.04665841199984824,
+      "grad_norm": 0.006730150897055864,
+      "learning_rate": 0.001,
+      "loss": 0.3419,
+      "step": 1691
+    },
+    {
+      "epoch": 0.04668600420091261,
+      "grad_norm": 0.004073324613273144,
+      "learning_rate": 0.001,
+      "loss": 0.4148,
+      "step": 1692
+    },
+    {
+      "epoch": 0.04671359640197698,
+      "grad_norm": 0.0040067946538329124,
+      "learning_rate": 0.001,
+      "loss": 0.3957,
+      "step": 1693
+    },
+    {
+      "epoch": 0.04674118860304135,
+      "grad_norm": 0.006488442420959473,
+      "learning_rate": 0.001,
+      "loss": 0.4193,
+      "step": 1694
+    },
+    {
+      "epoch": 0.04676878080410572,
+      "grad_norm": 0.006833325605839491,
+      "learning_rate": 0.001,
+      "loss": 0.3841,
+      "step": 1695
+    },
+    {
+      "epoch": 0.04679637300517009,
+      "grad_norm": 0.0035828256513923407,
+      "learning_rate": 0.001,
+      "loss": 0.3844,
+      "step": 1696
+    },
+    {
+      "epoch": 0.046823965206234455,
+      "grad_norm": 0.0032494564075022936,
+      "learning_rate": 0.001,
+      "loss": 0.3772,
+      "step": 1697
+    },
+    {
+      "epoch": 0.04685155740729883,
+      "grad_norm": 0.0033526027109473944,
+      "learning_rate": 0.001,
+      "loss": 0.4183,
+      "step": 1698
+    },
+    {
+      "epoch": 0.046879149608363195,
+      "grad_norm": 0.004358900245279074,
+      "learning_rate": 0.001,
+      "loss": 0.4057,
+      "step": 1699
+    },
+    {
+      "epoch": 0.04690674180942757,
+      "grad_norm": 0.0045670876279473305,
+      "learning_rate": 0.001,
+      "loss": 0.3853,
+      "step": 1700
+    },
+    {
+      "epoch": 0.046934334010491935,
+      "grad_norm": 0.004190321080386639,
+      "learning_rate": 0.001,
+      "loss": 0.4115,
+      "step": 1701
+    },
+    {
+      "epoch": 0.0469619262115563,
+      "grad_norm": 0.0030443707946687937,
+      "learning_rate": 0.001,
+      "loss": 0.4293,
+      "step": 1702
+    },
+    {
+      "epoch": 0.046989518412620675,
+      "grad_norm": 0.003604897065088153,
+      "learning_rate": 0.001,
+      "loss": 0.4429,
+      "step": 1703
+    },
+    {
+      "epoch": 0.04701711061368504,
+      "grad_norm": 0.003690918907523155,
+      "learning_rate": 0.001,
+      "loss": 0.3883,
+      "step": 1704
+    },
+    {
+      "epoch": 0.047044702814749415,
+      "grad_norm": 0.0034342606086283922,
+      "learning_rate": 0.001,
+      "loss": 0.4184,
+      "step": 1705
+    },
+    {
+      "epoch": 0.04707229501581378,
+      "grad_norm": 0.003786922199651599,
+      "learning_rate": 0.001,
+      "loss": 0.4286,
+      "step": 1706
+    },
+    {
+      "epoch": 0.04709988721687815,
+      "grad_norm": 0.0021744174882769585,
+      "learning_rate": 0.001,
+      "loss": 0.3615,
+      "step": 1707
+    },
+    {
+      "epoch": 0.04712747941794252,
+      "grad_norm": 0.004297124687582254,
+      "learning_rate": 0.001,
+      "loss": 0.3889,
+      "step": 1708
+    },
+    {
+      "epoch": 0.04715507161900689,
+      "grad_norm": 0.003097895532846451,
+      "learning_rate": 0.001,
+      "loss": 0.4263,
+      "step": 1709
+    },
+    {
+      "epoch": 0.047182663820071254,
+      "grad_norm": 0.0027625923976302147,
+      "learning_rate": 0.001,
+      "loss": 0.4183,
+      "step": 1710
+    },
+    {
+      "epoch": 0.04721025602113563,
+      "grad_norm": 0.024495547637343407,
+      "learning_rate": 0.001,
+      "loss": 0.3727,
+      "step": 1711
+    },
+    {
+      "epoch": 0.04723784822219999,
+      "grad_norm": 0.006063805893063545,
+      "learning_rate": 0.001,
+      "loss": 0.4558,
+      "step": 1712
+    },
+    {
+      "epoch": 0.04726544042326437,
+      "grad_norm": 0.002210379345342517,
+      "learning_rate": 0.001,
+      "loss": 0.4529,
+      "step": 1713
+    },
+    {
+      "epoch": 0.04729303262432873,
+      "grad_norm": 0.002300212625414133,
+      "learning_rate": 0.001,
+      "loss": 0.42,
+      "step": 1714
+    },
+    {
+      "epoch": 0.0473206248253931,
+      "grad_norm": 0.002808920806273818,
+      "learning_rate": 0.001,
+      "loss": 0.4337,
+      "step": 1715
+    },
+    {
+      "epoch": 0.04734821702645747,
+      "grad_norm": 0.0031099985353648663,
+      "learning_rate": 0.001,
+      "loss": 0.3939,
+      "step": 1716
+    },
+    {
+      "epoch": 0.04737580922752184,
+      "grad_norm": 0.003029707819223404,
+      "learning_rate": 0.001,
+      "loss": 0.3879,
+      "step": 1717
+    },
+    {
+      "epoch": 0.04740340142858621,
+      "grad_norm": 0.005905452184379101,
+      "learning_rate": 0.001,
+      "loss": 0.4024,
+      "step": 1718
+    },
+    {
+      "epoch": 0.04743099362965058,
+      "grad_norm": 0.002499626949429512,
+      "learning_rate": 0.001,
+      "loss": 0.4131,
+      "step": 1719
+    },
+    {
+      "epoch": 0.047458585830714946,
+      "grad_norm": 0.0029100440442562103,
+      "learning_rate": 0.001,
+      "loss": 0.3769,
+      "step": 1720
+    },
+    {
+      "epoch": 0.04748617803177932,
+      "grad_norm": 0.0029877678025513887,
+      "learning_rate": 0.001,
+      "loss": 0.3793,
+      "step": 1721
+    },
+    {
+      "epoch": 0.047513770232843686,
+      "grad_norm": 0.0023172239307314157,
+      "learning_rate": 0.001,
+      "loss": 0.4134,
+      "step": 1722
+    },
+    {
+      "epoch": 0.04754136243390806,
+      "grad_norm": 0.0031008669175207615,
+      "learning_rate": 0.001,
+      "loss": 0.3727,
+      "step": 1723
+    },
+    {
+      "epoch": 0.047568954634972425,
+      "grad_norm": 0.003588200779631734,
+      "learning_rate": 0.001,
+      "loss": 0.4225,
+      "step": 1724
+    },
+    {
+      "epoch": 0.04759654683603679,
+      "grad_norm": 0.0027301576919853687,
+      "learning_rate": 0.001,
+      "loss": 0.4042,
+      "step": 1725
+    },
+    {
+      "epoch": 0.047624139037101165,
+      "grad_norm": 0.003348779398947954,
+      "learning_rate": 0.001,
+      "loss": 0.3554,
+      "step": 1726
+    },
+    {
+      "epoch": 0.04765173123816553,
+      "grad_norm": 0.003580378834158182,
+      "learning_rate": 0.001,
+      "loss": 0.3819,
+      "step": 1727
+    },
+    {
+      "epoch": 0.047679323439229905,
+      "grad_norm": 0.004255024716258049,
+      "learning_rate": 0.001,
+      "loss": 0.4169,
+      "step": 1728
+    },
+    {
+      "epoch": 0.04770691564029427,
+      "grad_norm": 0.0033867263700813055,
+      "learning_rate": 0.001,
+      "loss": 0.3654,
+      "step": 1729
+    },
+    {
+      "epoch": 0.04773450784135864,
+      "grad_norm": 0.005044759716838598,
+      "learning_rate": 0.001,
+      "loss": 0.416,
+      "step": 1730
+    },
+    {
+      "epoch": 0.04776210004242301,
+      "grad_norm": 0.0034105442464351654,
+      "learning_rate": 0.001,
+      "loss": 0.3911,
+      "step": 1731
+    },
+    {
+      "epoch": 0.04778969224348738,
+      "grad_norm": 0.004883192479610443,
+      "learning_rate": 0.001,
+      "loss": 0.4679,
+      "step": 1732
+    },
+    {
+      "epoch": 0.047817284444551744,
+      "grad_norm": 0.003439564723521471,
+      "learning_rate": 0.001,
+      "loss": 0.3782,
+      "step": 1733
+    },
+    {
+      "epoch": 0.04784487664561612,
+      "grad_norm": 0.006362561602145433,
+      "learning_rate": 0.001,
+      "loss": 0.4092,
+      "step": 1734
+    },
+    {
+      "epoch": 0.047872468846680484,
+      "grad_norm": 0.004292085766792297,
+      "learning_rate": 0.001,
+      "loss": 0.4047,
+      "step": 1735
+    },
+    {
+      "epoch": 0.04790006104774486,
+      "grad_norm": 0.00374743458814919,
+      "learning_rate": 0.001,
+      "loss": 0.3785,
+      "step": 1736
+    },
+    {
+      "epoch": 0.047927653248809224,
+      "grad_norm": 0.004549616016447544,
+      "learning_rate": 0.001,
+      "loss": 0.3976,
+      "step": 1737
+    },
+    {
+      "epoch": 0.04795524544987359,
+      "grad_norm": 0.008672794327139854,
+      "learning_rate": 0.001,
+      "loss": 0.3664,
+      "step": 1738
+    },
+    {
+      "epoch": 0.047982837650937964,
+      "grad_norm": 0.007166683673858643,
+      "learning_rate": 0.001,
+      "loss": 0.416,
+      "step": 1739
+    },
+    {
+      "epoch": 0.04801042985200233,
+      "grad_norm": 0.0030777885112911463,
+      "learning_rate": 0.001,
+      "loss": 0.4281,
+      "step": 1740
+    },
+    {
+      "epoch": 0.048038022053066703,
+      "grad_norm": 0.0049812328070402145,
+      "learning_rate": 0.001,
+      "loss": 0.424,
+      "step": 1741
+    },
+    {
+      "epoch": 0.04806561425413107,
+      "grad_norm": 0.005072145257145166,
+      "learning_rate": 0.001,
+      "loss": 0.3912,
+      "step": 1742
+    },
+    {
+      "epoch": 0.048093206455195436,
+      "grad_norm": 0.004582617431879044,
+      "learning_rate": 0.001,
+      "loss": 0.3764,
+      "step": 1743
+    },
+    {
+      "epoch": 0.04812079865625981,
+      "grad_norm": 0.004273936152458191,
+      "learning_rate": 0.001,
+      "loss": 0.3881,
+      "step": 1744
+    },
+    {
+      "epoch": 0.048148390857324176,
+      "grad_norm": 0.003426861949265003,
+      "learning_rate": 0.001,
+      "loss": 0.3663,
+      "step": 1745
+    },
+    {
+      "epoch": 0.04817598305838855,
+      "grad_norm": 0.0036460154224187136,
+      "learning_rate": 0.001,
+      "loss": 0.3976,
+      "step": 1746
+    },
+    {
+      "epoch": 0.048203575259452916,
+      "grad_norm": 0.003482312895357609,
+      "learning_rate": 0.001,
+      "loss": 0.4189,
+      "step": 1747
+    },
+    {
+      "epoch": 0.04823116746051728,
+      "grad_norm": 0.0033532571978867054,
+      "learning_rate": 0.001,
+      "loss": 0.4093,
+      "step": 1748
+    },
+    {
+      "epoch": 0.048258759661581656,
+      "grad_norm": 0.0038984876591712236,
+      "learning_rate": 0.001,
+      "loss": 0.4015,
+      "step": 1749
+    },
+    {
+      "epoch": 0.04828635186264602,
+      "grad_norm": 0.005925590638071299,
+      "learning_rate": 0.001,
+      "loss": 0.3672,
+      "step": 1750
+    },
+    {
+      "epoch": 0.048313944063710396,
+      "grad_norm": 0.005235752090811729,
+      "learning_rate": 0.001,
+      "loss": 0.3777,
+      "step": 1751
+    },
+    {
+      "epoch": 0.04834153626477476,
+      "grad_norm": 0.00452176108956337,
+      "learning_rate": 0.001,
+      "loss": 0.4007,
+      "step": 1752
+    },
+    {
+      "epoch": 0.04836912846583913,
+      "grad_norm": 0.004116971977055073,
+      "learning_rate": 0.001,
+      "loss": 0.3719,
+      "step": 1753
+    },
+    {
+      "epoch": 0.0483967206669035,
+      "grad_norm": 0.0037535110022872686,
+      "learning_rate": 0.001,
+      "loss": 0.3643,
+      "step": 1754
+    },
+    {
+      "epoch": 0.04842431286796787,
+      "grad_norm": 0.0049346694722771645,
+      "learning_rate": 0.001,
+      "loss": 0.3676,
+      "step": 1755
+    },
+    {
+      "epoch": 0.048451905069032235,
+      "grad_norm": 0.006009018048644066,
+      "learning_rate": 0.001,
+      "loss": 0.3844,
+      "step": 1756
+    },
+    {
+      "epoch": 0.04847949727009661,
+      "grad_norm": 0.003077111905440688,
+      "learning_rate": 0.001,
+      "loss": 0.3965,
+      "step": 1757
+    },
+    {
+      "epoch": 0.048507089471160975,
+      "grad_norm": 0.004866272211074829,
+      "learning_rate": 0.001,
+      "loss": 0.3991,
+      "step": 1758
+    },
+    {
+      "epoch": 0.04853468167222535,
+      "grad_norm": 0.004328357521444559,
+      "learning_rate": 0.001,
+      "loss": 0.4211,
+      "step": 1759
+    },
+    {
+      "epoch": 0.048562273873289714,
+      "grad_norm": 0.003560771932825446,
+      "learning_rate": 0.001,
+      "loss": 0.4317,
+      "step": 1760
+    },
+    {
+      "epoch": 0.04858986607435408,
+      "grad_norm": 0.00306741357780993,
+      "learning_rate": 0.001,
+      "loss": 0.4185,
+      "step": 1761
+    },
+    {
+      "epoch": 0.048617458275418454,
+      "grad_norm": 0.010002429597079754,
+      "learning_rate": 0.001,
+      "loss": 0.3931,
+      "step": 1762
+    },
+    {
+      "epoch": 0.04864505047648282,
+      "grad_norm": 0.006219691131263971,
+      "learning_rate": 0.001,
+      "loss": 0.3574,
+      "step": 1763
+    },
+    {
+      "epoch": 0.048672642677547194,
+      "grad_norm": 0.0033367029391229153,
+      "learning_rate": 0.001,
+      "loss": 0.4001,
+      "step": 1764
+    },
+    {
+      "epoch": 0.04870023487861156,
+      "grad_norm": 0.002981371246278286,
+      "learning_rate": 0.001,
+      "loss": 0.3876,
+      "step": 1765
+    },
+    {
+      "epoch": 0.04872782707967593,
+      "grad_norm": 0.004406462889164686,
+      "learning_rate": 0.001,
+      "loss": 0.4005,
+      "step": 1766
+    },
+    {
+      "epoch": 0.0487554192807403,
+      "grad_norm": 0.0028726779855787754,
+      "learning_rate": 0.001,
+      "loss": 0.3537,
+      "step": 1767
+    },
+    {
+      "epoch": 0.04878301148180467,
+      "grad_norm": 0.0029508452862501144,
+      "learning_rate": 0.001,
+      "loss": 0.4005,
+      "step": 1768
+    },
+    {
+      "epoch": 0.04881060368286904,
+      "grad_norm": 0.004091752227395773,
+      "learning_rate": 0.001,
+      "loss": 0.3988,
+      "step": 1769
+    },
+    {
+      "epoch": 0.04883819588393341,
+      "grad_norm": 0.0027895302046090364,
+      "learning_rate": 0.001,
+      "loss": 0.3969,
+      "step": 1770
+    },
+    {
+      "epoch": 0.04886578808499777,
+      "grad_norm": 0.003976823296397924,
+      "learning_rate": 0.001,
+      "loss": 0.4086,
+      "step": 1771
+    },
+    {
+      "epoch": 0.048893380286062146,
+      "grad_norm": 0.0030748520512133837,
+      "learning_rate": 0.001,
+      "loss": 0.4191,
+      "step": 1772
+    },
+    {
+      "epoch": 0.04892097248712651,
+      "grad_norm": 0.005313929636031389,
+      "learning_rate": 0.001,
+      "loss": 0.4141,
+      "step": 1773
+    },
+    {
+      "epoch": 0.048948564688190886,
+      "grad_norm": 0.003955441992729902,
+      "learning_rate": 0.001,
+      "loss": 0.4196,
+      "step": 1774
+    },
+    {
+      "epoch": 0.04897615688925525,
+      "grad_norm": 0.007145262788981199,
+      "learning_rate": 0.001,
+      "loss": 0.4179,
+      "step": 1775
+    },
+    {
+      "epoch": 0.04900374909031962,
+      "grad_norm": 0.003774230368435383,
+      "learning_rate": 0.001,
+      "loss": 0.3864,
+      "step": 1776
+    },
+    {
+      "epoch": 0.04903134129138399,
+      "grad_norm": 0.002694410039111972,
+      "learning_rate": 0.001,
+      "loss": 0.389,
+      "step": 1777
+    },
+    {
+      "epoch": 0.04905893349244836,
+      "grad_norm": 0.003494556527584791,
+      "learning_rate": 0.001,
+      "loss": 0.3966,
+      "step": 1778
+    },
+    {
+      "epoch": 0.049086525693512725,
+      "grad_norm": 0.0033225903753191233,
+      "learning_rate": 0.001,
+      "loss": 0.3659,
+      "step": 1779
+    },
+    {
+      "epoch": 0.0491141178945771,
+      "grad_norm": 0.0031309863552451134,
+      "learning_rate": 0.001,
+      "loss": 0.3728,
+      "step": 1780
+    },
+    {
+      "epoch": 0.049141710095641465,
+      "grad_norm": 0.0028971245046705008,
+      "learning_rate": 0.001,
+      "loss": 0.3719,
+      "step": 1781
+    },
+    {
+      "epoch": 0.04916930229670584,
+      "grad_norm": 0.006014004349708557,
+      "learning_rate": 0.001,
+      "loss": 0.3521,
+      "step": 1782
+    },
+    {
+      "epoch": 0.049196894497770205,
+      "grad_norm": 0.002988451160490513,
+      "learning_rate": 0.001,
+      "loss": 0.4184,
+      "step": 1783
+    },
+    {
+      "epoch": 0.04922448669883457,
+      "grad_norm": 0.0036536797415465117,
+      "learning_rate": 0.001,
+      "loss": 0.4109,
+      "step": 1784
+    },
+    {
+      "epoch": 0.049252078899898945,
+      "grad_norm": 0.00857572816312313,
+      "learning_rate": 0.001,
+      "loss": 0.4012,
+      "step": 1785
+    },
+    {
+      "epoch": 0.04927967110096331,
+      "grad_norm": 0.003724129404872656,
+      "learning_rate": 0.001,
+      "loss": 0.4247,
+      "step": 1786
+    },
+    {
+      "epoch": 0.049307263302027685,
+      "grad_norm": 0.004292929545044899,
+      "learning_rate": 0.001,
+      "loss": 0.4128,
+      "step": 1787
+    },
+    {
+      "epoch": 0.04933485550309205,
+      "grad_norm": 0.004144448321312666,
+      "learning_rate": 0.001,
+      "loss": 0.3601,
+      "step": 1788
+    },
+    {
+      "epoch": 0.04936244770415642,
+      "grad_norm": 0.002937472425401211,
+      "learning_rate": 0.001,
+      "loss": 0.3888,
+      "step": 1789
+    },
+    {
+      "epoch": 0.04939003990522079,
+      "grad_norm": 0.0032268317881971598,
+      "learning_rate": 0.001,
+      "loss": 0.3306,
+      "step": 1790
+    },
+    {
+      "epoch": 0.04941763210628516,
+      "grad_norm": 0.0034790022764354944,
+      "learning_rate": 0.001,
+      "loss": 0.4137,
+      "step": 1791
+    },
+    {
+      "epoch": 0.04944522430734953,
+      "grad_norm": 0.004379100166261196,
+      "learning_rate": 0.001,
+      "loss": 0.387,
+      "step": 1792
+    },
+    {
+      "epoch": 0.0494728165084139,
+      "grad_norm": 0.00435485178604722,
+      "learning_rate": 0.001,
+      "loss": 0.361,
+      "step": 1793
+    },
+    {
+      "epoch": 0.049500408709478264,
+      "grad_norm": 0.0028884608764201403,
+      "learning_rate": 0.001,
+      "loss": 0.3888,
+      "step": 1794
+    },
+    {
+      "epoch": 0.04952800091054264,
+      "grad_norm": 0.0035733478143811226,
+      "learning_rate": 0.001,
+      "loss": 0.3798,
+      "step": 1795
+    },
+    {
+      "epoch": 0.049555593111607,
+      "grad_norm": 0.003197557758539915,
+      "learning_rate": 0.001,
+      "loss": 0.4283,
+      "step": 1796
+    },
+    {
+      "epoch": 0.04958318531267138,
+      "grad_norm": 0.012362437322735786,
+      "learning_rate": 0.001,
+      "loss": 0.4401,
+      "step": 1797
+    },
+    {
+      "epoch": 0.04961077751373574,
+      "grad_norm": 0.005861248355358839,
+      "learning_rate": 0.001,
+      "loss": 0.4287,
+      "step": 1798
+    },
+    {
+      "epoch": 0.04963836971480011,
+      "grad_norm": 0.0055047376081347466,
+      "learning_rate": 0.001,
+      "loss": 0.3827,
+      "step": 1799
+    },
+    {
+      "epoch": 0.04966596191586448,
+      "grad_norm": 0.005381823051720858,
+      "learning_rate": 0.001,
+      "loss": 0.4094,
+      "step": 1800
+    },
+    {
+      "epoch": 0.04969355411692885,
+      "grad_norm": 0.003702189540490508,
+      "learning_rate": 0.001,
+      "loss": 0.416,
+      "step": 1801
+    },
+    {
+      "epoch": 0.049721146317993216,
+      "grad_norm": 0.005075744818896055,
+      "learning_rate": 0.001,
+      "loss": 0.3941,
+      "step": 1802
+    },
+    {
+      "epoch": 0.04974873851905759,
+      "grad_norm": 0.0033848159946501255,
+      "learning_rate": 0.001,
+      "loss": 0.4107,
+      "step": 1803
+    },
+    {
+      "epoch": 0.049776330720121956,
+      "grad_norm": 0.007472567725926638,
+      "learning_rate": 0.001,
+      "loss": 0.3677,
+      "step": 1804
+    },
+    {
+      "epoch": 0.04980392292118633,
+      "grad_norm": 0.005941023584455252,
+      "learning_rate": 0.001,
+      "loss": 0.3957,
+      "step": 1805
+    },
+    {
+      "epoch": 0.049831515122250696,
+      "grad_norm": 0.00963501911610365,
+      "learning_rate": 0.001,
+      "loss": 0.3943,
+      "step": 1806
+    },
+    {
+      "epoch": 0.04985910732331506,
+      "grad_norm": 0.017620790749788284,
+      "learning_rate": 0.001,
+      "loss": 0.4225,
+      "step": 1807
+    },
+    {
+      "epoch": 0.049886699524379435,
+      "grad_norm": 0.0027190614491701126,
+      "learning_rate": 0.001,
+      "loss": 0.3788,
+      "step": 1808
+    },
+    {
+      "epoch": 0.0499142917254438,
+      "grad_norm": 0.003102682065218687,
+      "learning_rate": 0.001,
+      "loss": 0.4197,
+      "step": 1809
+    },
+    {
+      "epoch": 0.049941883926508175,
+      "grad_norm": 0.0033104156609624624,
+      "learning_rate": 0.001,
+      "loss": 0.3862,
+      "step": 1810
+    },
+    {
+      "epoch": 0.04996947612757254,
+      "grad_norm": 0.0038089246954768896,
+      "learning_rate": 0.001,
+      "loss": 0.4085,
+      "step": 1811
+    },
+    {
+      "epoch": 0.04999706832863691,
+      "grad_norm": 0.005627461709082127,
+      "learning_rate": 0.001,
+      "loss": 0.3882,
+      "step": 1812
+    },
+    {
+      "epoch": 0.05002466052970128,
+      "grad_norm": 0.0027909004129469395,
+      "learning_rate": 0.001,
+      "loss": 0.4023,
+      "step": 1813
+    },
+    {
+      "epoch": 0.05005225273076565,
+      "grad_norm": 0.0031870375387370586,
+      "learning_rate": 0.001,
+      "loss": 0.4156,
+      "step": 1814
+    },
+    {
+      "epoch": 0.05007984493183002,
+      "grad_norm": 0.0030249350238591433,
+      "learning_rate": 0.001,
+      "loss": 0.3822,
+      "step": 1815
+    },
+    {
+      "epoch": 0.05010743713289439,
+      "grad_norm": 0.0051695844158530235,
+      "learning_rate": 0.001,
+      "loss": 0.3823,
+      "step": 1816
+    },
+    {
+      "epoch": 0.050135029333958754,
+      "grad_norm": 0.004010764416307211,
+      "learning_rate": 0.001,
+      "loss": 0.3718,
+      "step": 1817
+    },
+    {
+      "epoch": 0.05016262153502313,
+      "grad_norm": 0.004209370352327824,
+      "learning_rate": 0.001,
+      "loss": 0.3638,
+      "step": 1818
+    },
+    {
+      "epoch": 0.050190213736087494,
+      "grad_norm": 0.004149060230702162,
+      "learning_rate": 0.001,
+      "loss": 0.3765,
+      "step": 1819
+    },
+    {
+      "epoch": 0.05021780593715187,
+      "grad_norm": 0.02054491639137268,
+      "learning_rate": 0.001,
+      "loss": 0.4019,
+      "step": 1820
+    },
+    {
+      "epoch": 0.050245398138216234,
+      "grad_norm": 0.017421457916498184,
+      "learning_rate": 0.001,
+      "loss": 0.4034,
+      "step": 1821
+    },
+    {
+      "epoch": 0.0502729903392806,
+      "grad_norm": 0.002541644498705864,
+      "learning_rate": 0.001,
+      "loss": 0.4215,
+      "step": 1822
+    },
+    {
+      "epoch": 0.050300582540344974,
+      "grad_norm": 0.0031819387804716825,
+      "learning_rate": 0.001,
+      "loss": 0.4075,
+      "step": 1823
+    },
+    {
+      "epoch": 0.05032817474140934,
+      "grad_norm": 0.004389981739223003,
+      "learning_rate": 0.001,
+      "loss": 0.414,
+      "step": 1824
+    },
+    {
+      "epoch": 0.050355766942473706,
+      "grad_norm": 0.0037767868489027023,
+      "learning_rate": 0.001,
+      "loss": 0.4166,
+      "step": 1825
+    },
+    {
+      "epoch": 0.05038335914353808,
+      "grad_norm": 0.0033333280589431524,
+      "learning_rate": 0.001,
+      "loss": 0.4304,
+      "step": 1826
+    },
+    {
+      "epoch": 0.050410951344602446,
+      "grad_norm": 0.0033245261292904615,
+      "learning_rate": 0.001,
+      "loss": 0.4512,
+      "step": 1827
+    },
+    {
+      "epoch": 0.05043854354566682,
+      "grad_norm": 0.005083107389509678,
+      "learning_rate": 0.001,
+      "loss": 0.4058,
+      "step": 1828
+    },
+    {
+      "epoch": 0.050466135746731186,
+      "grad_norm": 0.002904941327869892,
+      "learning_rate": 0.001,
+      "loss": 0.4123,
+      "step": 1829
+    },
+    {
+      "epoch": 0.05049372794779555,
+      "grad_norm": 0.0927748754620552,
+      "learning_rate": 0.001,
+      "loss": 0.381,
+      "step": 1830
+    },
+    {
+      "epoch": 0.050521320148859926,
+      "grad_norm": 0.0028325989842414856,
+      "learning_rate": 0.001,
+      "loss": 0.4329,
+      "step": 1831
+    },
+    {
+      "epoch": 0.05054891234992429,
+      "grad_norm": 0.005994097795337439,
+      "learning_rate": 0.001,
+      "loss": 0.3843,
+      "step": 1832
+    },
+    {
+      "epoch": 0.050576504550988666,
+      "grad_norm": 0.0032757038716226816,
+      "learning_rate": 0.001,
+      "loss": 0.4053,
+      "step": 1833
+    },
+    {
+      "epoch": 0.05060409675205303,
+      "grad_norm": 0.0055892630480229855,
+      "learning_rate": 0.001,
+      "loss": 0.3838,
+      "step": 1834
+    },
+    {
+      "epoch": 0.0506316889531174,
+      "grad_norm": 0.007300201803445816,
+      "learning_rate": 0.001,
+      "loss": 0.4304,
+      "step": 1835
+    },
+    {
+      "epoch": 0.05065928115418177,
+      "grad_norm": 0.004705760162323713,
+      "learning_rate": 0.001,
+      "loss": 0.4032,
+      "step": 1836
+    },
+    {
+      "epoch": 0.05068687335524614,
+      "grad_norm": 0.0038876638282090425,
+      "learning_rate": 0.001,
+      "loss": 0.4482,
+      "step": 1837
+    },
+    {
+      "epoch": 0.05071446555631051,
+      "grad_norm": 0.0049439528957009315,
+      "learning_rate": 0.001,
+      "loss": 0.3771,
+      "step": 1838
+    },
+    {
+      "epoch": 0.05074205775737488,
+      "grad_norm": 0.00367721077054739,
+      "learning_rate": 0.001,
+      "loss": 0.3874,
+      "step": 1839
+    },
+    {
+      "epoch": 0.050769649958439245,
+      "grad_norm": 0.0021761716343462467,
+      "learning_rate": 0.001,
+      "loss": 0.3918,
+      "step": 1840
+    },
+    {
+      "epoch": 0.05079724215950362,
+      "grad_norm": 0.003433308796957135,
+      "learning_rate": 0.001,
+      "loss": 0.3675,
+      "step": 1841
+    },
+    {
+      "epoch": 0.050824834360567984,
+      "grad_norm": 0.0041929068975150585,
+      "learning_rate": 0.001,
+      "loss": 0.4287,
+      "step": 1842
+    },
+    {
+      "epoch": 0.05085242656163236,
+      "grad_norm": 0.003800287377089262,
+      "learning_rate": 0.001,
+      "loss": 0.3831,
+      "step": 1843
+    },
+    {
+      "epoch": 0.050880018762696724,
+      "grad_norm": 0.00222361390478909,
+      "learning_rate": 0.001,
+      "loss": 0.3766,
+      "step": 1844
+    },
+    {
+      "epoch": 0.05090761096376109,
+      "grad_norm": 0.010053042322397232,
+      "learning_rate": 0.001,
+      "loss": 0.3897,
+      "step": 1845
+    },
+    {
+      "epoch": 0.050935203164825464,
+      "grad_norm": 0.002249652985483408,
+      "learning_rate": 0.001,
+      "loss": 0.4388,
+      "step": 1846
+    },
+    {
+      "epoch": 0.05096279536588983,
+      "grad_norm": 0.0034179852809756994,
+      "learning_rate": 0.001,
+      "loss": 0.4085,
+      "step": 1847
+    },
+    {
+      "epoch": 0.0509903875669542,
+      "grad_norm": 0.0027194637805223465,
+      "learning_rate": 0.001,
+      "loss": 0.3943,
+      "step": 1848
+    },
+    {
+      "epoch": 0.05101797976801857,
+      "grad_norm": 0.00189922412391752,
+      "learning_rate": 0.001,
+      "loss": 0.45,
+      "step": 1849
+    },
+    {
+      "epoch": 0.05104557196908294,
+      "grad_norm": 0.0029718675650656223,
+      "learning_rate": 0.001,
+      "loss": 0.408,
+      "step": 1850
+    },
+    {
+      "epoch": 0.05107316417014731,
+      "grad_norm": 0.002118749311193824,
+      "learning_rate": 0.001,
+      "loss": 0.4314,
+      "step": 1851
+    },
+    {
+      "epoch": 0.05110075637121168,
+      "grad_norm": 0.001921525807119906,
+      "learning_rate": 0.001,
+      "loss": 0.401,
+      "step": 1852
+    },
+    {
+      "epoch": 0.05112834857227604,
+      "grad_norm": 0.002200416987761855,
+      "learning_rate": 0.001,
+      "loss": 0.4341,
+      "step": 1853
+    },
+    {
+      "epoch": 0.051155940773340416,
+      "grad_norm": 0.004263362381607294,
+      "learning_rate": 0.001,
+      "loss": 0.3937,
+      "step": 1854
+    },
+    {
+      "epoch": 0.05118353297440478,
+      "grad_norm": 0.002614164724946022,
+      "learning_rate": 0.001,
+      "loss": 0.4168,
+      "step": 1855
+    },
+    {
+      "epoch": 0.051211125175469156,
+      "grad_norm": 0.002143037738278508,
+      "learning_rate": 0.001,
+      "loss": 0.4199,
+      "step": 1856
+    },
+    {
+      "epoch": 0.05123871737653352,
+      "grad_norm": 0.0025298171676695347,
+      "learning_rate": 0.001,
+      "loss": 0.4177,
+      "step": 1857
+    },
+    {
+      "epoch": 0.05126630957759789,
+      "grad_norm": 0.0026806145906448364,
+      "learning_rate": 0.001,
+      "loss": 0.3976,
+      "step": 1858
+    },
+    {
+      "epoch": 0.05129390177866226,
+      "grad_norm": 0.0022738908883184195,
+      "learning_rate": 0.001,
+      "loss": 0.4439,
+      "step": 1859
+    },
+    {
+      "epoch": 0.05132149397972663,
+      "grad_norm": 0.0023423023521900177,
+      "learning_rate": 0.001,
+      "loss": 0.3977,
+      "step": 1860
+    },
+    {
+      "epoch": 0.051349086180791,
+      "grad_norm": 0.0033382964320480824,
+      "learning_rate": 0.001,
+      "loss": 0.4005,
+      "step": 1861
+    },
+    {
+      "epoch": 0.05137667838185537,
+      "grad_norm": 0.002349057700484991,
+      "learning_rate": 0.001,
+      "loss": 0.4096,
+      "step": 1862
+    },
+    {
+      "epoch": 0.051404270582919735,
+      "grad_norm": 0.0029646153561770916,
+      "learning_rate": 0.001,
+      "loss": 0.3665,
+      "step": 1863
+    },
+    {
+      "epoch": 0.05143186278398411,
+      "grad_norm": 0.003112231148406863,
+      "learning_rate": 0.001,
+      "loss": 0.3745,
+      "step": 1864
+    },
+    {
+      "epoch": 0.051459454985048475,
+      "grad_norm": 0.006748152896761894,
+      "learning_rate": 0.001,
+      "loss": 0.3885,
+      "step": 1865
+    },
+    {
+      "epoch": 0.05148704718611285,
+      "grad_norm": 0.003962017595767975,
+      "learning_rate": 0.001,
+      "loss": 0.3669,
+      "step": 1866
+    },
+    {
+      "epoch": 0.051514639387177215,
+      "grad_norm": 0.0032168615143746138,
+      "learning_rate": 0.001,
+      "loss": 0.4077,
+      "step": 1867
+    },
+    {
+      "epoch": 0.05154223158824158,
+      "grad_norm": 0.00549092423170805,
+      "learning_rate": 0.001,
+      "loss": 0.3736,
+      "step": 1868
+    },
+    {
+      "epoch": 0.051569823789305955,
+      "grad_norm": 0.005641186144202948,
+      "learning_rate": 0.001,
+      "loss": 0.373,
+      "step": 1869
+    },
+    {
+      "epoch": 0.05159741599037032,
+      "grad_norm": 0.0029503426048904657,
+      "learning_rate": 0.001,
+      "loss": 0.3835,
+      "step": 1870
+    },
+    {
+      "epoch": 0.05162500819143469,
+      "grad_norm": 0.002859857166185975,
+      "learning_rate": 0.001,
+      "loss": 0.4275,
+      "step": 1871
+    },
+    {
+      "epoch": 0.05165260039249906,
+      "grad_norm": 0.0032987252343446016,
+      "learning_rate": 0.001,
+      "loss": 0.377,
+      "step": 1872
+    },
+    {
+      "epoch": 0.05168019259356343,
+      "grad_norm": 0.002959759905934334,
+      "learning_rate": 0.001,
+      "loss": 0.361,
+      "step": 1873
+    },
+    {
+      "epoch": 0.0517077847946278,
+      "grad_norm": 0.00282686366699636,
+      "learning_rate": 0.001,
+      "loss": 0.3661,
+      "step": 1874
+    },
+    {
+      "epoch": 0.05173537699569217,
+      "grad_norm": 0.003676011925563216,
+      "learning_rate": 0.001,
+      "loss": 0.418,
+      "step": 1875
+    },
+    {
+      "epoch": 0.051762969196756534,
+      "grad_norm": 0.0026158930268138647,
+      "learning_rate": 0.001,
+      "loss": 0.4015,
+      "step": 1876
+    },
+    {
+      "epoch": 0.05179056139782091,
+      "grad_norm": 0.0024638089817017317,
+      "learning_rate": 0.001,
+      "loss": 0.4249,
+      "step": 1877
+    },
+    {
+      "epoch": 0.05181815359888527,
+      "grad_norm": 0.003338116453960538,
+      "learning_rate": 0.001,
+      "loss": 0.392,
+      "step": 1878
+    },
+    {
+      "epoch": 0.05184574579994965,
+      "grad_norm": 0.004489647690206766,
+      "learning_rate": 0.001,
+      "loss": 0.3799,
+      "step": 1879
+    },
+    {
+      "epoch": 0.05187333800101401,
+      "grad_norm": 0.002798578003421426,
+      "learning_rate": 0.001,
+      "loss": 0.3785,
+      "step": 1880
+    },
+    {
+      "epoch": 0.05190093020207838,
+      "grad_norm": 0.0029343098867684603,
+      "learning_rate": 0.001,
+      "loss": 0.3848,
+      "step": 1881
+    },
+    {
+      "epoch": 0.05192852240314275,
+      "grad_norm": 0.003431658959016204,
+      "learning_rate": 0.001,
+      "loss": 0.4085,
+      "step": 1882
+    },
+    {
+      "epoch": 0.05195611460420712,
+      "grad_norm": 0.002329075476154685,
+      "learning_rate": 0.001,
+      "loss": 0.404,
+      "step": 1883
+    },
+    {
+      "epoch": 0.05198370680527149,
+      "grad_norm": 0.002889421069994569,
+      "learning_rate": 0.001,
+      "loss": 0.394,
+      "step": 1884
+    },
+    {
+      "epoch": 0.05201129900633586,
+      "grad_norm": 0.003102013608440757,
+      "learning_rate": 0.001,
+      "loss": 0.4429,
+      "step": 1885
+    },
+    {
+      "epoch": 0.052038891207400226,
+      "grad_norm": 0.0030128401704132557,
+      "learning_rate": 0.001,
+      "loss": 0.402,
+      "step": 1886
+    },
+    {
+      "epoch": 0.0520664834084646,
+      "grad_norm": 0.002882935106754303,
+      "learning_rate": 0.001,
+      "loss": 0.4126,
+      "step": 1887
+    },
+    {
+      "epoch": 0.052094075609528966,
+      "grad_norm": 0.0033189402893185616,
+      "learning_rate": 0.001,
+      "loss": 0.382,
+      "step": 1888
+    },
+    {
+      "epoch": 0.05212166781059334,
+      "grad_norm": 0.003128107637166977,
+      "learning_rate": 0.001,
+      "loss": 0.3822,
+      "step": 1889
+    },
+    {
+      "epoch": 0.052149260011657705,
+      "grad_norm": 0.0028161504305899143,
+      "learning_rate": 0.001,
+      "loss": 0.4362,
+      "step": 1890
+    },
+    {
+      "epoch": 0.05217685221272207,
+      "grad_norm": 0.00556217972189188,
+      "learning_rate": 0.001,
+      "loss": 0.426,
+      "step": 1891
+    },
+    {
+      "epoch": 0.052204444413786445,
+      "grad_norm": 0.006903808563947678,
+      "learning_rate": 0.001,
+      "loss": 0.4056,
+      "step": 1892
+    },
+    {
+      "epoch": 0.05223203661485081,
+      "grad_norm": 0.008768963627517223,
+      "learning_rate": 0.001,
+      "loss": 0.407,
+      "step": 1893
+    },
+    {
+      "epoch": 0.05225962881591518,
+      "grad_norm": 0.0038300056476145983,
+      "learning_rate": 0.001,
+      "loss": 0.4084,
+      "step": 1894
+    },
+    {
+      "epoch": 0.05228722101697955,
+      "grad_norm": 0.005089603364467621,
+      "learning_rate": 0.001,
+      "loss": 0.387,
+      "step": 1895
+    },
+    {
+      "epoch": 0.05231481321804392,
+      "grad_norm": 0.004371874965727329,
+      "learning_rate": 0.001,
+      "loss": 0.3989,
+      "step": 1896
+    },
+    {
+      "epoch": 0.05234240541910829,
+      "grad_norm": 0.0037384675815701485,
+      "learning_rate": 0.001,
+      "loss": 0.4168,
+      "step": 1897
+    },
+    {
+      "epoch": 0.05236999762017266,
+      "grad_norm": 0.012704477645456791,
+      "learning_rate": 0.001,
+      "loss": 0.4017,
+      "step": 1898
+    },
+    {
+      "epoch": 0.052397589821237024,
+      "grad_norm": 0.004448601044714451,
+      "learning_rate": 0.001,
+      "loss": 0.3781,
+      "step": 1899
+    },
+    {
+      "epoch": 0.0524251820223014,
+      "grad_norm": 0.004198121372610331,
+      "learning_rate": 0.001,
+      "loss": 0.3724,
+      "step": 1900
+    },
+    {
+      "epoch": 0.052452774223365764,
+      "grad_norm": 0.008598609827458858,
+      "learning_rate": 0.001,
+      "loss": 0.4058,
+      "step": 1901
+    },
+    {
+      "epoch": 0.05248036642443014,
+      "grad_norm": 0.005187192931771278,
+      "learning_rate": 0.001,
+      "loss": 0.388,
+      "step": 1902
+    },
+    {
+      "epoch": 0.052507958625494504,
+      "grad_norm": 0.0033852029591798782,
+      "learning_rate": 0.001,
+      "loss": 0.418,
+      "step": 1903
+    },
+    {
+      "epoch": 0.05253555082655887,
+      "grad_norm": 0.004068314563483,
+      "learning_rate": 0.001,
+      "loss": 0.4299,
+      "step": 1904
+    },
+    {
+      "epoch": 0.052563143027623244,
+      "grad_norm": 0.004143074620515108,
+      "learning_rate": 0.001,
+      "loss": 0.4146,
+      "step": 1905
+    },
+    {
+      "epoch": 0.05259073522868761,
+      "grad_norm": 0.004285483155399561,
+      "learning_rate": 0.001,
+      "loss": 0.3817,
+      "step": 1906
+    },
+    {
+      "epoch": 0.05261832742975198,
+      "grad_norm": 0.004647474270313978,
+      "learning_rate": 0.001,
+      "loss": 0.3652,
+      "step": 1907
+    },
+    {
+      "epoch": 0.05264591963081635,
+      "grad_norm": 0.002305620349943638,
+      "learning_rate": 0.001,
+      "loss": 0.3944,
+      "step": 1908
+    },
+    {
+      "epoch": 0.052673511831880716,
+      "grad_norm": 0.004161974415183067,
+      "learning_rate": 0.001,
+      "loss": 0.3945,
+      "step": 1909
+    },
+    {
+      "epoch": 0.05270110403294509,
+      "grad_norm": 0.004659401252865791,
+      "learning_rate": 0.001,
+      "loss": 0.4125,
+      "step": 1910
+    },
+    {
+      "epoch": 0.052728696234009456,
+      "grad_norm": 0.003204295178875327,
+      "learning_rate": 0.001,
+      "loss": 0.3579,
+      "step": 1911
+    },
+    {
+      "epoch": 0.05275628843507383,
+      "grad_norm": 0.002777168760076165,
+      "learning_rate": 0.001,
+      "loss": 0.3996,
+      "step": 1912
+    },
+    {
+      "epoch": 0.052783880636138196,
+      "grad_norm": 0.003622923046350479,
+      "learning_rate": 0.001,
+      "loss": 0.4119,
+      "step": 1913
+    },
+    {
+      "epoch": 0.05281147283720256,
+      "grad_norm": 0.003526994027197361,
+      "learning_rate": 0.001,
+      "loss": 0.3893,
+      "step": 1914
+    },
+    {
+      "epoch": 0.052839065038266936,
+      "grad_norm": 0.0032004155218601227,
+      "learning_rate": 0.001,
+      "loss": 0.3905,
+      "step": 1915
+    },
+    {
+      "epoch": 0.0528666572393313,
+      "grad_norm": 0.003097831504419446,
+      "learning_rate": 0.001,
+      "loss": 0.3859,
+      "step": 1916
+    },
+    {
+      "epoch": 0.052894249440395676,
+      "grad_norm": 0.0046899812296032906,
+      "learning_rate": 0.001,
+      "loss": 0.3689,
+      "step": 1917
+    },
+    {
+      "epoch": 0.05292184164146004,
+      "grad_norm": 0.00434432877227664,
+      "learning_rate": 0.001,
+      "loss": 0.4004,
+      "step": 1918
+    },
+    {
+      "epoch": 0.05294943384252441,
+      "grad_norm": 0.0034439517185091972,
+      "learning_rate": 0.001,
+      "loss": 0.3907,
+      "step": 1919
+    },
+    {
+      "epoch": 0.05297702604358878,
+      "grad_norm": 0.0031842549797147512,
+      "learning_rate": 0.001,
+      "loss": 0.3825,
+      "step": 1920
+    },
+    {
+      "epoch": 0.05300461824465315,
+      "grad_norm": 0.002607315080240369,
+      "learning_rate": 0.001,
+      "loss": 0.4513,
+      "step": 1921
+    },
+    {
+      "epoch": 0.053032210445717515,
+      "grad_norm": 0.003620088566094637,
+      "learning_rate": 0.001,
+      "loss": 0.3873,
+      "step": 1922
+    },
+    {
+      "epoch": 0.05305980264678189,
+      "grad_norm": 0.0034968543332070112,
+      "learning_rate": 0.001,
+      "loss": 0.3656,
+      "step": 1923
+    },
+    {
+      "epoch": 0.053087394847846255,
+      "grad_norm": 0.0023364322260022163,
+      "learning_rate": 0.001,
+      "loss": 0.4037,
+      "step": 1924
+    },
+    {
+      "epoch": 0.05311498704891063,
+      "grad_norm": 0.002855852944776416,
+      "learning_rate": 0.001,
+      "loss": 0.4145,
+      "step": 1925
+    },
+    {
+      "epoch": 0.053142579249974994,
+      "grad_norm": 0.0027531140949577093,
+      "learning_rate": 0.001,
+      "loss": 0.3575,
+      "step": 1926
+    },
+    {
+      "epoch": 0.05317017145103936,
+      "grad_norm": 0.0032164284493774176,
+      "learning_rate": 0.001,
+      "loss": 0.3979,
+      "step": 1927
+    },
+    {
+      "epoch": 0.053197763652103734,
+      "grad_norm": 0.002508921315893531,
+      "learning_rate": 0.001,
+      "loss": 0.4085,
+      "step": 1928
+    },
+    {
+      "epoch": 0.0532253558531681,
+      "grad_norm": 0.0026788045652210712,
+      "learning_rate": 0.001,
+      "loss": 0.3872,
+      "step": 1929
+    },
+    {
+      "epoch": 0.053252948054232474,
+      "grad_norm": 0.0051200552843511105,
+      "learning_rate": 0.001,
+      "loss": 0.4025,
+      "step": 1930
+    },
+    {
+      "epoch": 0.05328054025529684,
+      "grad_norm": 0.004671086091548204,
+      "learning_rate": 0.001,
+      "loss": 0.4054,
+      "step": 1931
+    },
+    {
+      "epoch": 0.05330813245636121,
+      "grad_norm": 0.0028699622489511967,
+      "learning_rate": 0.001,
+      "loss": 0.4078,
+      "step": 1932
+    },
+    {
+      "epoch": 0.05333572465742558,
+      "grad_norm": 0.003617132781073451,
+      "learning_rate": 0.001,
+      "loss": 0.3793,
+      "step": 1933
+    },
+    {
+      "epoch": 0.05336331685848995,
+      "grad_norm": 0.0036757884081453085,
+      "learning_rate": 0.001,
+      "loss": 0.3995,
+      "step": 1934
+    },
+    {
+      "epoch": 0.05339090905955432,
+      "grad_norm": 0.002974196569994092,
+      "learning_rate": 0.001,
+      "loss": 0.4085,
+      "step": 1935
+    },
+    {
+      "epoch": 0.05341850126061869,
+      "grad_norm": 0.0034452476538717747,
+      "learning_rate": 0.001,
+      "loss": 0.4123,
+      "step": 1936
+    },
+    {
+      "epoch": 0.05344609346168305,
+      "grad_norm": 0.010058878920972347,
+      "learning_rate": 0.001,
+      "loss": 0.4216,
+      "step": 1937
+    },
+    {
+      "epoch": 0.053473685662747426,
+      "grad_norm": 0.007609996944665909,
+      "learning_rate": 0.001,
+      "loss": 0.3969,
+      "step": 1938
+    },
+    {
+      "epoch": 0.05350127786381179,
+      "grad_norm": 0.006538870744407177,
+      "learning_rate": 0.001,
+      "loss": 0.4252,
+      "step": 1939
+    },
+    {
+      "epoch": 0.053528870064876166,
+      "grad_norm": 0.003294197143986821,
+      "learning_rate": 0.001,
+      "loss": 0.4012,
+      "step": 1940
+    },
+    {
+      "epoch": 0.05355646226594053,
+      "grad_norm": 0.0027565581258386374,
+      "learning_rate": 0.001,
+      "loss": 0.4105,
+      "step": 1941
+    },
+    {
+      "epoch": 0.0535840544670049,
+      "grad_norm": 0.0027171720284968615,
+      "learning_rate": 0.001,
+      "loss": 0.3947,
+      "step": 1942
+    },
+    {
+      "epoch": 0.05361164666806927,
+      "grad_norm": 0.00246808142401278,
+      "learning_rate": 0.001,
+      "loss": 0.3959,
+      "step": 1943
+    },
+    {
+      "epoch": 0.05363923886913364,
+      "grad_norm": 0.0030862221028655767,
+      "learning_rate": 0.001,
+      "loss": 0.3797,
+      "step": 1944
+    },
+    {
+      "epoch": 0.053666831070198005,
+      "grad_norm": 0.002278526546433568,
+      "learning_rate": 0.001,
+      "loss": 0.4429,
+      "step": 1945
+    },
+    {
+      "epoch": 0.05369442327126238,
+      "grad_norm": 0.002600095234811306,
+      "learning_rate": 0.001,
+      "loss": 0.4105,
+      "step": 1946
+    },
+    {
+      "epoch": 0.053722015472326745,
+      "grad_norm": 0.0028381014708429575,
+      "learning_rate": 0.001,
+      "loss": 0.4161,
+      "step": 1947
+    },
+    {
+      "epoch": 0.05374960767339112,
+      "grad_norm": 0.0026186273898929358,
+      "learning_rate": 0.001,
+      "loss": 0.3953,
+      "step": 1948
+    },
+    {
+      "epoch": 0.053777199874455485,
+      "grad_norm": 0.0026584621518850327,
+      "learning_rate": 0.001,
+      "loss": 0.445,
+      "step": 1949
+    },
+    {
+      "epoch": 0.05380479207551985,
+      "grad_norm": 0.002734116045758128,
+      "learning_rate": 0.001,
+      "loss": 0.396,
+      "step": 1950
+    },
+    {
+      "epoch": 0.053832384276584225,
+      "grad_norm": 0.0053114392794668674,
+      "learning_rate": 0.001,
+      "loss": 0.4241,
+      "step": 1951
+    },
+    {
+      "epoch": 0.05385997647764859,
+      "grad_norm": 0.005844905972480774,
+      "learning_rate": 0.001,
+      "loss": 0.3789,
+      "step": 1952
+    },
+    {
+      "epoch": 0.053887568678712965,
+      "grad_norm": 0.002963895909488201,
+      "learning_rate": 0.001,
+      "loss": 0.406,
+      "step": 1953
+    },
+    {
+      "epoch": 0.05391516087977733,
+      "grad_norm": 0.004491107538342476,
+      "learning_rate": 0.001,
+      "loss": 0.4068,
+      "step": 1954
+    },
+    {
+      "epoch": 0.0539427530808417,
+      "grad_norm": 0.003358663059771061,
+      "learning_rate": 0.001,
+      "loss": 0.4336,
+      "step": 1955
+    },
+    {
+      "epoch": 0.05397034528190607,
+      "grad_norm": 0.003177732229232788,
+      "learning_rate": 0.001,
+      "loss": 0.4212,
+      "step": 1956
+    },
+    {
+      "epoch": 0.05399793748297044,
+      "grad_norm": 0.0027129724621772766,
+      "learning_rate": 0.001,
+      "loss": 0.4155,
+      "step": 1957
+    },
+    {
+      "epoch": 0.05402552968403481,
+      "grad_norm": 0.00669928640127182,
+      "learning_rate": 0.001,
+      "loss": 0.3979,
+      "step": 1958
+    },
+    {
+      "epoch": 0.05405312188509918,
+      "grad_norm": 0.0033415532670915127,
+      "learning_rate": 0.001,
+      "loss": 0.4139,
+      "step": 1959
+    },
+    {
+      "epoch": 0.054080714086163544,
+      "grad_norm": 0.0033846443984657526,
+      "learning_rate": 0.001,
+      "loss": 0.4469,
+      "step": 1960
+    },
+    {
+      "epoch": 0.05410830628722792,
+      "grad_norm": 0.0034599697683006525,
+      "learning_rate": 0.001,
+      "loss": 0.3879,
+      "step": 1961
+    },
+    {
+      "epoch": 0.05413589848829228,
+      "grad_norm": 0.004474925808608532,
+      "learning_rate": 0.001,
+      "loss": 0.367,
+      "step": 1962
+    },
+    {
+      "epoch": 0.05416349068935666,
+      "grad_norm": 0.005082536954432726,
+      "learning_rate": 0.001,
+      "loss": 0.3704,
+      "step": 1963
+    },
+    {
+      "epoch": 0.05419108289042102,
+      "grad_norm": 0.008428883738815784,
+      "learning_rate": 0.001,
+      "loss": 0.3963,
+      "step": 1964
+    },
+    {
+      "epoch": 0.05421867509148539,
+      "grad_norm": 0.0027383146807551384,
+      "learning_rate": 0.001,
+      "loss": 0.4093,
+      "step": 1965
+    },
+    {
+      "epoch": 0.05424626729254976,
+      "grad_norm": 0.0036172361578792334,
+      "learning_rate": 0.001,
+      "loss": 0.424,
+      "step": 1966
+    },
+    {
+      "epoch": 0.05427385949361413,
+      "grad_norm": 0.0034358236007392406,
+      "learning_rate": 0.001,
+      "loss": 0.41,
+      "step": 1967
+    },
+    {
+      "epoch": 0.054301451694678496,
+      "grad_norm": 0.0043413047678768635,
+      "learning_rate": 0.001,
+      "loss": 0.3677,
+      "step": 1968
+    },
+    {
+      "epoch": 0.05432904389574287,
+      "grad_norm": 0.0025661587715148926,
+      "learning_rate": 0.001,
+      "loss": 0.435,
+      "step": 1969
+    },
+    {
+      "epoch": 0.054356636096807236,
+      "grad_norm": 0.005917475093156099,
+      "learning_rate": 0.001,
+      "loss": 0.4118,
+      "step": 1970
+    },
+    {
+      "epoch": 0.05438422829787161,
+      "grad_norm": 0.003292621113359928,
+      "learning_rate": 0.001,
+      "loss": 0.455,
+      "step": 1971
+    },
+    {
+      "epoch": 0.054411820498935975,
+      "grad_norm": 0.0028464931529015303,
+      "learning_rate": 0.001,
+      "loss": 0.4019,
+      "step": 1972
+    },
+    {
+      "epoch": 0.05443941270000034,
+      "grad_norm": 0.003331197891384363,
+      "learning_rate": 0.001,
+      "loss": 0.3802,
+      "step": 1973
+    },
+    {
+      "epoch": 0.054467004901064715,
+      "grad_norm": 0.005236343014985323,
+      "learning_rate": 0.001,
+      "loss": 0.3887,
+      "step": 1974
+    },
+    {
+      "epoch": 0.05449459710212908,
+      "grad_norm": 0.003114610444754362,
+      "learning_rate": 0.001,
+      "loss": 0.4028,
+      "step": 1975
+    },
+    {
+      "epoch": 0.054522189303193455,
+      "grad_norm": 0.0027003728318959475,
+      "learning_rate": 0.001,
+      "loss": 0.4072,
+      "step": 1976
+    },
+    {
+      "epoch": 0.05454978150425782,
+      "grad_norm": 0.00410441542044282,
+      "learning_rate": 0.001,
+      "loss": 0.4108,
+      "step": 1977
+    },
+    {
+      "epoch": 0.05457737370532219,
+      "grad_norm": 0.006601103115826845,
+      "learning_rate": 0.001,
+      "loss": 0.3811,
+      "step": 1978
+    },
+    {
+      "epoch": 0.05460496590638656,
+      "grad_norm": 0.003532871138304472,
+      "learning_rate": 0.001,
+      "loss": 0.4071,
+      "step": 1979
+    },
+    {
+      "epoch": 0.05463255810745093,
+      "grad_norm": 0.0024505204055458307,
+      "learning_rate": 0.001,
+      "loss": 0.4183,
+      "step": 1980
+    },
+    {
+      "epoch": 0.0546601503085153,
+      "grad_norm": 0.0038412862922996283,
+      "learning_rate": 0.001,
+      "loss": 0.3656,
+      "step": 1981
+    },
+    {
+      "epoch": 0.05468774250957967,
+      "grad_norm": 0.00442805141210556,
+      "learning_rate": 0.001,
+      "loss": 0.3929,
+      "step": 1982
+    },
+    {
+      "epoch": 0.054715334710644034,
+      "grad_norm": 0.004178628791123629,
+      "learning_rate": 0.001,
+      "loss": 0.4056,
+      "step": 1983
+    },
+    {
+      "epoch": 0.05474292691170841,
+      "grad_norm": 0.0023596303071826696,
+      "learning_rate": 0.001,
+      "loss": 0.3791,
+      "step": 1984
+    },
+    {
+      "epoch": 0.054770519112772774,
+      "grad_norm": 0.0026005019899457693,
+      "learning_rate": 0.001,
+      "loss": 0.4101,
+      "step": 1985
+    },
+    {
+      "epoch": 0.05479811131383715,
+      "grad_norm": 0.0043563637882471085,
+      "learning_rate": 0.001,
+      "loss": 0.3924,
+      "step": 1986
+    },
+    {
+      "epoch": 0.054825703514901514,
+      "grad_norm": 0.004494437016546726,
+      "learning_rate": 0.001,
+      "loss": 0.3904,
+      "step": 1987
+    },
+    {
+      "epoch": 0.05485329571596588,
+      "grad_norm": 0.0030924968887120485,
+      "learning_rate": 0.001,
+      "loss": 0.3947,
+      "step": 1988
+    },
+    {
+      "epoch": 0.054880887917030254,
+      "grad_norm": 0.0029607934411615133,
+      "learning_rate": 0.001,
+      "loss": 0.402,
+      "step": 1989
+    },
+    {
+      "epoch": 0.05490848011809462,
+      "grad_norm": 0.004574719350785017,
+      "learning_rate": 0.001,
+      "loss": 0.4282,
+      "step": 1990
+    },
+    {
+      "epoch": 0.054936072319158986,
+      "grad_norm": 0.0027861008420586586,
+      "learning_rate": 0.001,
+      "loss": 0.4374,
+      "step": 1991
+    },
+    {
+      "epoch": 0.05496366452022336,
+      "grad_norm": 0.003089543664827943,
+      "learning_rate": 0.001,
+      "loss": 0.452,
+      "step": 1992
+    },
+    {
+      "epoch": 0.054991256721287726,
+      "grad_norm": 0.0027578859589993954,
+      "learning_rate": 0.001,
+      "loss": 0.4185,
+      "step": 1993
+    },
+    {
+      "epoch": 0.0550188489223521,
+      "grad_norm": 0.003672545775771141,
+      "learning_rate": 0.001,
+      "loss": 0.4158,
+      "step": 1994
+    },
+    {
+      "epoch": 0.055046441123416466,
+      "grad_norm": 0.0028012071270495653,
+      "learning_rate": 0.001,
+      "loss": 0.3829,
+      "step": 1995
+    },
+    {
+      "epoch": 0.05507403332448083,
+      "grad_norm": 0.0035645493771880865,
+      "learning_rate": 0.001,
+      "loss": 0.3798,
+      "step": 1996
+    },
+    {
+      "epoch": 0.055101625525545206,
+      "grad_norm": 0.004443664103746414,
+      "learning_rate": 0.001,
+      "loss": 0.4631,
+      "step": 1997
+    },
+    {
+      "epoch": 0.05512921772660957,
+      "grad_norm": 0.002620038343593478,
+      "learning_rate": 0.001,
+      "loss": 0.4562,
+      "step": 1998
+    },
+    {
+      "epoch": 0.055156809927673946,
+      "grad_norm": 0.004475294146686792,
+      "learning_rate": 0.001,
+      "loss": 0.3954,
+      "step": 1999
+    },
+    {
+      "epoch": 0.05518440212873831,
+      "grad_norm": 0.003935279790312052,
+      "learning_rate": 0.001,
+      "loss": 0.4801,
+      "step": 2000
+    },
+    {
+      "epoch": 0.05518440212873831,
+      "eval_runtime": 24.2254,
+      "eval_samples_per_second": 1.321,
+      "eval_steps_per_second": 0.165,
+      "step": 2000
+    },
+    {
+      "epoch": 0.05521199432980268,
+      "grad_norm": 0.01289259921759367,
+      "learning_rate": 0.001,
+      "loss": 0.4222,
+      "step": 2001
+    },
+    {
+      "epoch": 0.05523958653086705,
+      "grad_norm": 0.0044218553230166435,
+      "learning_rate": 0.001,
+      "loss": 0.4219,
+      "step": 2002
+    },
+    {
+      "epoch": 0.05526717873193142,
+      "grad_norm": 0.0024792973417788744,
+      "learning_rate": 0.001,
+      "loss": 0.3932,
+      "step": 2003
+    },
+    {
+      "epoch": 0.05529477093299579,
+      "grad_norm": 0.0030391570180654526,
+      "learning_rate": 0.001,
+      "loss": 0.3952,
+      "step": 2004
+    },
+    {
+      "epoch": 0.05532236313406016,
+      "grad_norm": 0.0027939057908952236,
+      "learning_rate": 0.001,
+      "loss": 0.4062,
+      "step": 2005
+    },
+    {
+      "epoch": 0.055349955335124525,
+      "grad_norm": 0.003908068872988224,
+      "learning_rate": 0.001,
+      "loss": 0.3825,
+      "step": 2006
+    },
+    {
+      "epoch": 0.0553775475361889,
+      "grad_norm": 0.006802330259233713,
+      "learning_rate": 0.001,
+      "loss": 0.3977,
+      "step": 2007
+    },
+    {
+      "epoch": 0.055405139737253264,
+      "grad_norm": 0.005997187457978725,
+      "learning_rate": 0.001,
+      "loss": 0.4176,
+      "step": 2008
+    },
+    {
+      "epoch": 0.05543273193831764,
+      "grad_norm": 0.004595068749040365,
+      "learning_rate": 0.001,
+      "loss": 0.4324,
+      "step": 2009
+    },
+    {
+      "epoch": 0.055460324139382004,
+      "grad_norm": 0.0029883128590881824,
+      "learning_rate": 0.001,
+      "loss": 0.3992,
+      "step": 2010
+    },
+    {
+      "epoch": 0.05548791634044637,
+      "grad_norm": 0.0025319228880107403,
+      "learning_rate": 0.001,
+      "loss": 0.3903,
+      "step": 2011
+    },
+    {
+      "epoch": 0.055515508541510744,
+      "grad_norm": 0.004491596017032862,
+      "learning_rate": 0.001,
+      "loss": 0.3786,
+      "step": 2012
+    },
+    {
+      "epoch": 0.05554310074257511,
+      "grad_norm": 0.01933477073907852,
+      "learning_rate": 0.001,
+      "loss": 0.3898,
+      "step": 2013
+    },
+    {
+      "epoch": 0.05557069294363948,
+      "grad_norm": 0.0026234816759824753,
+      "learning_rate": 0.001,
+      "loss": 0.3984,
+      "step": 2014
+    },
+    {
+      "epoch": 0.05559828514470385,
+      "grad_norm": 0.0022836877033114433,
+      "learning_rate": 0.001,
+      "loss": 0.4291,
+      "step": 2015
+    },
+    {
+      "epoch": 0.05562587734576822,
+      "grad_norm": 0.003522195853292942,
+      "learning_rate": 0.001,
+      "loss": 0.3994,
+      "step": 2016
+    },
+    {
+      "epoch": 0.05565346954683259,
+      "grad_norm": 0.0039251442067325115,
+      "learning_rate": 0.001,
+      "loss": 0.3804,
+      "step": 2017
+    },
+    {
+      "epoch": 0.05568106174789696,
+      "grad_norm": 0.011906804516911507,
+      "learning_rate": 0.001,
+      "loss": 0.3988,
+      "step": 2018
+    },
+    {
+      "epoch": 0.05570865394896132,
+      "grad_norm": 0.002788014244288206,
+      "learning_rate": 0.001,
+      "loss": 0.4111,
+      "step": 2019
+    },
+    {
+      "epoch": 0.055736246150025696,
+      "grad_norm": 0.006921442225575447,
+      "learning_rate": 0.001,
+      "loss": 0.3862,
+      "step": 2020
+    },
+    {
+      "epoch": 0.05576383835109006,
+      "grad_norm": 0.004114898853003979,
+      "learning_rate": 0.001,
+      "loss": 0.4144,
+      "step": 2021
+    },
+    {
+      "epoch": 0.055791430552154436,
+      "grad_norm": 0.0033005515579134226,
+      "learning_rate": 0.001,
+      "loss": 0.4108,
+      "step": 2022
+    },
+    {
+      "epoch": 0.0558190227532188,
+      "grad_norm": 0.00554333720356226,
+      "learning_rate": 0.001,
+      "loss": 0.4142,
+      "step": 2023
+    },
+    {
+      "epoch": 0.05584661495428317,
+      "grad_norm": 0.009507423266768456,
+      "learning_rate": 0.001,
+      "loss": 0.3891,
+      "step": 2024
+    },
+    {
+      "epoch": 0.05587420715534754,
+      "grad_norm": 0.0032099178060889244,
+      "learning_rate": 0.001,
+      "loss": 0.3636,
+      "step": 2025
+    },
+    {
+      "epoch": 0.05590179935641191,
+      "grad_norm": 0.0028057810850441456,
+      "learning_rate": 0.001,
+      "loss": 0.4091,
+      "step": 2026
+    },
+    {
+      "epoch": 0.05592939155747628,
+      "grad_norm": 0.00344046950340271,
+      "learning_rate": 0.001,
+      "loss": 0.4409,
+      "step": 2027
+    },
+    {
+      "epoch": 0.05595698375854065,
+      "grad_norm": 0.004159116186201572,
+      "learning_rate": 0.001,
+      "loss": 0.3919,
+      "step": 2028
+    },
+    {
+      "epoch": 0.055984575959605015,
+      "grad_norm": 0.017573168501257896,
+      "learning_rate": 0.001,
+      "loss": 0.4202,
+      "step": 2029
+    },
+    {
+      "epoch": 0.05601216816066939,
+      "grad_norm": 0.0059144701808691025,
+      "learning_rate": 0.001,
+      "loss": 0.38,
+      "step": 2030
+    },
+    {
+      "epoch": 0.056039760361733755,
+      "grad_norm": 0.003630187129601836,
+      "learning_rate": 0.001,
+      "loss": 0.3843,
+      "step": 2031
+    },
+    {
+      "epoch": 0.05606735256279813,
+      "grad_norm": 0.003033621236681938,
+      "learning_rate": 0.001,
+      "loss": 0.393,
+      "step": 2032
+    },
+    {
+      "epoch": 0.056094944763862495,
+      "grad_norm": 0.003519507823511958,
+      "learning_rate": 0.001,
+      "loss": 0.4114,
+      "step": 2033
+    },
+    {
+      "epoch": 0.05612253696492686,
+      "grad_norm": 0.004492396954447031,
+      "learning_rate": 0.001,
+      "loss": 0.4137,
+      "step": 2034
+    },
+    {
+      "epoch": 0.056150129165991235,
+      "grad_norm": 0.004509568680077791,
+      "learning_rate": 0.001,
+      "loss": 0.4155,
+      "step": 2035
+    },
+    {
+      "epoch": 0.0561777213670556,
+      "grad_norm": 0.0026023637037724257,
+      "learning_rate": 0.001,
+      "loss": 0.4057,
+      "step": 2036
+    },
+    {
+      "epoch": 0.05620531356811997,
+      "grad_norm": 0.0026178304105997086,
+      "learning_rate": 0.001,
+      "loss": 0.3801,
+      "step": 2037
+    },
+    {
+      "epoch": 0.05623290576918434,
+      "grad_norm": 0.0024824494030326605,
+      "learning_rate": 0.001,
+      "loss": 0.3754,
+      "step": 2038
+    },
+    {
+      "epoch": 0.05626049797024871,
+      "grad_norm": 0.0025145653635263443,
+      "learning_rate": 0.001,
+      "loss": 0.4485,
+      "step": 2039
+    },
+    {
+      "epoch": 0.05628809017131308,
+      "grad_norm": 0.005017207004129887,
+      "learning_rate": 0.001,
+      "loss": 0.4104,
+      "step": 2040
+    },
+    {
+      "epoch": 0.05631568237237745,
+      "grad_norm": 0.005889947526156902,
+      "learning_rate": 0.001,
+      "loss": 0.4284,
+      "step": 2041
+    },
+    {
+      "epoch": 0.056343274573441814,
+      "grad_norm": 0.003475229488685727,
+      "learning_rate": 0.001,
+      "loss": 0.3887,
+      "step": 2042
+    },
+    {
+      "epoch": 0.05637086677450619,
+      "grad_norm": 0.00831044651567936,
+      "learning_rate": 0.001,
+      "loss": 0.4023,
+      "step": 2043
+    },
+    {
+      "epoch": 0.05639845897557055,
+      "grad_norm": 0.003840766381472349,
+      "learning_rate": 0.001,
+      "loss": 0.3942,
+      "step": 2044
+    },
+    {
+      "epoch": 0.05642605117663493,
+      "grad_norm": 0.004076477140188217,
+      "learning_rate": 0.001,
+      "loss": 0.3865,
+      "step": 2045
+    },
+    {
+      "epoch": 0.05645364337769929,
+      "grad_norm": 0.004721554461866617,
+      "learning_rate": 0.001,
+      "loss": 0.4019,
+      "step": 2046
+    },
+    {
+      "epoch": 0.05648123557876366,
+      "grad_norm": 0.003165911417454481,
+      "learning_rate": 0.001,
+      "loss": 0.4076,
+      "step": 2047
+    },
+    {
+      "epoch": 0.05650882777982803,
+      "grad_norm": 0.0031442714389413595,
+      "learning_rate": 0.001,
+      "loss": 0.3718,
+      "step": 2048
+    },
+    {
+      "epoch": 0.0565364199808924,
+      "grad_norm": 0.004942482803016901,
+      "learning_rate": 0.001,
+      "loss": 0.3707,
+      "step": 2049
+    },
+    {
+      "epoch": 0.05656401218195677,
+      "grad_norm": 0.004441413562744856,
+      "learning_rate": 0.001,
+      "loss": 0.3872,
+      "step": 2050
+    },
+    {
+      "epoch": 0.05659160438302114,
+      "grad_norm": 0.005134286358952522,
+      "learning_rate": 0.001,
+      "loss": 0.415,
+      "step": 2051
+    },
+    {
+      "epoch": 0.056619196584085506,
+      "grad_norm": 0.0030015951488167048,
+      "learning_rate": 0.001,
+      "loss": 0.4174,
+      "step": 2052
+    },
+    {
+      "epoch": 0.05664678878514988,
+      "grad_norm": 0.0028276192024350166,
+      "learning_rate": 0.001,
+      "loss": 0.3858,
+      "step": 2053
+    },
+    {
+      "epoch": 0.056674380986214246,
+      "grad_norm": 0.0030078550335019827,
+      "learning_rate": 0.001,
+      "loss": 0.3855,
+      "step": 2054
+    },
+    {
+      "epoch": 0.05670197318727862,
+      "grad_norm": 0.0047853728756308556,
+      "learning_rate": 0.001,
+      "loss": 0.4107,
+      "step": 2055
+    },
+    {
+      "epoch": 0.056729565388342985,
+      "grad_norm": 0.003354682819917798,
+      "learning_rate": 0.001,
+      "loss": 0.3903,
+      "step": 2056
+    },
+    {
+      "epoch": 0.05675715758940735,
+      "grad_norm": 0.002619178267195821,
+      "learning_rate": 0.001,
+      "loss": 0.353,
+      "step": 2057
+    },
+    {
+      "epoch": 0.056784749790471725,
+      "grad_norm": 0.003579444019123912,
+      "learning_rate": 0.001,
+      "loss": 0.4116,
+      "step": 2058
+    },
+    {
+      "epoch": 0.05681234199153609,
+      "grad_norm": 0.002495531225576997,
+      "learning_rate": 0.001,
+      "loss": 0.4171,
+      "step": 2059
+    },
+    {
+      "epoch": 0.05683993419260046,
+      "grad_norm": 0.0025894520804286003,
+      "learning_rate": 0.001,
+      "loss": 0.4454,
+      "step": 2060
+    },
+    {
+      "epoch": 0.05686752639366483,
+      "grad_norm": 0.0035489851143211126,
+      "learning_rate": 0.001,
+      "loss": 0.3708,
+      "step": 2061
+    },
+    {
+      "epoch": 0.0568951185947292,
+      "grad_norm": 0.00259172054938972,
+      "learning_rate": 0.001,
+      "loss": 0.4112,
+      "step": 2062
+    },
+    {
+      "epoch": 0.05692271079579357,
+      "grad_norm": 0.0038454467430710793,
+      "learning_rate": 0.001,
+      "loss": 0.434,
+      "step": 2063
+    },
+    {
+      "epoch": 0.05695030299685794,
+      "grad_norm": 0.002690738532692194,
+      "learning_rate": 0.001,
+      "loss": 0.4067,
+      "step": 2064
+    },
+    {
+      "epoch": 0.056977895197922304,
+      "grad_norm": 0.0031391652300953865,
+      "learning_rate": 0.001,
+      "loss": 0.4122,
+      "step": 2065
+    },
+    {
+      "epoch": 0.05700548739898668,
+      "grad_norm": 0.0025237516965717077,
+      "learning_rate": 0.001,
+      "loss": 0.3933,
+      "step": 2066
+    },
+    {
+      "epoch": 0.057033079600051044,
+      "grad_norm": 0.005846772808581591,
+      "learning_rate": 0.001,
+      "loss": 0.4046,
+      "step": 2067
+    },
+    {
+      "epoch": 0.05706067180111542,
+      "grad_norm": 0.0032868178095668554,
+      "learning_rate": 0.001,
+      "loss": 0.4246,
+      "step": 2068
+    },
+    {
+      "epoch": 0.057088264002179784,
+      "grad_norm": 0.0038759801536798477,
+      "learning_rate": 0.001,
+      "loss": 0.3933,
+      "step": 2069
+    },
+    {
+      "epoch": 0.05711585620324415,
+      "grad_norm": 0.002897894475609064,
+      "learning_rate": 0.001,
+      "loss": 0.4083,
+      "step": 2070
+    },
+    {
+      "epoch": 0.057143448404308524,
+      "grad_norm": 0.003541572019457817,
+      "learning_rate": 0.001,
+      "loss": 0.3807,
+      "step": 2071
+    },
+    {
+      "epoch": 0.05717104060537289,
+      "grad_norm": 0.003856340888887644,
+      "learning_rate": 0.001,
+      "loss": 0.4249,
+      "step": 2072
+    },
+    {
+      "epoch": 0.05719863280643726,
+      "grad_norm": 0.003133943770080805,
+      "learning_rate": 0.001,
+      "loss": 0.4246,
+      "step": 2073
+    },
+    {
+      "epoch": 0.05722622500750163,
+      "grad_norm": 0.0026191668584942818,
+      "learning_rate": 0.001,
+      "loss": 0.4348,
+      "step": 2074
+    },
+    {
+      "epoch": 0.057253817208565996,
+      "grad_norm": 0.003371886443346739,
+      "learning_rate": 0.001,
+      "loss": 0.4478,
+      "step": 2075
+    },
+    {
+      "epoch": 0.05728140940963037,
+      "grad_norm": 0.003778213867917657,
+      "learning_rate": 0.001,
+      "loss": 0.4202,
+      "step": 2076
+    },
+    {
+      "epoch": 0.057309001610694736,
+      "grad_norm": 0.0024114828556776047,
+      "learning_rate": 0.001,
+      "loss": 0.397,
+      "step": 2077
+    },
+    {
+      "epoch": 0.05733659381175911,
+      "grad_norm": 0.0026140043046325445,
+      "learning_rate": 0.001,
+      "loss": 0.3753,
+      "step": 2078
+    },
+    {
+      "epoch": 0.057364186012823476,
+      "grad_norm": 0.006618298124521971,
+      "learning_rate": 0.001,
+      "loss": 0.4144,
+      "step": 2079
+    },
+    {
+      "epoch": 0.05739177821388784,
+      "grad_norm": 0.002615788020193577,
+      "learning_rate": 0.001,
+      "loss": 0.3928,
+      "step": 2080
+    },
+    {
+      "epoch": 0.057419370414952216,
+      "grad_norm": 0.0076182009652256966,
+      "learning_rate": 0.001,
+      "loss": 0.3932,
+      "step": 2081
+    },
+    {
+      "epoch": 0.05744696261601658,
+      "grad_norm": 0.0025556047912687063,
+      "learning_rate": 0.001,
+      "loss": 0.3813,
+      "step": 2082
+    },
+    {
+      "epoch": 0.05747455481708095,
+      "grad_norm": 0.0024526710622012615,
+      "learning_rate": 0.001,
+      "loss": 0.4107,
+      "step": 2083
+    },
+    {
+      "epoch": 0.05750214701814532,
+      "grad_norm": 0.004499271512031555,
+      "learning_rate": 0.001,
+      "loss": 0.3638,
+      "step": 2084
+    },
+    {
+      "epoch": 0.05752973921920969,
+      "grad_norm": 0.0039004157297313213,
+      "learning_rate": 0.001,
+      "loss": 0.4047,
+      "step": 2085
+    },
+    {
+      "epoch": 0.05755733142027406,
+      "grad_norm": 0.005158510524779558,
+      "learning_rate": 0.001,
+      "loss": 0.3914,
+      "step": 2086
+    },
+    {
+      "epoch": 0.05758492362133843,
+      "grad_norm": 0.0028425029013305902,
+      "learning_rate": 0.001,
+      "loss": 0.408,
+      "step": 2087
+    },
+    {
+      "epoch": 0.057612515822402795,
+      "grad_norm": 0.0027261306531727314,
+      "learning_rate": 0.001,
+      "loss": 0.4183,
+      "step": 2088
+    },
+    {
+      "epoch": 0.05764010802346717,
+      "grad_norm": 0.0026766748633235693,
+      "learning_rate": 0.001,
+      "loss": 0.3774,
+      "step": 2089
+    },
+    {
+      "epoch": 0.057667700224531535,
+      "grad_norm": 0.0038580589462071657,
+      "learning_rate": 0.001,
+      "loss": 0.4266,
+      "step": 2090
+    },
+    {
+      "epoch": 0.05769529242559591,
+      "grad_norm": 0.0030884486623108387,
+      "learning_rate": 0.001,
+      "loss": 0.3999,
+      "step": 2091
+    },
+    {
+      "epoch": 0.057722884626660274,
+      "grad_norm": 0.002511868719011545,
+      "learning_rate": 0.001,
+      "loss": 0.4202,
+      "step": 2092
+    },
+    {
+      "epoch": 0.05775047682772464,
+      "grad_norm": 0.003216751618310809,
+      "learning_rate": 0.001,
+      "loss": 0.4032,
+      "step": 2093
+    },
+    {
+      "epoch": 0.057778069028789014,
+      "grad_norm": 0.0028110183775424957,
+      "learning_rate": 0.001,
+      "loss": 0.412,
+      "step": 2094
+    },
+    {
+      "epoch": 0.05780566122985338,
+      "grad_norm": 0.00429938780143857,
+      "learning_rate": 0.001,
+      "loss": 0.371,
+      "step": 2095
+    },
+    {
+      "epoch": 0.057833253430917754,
+      "grad_norm": 0.005798738915473223,
+      "learning_rate": 0.001,
+      "loss": 0.4238,
+      "step": 2096
+    },
+    {
+      "epoch": 0.05786084563198212,
+      "grad_norm": 0.0031060322653502226,
+      "learning_rate": 0.001,
+      "loss": 0.4029,
+      "step": 2097
+    },
+    {
+      "epoch": 0.05788843783304649,
+      "grad_norm": 0.0039985994808375835,
+      "learning_rate": 0.001,
+      "loss": 0.3612,
+      "step": 2098
+    },
+    {
+      "epoch": 0.05791603003411086,
+      "grad_norm": 0.0036842762492597103,
+      "learning_rate": 0.001,
+      "loss": 0.3811,
+      "step": 2099
+    },
+    {
+      "epoch": 0.05794362223517523,
+      "grad_norm": 0.002856861101463437,
+      "learning_rate": 0.001,
+      "loss": 0.3664,
+      "step": 2100
+    },
+    {
+      "epoch": 0.0579712144362396,
+      "grad_norm": 0.00465161819010973,
+      "learning_rate": 0.001,
+      "loss": 0.404,
+      "step": 2101
+    },
+    {
+      "epoch": 0.057998806637303967,
+      "grad_norm": 0.0029720210004597902,
+      "learning_rate": 0.001,
+      "loss": 0.4339,
+      "step": 2102
+    },
+    {
+      "epoch": 0.05802639883836833,
+      "grad_norm": 0.0034581513609737158,
+      "learning_rate": 0.001,
+      "loss": 0.3858,
+      "step": 2103
+    },
+    {
+      "epoch": 0.058053991039432706,
+      "grad_norm": 0.0030470779165625572,
+      "learning_rate": 0.001,
+      "loss": 0.3836,
+      "step": 2104
+    },
+    {
+      "epoch": 0.05808158324049707,
+      "grad_norm": 0.005939992144703865,
+      "learning_rate": 0.001,
+      "loss": 0.3763,
+      "step": 2105
+    },
+    {
+      "epoch": 0.05810917544156144,
+      "grad_norm": 0.005322432145476341,
+      "learning_rate": 0.001,
+      "loss": 0.4142,
+      "step": 2106
+    },
+    {
+      "epoch": 0.05813676764262581,
+      "grad_norm": 0.0031803702004253864,
+      "learning_rate": 0.001,
+      "loss": 0.3886,
+      "step": 2107
+    },
+    {
+      "epoch": 0.05816435984369018,
+      "grad_norm": 0.004405119922012091,
+      "learning_rate": 0.001,
+      "loss": 0.4181,
+      "step": 2108
+    },
+    {
+      "epoch": 0.05819195204475455,
+      "grad_norm": 0.005180948879569769,
+      "learning_rate": 0.001,
+      "loss": 0.4202,
+      "step": 2109
+    },
+    {
+      "epoch": 0.05821954424581892,
+      "grad_norm": 0.003976668696850538,
+      "learning_rate": 0.001,
+      "loss": 0.3849,
+      "step": 2110
+    },
+    {
+      "epoch": 0.058247136446883285,
+      "grad_norm": 0.006682976149022579,
+      "learning_rate": 0.001,
+      "loss": 0.3829,
+      "step": 2111
+    },
+    {
+      "epoch": 0.05827472864794766,
+      "grad_norm": 0.0036576632410287857,
+      "learning_rate": 0.001,
+      "loss": 0.3938,
+      "step": 2112
+    },
+    {
+      "epoch": 0.058302320849012025,
+      "grad_norm": 0.0037393097300082445,
+      "learning_rate": 0.001,
+      "loss": 0.4176,
+      "step": 2113
+    },
+    {
+      "epoch": 0.0583299130500764,
+      "grad_norm": 0.004572103265672922,
+      "learning_rate": 0.001,
+      "loss": 0.3864,
+      "step": 2114
+    },
+    {
+      "epoch": 0.058357505251140765,
+      "grad_norm": 0.004839747212827206,
+      "learning_rate": 0.001,
+      "loss": 0.4277,
+      "step": 2115
+    },
+    {
+      "epoch": 0.05838509745220513,
+      "grad_norm": 0.002434584079310298,
+      "learning_rate": 0.001,
+      "loss": 0.4297,
+      "step": 2116
+    },
+    {
+      "epoch": 0.058412689653269505,
+      "grad_norm": 0.00401110528036952,
+      "learning_rate": 0.001,
+      "loss": 0.3835,
+      "step": 2117
+    },
+    {
+      "epoch": 0.05844028185433387,
+      "grad_norm": 0.015435201115906239,
+      "learning_rate": 0.001,
+      "loss": 0.4008,
+      "step": 2118
+    },
+    {
+      "epoch": 0.058467874055398245,
+      "grad_norm": 0.005054370500147343,
+      "learning_rate": 0.001,
+      "loss": 0.3822,
+      "step": 2119
+    },
+    {
+      "epoch": 0.05849546625646261,
+      "grad_norm": 0.0033668207470327616,
+      "learning_rate": 0.001,
+      "loss": 0.3873,
+      "step": 2120
+    },
+    {
+      "epoch": 0.05852305845752698,
+      "grad_norm": 0.004512968007475138,
+      "learning_rate": 0.001,
+      "loss": 0.3878,
+      "step": 2121
+    },
+    {
+      "epoch": 0.05855065065859135,
+      "grad_norm": 0.003313259920105338,
+      "learning_rate": 0.001,
+      "loss": 0.436,
+      "step": 2122
+    },
+    {
+      "epoch": 0.05857824285965572,
+      "grad_norm": 0.0035786698572337627,
+      "learning_rate": 0.001,
+      "loss": 0.3897,
+      "step": 2123
+    },
+    {
+      "epoch": 0.05860583506072009,
+      "grad_norm": 0.002702909056097269,
+      "learning_rate": 0.001,
+      "loss": 0.414,
+      "step": 2124
+    },
+    {
+      "epoch": 0.05863342726178446,
+      "grad_norm": 0.004725235048681498,
+      "learning_rate": 0.001,
+      "loss": 0.3909,
+      "step": 2125
+    },
+    {
+      "epoch": 0.058661019462848824,
+      "grad_norm": 0.0036240858025848866,
+      "learning_rate": 0.001,
+      "loss": 0.3936,
+      "step": 2126
+    },
+    {
+      "epoch": 0.0586886116639132,
+      "grad_norm": 0.004966442938894033,
+      "learning_rate": 0.001,
+      "loss": 0.3921,
+      "step": 2127
+    },
+    {
+      "epoch": 0.05871620386497756,
+      "grad_norm": 0.0031415580306202173,
+      "learning_rate": 0.001,
+      "loss": 0.3542,
+      "step": 2128
+    },
+    {
+      "epoch": 0.05874379606604194,
+      "grad_norm": 0.003191297873854637,
+      "learning_rate": 0.001,
+      "loss": 0.4267,
+      "step": 2129
+    },
+    {
+      "epoch": 0.0587713882671063,
+      "grad_norm": 0.0039037340320646763,
+      "learning_rate": 0.001,
+      "loss": 0.4063,
+      "step": 2130
+    },
+    {
+      "epoch": 0.05879898046817067,
+      "grad_norm": 0.00381074589677155,
+      "learning_rate": 0.001,
+      "loss": 0.3995,
+      "step": 2131
+    },
+    {
+      "epoch": 0.05882657266923504,
+      "grad_norm": 0.004218887537717819,
+      "learning_rate": 0.001,
+      "loss": 0.3944,
+      "step": 2132
+    },
+    {
+      "epoch": 0.05885416487029941,
+      "grad_norm": 0.004148995969444513,
+      "learning_rate": 0.001,
+      "loss": 0.456,
+      "step": 2133
+    },
+    {
+      "epoch": 0.058881757071363776,
+      "grad_norm": 0.0040593999437987804,
+      "learning_rate": 0.001,
+      "loss": 0.3874,
+      "step": 2134
+    },
+    {
+      "epoch": 0.05890934927242815,
+      "grad_norm": 0.004279328975826502,
+      "learning_rate": 0.001,
+      "loss": 0.4111,
+      "step": 2135
+    },
+    {
+      "epoch": 0.058936941473492516,
+      "grad_norm": 0.00401785783469677,
+      "learning_rate": 0.001,
+      "loss": 0.4497,
+      "step": 2136
+    },
+    {
+      "epoch": 0.05896453367455689,
+      "grad_norm": 0.004279072396457195,
+      "learning_rate": 0.001,
+      "loss": 0.4052,
+      "step": 2137
+    },
+    {
+      "epoch": 0.058992125875621255,
+      "grad_norm": 0.004601733293384314,
+      "learning_rate": 0.001,
+      "loss": 0.3811,
+      "step": 2138
+    },
+    {
+      "epoch": 0.05901971807668562,
+      "grad_norm": 0.010014613159000874,
+      "learning_rate": 0.001,
+      "loss": 0.4262,
+      "step": 2139
+    },
+    {
+      "epoch": 0.059047310277749995,
+      "grad_norm": 0.004289823584258556,
+      "learning_rate": 0.001,
+      "loss": 0.3813,
+      "step": 2140
+    },
+    {
+      "epoch": 0.05907490247881436,
+      "grad_norm": 0.004107081796973944,
+      "learning_rate": 0.001,
+      "loss": 0.406,
+      "step": 2141
+    },
+    {
+      "epoch": 0.059102494679878735,
+      "grad_norm": 0.004256140440702438,
+      "learning_rate": 0.001,
+      "loss": 0.3738,
+      "step": 2142
+    },
+    {
+      "epoch": 0.0591300868809431,
+      "grad_norm": 0.00437437929213047,
+      "learning_rate": 0.001,
+      "loss": 0.3625,
+      "step": 2143
+    },
+    {
+      "epoch": 0.05915767908200747,
+      "grad_norm": 0.004300633445382118,
+      "learning_rate": 0.001,
+      "loss": 0.4134,
+      "step": 2144
+    },
+    {
+      "epoch": 0.05918527128307184,
+      "grad_norm": 0.0044938200153410435,
+      "learning_rate": 0.001,
+      "loss": 0.401,
+      "step": 2145
+    },
+    {
+      "epoch": 0.05921286348413621,
+      "grad_norm": 0.004241921007633209,
+      "learning_rate": 0.001,
+      "loss": 0.3975,
+      "step": 2146
+    },
+    {
+      "epoch": 0.05924045568520058,
+      "grad_norm": 0.005186907015740871,
+      "learning_rate": 0.001,
+      "loss": 0.3937,
+      "step": 2147
+    },
+    {
+      "epoch": 0.05926804788626495,
+      "grad_norm": 0.0033411476761102676,
+      "learning_rate": 0.001,
+      "loss": 0.446,
+      "step": 2148
+    },
+    {
+      "epoch": 0.059295640087329314,
+      "grad_norm": 0.003417744068428874,
+      "learning_rate": 0.001,
+      "loss": 0.4201,
+      "step": 2149
+    },
+    {
+      "epoch": 0.05932323228839369,
+      "grad_norm": 0.0042548892088234425,
+      "learning_rate": 0.001,
+      "loss": 0.3831,
+      "step": 2150
+    },
+    {
+      "epoch": 0.059350824489458054,
+      "grad_norm": 0.0036170384846627712,
+      "learning_rate": 0.001,
+      "loss": 0.377,
+      "step": 2151
+    },
+    {
+      "epoch": 0.05937841669052243,
+      "grad_norm": 0.0034266202710568905,
+      "learning_rate": 0.001,
+      "loss": 0.4278,
+      "step": 2152
+    },
+    {
+      "epoch": 0.059406008891586794,
+      "grad_norm": 0.0034320498816668987,
+      "learning_rate": 0.001,
+      "loss": 0.4498,
+      "step": 2153
+    },
+    {
+      "epoch": 0.05943360109265116,
+      "grad_norm": 0.0033084768801927567,
+      "learning_rate": 0.001,
+      "loss": 0.3733,
+      "step": 2154
+    },
+    {
+      "epoch": 0.059461193293715534,
+      "grad_norm": 0.003638029098510742,
+      "learning_rate": 0.001,
+      "loss": 0.4059,
+      "step": 2155
+    },
+    {
+      "epoch": 0.0594887854947799,
+      "grad_norm": 0.0036058463156223297,
+      "learning_rate": 0.001,
+      "loss": 0.4087,
+      "step": 2156
+    },
+    {
+      "epoch": 0.059516377695844266,
+      "grad_norm": 0.003367355791851878,
+      "learning_rate": 0.001,
+      "loss": 0.3577,
+      "step": 2157
+    },
+    {
+      "epoch": 0.05954396989690864,
+      "grad_norm": 0.003909154795110226,
+      "learning_rate": 0.001,
+      "loss": 0.404,
+      "step": 2158
+    },
+    {
+      "epoch": 0.059571562097973006,
+      "grad_norm": 0.0027421792037785053,
+      "learning_rate": 0.001,
+      "loss": 0.4082,
+      "step": 2159
+    },
+    {
+      "epoch": 0.05959915429903738,
+      "grad_norm": 0.003349900944158435,
+      "learning_rate": 0.001,
+      "loss": 0.38,
+      "step": 2160
+    },
+    {
+      "epoch": 0.059626746500101746,
+      "grad_norm": 0.003867817111313343,
+      "learning_rate": 0.001,
+      "loss": 0.4502,
+      "step": 2161
+    },
+    {
+      "epoch": 0.05965433870116611,
+      "grad_norm": 0.002976678777486086,
+      "learning_rate": 0.001,
+      "loss": 0.4679,
+      "step": 2162
+    },
+    {
+      "epoch": 0.059681930902230486,
+      "grad_norm": 0.004093970637768507,
+      "learning_rate": 0.001,
+      "loss": 0.4623,
+      "step": 2163
+    },
+    {
+      "epoch": 0.05970952310329485,
+      "grad_norm": 0.008607220835983753,
+      "learning_rate": 0.001,
+      "loss": 0.3939,
+      "step": 2164
+    },
+    {
+      "epoch": 0.059737115304359226,
+      "grad_norm": 0.003642444731667638,
+      "learning_rate": 0.001,
+      "loss": 0.3617,
+      "step": 2165
+    },
+    {
+      "epoch": 0.05976470750542359,
+      "grad_norm": 0.004864281043410301,
+      "learning_rate": 0.001,
+      "loss": 0.3954,
+      "step": 2166
+    },
+    {
+      "epoch": 0.05979229970648796,
+      "grad_norm": 0.003302594181150198,
+      "learning_rate": 0.001,
+      "loss": 0.3968,
+      "step": 2167
+    },
+    {
+      "epoch": 0.05981989190755233,
+      "grad_norm": 0.004229418467730284,
+      "learning_rate": 0.001,
+      "loss": 0.3913,
+      "step": 2168
+    },
+    {
+      "epoch": 0.0598474841086167,
+      "grad_norm": 0.026235150173306465,
+      "learning_rate": 0.001,
+      "loss": 0.3886,
+      "step": 2169
+    },
+    {
+      "epoch": 0.05987507630968107,
+      "grad_norm": 0.0035215525422245264,
+      "learning_rate": 0.001,
+      "loss": 0.4007,
+      "step": 2170
+    },
+    {
+      "epoch": 0.05990266851074544,
+      "grad_norm": 0.0038247762713581324,
+      "learning_rate": 0.001,
+      "loss": 0.4207,
+      "step": 2171
+    },
+    {
+      "epoch": 0.059930260711809805,
+      "grad_norm": 0.0033723246306180954,
+      "learning_rate": 0.001,
+      "loss": 0.4046,
+      "step": 2172
+    },
+    {
+      "epoch": 0.05995785291287418,
+      "grad_norm": 0.003272601403295994,
+      "learning_rate": 0.001,
+      "loss": 0.4218,
+      "step": 2173
+    },
+    {
+      "epoch": 0.059985445113938544,
+      "grad_norm": 0.00700679887086153,
+      "learning_rate": 0.001,
+      "loss": 0.3977,
+      "step": 2174
+    },
+    {
+      "epoch": 0.06001303731500292,
+      "grad_norm": 0.004312647040933371,
+      "learning_rate": 0.001,
+      "loss": 0.4259,
+      "step": 2175
+    },
+    {
+      "epoch": 0.060040629516067284,
+      "grad_norm": 0.0028906308580189943,
+      "learning_rate": 0.001,
+      "loss": 0.4082,
+      "step": 2176
+    },
+    {
+      "epoch": 0.06006822171713165,
+      "grad_norm": 0.0023235634434968233,
+      "learning_rate": 0.001,
+      "loss": 0.406,
+      "step": 2177
+    },
+    {
+      "epoch": 0.060095813918196024,
+      "grad_norm": 0.003625315148383379,
+      "learning_rate": 0.001,
+      "loss": 0.4082,
+      "step": 2178
+    },
+    {
+      "epoch": 0.06012340611926039,
+      "grad_norm": 0.0026299748569726944,
+      "learning_rate": 0.001,
+      "loss": 0.4043,
+      "step": 2179
+    },
+    {
+      "epoch": 0.06015099832032476,
+      "grad_norm": 0.003912107087671757,
+      "learning_rate": 0.001,
+      "loss": 0.3928,
+      "step": 2180
+    },
+    {
+      "epoch": 0.06017859052138913,
+      "grad_norm": 0.0030596598517149687,
+      "learning_rate": 0.001,
+      "loss": 0.396,
+      "step": 2181
+    },
+    {
+      "epoch": 0.0602061827224535,
+      "grad_norm": 0.0034961546771228313,
+      "learning_rate": 0.001,
+      "loss": 0.3976,
+      "step": 2182
+    },
+    {
+      "epoch": 0.06023377492351787,
+      "grad_norm": 0.0026868092827498913,
+      "learning_rate": 0.001,
+      "loss": 0.341,
+      "step": 2183
+    },
+    {
+      "epoch": 0.06026136712458224,
+      "grad_norm": 0.0036376873031258583,
+      "learning_rate": 0.001,
+      "loss": 0.3954,
+      "step": 2184
+    },
+    {
+      "epoch": 0.0602889593256466,
+      "grad_norm": 0.0034194989129900932,
+      "learning_rate": 0.001,
+      "loss": 0.3782,
+      "step": 2185
+    },
+    {
+      "epoch": 0.060316551526710976,
+      "grad_norm": 0.00262664002366364,
+      "learning_rate": 0.001,
+      "loss": 0.4054,
+      "step": 2186
+    },
+    {
+      "epoch": 0.06034414372777534,
+      "grad_norm": 0.002487578894942999,
+      "learning_rate": 0.001,
+      "loss": 0.4719,
+      "step": 2187
+    },
+    {
+      "epoch": 0.060371735928839716,
+      "grad_norm": 0.002136924536898732,
+      "learning_rate": 0.001,
+      "loss": 0.4038,
+      "step": 2188
+    },
+    {
+      "epoch": 0.06039932812990408,
+      "grad_norm": 0.0025951487477868795,
+      "learning_rate": 0.001,
+      "loss": 0.4247,
+      "step": 2189
+    },
+    {
+      "epoch": 0.06042692033096845,
+      "grad_norm": 0.007234814576804638,
+      "learning_rate": 0.001,
+      "loss": 0.3922,
+      "step": 2190
+    },
+    {
+      "epoch": 0.06045451253203282,
+      "grad_norm": 0.04125404730439186,
+      "learning_rate": 0.001,
+      "loss": 0.4087,
+      "step": 2191
+    },
+    {
+      "epoch": 0.06048210473309719,
+      "grad_norm": 0.0031280010007321835,
+      "learning_rate": 0.001,
+      "loss": 0.4324,
+      "step": 2192
+    },
+    {
+      "epoch": 0.06050969693416156,
+      "grad_norm": 0.003333252388983965,
+      "learning_rate": 0.001,
+      "loss": 0.3976,
+      "step": 2193
+    },
+    {
+      "epoch": 0.06053728913522593,
+      "grad_norm": 0.002960551530122757,
+      "learning_rate": 0.001,
+      "loss": 0.4211,
+      "step": 2194
+    },
+    {
+      "epoch": 0.060564881336290295,
+      "grad_norm": 0.003678489476442337,
+      "learning_rate": 0.001,
+      "loss": 0.4412,
+      "step": 2195
+    },
+    {
+      "epoch": 0.06059247353735467,
+      "grad_norm": 0.008868735283613205,
+      "learning_rate": 0.001,
+      "loss": 0.3728,
+      "step": 2196
+    },
+    {
+      "epoch": 0.060620065738419035,
+      "grad_norm": 0.009513468481600285,
+      "learning_rate": 0.001,
+      "loss": 0.3676,
+      "step": 2197
+    },
+    {
+      "epoch": 0.06064765793948341,
+      "grad_norm": 0.0050335777923464775,
+      "learning_rate": 0.001,
+      "loss": 0.3789,
+      "step": 2198
+    },
+    {
+      "epoch": 0.060675250140547775,
+      "grad_norm": 0.006050426512956619,
+      "learning_rate": 0.001,
+      "loss": 0.4329,
+      "step": 2199
+    },
+    {
+      "epoch": 0.06070284234161214,
+      "grad_norm": 0.002776517765596509,
+      "learning_rate": 0.001,
+      "loss": 0.4271,
+      "step": 2200
+    },
+    {
+      "epoch": 0.060730434542676515,
+      "grad_norm": 0.003950697835534811,
+      "learning_rate": 0.001,
+      "loss": 0.4057,
+      "step": 2201
+    },
+    {
+      "epoch": 0.06075802674374088,
+      "grad_norm": 0.003235916141420603,
+      "learning_rate": 0.001,
+      "loss": 0.3991,
+      "step": 2202
+    },
+    {
+      "epoch": 0.06078561894480525,
+      "grad_norm": 0.0029406328685581684,
+      "learning_rate": 0.001,
+      "loss": 0.3845,
+      "step": 2203
+    },
+    {
+      "epoch": 0.06081321114586962,
+      "grad_norm": 0.0045188660733401775,
+      "learning_rate": 0.001,
+      "loss": 0.4269,
+      "step": 2204
+    },
+    {
+      "epoch": 0.06084080334693399,
+      "grad_norm": 0.0031814551912248135,
+      "learning_rate": 0.001,
+      "loss": 0.391,
+      "step": 2205
+    },
+    {
+      "epoch": 0.06086839554799836,
+      "grad_norm": 0.0054643419571220875,
+      "learning_rate": 0.001,
+      "loss": 0.426,
+      "step": 2206
+    },
+    {
+      "epoch": 0.06089598774906273,
+      "grad_norm": 0.0061082998290658,
+      "learning_rate": 0.001,
+      "loss": 0.4021,
+      "step": 2207
+    },
+    {
+      "epoch": 0.060923579950127094,
+      "grad_norm": 0.002864258596673608,
+      "learning_rate": 0.001,
+      "loss": 0.3931,
+      "step": 2208
+    },
+    {
+      "epoch": 0.06095117215119147,
+      "grad_norm": 0.014014131389558315,
+      "learning_rate": 0.001,
+      "loss": 0.3781,
+      "step": 2209
+    },
+    {
+      "epoch": 0.06097876435225583,
+      "grad_norm": 0.0033069124910980463,
+      "learning_rate": 0.001,
+      "loss": 0.4007,
+      "step": 2210
+    },
+    {
+      "epoch": 0.06100635655332021,
+      "grad_norm": 0.0032278632279485464,
+      "learning_rate": 0.001,
+      "loss": 0.4836,
+      "step": 2211
+    },
+    {
+      "epoch": 0.06103394875438457,
+      "grad_norm": 0.0036987089551985264,
+      "learning_rate": 0.001,
+      "loss": 0.3673,
+      "step": 2212
+    },
+    {
+      "epoch": 0.06106154095544894,
+      "grad_norm": 0.006201690062880516,
+      "learning_rate": 0.001,
+      "loss": 0.3988,
+      "step": 2213
+    },
+    {
+      "epoch": 0.06108913315651331,
+      "grad_norm": 0.0028827004134655,
+      "learning_rate": 0.001,
+      "loss": 0.4225,
+      "step": 2214
+    },
+    {
+      "epoch": 0.06111672535757768,
+      "grad_norm": 0.0034600994549691677,
+      "learning_rate": 0.001,
+      "loss": 0.4068,
+      "step": 2215
+    },
+    {
+      "epoch": 0.06114431755864205,
+      "grad_norm": 0.09618855267763138,
+      "learning_rate": 0.001,
+      "loss": 0.4004,
+      "step": 2216
+    },
+    {
+      "epoch": 0.06117190975970642,
+      "grad_norm": 0.003716063452884555,
+      "learning_rate": 0.001,
+      "loss": 0.4075,
+      "step": 2217
+    },
+    {
+      "epoch": 0.061199501960770786,
+      "grad_norm": 0.002988336840644479,
+      "learning_rate": 0.001,
+      "loss": 0.4147,
+      "step": 2218
+    },
+    {
+      "epoch": 0.06122709416183516,
+      "grad_norm": 0.0036907619796693325,
+      "learning_rate": 0.001,
+      "loss": 0.4122,
+      "step": 2219
+    },
+    {
+      "epoch": 0.061254686362899526,
+      "grad_norm": 0.002765973797068,
+      "learning_rate": 0.001,
+      "loss": 0.4421,
+      "step": 2220
+    },
+    {
+      "epoch": 0.0612822785639639,
+      "grad_norm": 0.00515265017747879,
+      "learning_rate": 0.001,
+      "loss": 0.4076,
+      "step": 2221
+    },
+    {
+      "epoch": 0.061309870765028265,
+      "grad_norm": 0.0030234006699174643,
+      "learning_rate": 0.001,
+      "loss": 0.4428,
+      "step": 2222
+    },
+    {
+      "epoch": 0.06133746296609263,
+      "grad_norm": 0.007390964776277542,
+      "learning_rate": 0.001,
+      "loss": 0.4148,
+      "step": 2223
+    },
+    {
+      "epoch": 0.061365055167157005,
+      "grad_norm": 0.0025635359343141317,
+      "learning_rate": 0.001,
+      "loss": 0.4205,
+      "step": 2224
+    },
+    {
+      "epoch": 0.06139264736822137,
+      "grad_norm": 0.0057982392609119415,
+      "learning_rate": 0.001,
+      "loss": 0.4358,
+      "step": 2225
+    },
+    {
+      "epoch": 0.06142023956928574,
+      "grad_norm": 0.003062683856114745,
+      "learning_rate": 0.001,
+      "loss": 0.3966,
+      "step": 2226
+    },
+    {
+      "epoch": 0.06144783177035011,
+      "grad_norm": 0.0026797873433679342,
+      "learning_rate": 0.001,
+      "loss": 0.3841,
+      "step": 2227
+    },
+    {
+      "epoch": 0.06147542397141448,
+      "grad_norm": 0.0024085459299385548,
+      "learning_rate": 0.001,
+      "loss": 0.4376,
+      "step": 2228
+    },
+    {
+      "epoch": 0.06150301617247885,
+      "grad_norm": 0.004409399814903736,
+      "learning_rate": 0.001,
+      "loss": 0.4038,
+      "step": 2229
+    },
+    {
+      "epoch": 0.06153060837354322,
+      "grad_norm": 0.003322464181110263,
+      "learning_rate": 0.001,
+      "loss": 0.3991,
+      "step": 2230
+    },
+    {
+      "epoch": 0.061558200574607584,
+      "grad_norm": 0.00409911060705781,
+      "learning_rate": 0.001,
+      "loss": 0.4184,
+      "step": 2231
+    },
+    {
+      "epoch": 0.06158579277567196,
+      "grad_norm": 0.0029169321060180664,
+      "learning_rate": 0.001,
+      "loss": 0.3508,
+      "step": 2232
+    },
+    {
+      "epoch": 0.061613384976736324,
+      "grad_norm": 0.00469607999548316,
+      "learning_rate": 0.001,
+      "loss": 0.3692,
+      "step": 2233
+    },
+    {
+      "epoch": 0.0616409771778007,
+      "grad_norm": 0.0029155181255191565,
+      "learning_rate": 0.001,
+      "loss": 0.3887,
+      "step": 2234
+    },
+    {
+      "epoch": 0.061668569378865064,
+      "grad_norm": 0.003221947466954589,
+      "learning_rate": 0.001,
+      "loss": 0.4239,
+      "step": 2235
+    },
+    {
+      "epoch": 0.06169616157992943,
+      "grad_norm": 0.0033768792636692524,
+      "learning_rate": 0.001,
+      "loss": 0.3954,
+      "step": 2236
+    },
+    {
+      "epoch": 0.061723753780993804,
+      "grad_norm": 0.002673777285963297,
+      "learning_rate": 0.001,
+      "loss": 0.3536,
+      "step": 2237
+    },
+    {
+      "epoch": 0.06175134598205817,
+      "grad_norm": 0.002202989300712943,
+      "learning_rate": 0.001,
+      "loss": 0.4157,
+      "step": 2238
+    },
+    {
+      "epoch": 0.06177893818312254,
+      "grad_norm": 0.00238011684268713,
+      "learning_rate": 0.001,
+      "loss": 0.3929,
+      "step": 2239
+    },
+    {
+      "epoch": 0.06180653038418691,
+      "grad_norm": 0.0027684099040925503,
+      "learning_rate": 0.001,
+      "loss": 0.4129,
+      "step": 2240
+    },
+    {
+      "epoch": 0.061834122585251276,
+      "grad_norm": 0.0032890914008021355,
+      "learning_rate": 0.001,
+      "loss": 0.4205,
+      "step": 2241
+    },
+    {
+      "epoch": 0.06186171478631565,
+      "grad_norm": 0.003589882282540202,
+      "learning_rate": 0.001,
+      "loss": 0.4234,
+      "step": 2242
+    },
+    {
+      "epoch": 0.061889306987380016,
+      "grad_norm": 0.002842566231265664,
+      "learning_rate": 0.001,
+      "loss": 0.4051,
+      "step": 2243
+    },
+    {
+      "epoch": 0.06191689918844439,
+      "grad_norm": 0.0028953177388757467,
+      "learning_rate": 0.001,
+      "loss": 0.4144,
+      "step": 2244
+    },
+    {
+      "epoch": 0.061944491389508756,
+      "grad_norm": 0.003262540325522423,
+      "learning_rate": 0.001,
+      "loss": 0.4051,
+      "step": 2245
+    },
+    {
+      "epoch": 0.06197208359057312,
+      "grad_norm": 0.004046915099024773,
+      "learning_rate": 0.001,
+      "loss": 0.3918,
+      "step": 2246
+    },
+    {
+      "epoch": 0.061999675791637496,
+      "grad_norm": 0.0038636084645986557,
+      "learning_rate": 0.001,
+      "loss": 0.4276,
+      "step": 2247
+    },
+    {
+      "epoch": 0.06202726799270186,
+      "grad_norm": 0.002859740052372217,
+      "learning_rate": 0.001,
+      "loss": 0.3932,
+      "step": 2248
+    },
+    {
+      "epoch": 0.06205486019376623,
+      "grad_norm": 0.002851482480764389,
+      "learning_rate": 0.001,
+      "loss": 0.4235,
+      "step": 2249
+    },
+    {
+      "epoch": 0.0620824523948306,
+      "grad_norm": 0.0031568272970616817,
+      "learning_rate": 0.001,
+      "loss": 0.4335,
+      "step": 2250
+    },
+    {
+      "epoch": 0.06211004459589497,
+      "grad_norm": 0.003563236678019166,
+      "learning_rate": 0.001,
+      "loss": 0.3736,
+      "step": 2251
+    },
+    {
+      "epoch": 0.06213763679695934,
+      "grad_norm": 0.0029816743917763233,
+      "learning_rate": 0.001,
+      "loss": 0.4153,
+      "step": 2252
+    },
+    {
+      "epoch": 0.06216522899802371,
+      "grad_norm": 0.002947515808045864,
+      "learning_rate": 0.001,
+      "loss": 0.4301,
+      "step": 2253
+    },
+    {
+      "epoch": 0.062192821199088075,
+      "grad_norm": 0.003983738832175732,
+      "learning_rate": 0.001,
+      "loss": 0.3958,
+      "step": 2254
+    },
+    {
+      "epoch": 0.06222041340015245,
+      "grad_norm": 0.0053352476097643375,
+      "learning_rate": 0.001,
+      "loss": 0.4146,
+      "step": 2255
+    },
+    {
+      "epoch": 0.062248005601216815,
+      "grad_norm": 0.004124592524021864,
+      "learning_rate": 0.001,
+      "loss": 0.3691,
+      "step": 2256
+    },
+    {
+      "epoch": 0.06227559780228119,
+      "grad_norm": 0.004866636358201504,
+      "learning_rate": 0.001,
+      "loss": 0.4341,
+      "step": 2257
+    },
+    {
+      "epoch": 0.062303190003345554,
+      "grad_norm": 0.005429052747786045,
+      "learning_rate": 0.001,
+      "loss": 0.4275,
+      "step": 2258
+    },
+    {
+      "epoch": 0.06233078220440992,
+      "grad_norm": 0.00399363599717617,
+      "learning_rate": 0.001,
+      "loss": 0.4352,
+      "step": 2259
+    },
+    {
+      "epoch": 0.062358374405474294,
+      "grad_norm": 0.0047408780083060265,
+      "learning_rate": 0.001,
+      "loss": 0.3742,
+      "step": 2260
+    },
+    {
+      "epoch": 0.06238596660653866,
+      "grad_norm": 0.007790642790496349,
+      "learning_rate": 0.001,
+      "loss": 0.362,
+      "step": 2261
+    },
+    {
+      "epoch": 0.062413558807603034,
+      "grad_norm": 0.003895730245858431,
+      "learning_rate": 0.001,
+      "loss": 0.3724,
+      "step": 2262
+    },
+    {
+      "epoch": 0.0624411510086674,
+      "grad_norm": 0.004620977211743593,
+      "learning_rate": 0.001,
+      "loss": 0.4067,
+      "step": 2263
+    },
+    {
+      "epoch": 0.06246874320973177,
+      "grad_norm": 0.002770553808659315,
+      "learning_rate": 0.001,
+      "loss": 0.3951,
+      "step": 2264
+    },
+    {
+      "epoch": 0.06249633541079614,
+      "grad_norm": 0.0061469171196222305,
+      "learning_rate": 0.001,
+      "loss": 0.4015,
+      "step": 2265
+    },
+    {
+      "epoch": 0.06252392761186051,
+      "grad_norm": 0.00576475216075778,
+      "learning_rate": 0.001,
+      "loss": 0.391,
+      "step": 2266
+    },
+    {
+      "epoch": 0.06255151981292488,
+      "grad_norm": 0.004090351052582264,
+      "learning_rate": 0.001,
+      "loss": 0.3965,
+      "step": 2267
+    },
+    {
+      "epoch": 0.06257911201398925,
+      "grad_norm": 0.024996625259518623,
+      "learning_rate": 0.001,
+      "loss": 0.3813,
+      "step": 2268
+    },
+    {
+      "epoch": 0.06260670421505361,
+      "grad_norm": 0.0244828462600708,
+      "learning_rate": 0.001,
+      "loss": 0.3979,
+      "step": 2269
+    },
+    {
+      "epoch": 0.06263429641611798,
+      "grad_norm": 0.0046532778069376945,
+      "learning_rate": 0.001,
+      "loss": 0.432,
+      "step": 2270
+    },
+    {
+      "epoch": 0.06266188861718236,
+      "grad_norm": 0.004564880859106779,
+      "learning_rate": 0.001,
+      "loss": 0.3931,
+      "step": 2271
+    },
+    {
+      "epoch": 0.06268948081824673,
+      "grad_norm": 0.0029092628974467516,
+      "learning_rate": 0.001,
+      "loss": 0.4537,
+      "step": 2272
+    },
+    {
+      "epoch": 0.06271707301931109,
+      "grad_norm": 0.0033110990189015865,
+      "learning_rate": 0.001,
+      "loss": 0.4023,
+      "step": 2273
+    },
+    {
+      "epoch": 0.06274466522037546,
+      "grad_norm": 0.0045304871164262295,
+      "learning_rate": 0.001,
+      "loss": 0.3824,
+      "step": 2274
+    },
+    {
+      "epoch": 0.06277225742143983,
+      "grad_norm": 0.0031519641634076834,
+      "learning_rate": 0.001,
+      "loss": 0.392,
+      "step": 2275
+    },
+    {
+      "epoch": 0.0627998496225042,
+      "grad_norm": 0.003920139744877815,
+      "learning_rate": 0.001,
+      "loss": 0.3883,
+      "step": 2276
+    },
+    {
+      "epoch": 0.06282744182356857,
+      "grad_norm": 0.004257616586983204,
+      "learning_rate": 0.001,
+      "loss": 0.4043,
+      "step": 2277
+    },
+    {
+      "epoch": 0.06285503402463294,
+      "grad_norm": 0.0038890133146196604,
+      "learning_rate": 0.001,
+      "loss": 0.4207,
+      "step": 2278
+    },
+    {
+      "epoch": 0.0628826262256973,
+      "grad_norm": 0.0028080164920538664,
+      "learning_rate": 0.001,
+      "loss": 0.4085,
+      "step": 2279
+    },
+    {
+      "epoch": 0.06291021842676167,
+      "grad_norm": 0.008050143718719482,
+      "learning_rate": 0.001,
+      "loss": 0.415,
+      "step": 2280
+    },
+    {
+      "epoch": 0.06293781062782605,
+      "grad_norm": 0.0025884988717734814,
+      "learning_rate": 0.001,
+      "loss": 0.3781,
+      "step": 2281
+    },
+    {
+      "epoch": 0.06296540282889042,
+      "grad_norm": 0.0030551603995263577,
+      "learning_rate": 0.001,
+      "loss": 0.4164,
+      "step": 2282
+    },
+    {
+      "epoch": 0.06299299502995478,
+      "grad_norm": 0.0027584030758589506,
+      "learning_rate": 0.001,
+      "loss": 0.3816,
+      "step": 2283
+    },
+    {
+      "epoch": 0.06302058723101915,
+      "grad_norm": 0.0033046863973140717,
+      "learning_rate": 0.001,
+      "loss": 0.3583,
+      "step": 2284
+    },
+    {
+      "epoch": 0.06304817943208352,
+      "grad_norm": 0.0036664491053670645,
+      "learning_rate": 0.001,
+      "loss": 0.405,
+      "step": 2285
+    },
+    {
+      "epoch": 0.06307577163314788,
+      "grad_norm": 0.002570797922089696,
+      "learning_rate": 0.001,
+      "loss": 0.4117,
+      "step": 2286
+    },
+    {
+      "epoch": 0.06310336383421226,
+      "grad_norm": 0.003837966127321124,
+      "learning_rate": 0.001,
+      "loss": 0.3997,
+      "step": 2287
+    },
+    {
+      "epoch": 0.06313095603527663,
+      "grad_norm": 0.00263983360491693,
+      "learning_rate": 0.001,
+      "loss": 0.3935,
+      "step": 2288
+    },
+    {
+      "epoch": 0.063158548236341,
+      "grad_norm": 0.0033676191233098507,
+      "learning_rate": 0.001,
+      "loss": 0.4224,
+      "step": 2289
+    },
+    {
+      "epoch": 0.06318614043740536,
+      "grad_norm": 0.0033159011509269476,
+      "learning_rate": 0.001,
+      "loss": 0.3714,
+      "step": 2290
+    },
+    {
+      "epoch": 0.06321373263846973,
+      "grad_norm": 0.002879378153011203,
+      "learning_rate": 0.001,
+      "loss": 0.4229,
+      "step": 2291
+    },
+    {
+      "epoch": 0.06324132483953411,
+      "grad_norm": 0.004017788916826248,
+      "learning_rate": 0.001,
+      "loss": 0.3722,
+      "step": 2292
+    },
+    {
+      "epoch": 0.06326891704059848,
+      "grad_norm": 0.0034288205206394196,
+      "learning_rate": 0.001,
+      "loss": 0.3973,
+      "step": 2293
+    },
+    {
+      "epoch": 0.06329650924166284,
+      "grad_norm": 0.003537122393026948,
+      "learning_rate": 0.001,
+      "loss": 0.3685,
+      "step": 2294
+    },
+    {
+      "epoch": 0.06332410144272721,
+      "grad_norm": 0.002853949787095189,
+      "learning_rate": 0.001,
+      "loss": 0.3989,
+      "step": 2295
+    },
+    {
+      "epoch": 0.06335169364379158,
+      "grad_norm": 0.00246223621070385,
+      "learning_rate": 0.001,
+      "loss": 0.4675,
+      "step": 2296
+    },
+    {
+      "epoch": 0.06337928584485596,
+      "grad_norm": 0.003921873867511749,
+      "learning_rate": 0.001,
+      "loss": 0.3894,
+      "step": 2297
+    },
+    {
+      "epoch": 0.06340687804592032,
+      "grad_norm": 0.0037953928112983704,
+      "learning_rate": 0.001,
+      "loss": 0.4206,
+      "step": 2298
+    },
+    {
+      "epoch": 0.06343447024698469,
+      "grad_norm": 0.003141796449199319,
+      "learning_rate": 0.001,
+      "loss": 0.3779,
+      "step": 2299
+    },
+    {
+      "epoch": 0.06346206244804906,
+      "grad_norm": 0.0069033862091600895,
+      "learning_rate": 0.001,
+      "loss": 0.4432,
+      "step": 2300
+    },
+    {
+      "epoch": 0.06348965464911342,
+      "grad_norm": 0.00247231125831604,
+      "learning_rate": 0.001,
+      "loss": 0.3987,
+      "step": 2301
+    },
+    {
+      "epoch": 0.0635172468501778,
+      "grad_norm": 0.003614683635532856,
+      "learning_rate": 0.001,
+      "loss": 0.3708,
+      "step": 2302
+    },
+    {
+      "epoch": 0.06354483905124217,
+      "grad_norm": 0.0028486682567745447,
+      "learning_rate": 0.001,
+      "loss": 0.4465,
+      "step": 2303
+    },
+    {
+      "epoch": 0.06357243125230654,
+      "grad_norm": 0.0033698193728923798,
+      "learning_rate": 0.001,
+      "loss": 0.3782,
+      "step": 2304
+    },
+    {
+      "epoch": 0.0636000234533709,
+      "grad_norm": 0.0028043955098837614,
+      "learning_rate": 0.001,
+      "loss": 0.4198,
+      "step": 2305
+    },
+    {
+      "epoch": 0.06362761565443527,
+      "grad_norm": 0.0027136304415762424,
+      "learning_rate": 0.001,
+      "loss": 0.4328,
+      "step": 2306
+    },
+    {
+      "epoch": 0.06365520785549965,
+      "grad_norm": 0.00574844004586339,
+      "learning_rate": 0.001,
+      "loss": 0.4002,
+      "step": 2307
+    },
+    {
+      "epoch": 0.06368280005656402,
+      "grad_norm": 0.002718136878684163,
+      "learning_rate": 0.001,
+      "loss": 0.443,
+      "step": 2308
+    },
+    {
+      "epoch": 0.06371039225762838,
+      "grad_norm": 0.0043097492307424545,
+      "learning_rate": 0.001,
+      "loss": 0.3894,
+      "step": 2309
+    },
+    {
+      "epoch": 0.06373798445869275,
+      "grad_norm": 0.006927134469151497,
+      "learning_rate": 0.001,
+      "loss": 0.4365,
+      "step": 2310
+    },
+    {
+      "epoch": 0.06376557665975711,
+      "grad_norm": 0.0031723494175821543,
+      "learning_rate": 0.001,
+      "loss": 0.4052,
+      "step": 2311
+    },
+    {
+      "epoch": 0.0637931688608215,
+      "grad_norm": 0.0034040913451462984,
+      "learning_rate": 0.001,
+      "loss": 0.3949,
+      "step": 2312
+    },
+    {
+      "epoch": 0.06382076106188586,
+      "grad_norm": 0.003764103166759014,
+      "learning_rate": 0.001,
+      "loss": 0.3979,
+      "step": 2313
+    },
+    {
+      "epoch": 0.06384835326295023,
+      "grad_norm": 0.0026512970216572285,
+      "learning_rate": 0.001,
+      "loss": 0.4215,
+      "step": 2314
+    },
+    {
+      "epoch": 0.0638759454640146,
+      "grad_norm": 0.003220104379579425,
+      "learning_rate": 0.001,
+      "loss": 0.3675,
+      "step": 2315
+    },
+    {
+      "epoch": 0.06390353766507896,
+      "grad_norm": 0.0035847953986376524,
+      "learning_rate": 0.001,
+      "loss": 0.4028,
+      "step": 2316
+    },
+    {
+      "epoch": 0.06393112986614334,
+      "grad_norm": 0.004416230600327253,
+      "learning_rate": 0.001,
+      "loss": 0.4225,
+      "step": 2317
+    },
+    {
+      "epoch": 0.06395872206720771,
+      "grad_norm": 0.00270696054212749,
+      "learning_rate": 0.001,
+      "loss": 0.3667,
+      "step": 2318
+    },
+    {
+      "epoch": 0.06398631426827207,
+      "grad_norm": 0.0026564293075352907,
+      "learning_rate": 0.001,
+      "loss": 0.3829,
+      "step": 2319
+    },
+    {
+      "epoch": 0.06401390646933644,
+      "grad_norm": 0.0031192628666758537,
+      "learning_rate": 0.001,
+      "loss": 0.3961,
+      "step": 2320
+    },
+    {
+      "epoch": 0.0640414986704008,
+      "grad_norm": 0.003101060399785638,
+      "learning_rate": 0.001,
+      "loss": 0.4423,
+      "step": 2321
+    },
+    {
+      "epoch": 0.06406909087146519,
+      "grad_norm": 0.00393852312117815,
+      "learning_rate": 0.001,
+      "loss": 0.3976,
+      "step": 2322
+    },
+    {
+      "epoch": 0.06409668307252955,
+      "grad_norm": 0.003799998899921775,
+      "learning_rate": 0.001,
+      "loss": 0.4048,
+      "step": 2323
+    },
+    {
+      "epoch": 0.06412427527359392,
+      "grad_norm": 0.008037365972995758,
+      "learning_rate": 0.001,
+      "loss": 0.4383,
+      "step": 2324
+    },
+    {
+      "epoch": 0.06415186747465829,
+      "grad_norm": 0.004333310294896364,
+      "learning_rate": 0.001,
+      "loss": 0.3666,
+      "step": 2325
+    },
+    {
+      "epoch": 0.06417945967572265,
+      "grad_norm": 0.0026929888408631086,
+      "learning_rate": 0.001,
+      "loss": 0.3995,
+      "step": 2326
+    },
+    {
+      "epoch": 0.06420705187678703,
+      "grad_norm": 0.0040913717821240425,
+      "learning_rate": 0.001,
+      "loss": 0.4001,
+      "step": 2327
+    },
+    {
+      "epoch": 0.0642346440778514,
+      "grad_norm": 0.005933169275522232,
+      "learning_rate": 0.001,
+      "loss": 0.3958,
+      "step": 2328
+    },
+    {
+      "epoch": 0.06426223627891577,
+      "grad_norm": 0.003139302134513855,
+      "learning_rate": 0.001,
+      "loss": 0.4112,
+      "step": 2329
+    },
+    {
+      "epoch": 0.06428982847998013,
+      "grad_norm": 0.003957003820687532,
+      "learning_rate": 0.001,
+      "loss": 0.3589,
+      "step": 2330
+    },
+    {
+      "epoch": 0.0643174206810445,
+      "grad_norm": 0.003573901252821088,
+      "learning_rate": 0.001,
+      "loss": 0.3708,
+      "step": 2331
+    },
+    {
+      "epoch": 0.06434501288210887,
+      "grad_norm": 0.0033237498719245195,
+      "learning_rate": 0.001,
+      "loss": 0.4117,
+      "step": 2332
+    },
+    {
+      "epoch": 0.06437260508317325,
+      "grad_norm": 0.0034847762435674667,
+      "learning_rate": 0.001,
+      "loss": 0.3974,
+      "step": 2333
+    },
+    {
+      "epoch": 0.06440019728423761,
+      "grad_norm": 0.004198945127427578,
+      "learning_rate": 0.001,
+      "loss": 0.3993,
+      "step": 2334
+    },
+    {
+      "epoch": 0.06442778948530198,
+      "grad_norm": 0.003995297942310572,
+      "learning_rate": 0.001,
+      "loss": 0.3924,
+      "step": 2335
+    },
+    {
+      "epoch": 0.06445538168636634,
+      "grad_norm": 0.003225005231797695,
+      "learning_rate": 0.001,
+      "loss": 0.3985,
+      "step": 2336
+    },
+    {
+      "epoch": 0.06448297388743071,
+      "grad_norm": 0.0031134854070842266,
+      "learning_rate": 0.001,
+      "loss": 0.4141,
+      "step": 2337
+    },
+    {
+      "epoch": 0.06451056608849509,
+      "grad_norm": 0.004270988516509533,
+      "learning_rate": 0.001,
+      "loss": 0.3993,
+      "step": 2338
+    },
+    {
+      "epoch": 0.06453815828955946,
+      "grad_norm": 0.004274028819054365,
+      "learning_rate": 0.001,
+      "loss": 0.4183,
+      "step": 2339
+    },
+    {
+      "epoch": 0.06456575049062382,
+      "grad_norm": 0.003787413239479065,
+      "learning_rate": 0.001,
+      "loss": 0.3675,
+      "step": 2340
+    },
+    {
+      "epoch": 0.06459334269168819,
+      "grad_norm": 0.003217950463294983,
+      "learning_rate": 0.001,
+      "loss": 0.4109,
+      "step": 2341
+    },
+    {
+      "epoch": 0.06462093489275256,
+      "grad_norm": 0.0038233373779803514,
+      "learning_rate": 0.001,
+      "loss": 0.3671,
+      "step": 2342
+    },
+    {
+      "epoch": 0.06464852709381694,
+      "grad_norm": 0.0038631544448435307,
+      "learning_rate": 0.001,
+      "loss": 0.3884,
+      "step": 2343
+    },
+    {
+      "epoch": 0.0646761192948813,
+      "grad_norm": 0.002935303607955575,
+      "learning_rate": 0.001,
+      "loss": 0.4283,
+      "step": 2344
+    },
+    {
+      "epoch": 0.06470371149594567,
+      "grad_norm": 0.004563242197036743,
+      "learning_rate": 0.001,
+      "loss": 0.3883,
+      "step": 2345
+    },
+    {
+      "epoch": 0.06473130369701004,
+      "grad_norm": 0.004774071741849184,
+      "learning_rate": 0.001,
+      "loss": 0.409,
+      "step": 2346
+    },
+    {
+      "epoch": 0.0647588958980744,
+      "grad_norm": 0.0027761361561715603,
+      "learning_rate": 0.001,
+      "loss": 0.435,
+      "step": 2347
+    },
+    {
+      "epoch": 0.06478648809913878,
+      "grad_norm": 0.0030395982321351767,
+      "learning_rate": 0.001,
+      "loss": 0.4186,
+      "step": 2348
+    },
+    {
+      "epoch": 0.06481408030020315,
+      "grad_norm": 0.034988418221473694,
+      "learning_rate": 0.001,
+      "loss": 0.4053,
+      "step": 2349
+    },
+    {
+      "epoch": 0.06484167250126752,
+      "grad_norm": 0.003620629198849201,
+      "learning_rate": 0.001,
+      "loss": 0.3867,
+      "step": 2350
+    },
+    {
+      "epoch": 0.06486926470233188,
+      "grad_norm": 0.003206313122063875,
+      "learning_rate": 0.001,
+      "loss": 0.4027,
+      "step": 2351
+    },
+    {
+      "epoch": 0.06489685690339625,
+      "grad_norm": 0.0028597121126949787,
+      "learning_rate": 0.001,
+      "loss": 0.3927,
+      "step": 2352
+    },
+    {
+      "epoch": 0.06492444910446063,
+      "grad_norm": 0.0034794441889971495,
+      "learning_rate": 0.001,
+      "loss": 0.3854,
+      "step": 2353
+    },
+    {
+      "epoch": 0.064952041305525,
+      "grad_norm": 0.0031176162883639336,
+      "learning_rate": 0.001,
+      "loss": 0.3651,
+      "step": 2354
+    },
+    {
+      "epoch": 0.06497963350658936,
+      "grad_norm": 0.0035436085890978575,
+      "learning_rate": 0.001,
+      "loss": 0.37,
+      "step": 2355
+    },
+    {
+      "epoch": 0.06500722570765373,
+      "grad_norm": 0.003136987565085292,
+      "learning_rate": 0.001,
+      "loss": 0.4029,
+      "step": 2356
+    },
+    {
+      "epoch": 0.0650348179087181,
+      "grad_norm": 0.003193167271092534,
+      "learning_rate": 0.001,
+      "loss": 0.4076,
+      "step": 2357
+    },
+    {
+      "epoch": 0.06506241010978248,
+      "grad_norm": 0.0033892260398715734,
+      "learning_rate": 0.001,
+      "loss": 0.3589,
+      "step": 2358
+    },
+    {
+      "epoch": 0.06509000231084684,
+      "grad_norm": 0.0065610939636826515,
+      "learning_rate": 0.001,
+      "loss": 0.3956,
+      "step": 2359
+    },
+    {
+      "epoch": 0.06511759451191121,
+      "grad_norm": 0.005481204017996788,
+      "learning_rate": 0.001,
+      "loss": 0.3597,
+      "step": 2360
+    },
+    {
+      "epoch": 0.06514518671297558,
+      "grad_norm": 0.0103605967015028,
+      "learning_rate": 0.001,
+      "loss": 0.3356,
+      "step": 2361
+    },
+    {
+      "epoch": 0.06517277891403994,
+      "grad_norm": 0.0066803195513784885,
+      "learning_rate": 0.001,
+      "loss": 0.426,
+      "step": 2362
+    },
+    {
+      "epoch": 0.06520037111510432,
+      "grad_norm": 0.004259153269231319,
+      "learning_rate": 0.001,
+      "loss": 0.4293,
+      "step": 2363
+    },
+    {
+      "epoch": 0.06522796331616869,
+      "grad_norm": 0.012415111064910889,
+      "learning_rate": 0.001,
+      "loss": 0.3602,
+      "step": 2364
+    },
+    {
+      "epoch": 0.06525555551723305,
+      "grad_norm": 0.0029489900916814804,
+      "learning_rate": 0.001,
+      "loss": 0.4193,
+      "step": 2365
+    },
+    {
+      "epoch": 0.06528314771829742,
+      "grad_norm": 0.006090483628213406,
+      "learning_rate": 0.001,
+      "loss": 0.379,
+      "step": 2366
+    },
+    {
+      "epoch": 0.06531073991936179,
+      "grad_norm": 0.003908079583197832,
+      "learning_rate": 0.001,
+      "loss": 0.3831,
+      "step": 2367
+    },
+    {
+      "epoch": 0.06533833212042617,
+      "grad_norm": 0.0039390516467392445,
+      "learning_rate": 0.001,
+      "loss": 0.4163,
+      "step": 2368
+    },
+    {
+      "epoch": 0.06536592432149053,
+      "grad_norm": 0.009213199838995934,
+      "learning_rate": 0.001,
+      "loss": 0.4277,
+      "step": 2369
+    },
+    {
+      "epoch": 0.0653935165225549,
+      "grad_norm": 0.0031545236706733704,
+      "learning_rate": 0.001,
+      "loss": 0.4037,
+      "step": 2370
+    },
+    {
+      "epoch": 0.06542110872361927,
+      "grad_norm": 0.003665713593363762,
+      "learning_rate": 0.001,
+      "loss": 0.379,
+      "step": 2371
+    },
+    {
+      "epoch": 0.06544870092468363,
+      "grad_norm": 0.00421819556504488,
+      "learning_rate": 0.001,
+      "loss": 0.3466,
+      "step": 2372
+    },
+    {
+      "epoch": 0.06547629312574801,
+      "grad_norm": 0.0041595143266022205,
+      "learning_rate": 0.001,
+      "loss": 0.3893,
+      "step": 2373
+    },
+    {
+      "epoch": 0.06550388532681238,
+      "grad_norm": 0.00258744228631258,
+      "learning_rate": 0.001,
+      "loss": 0.3991,
+      "step": 2374
+    },
+    {
+      "epoch": 0.06553147752787675,
+      "grad_norm": 0.0025976793840527534,
+      "learning_rate": 0.001,
+      "loss": 0.4073,
+      "step": 2375
+    },
+    {
+      "epoch": 0.06555906972894111,
+      "grad_norm": 0.002732262946665287,
+      "learning_rate": 0.001,
+      "loss": 0.4031,
+      "step": 2376
+    },
+    {
+      "epoch": 0.06558666193000548,
+      "grad_norm": 0.0024393522180616856,
+      "learning_rate": 0.001,
+      "loss": 0.3939,
+      "step": 2377
+    },
+    {
+      "epoch": 0.06561425413106985,
+      "grad_norm": 0.006330225151032209,
+      "learning_rate": 0.001,
+      "loss": 0.3867,
+      "step": 2378
+    },
+    {
+      "epoch": 0.06564184633213423,
+      "grad_norm": 0.0031101806089282036,
+      "learning_rate": 0.001,
+      "loss": 0.362,
+      "step": 2379
+    },
+    {
+      "epoch": 0.06566943853319859,
+      "grad_norm": 0.0026855298783630133,
+      "learning_rate": 0.001,
+      "loss": 0.4196,
+      "step": 2380
+    },
+    {
+      "epoch": 0.06569703073426296,
+      "grad_norm": 0.002515793778002262,
+      "learning_rate": 0.001,
+      "loss": 0.4513,
+      "step": 2381
+    },
+    {
+      "epoch": 0.06572462293532733,
+      "grad_norm": 0.0025803048629313707,
+      "learning_rate": 0.001,
+      "loss": 0.3932,
+      "step": 2382
+    },
+    {
+      "epoch": 0.06575221513639169,
+      "grad_norm": 0.0031242026016116142,
+      "learning_rate": 0.001,
+      "loss": 0.3867,
+      "step": 2383
+    },
+    {
+      "epoch": 0.06577980733745607,
+      "grad_norm": 0.003442540764808655,
+      "learning_rate": 0.001,
+      "loss": 0.425,
+      "step": 2384
+    },
+    {
+      "epoch": 0.06580739953852044,
+      "grad_norm": 0.0035992697812616825,
+      "learning_rate": 0.001,
+      "loss": 0.3475,
+      "step": 2385
+    },
+    {
+      "epoch": 0.0658349917395848,
+      "grad_norm": 0.0026494376361370087,
+      "learning_rate": 0.001,
+      "loss": 0.4028,
+      "step": 2386
+    },
+    {
+      "epoch": 0.06586258394064917,
+      "grad_norm": 0.0026794499717652798,
+      "learning_rate": 0.001,
+      "loss": 0.4355,
+      "step": 2387
+    },
+    {
+      "epoch": 0.06589017614171354,
+      "grad_norm": 0.003907007165253162,
+      "learning_rate": 0.001,
+      "loss": 0.4043,
+      "step": 2388
+    },
+    {
+      "epoch": 0.06591776834277792,
+      "grad_norm": 0.0038369898684322834,
+      "learning_rate": 0.001,
+      "loss": 0.3949,
+      "step": 2389
+    },
+    {
+      "epoch": 0.06594536054384229,
+      "grad_norm": 0.003516831435263157,
+      "learning_rate": 0.001,
+      "loss": 0.4073,
+      "step": 2390
+    },
+    {
+      "epoch": 0.06597295274490665,
+      "grad_norm": 0.0029046509880572557,
+      "learning_rate": 0.001,
+      "loss": 0.4192,
+      "step": 2391
+    },
+    {
+      "epoch": 0.06600054494597102,
+      "grad_norm": 0.0029956242069602013,
+      "learning_rate": 0.001,
+      "loss": 0.3925,
+      "step": 2392
+    },
+    {
+      "epoch": 0.06602813714703538,
+      "grad_norm": 0.0032762791961431503,
+      "learning_rate": 0.001,
+      "loss": 0.4313,
+      "step": 2393
+    },
+    {
+      "epoch": 0.06605572934809976,
+      "grad_norm": 0.0029915180057287216,
+      "learning_rate": 0.001,
+      "loss": 0.4053,
+      "step": 2394
+    },
+    {
+      "epoch": 0.06608332154916413,
+      "grad_norm": 0.004418436903506517,
+      "learning_rate": 0.001,
+      "loss": 0.3815,
+      "step": 2395
+    },
+    {
+      "epoch": 0.0661109137502285,
+      "grad_norm": 0.003055717097595334,
+      "learning_rate": 0.001,
+      "loss": 0.3776,
+      "step": 2396
+    },
+    {
+      "epoch": 0.06613850595129286,
+      "grad_norm": 0.0034056156873703003,
+      "learning_rate": 0.001,
+      "loss": 0.3617,
+      "step": 2397
+    },
+    {
+      "epoch": 0.06616609815235723,
+      "grad_norm": 0.002760351402685046,
+      "learning_rate": 0.001,
+      "loss": 0.4072,
+      "step": 2398
+    },
+    {
+      "epoch": 0.06619369035342161,
+      "grad_norm": 0.0054167998023331165,
+      "learning_rate": 0.001,
+      "loss": 0.4098,
+      "step": 2399
+    },
+    {
+      "epoch": 0.06622128255448598,
+      "grad_norm": 0.002874811412766576,
+      "learning_rate": 0.001,
+      "loss": 0.3747,
+      "step": 2400
+    },
+    {
+      "epoch": 0.06624887475555034,
+      "grad_norm": 0.0038131948094815016,
+      "learning_rate": 0.001,
+      "loss": 0.3841,
+      "step": 2401
+    },
+    {
+      "epoch": 0.06627646695661471,
+      "grad_norm": 0.01067175529897213,
+      "learning_rate": 0.001,
+      "loss": 0.4112,
+      "step": 2402
+    },
+    {
+      "epoch": 0.06630405915767908,
+      "grad_norm": 0.003224137471988797,
+      "learning_rate": 0.001,
+      "loss": 0.426,
+      "step": 2403
+    },
+    {
+      "epoch": 0.06633165135874346,
+      "grad_norm": 0.004995389375835657,
+      "learning_rate": 0.001,
+      "loss": 0.4192,
+      "step": 2404
+    },
+    {
+      "epoch": 0.06635924355980782,
+      "grad_norm": 0.0033968077041208744,
+      "learning_rate": 0.001,
+      "loss": 0.3858,
+      "step": 2405
+    },
+    {
+      "epoch": 0.06638683576087219,
+      "grad_norm": 0.003337099449709058,
+      "learning_rate": 0.001,
+      "loss": 0.3995,
+      "step": 2406
+    },
+    {
+      "epoch": 0.06641442796193656,
+      "grad_norm": 0.0033400380052626133,
+      "learning_rate": 0.001,
+      "loss": 0.4082,
+      "step": 2407
+    },
+    {
+      "epoch": 0.06644202016300092,
+      "grad_norm": 0.0059796725399792194,
+      "learning_rate": 0.001,
+      "loss": 0.4014,
+      "step": 2408
+    },
+    {
+      "epoch": 0.0664696123640653,
+      "grad_norm": 0.0036425914149731398,
+      "learning_rate": 0.001,
+      "loss": 0.3791,
+      "step": 2409
+    },
+    {
+      "epoch": 0.06649720456512967,
+      "grad_norm": 0.003386643249541521,
+      "learning_rate": 0.001,
+      "loss": 0.4226,
+      "step": 2410
+    },
+    {
+      "epoch": 0.06652479676619404,
+      "grad_norm": 0.0025921366177499294,
+      "learning_rate": 0.001,
+      "loss": 0.4242,
+      "step": 2411
+    },
+    {
+      "epoch": 0.0665523889672584,
+      "grad_norm": 0.003707844065502286,
+      "learning_rate": 0.001,
+      "loss": 0.3935,
+      "step": 2412
+    },
+    {
+      "epoch": 0.06657998116832277,
+      "grad_norm": 0.005570289213210344,
+      "learning_rate": 0.001,
+      "loss": 0.395,
+      "step": 2413
+    },
+    {
+      "epoch": 0.06660757336938715,
+      "grad_norm": 0.004504153970628977,
+      "learning_rate": 0.001,
+      "loss": 0.4011,
+      "step": 2414
+    },
+    {
+      "epoch": 0.06663516557045152,
+      "grad_norm": 0.0021061068400740623,
+      "learning_rate": 0.001,
+      "loss": 0.4058,
+      "step": 2415
+    },
+    {
+      "epoch": 0.06666275777151588,
+      "grad_norm": 0.00822344422340393,
+      "learning_rate": 0.001,
+      "loss": 0.3713,
+      "step": 2416
+    },
+    {
+      "epoch": 0.06669034997258025,
+      "grad_norm": 0.003418115433305502,
+      "learning_rate": 0.001,
+      "loss": 0.4284,
+      "step": 2417
+    },
+    {
+      "epoch": 0.06671794217364461,
+      "grad_norm": 0.015280312858521938,
+      "learning_rate": 0.001,
+      "loss": 0.3837,
+      "step": 2418
+    },
+    {
+      "epoch": 0.066745534374709,
+      "grad_norm": 0.006191336549818516,
+      "learning_rate": 0.001,
+      "loss": 0.3973,
+      "step": 2419
+    },
+    {
+      "epoch": 0.06677312657577336,
+      "grad_norm": 0.0025210208259522915,
+      "learning_rate": 0.001,
+      "loss": 0.3973,
+      "step": 2420
+    },
+    {
+      "epoch": 0.06680071877683773,
+      "grad_norm": 0.003629886545240879,
+      "learning_rate": 0.001,
+      "loss": 0.4107,
+      "step": 2421
+    },
+    {
+      "epoch": 0.0668283109779021,
+      "grad_norm": 0.005250046495348215,
+      "learning_rate": 0.001,
+      "loss": 0.3935,
+      "step": 2422
+    },
+    {
+      "epoch": 0.06685590317896646,
+      "grad_norm": 0.0036179288290441036,
+      "learning_rate": 0.001,
+      "loss": 0.3925,
+      "step": 2423
+    },
+    {
+      "epoch": 0.06688349538003083,
+      "grad_norm": 0.005279705859720707,
+      "learning_rate": 0.001,
+      "loss": 0.3952,
+      "step": 2424
+    },
+    {
+      "epoch": 0.06691108758109521,
+      "grad_norm": 0.0030266193207353354,
+      "learning_rate": 0.001,
+      "loss": 0.38,
+      "step": 2425
+    },
+    {
+      "epoch": 0.06693867978215957,
+      "grad_norm": 0.007787918671965599,
+      "learning_rate": 0.001,
+      "loss": 0.3782,
+      "step": 2426
+    },
+    {
+      "epoch": 0.06696627198322394,
+      "grad_norm": 0.00260373717173934,
+      "learning_rate": 0.001,
+      "loss": 0.3733,
+      "step": 2427
+    },
+    {
+      "epoch": 0.06699386418428831,
+      "grad_norm": 0.002870837925001979,
+      "learning_rate": 0.001,
+      "loss": 0.4329,
+      "step": 2428
+    },
+    {
+      "epoch": 0.06702145638535267,
+      "grad_norm": 0.0024147257208824158,
+      "learning_rate": 0.001,
+      "loss": 0.4342,
+      "step": 2429
+    },
+    {
+      "epoch": 0.06704904858641705,
+      "grad_norm": 0.003153110621497035,
+      "learning_rate": 0.001,
+      "loss": 0.4107,
+      "step": 2430
+    },
+    {
+      "epoch": 0.06707664078748142,
+      "grad_norm": 0.0036289046984165907,
+      "learning_rate": 0.001,
+      "loss": 0.3967,
+      "step": 2431
+    },
+    {
+      "epoch": 0.06710423298854579,
+      "grad_norm": 0.003097312757745385,
+      "learning_rate": 0.001,
+      "loss": 0.4015,
+      "step": 2432
+    },
+    {
+      "epoch": 0.06713182518961015,
+      "grad_norm": 0.003282829187810421,
+      "learning_rate": 0.001,
+      "loss": 0.3956,
+      "step": 2433
+    },
+    {
+      "epoch": 0.06715941739067452,
+      "grad_norm": 0.006064482033252716,
+      "learning_rate": 0.001,
+      "loss": 0.4164,
+      "step": 2434
+    },
+    {
+      "epoch": 0.0671870095917389,
+      "grad_norm": 0.007535384502261877,
+      "learning_rate": 0.001,
+      "loss": 0.4349,
+      "step": 2435
+    },
+    {
+      "epoch": 0.06721460179280327,
+      "grad_norm": 0.003580626333132386,
+      "learning_rate": 0.001,
+      "loss": 0.3686,
+      "step": 2436
+    },
+    {
+      "epoch": 0.06724219399386763,
+      "grad_norm": 0.004072824027389288,
+      "learning_rate": 0.001,
+      "loss": 0.4207,
+      "step": 2437
+    },
+    {
+      "epoch": 0.067269786194932,
+      "grad_norm": 0.0034644294064491987,
+      "learning_rate": 0.001,
+      "loss": 0.4201,
+      "step": 2438
+    },
+    {
+      "epoch": 0.06729737839599637,
+      "grad_norm": 0.0031727415043860674,
+      "learning_rate": 0.001,
+      "loss": 0.4182,
+      "step": 2439
+    },
+    {
+      "epoch": 0.06732497059706075,
+      "grad_norm": 0.0025959559716284275,
+      "learning_rate": 0.001,
+      "loss": 0.4412,
+      "step": 2440
+    },
+    {
+      "epoch": 0.06735256279812511,
+      "grad_norm": 0.007867682725191116,
+      "learning_rate": 0.001,
+      "loss": 0.3974,
+      "step": 2441
+    },
+    {
+      "epoch": 0.06738015499918948,
+      "grad_norm": 0.003531220369040966,
+      "learning_rate": 0.001,
+      "loss": 0.3763,
+      "step": 2442
+    },
+    {
+      "epoch": 0.06740774720025385,
+      "grad_norm": 0.0036349524743855,
+      "learning_rate": 0.001,
+      "loss": 0.4136,
+      "step": 2443
+    },
+    {
+      "epoch": 0.06743533940131821,
+      "grad_norm": 0.003158099949359894,
+      "learning_rate": 0.001,
+      "loss": 0.429,
+      "step": 2444
+    },
+    {
+      "epoch": 0.06746293160238259,
+      "grad_norm": 0.002893587574362755,
+      "learning_rate": 0.001,
+      "loss": 0.4316,
+      "step": 2445
+    },
+    {
+      "epoch": 0.06749052380344696,
+      "grad_norm": 0.002769331680610776,
+      "learning_rate": 0.001,
+      "loss": 0.3848,
+      "step": 2446
+    },
+    {
+      "epoch": 0.06751811600451132,
+      "grad_norm": 0.0028869437519460917,
+      "learning_rate": 0.001,
+      "loss": 0.3884,
+      "step": 2447
+    },
+    {
+      "epoch": 0.06754570820557569,
+      "grad_norm": 0.004190088715404272,
+      "learning_rate": 0.001,
+      "loss": 0.3899,
+      "step": 2448
+    },
+    {
+      "epoch": 0.06757330040664006,
+      "grad_norm": 0.00414698664098978,
+      "learning_rate": 0.001,
+      "loss": 0.3769,
+      "step": 2449
+    },
+    {
+      "epoch": 0.06760089260770444,
+      "grad_norm": 0.0034248684532940388,
+      "learning_rate": 0.001,
+      "loss": 0.4067,
+      "step": 2450
+    },
+    {
+      "epoch": 0.0676284848087688,
+      "grad_norm": 0.0032379438634961843,
+      "learning_rate": 0.001,
+      "loss": 0.3517,
+      "step": 2451
+    },
+    {
+      "epoch": 0.06765607700983317,
+      "grad_norm": 0.0037679013330489397,
+      "learning_rate": 0.001,
+      "loss": 0.3754,
+      "step": 2452
+    },
+    {
+      "epoch": 0.06768366921089754,
+      "grad_norm": 0.0025305391754955053,
+      "learning_rate": 0.001,
+      "loss": 0.3749,
+      "step": 2453
+    },
+    {
+      "epoch": 0.0677112614119619,
+      "grad_norm": 0.010560314171016216,
+      "learning_rate": 0.001,
+      "loss": 0.3775,
+      "step": 2454
+    },
+    {
+      "epoch": 0.06773885361302628,
+      "grad_norm": 0.0022722717840224504,
+      "learning_rate": 0.001,
+      "loss": 0.4212,
+      "step": 2455
+    },
+    {
+      "epoch": 0.06776644581409065,
+      "grad_norm": 0.0035940262023359537,
+      "learning_rate": 0.001,
+      "loss": 0.3902,
+      "step": 2456
+    },
+    {
+      "epoch": 0.06779403801515502,
+      "grad_norm": 0.0032793956343084574,
+      "learning_rate": 0.001,
+      "loss": 0.381,
+      "step": 2457
+    },
+    {
+      "epoch": 0.06782163021621938,
+      "grad_norm": 0.0032437799964100122,
+      "learning_rate": 0.001,
+      "loss": 0.3787,
+      "step": 2458
+    },
+    {
+      "epoch": 0.06784922241728375,
+      "grad_norm": 0.004813835956156254,
+      "learning_rate": 0.001,
+      "loss": 0.3529,
+      "step": 2459
+    },
+    {
+      "epoch": 0.06787681461834813,
+      "grad_norm": 0.0028039722237735987,
+      "learning_rate": 0.001,
+      "loss": 0.384,
+      "step": 2460
+    },
+    {
+      "epoch": 0.0679044068194125,
+      "grad_norm": 0.0028152938466519117,
+      "learning_rate": 0.001,
+      "loss": 0.396,
+      "step": 2461
+    },
+    {
+      "epoch": 0.06793199902047686,
+      "grad_norm": 0.004003360401839018,
+      "learning_rate": 0.001,
+      "loss": 0.3996,
+      "step": 2462
+    },
+    {
+      "epoch": 0.06795959122154123,
+      "grad_norm": 0.003837777767330408,
+      "learning_rate": 0.001,
+      "loss": 0.3715,
+      "step": 2463
+    },
+    {
+      "epoch": 0.0679871834226056,
+      "grad_norm": 0.0043929507955908775,
+      "learning_rate": 0.001,
+      "loss": 0.3933,
+      "step": 2464
+    },
+    {
+      "epoch": 0.06801477562366998,
+      "grad_norm": 0.005224680993705988,
+      "learning_rate": 0.001,
+      "loss": 0.4112,
+      "step": 2465
+    },
+    {
+      "epoch": 0.06804236782473434,
+      "grad_norm": 0.0029568690806627274,
+      "learning_rate": 0.001,
+      "loss": 0.3845,
+      "step": 2466
+    },
+    {
+      "epoch": 0.06806996002579871,
+      "grad_norm": 0.0061539956368505955,
+      "learning_rate": 0.001,
+      "loss": 0.3828,
+      "step": 2467
+    },
+    {
+      "epoch": 0.06809755222686308,
+      "grad_norm": 0.003775696037337184,
+      "learning_rate": 0.001,
+      "loss": 0.3627,
+      "step": 2468
+    },
+    {
+      "epoch": 0.06812514442792744,
+      "grad_norm": 0.003055511973798275,
+      "learning_rate": 0.001,
+      "loss": 0.4198,
+      "step": 2469
+    },
+    {
+      "epoch": 0.06815273662899182,
+      "grad_norm": 0.004253302235156298,
+      "learning_rate": 0.001,
+      "loss": 0.391,
+      "step": 2470
+    },
+    {
+      "epoch": 0.06818032883005619,
+      "grad_norm": 0.00398953165858984,
+      "learning_rate": 0.001,
+      "loss": 0.3917,
+      "step": 2471
+    },
+    {
+      "epoch": 0.06820792103112056,
+      "grad_norm": 0.005971815902739763,
+      "learning_rate": 0.001,
+      "loss": 0.3704,
+      "step": 2472
+    },
+    {
+      "epoch": 0.06823551323218492,
+      "grad_norm": 0.0031450032256543636,
+      "learning_rate": 0.001,
+      "loss": 0.4128,
+      "step": 2473
+    },
+    {
+      "epoch": 0.06826310543324929,
+      "grad_norm": 0.013066442683339119,
+      "learning_rate": 0.001,
+      "loss": 0.374,
+      "step": 2474
+    },
+    {
+      "epoch": 0.06829069763431365,
+      "grad_norm": 0.005165675655007362,
+      "learning_rate": 0.001,
+      "loss": 0.4194,
+      "step": 2475
+    },
+    {
+      "epoch": 0.06831828983537803,
+      "grad_norm": 0.002924390835687518,
+      "learning_rate": 0.001,
+      "loss": 0.3841,
+      "step": 2476
+    },
+    {
+      "epoch": 0.0683458820364424,
+      "grad_norm": 0.0032948816660791636,
+      "learning_rate": 0.001,
+      "loss": 0.3903,
+      "step": 2477
+    },
+    {
+      "epoch": 0.06837347423750677,
+      "grad_norm": 0.004480746109038591,
+      "learning_rate": 0.001,
+      "loss": 0.3155,
+      "step": 2478
+    },
+    {
+      "epoch": 0.06840106643857113,
+      "grad_norm": 0.0036851740442216396,
+      "learning_rate": 0.001,
+      "loss": 0.3506,
+      "step": 2479
+    },
+    {
+      "epoch": 0.0684286586396355,
+      "grad_norm": 0.0027917283587157726,
+      "learning_rate": 0.001,
+      "loss": 0.3936,
+      "step": 2480
+    },
+    {
+      "epoch": 0.06845625084069988,
+      "grad_norm": 0.0024517588317394257,
+      "learning_rate": 0.001,
+      "loss": 0.4228,
+      "step": 2481
+    },
+    {
+      "epoch": 0.06848384304176425,
+      "grad_norm": 0.002968950429931283,
+      "learning_rate": 0.001,
+      "loss": 0.4209,
+      "step": 2482
+    },
+    {
+      "epoch": 0.06851143524282861,
+      "grad_norm": 0.0027774290647357702,
+      "learning_rate": 0.001,
+      "loss": 0.3914,
+      "step": 2483
+    },
+    {
+      "epoch": 0.06853902744389298,
+      "grad_norm": 0.002697261283174157,
+      "learning_rate": 0.001,
+      "loss": 0.4455,
+      "step": 2484
+    },
+    {
+      "epoch": 0.06856661964495735,
+      "grad_norm": 0.003583157667890191,
+      "learning_rate": 0.001,
+      "loss": 0.397,
+      "step": 2485
+    },
+    {
+      "epoch": 0.06859421184602173,
+      "grad_norm": 0.002954701893031597,
+      "learning_rate": 0.001,
+      "loss": 0.4139,
+      "step": 2486
+    },
+    {
+      "epoch": 0.0686218040470861,
+      "grad_norm": 0.00595908472314477,
+      "learning_rate": 0.001,
+      "loss": 0.4411,
+      "step": 2487
+    },
+    {
+      "epoch": 0.06864939624815046,
+      "grad_norm": 0.0025385827757418156,
+      "learning_rate": 0.001,
+      "loss": 0.3983,
+      "step": 2488
+    },
+    {
+      "epoch": 0.06867698844921483,
+      "grad_norm": 0.0031357340048998594,
+      "learning_rate": 0.001,
+      "loss": 0.3998,
+      "step": 2489
+    },
+    {
+      "epoch": 0.06870458065027919,
+      "grad_norm": 0.006353291217237711,
+      "learning_rate": 0.001,
+      "loss": 0.3985,
+      "step": 2490
+    },
+    {
+      "epoch": 0.06873217285134357,
+      "grad_norm": 0.0036874916404485703,
+      "learning_rate": 0.001,
+      "loss": 0.3951,
+      "step": 2491
+    },
+    {
+      "epoch": 0.06875976505240794,
+      "grad_norm": 0.0032723871991038322,
+      "learning_rate": 0.001,
+      "loss": 0.4342,
+      "step": 2492
+    },
+    {
+      "epoch": 0.0687873572534723,
+      "grad_norm": 0.004007409326732159,
+      "learning_rate": 0.001,
+      "loss": 0.3908,
+      "step": 2493
+    },
+    {
+      "epoch": 0.06881494945453667,
+      "grad_norm": 0.0029772506095469,
+      "learning_rate": 0.001,
+      "loss": 0.3861,
+      "step": 2494
+    },
+    {
+      "epoch": 0.06884254165560104,
+      "grad_norm": 0.002340099308639765,
+      "learning_rate": 0.001,
+      "loss": 0.4351,
+      "step": 2495
+    },
+    {
+      "epoch": 0.06887013385666542,
+      "grad_norm": 0.0038869476411491632,
+      "learning_rate": 0.001,
+      "loss": 0.4013,
+      "step": 2496
+    },
+    {
+      "epoch": 0.06889772605772979,
+      "grad_norm": 0.004465511068701744,
+      "learning_rate": 0.001,
+      "loss": 0.4003,
+      "step": 2497
+    },
+    {
+      "epoch": 0.06892531825879415,
+      "grad_norm": 0.004165589809417725,
+      "learning_rate": 0.001,
+      "loss": 0.3973,
+      "step": 2498
+    },
+    {
+      "epoch": 0.06895291045985852,
+      "grad_norm": 0.0023313488345593214,
+      "learning_rate": 0.001,
+      "loss": 0.4059,
+      "step": 2499
+    },
+    {
+      "epoch": 0.06898050266092288,
+      "grad_norm": 0.004344920627772808,
+      "learning_rate": 0.001,
+      "loss": 0.3938,
+      "step": 2500
+    },
+    {
+      "epoch": 0.06898050266092288,
+      "eval_runtime": 24.3559,
+      "eval_samples_per_second": 1.314,
+      "eval_steps_per_second": 0.164,
+      "step": 2500
+    },
+    {
+      "epoch": 0.06900809486198727,
+      "grad_norm": 0.0032904972322285175,
+      "learning_rate": 0.001,
+      "loss": 0.4215,
+      "step": 2501
+    },
+    {
+      "epoch": 0.06903568706305163,
+      "grad_norm": 0.0037113146390765905,
+      "learning_rate": 0.001,
+      "loss": 0.3731,
+      "step": 2502
+    },
+    {
+      "epoch": 0.069063279264116,
+      "grad_norm": 0.0028964923694729805,
+      "learning_rate": 0.001,
+      "loss": 0.4537,
+      "step": 2503
+    },
+    {
+      "epoch": 0.06909087146518036,
+      "grad_norm": 0.005055161193013191,
+      "learning_rate": 0.001,
+      "loss": 0.3704,
+      "step": 2504
+    },
+    {
+      "epoch": 0.06911846366624473,
+      "grad_norm": 0.0033460462000221014,
+      "learning_rate": 0.001,
+      "loss": 0.3886,
+      "step": 2505
+    },
+    {
+      "epoch": 0.06914605586730911,
+      "grad_norm": 0.00268213776871562,
+      "learning_rate": 0.001,
+      "loss": 0.3913,
+      "step": 2506
+    },
+    {
+      "epoch": 0.06917364806837348,
+      "grad_norm": 0.0038208633195608854,
+      "learning_rate": 0.001,
+      "loss": 0.3798,
+      "step": 2507
+    },
+    {
+      "epoch": 0.06920124026943784,
+      "grad_norm": 0.0029110198374837637,
+      "learning_rate": 0.001,
+      "loss": 0.4377,
+      "step": 2508
+    },
+    {
+      "epoch": 0.06922883247050221,
+      "grad_norm": 0.004070633556693792,
+      "learning_rate": 0.001,
+      "loss": 0.4066,
+      "step": 2509
+    },
+    {
+      "epoch": 0.06925642467156658,
+      "grad_norm": 0.004861120600253344,
+      "learning_rate": 0.001,
+      "loss": 0.4234,
+      "step": 2510
+    },
+    {
+      "epoch": 0.06928401687263096,
+      "grad_norm": 0.004837087355554104,
+      "learning_rate": 0.001,
+      "loss": 0.3942,
+      "step": 2511
+    },
+    {
+      "epoch": 0.06931160907369532,
+      "grad_norm": 0.005938942078500986,
+      "learning_rate": 0.001,
+      "loss": 0.3833,
+      "step": 2512
+    },
+    {
+      "epoch": 0.06933920127475969,
+      "grad_norm": 0.007488689385354519,
+      "learning_rate": 0.001,
+      "loss": 0.4109,
+      "step": 2513
+    },
+    {
+      "epoch": 0.06936679347582406,
+      "grad_norm": 0.007844404317438602,
+      "learning_rate": 0.001,
+      "loss": 0.3895,
+      "step": 2514
+    },
+    {
+      "epoch": 0.06939438567688842,
+      "grad_norm": 0.006541546434164047,
+      "learning_rate": 0.001,
+      "loss": 0.3925,
+      "step": 2515
+    },
+    {
+      "epoch": 0.0694219778779528,
+      "grad_norm": 0.004513300955295563,
+      "learning_rate": 0.001,
+      "loss": 0.3886,
+      "step": 2516
+    },
+    {
+      "epoch": 0.06944957007901717,
+      "grad_norm": 0.0028703995048999786,
+      "learning_rate": 0.001,
+      "loss": 0.4453,
+      "step": 2517
+    },
+    {
+      "epoch": 0.06947716228008154,
+      "grad_norm": 0.0036693988367915154,
+      "learning_rate": 0.001,
+      "loss": 0.4475,
+      "step": 2518
+    },
+    {
+      "epoch": 0.0695047544811459,
+      "grad_norm": 0.0035753645934164524,
+      "learning_rate": 0.001,
+      "loss": 0.3993,
+      "step": 2519
+    },
+    {
+      "epoch": 0.06953234668221027,
+      "grad_norm": 0.0029638251289725304,
+      "learning_rate": 0.001,
+      "loss": 0.3883,
+      "step": 2520
+    },
+    {
+      "epoch": 0.06955993888327464,
+      "grad_norm": 0.002767800120636821,
+      "learning_rate": 0.001,
+      "loss": 0.4263,
+      "step": 2521
+    },
+    {
+      "epoch": 0.06958753108433902,
+      "grad_norm": 0.00324231362901628,
+      "learning_rate": 0.001,
+      "loss": 0.4276,
+      "step": 2522
+    },
+    {
+      "epoch": 0.06961512328540338,
+      "grad_norm": 0.0024813879281282425,
+      "learning_rate": 0.001,
+      "loss": 0.38,
+      "step": 2523
+    },
+    {
+      "epoch": 0.06964271548646775,
+      "grad_norm": 0.0032533248886466026,
+      "learning_rate": 0.001,
+      "loss": 0.3958,
+      "step": 2524
+    },
+    {
+      "epoch": 0.06967030768753212,
+      "grad_norm": 0.005439402535557747,
+      "learning_rate": 0.001,
+      "loss": 0.3758,
+      "step": 2525
+    },
+    {
+      "epoch": 0.06969789988859648,
+      "grad_norm": 0.002291264943778515,
+      "learning_rate": 0.001,
+      "loss": 0.4292,
+      "step": 2526
+    },
+    {
+      "epoch": 0.06972549208966086,
+      "grad_norm": 0.0028953540604561567,
+      "learning_rate": 0.001,
+      "loss": 0.4194,
+      "step": 2527
+    },
+    {
+      "epoch": 0.06975308429072523,
+      "grad_norm": 0.003083001123741269,
+      "learning_rate": 0.001,
+      "loss": 0.4227,
+      "step": 2528
+    },
+    {
+      "epoch": 0.0697806764917896,
+      "grad_norm": 0.00382791250012815,
+      "learning_rate": 0.001,
+      "loss": 0.3844,
+      "step": 2529
+    },
+    {
+      "epoch": 0.06980826869285396,
+      "grad_norm": 0.004844884853810072,
+      "learning_rate": 0.001,
+      "loss": 0.3824,
+      "step": 2530
+    },
+    {
+      "epoch": 0.06983586089391833,
+      "grad_norm": 0.0025243901181966066,
+      "learning_rate": 0.001,
+      "loss": 0.4334,
+      "step": 2531
+    },
+    {
+      "epoch": 0.06986345309498271,
+      "grad_norm": 0.003347366116940975,
+      "learning_rate": 0.001,
+      "loss": 0.3989,
+      "step": 2532
+    },
+    {
+      "epoch": 0.06989104529604707,
+      "grad_norm": 0.004716082476079464,
+      "learning_rate": 0.001,
+      "loss": 0.3485,
+      "step": 2533
+    },
+    {
+      "epoch": 0.06991863749711144,
+      "grad_norm": 0.004542906302958727,
+      "learning_rate": 0.001,
+      "loss": 0.4021,
+      "step": 2534
+    },
+    {
+      "epoch": 0.06994622969817581,
+      "grad_norm": 0.002997052390128374,
+      "learning_rate": 0.001,
+      "loss": 0.4181,
+      "step": 2535
+    },
+    {
+      "epoch": 0.06997382189924017,
+      "grad_norm": 0.0032483143731951714,
+      "learning_rate": 0.001,
+      "loss": 0.3827,
+      "step": 2536
+    },
+    {
+      "epoch": 0.07000141410030455,
+      "grad_norm": 0.0036252515856176615,
+      "learning_rate": 0.001,
+      "loss": 0.4184,
+      "step": 2537
+    },
+    {
+      "epoch": 0.07002900630136892,
+      "grad_norm": 0.003113416489213705,
+      "learning_rate": 0.001,
+      "loss": 0.3858,
+      "step": 2538
+    },
+    {
+      "epoch": 0.07005659850243329,
+      "grad_norm": 0.003129280637949705,
+      "learning_rate": 0.001,
+      "loss": 0.3711,
+      "step": 2539
+    },
+    {
+      "epoch": 0.07008419070349765,
+      "grad_norm": 0.005747736897319555,
+      "learning_rate": 0.001,
+      "loss": 0.4057,
+      "step": 2540
+    },
+    {
+      "epoch": 0.07011178290456202,
+      "grad_norm": 0.01309170015156269,
+      "learning_rate": 0.001,
+      "loss": 0.4313,
+      "step": 2541
+    },
+    {
+      "epoch": 0.0701393751056264,
+      "grad_norm": 0.0030168737284839153,
+      "learning_rate": 0.001,
+      "loss": 0.3878,
+      "step": 2542
+    },
+    {
+      "epoch": 0.07016696730669077,
+      "grad_norm": 0.0039615570567548275,
+      "learning_rate": 0.001,
+      "loss": 0.4214,
+      "step": 2543
+    },
+    {
+      "epoch": 0.07019455950775513,
+      "grad_norm": 0.034471940249204636,
+      "learning_rate": 0.001,
+      "loss": 0.3899,
+      "step": 2544
+    },
+    {
+      "epoch": 0.0702221517088195,
+      "grad_norm": 0.00423240615054965,
+      "learning_rate": 0.001,
+      "loss": 0.3951,
+      "step": 2545
+    },
+    {
+      "epoch": 0.07024974390988387,
+      "grad_norm": 0.004391579423099756,
+      "learning_rate": 0.001,
+      "loss": 0.3852,
+      "step": 2546
+    },
+    {
+      "epoch": 0.07027733611094825,
+      "grad_norm": 0.004261813126504421,
+      "learning_rate": 0.001,
+      "loss": 0.3933,
+      "step": 2547
+    },
+    {
+      "epoch": 0.07030492831201261,
+      "grad_norm": 0.005609198939055204,
+      "learning_rate": 0.001,
+      "loss": 0.3745,
+      "step": 2548
+    },
+    {
+      "epoch": 0.07033252051307698,
+      "grad_norm": 0.0033081392757594585,
+      "learning_rate": 0.001,
+      "loss": 0.3667,
+      "step": 2549
+    },
+    {
+      "epoch": 0.07036011271414135,
+      "grad_norm": 0.0029132398776710033,
+      "learning_rate": 0.001,
+      "loss": 0.3935,
+      "step": 2550
+    },
+    {
+      "epoch": 0.07038770491520571,
+      "grad_norm": 0.004867136478424072,
+      "learning_rate": 0.001,
+      "loss": 0.3799,
+      "step": 2551
+    },
+    {
+      "epoch": 0.07041529711627009,
+      "grad_norm": 0.004058686550706625,
+      "learning_rate": 0.001,
+      "loss": 0.4201,
+      "step": 2552
+    },
+    {
+      "epoch": 0.07044288931733446,
+      "grad_norm": 0.0027920163702219725,
+      "learning_rate": 0.001,
+      "loss": 0.426,
+      "step": 2553
+    },
+    {
+      "epoch": 0.07047048151839883,
+      "grad_norm": 0.00434753717854619,
+      "learning_rate": 0.001,
+      "loss": 0.3935,
+      "step": 2554
+    },
+    {
+      "epoch": 0.07049807371946319,
+      "grad_norm": 0.0051083932630717754,
+      "learning_rate": 0.001,
+      "loss": 0.3911,
+      "step": 2555
+    },
+    {
+      "epoch": 0.07052566592052756,
+      "grad_norm": 0.0038204751908779144,
+      "learning_rate": 0.001,
+      "loss": 0.381,
+      "step": 2556
+    },
+    {
+      "epoch": 0.07055325812159194,
+      "grad_norm": 0.0038525923155248165,
+      "learning_rate": 0.001,
+      "loss": 0.4203,
+      "step": 2557
+    },
+    {
+      "epoch": 0.0705808503226563,
+      "grad_norm": 0.0032179048284888268,
+      "learning_rate": 0.001,
+      "loss": 0.4046,
+      "step": 2558
+    },
+    {
+      "epoch": 0.07060844252372067,
+      "grad_norm": 0.008977223187685013,
+      "learning_rate": 0.001,
+      "loss": 0.4188,
+      "step": 2559
+    },
+    {
+      "epoch": 0.07063603472478504,
+      "grad_norm": 0.004340124782174826,
+      "learning_rate": 0.001,
+      "loss": 0.3442,
+      "step": 2560
+    },
+    {
+      "epoch": 0.0706636269258494,
+      "grad_norm": 0.0031930410768836737,
+      "learning_rate": 0.001,
+      "loss": 0.4036,
+      "step": 2561
+    },
+    {
+      "epoch": 0.07069121912691378,
+      "grad_norm": 0.0034286684822291136,
+      "learning_rate": 0.001,
+      "loss": 0.3815,
+      "step": 2562
+    },
+    {
+      "epoch": 0.07071881132797815,
+      "grad_norm": 0.003299806034192443,
+      "learning_rate": 0.001,
+      "loss": 0.3759,
+      "step": 2563
+    },
+    {
+      "epoch": 0.07074640352904252,
+      "grad_norm": 0.0048191421665251255,
+      "learning_rate": 0.001,
+      "loss": 0.3997,
+      "step": 2564
+    },
+    {
+      "epoch": 0.07077399573010688,
+      "grad_norm": 0.006031329277902842,
+      "learning_rate": 0.001,
+      "loss": 0.4296,
+      "step": 2565
+    },
+    {
+      "epoch": 0.07080158793117125,
+      "grad_norm": 0.00475485622882843,
+      "learning_rate": 0.001,
+      "loss": 0.3825,
+      "step": 2566
+    },
+    {
+      "epoch": 0.07082918013223562,
+      "grad_norm": 0.0029015771578997374,
+      "learning_rate": 0.001,
+      "loss": 0.4159,
+      "step": 2567
+    },
+    {
+      "epoch": 0.0708567723333,
+      "grad_norm": 0.0026210874784737825,
+      "learning_rate": 0.001,
+      "loss": 0.3808,
+      "step": 2568
+    },
+    {
+      "epoch": 0.07088436453436436,
+      "grad_norm": 0.00292952754534781,
+      "learning_rate": 0.001,
+      "loss": 0.4424,
+      "step": 2569
+    },
+    {
+      "epoch": 0.07091195673542873,
+      "grad_norm": 0.004151395056396723,
+      "learning_rate": 0.001,
+      "loss": 0.3488,
+      "step": 2570
+    },
+    {
+      "epoch": 0.0709395489364931,
+      "grad_norm": 0.003007207065820694,
+      "learning_rate": 0.001,
+      "loss": 0.4021,
+      "step": 2571
+    },
+    {
+      "epoch": 0.07096714113755746,
+      "grad_norm": 0.0025580485817044973,
+      "learning_rate": 0.001,
+      "loss": 0.4374,
+      "step": 2572
+    },
+    {
+      "epoch": 0.07099473333862184,
+      "grad_norm": 0.0034552421420812607,
+      "learning_rate": 0.001,
+      "loss": 0.3793,
+      "step": 2573
+    },
+    {
+      "epoch": 0.07102232553968621,
+      "grad_norm": 0.004341066349297762,
+      "learning_rate": 0.001,
+      "loss": 0.4012,
+      "step": 2574
+    },
+    {
+      "epoch": 0.07104991774075058,
+      "grad_norm": 0.0033516965340822935,
+      "learning_rate": 0.001,
+      "loss": 0.3915,
+      "step": 2575
+    },
+    {
+      "epoch": 0.07107750994181494,
+      "grad_norm": 0.0036367857828736305,
+      "learning_rate": 0.001,
+      "loss": 0.4064,
+      "step": 2576
+    },
+    {
+      "epoch": 0.07110510214287931,
+      "grad_norm": 0.0029854371678084135,
+      "learning_rate": 0.001,
+      "loss": 0.4095,
+      "step": 2577
+    },
+    {
+      "epoch": 0.07113269434394369,
+      "grad_norm": 0.002374214818701148,
+      "learning_rate": 0.001,
+      "loss": 0.4303,
+      "step": 2578
+    },
+    {
+      "epoch": 0.07116028654500806,
+      "grad_norm": 0.002954955445602536,
+      "learning_rate": 0.001,
+      "loss": 0.4127,
+      "step": 2579
+    },
+    {
+      "epoch": 0.07118787874607242,
+      "grad_norm": 0.003998765256255865,
+      "learning_rate": 0.001,
+      "loss": 0.4217,
+      "step": 2580
+    },
+    {
+      "epoch": 0.07121547094713679,
+      "grad_norm": 0.0029591761995106936,
+      "learning_rate": 0.001,
+      "loss": 0.4329,
+      "step": 2581
+    },
+    {
+      "epoch": 0.07124306314820116,
+      "grad_norm": 0.0039256964810192585,
+      "learning_rate": 0.001,
+      "loss": 0.4368,
+      "step": 2582
+    },
+    {
+      "epoch": 0.07127065534926554,
+      "grad_norm": 0.005670437589287758,
+      "learning_rate": 0.001,
+      "loss": 0.3942,
+      "step": 2583
+    },
+    {
+      "epoch": 0.0712982475503299,
+      "grad_norm": 0.0024786926805973053,
+      "learning_rate": 0.001,
+      "loss": 0.4157,
+      "step": 2584
+    },
+    {
+      "epoch": 0.07132583975139427,
+      "grad_norm": 0.0045395889319479465,
+      "learning_rate": 0.001,
+      "loss": 0.4074,
+      "step": 2585
+    },
+    {
+      "epoch": 0.07135343195245863,
+      "grad_norm": 0.002754254499450326,
+      "learning_rate": 0.001,
+      "loss": 0.4155,
+      "step": 2586
+    },
+    {
+      "epoch": 0.071381024153523,
+      "grad_norm": 0.0032517199870198965,
+      "learning_rate": 0.001,
+      "loss": 0.3633,
+      "step": 2587
+    },
+    {
+      "epoch": 0.07140861635458738,
+      "grad_norm": 0.0037906805519014597,
+      "learning_rate": 0.001,
+      "loss": 0.4037,
+      "step": 2588
+    },
+    {
+      "epoch": 0.07143620855565175,
+      "grad_norm": 0.00529517512768507,
+      "learning_rate": 0.001,
+      "loss": 0.3891,
+      "step": 2589
+    },
+    {
+      "epoch": 0.07146380075671611,
+      "grad_norm": 0.00859268568456173,
+      "learning_rate": 0.001,
+      "loss": 0.3689,
+      "step": 2590
+    },
+    {
+      "epoch": 0.07149139295778048,
+      "grad_norm": 0.003779566613957286,
+      "learning_rate": 0.001,
+      "loss": 0.3853,
+      "step": 2591
+    },
+    {
+      "epoch": 0.07151898515884485,
+      "grad_norm": 0.0031110162381082773,
+      "learning_rate": 0.001,
+      "loss": 0.4138,
+      "step": 2592
+    },
+    {
+      "epoch": 0.07154657735990923,
+      "grad_norm": 0.004116731230169535,
+      "learning_rate": 0.001,
+      "loss": 0.3965,
+      "step": 2593
+    },
+    {
+      "epoch": 0.0715741695609736,
+      "grad_norm": 0.00332248630002141,
+      "learning_rate": 0.001,
+      "loss": 0.4053,
+      "step": 2594
+    },
+    {
+      "epoch": 0.07160176176203796,
+      "grad_norm": 0.003030715975910425,
+      "learning_rate": 0.001,
+      "loss": 0.4413,
+      "step": 2595
+    },
+    {
+      "epoch": 0.07162935396310233,
+      "grad_norm": 0.0028603002429008484,
+      "learning_rate": 0.001,
+      "loss": 0.3965,
+      "step": 2596
+    },
+    {
+      "epoch": 0.0716569461641667,
+      "grad_norm": 0.0035293481778353453,
+      "learning_rate": 0.001,
+      "loss": 0.4229,
+      "step": 2597
+    },
+    {
+      "epoch": 0.07168453836523107,
+      "grad_norm": 0.004880653228610754,
+      "learning_rate": 0.001,
+      "loss": 0.384,
+      "step": 2598
+    },
+    {
+      "epoch": 0.07171213056629544,
+      "grad_norm": 0.0037580877542495728,
+      "learning_rate": 0.001,
+      "loss": 0.373,
+      "step": 2599
+    },
+    {
+      "epoch": 0.0717397227673598,
+      "grad_norm": 0.004738042131066322,
+      "learning_rate": 0.001,
+      "loss": 0.3783,
+      "step": 2600
+    },
+    {
+      "epoch": 0.07176731496842417,
+      "grad_norm": 0.0030720685608685017,
+      "learning_rate": 0.001,
+      "loss": 0.4232,
+      "step": 2601
+    },
+    {
+      "epoch": 0.07179490716948854,
+      "grad_norm": 0.003540902165696025,
+      "learning_rate": 0.001,
+      "loss": 0.4012,
+      "step": 2602
+    },
+    {
+      "epoch": 0.07182249937055292,
+      "grad_norm": 0.0028504952788352966,
+      "learning_rate": 0.001,
+      "loss": 0.3778,
+      "step": 2603
+    },
+    {
+      "epoch": 0.07185009157161729,
+      "grad_norm": 0.0041265105828642845,
+      "learning_rate": 0.001,
+      "loss": 0.3976,
+      "step": 2604
+    },
+    {
+      "epoch": 0.07187768377268165,
+      "grad_norm": 0.0034538819454610348,
+      "learning_rate": 0.001,
+      "loss": 0.3802,
+      "step": 2605
+    },
+    {
+      "epoch": 0.07190527597374602,
+      "grad_norm": 0.002804661402478814,
+      "learning_rate": 0.001,
+      "loss": 0.388,
+      "step": 2606
+    },
+    {
+      "epoch": 0.07193286817481039,
+      "grad_norm": 0.006338078528642654,
+      "learning_rate": 0.001,
+      "loss": 0.4074,
+      "step": 2607
+    },
+    {
+      "epoch": 0.07196046037587477,
+      "grad_norm": 0.0034297939855605364,
+      "learning_rate": 0.001,
+      "loss": 0.3497,
+      "step": 2608
+    },
+    {
+      "epoch": 0.07198805257693913,
+      "grad_norm": 0.0029641755390912294,
+      "learning_rate": 0.001,
+      "loss": 0.4111,
+      "step": 2609
+    },
+    {
+      "epoch": 0.0720156447780035,
+      "grad_norm": 0.0038228612393140793,
+      "learning_rate": 0.001,
+      "loss": 0.4009,
+      "step": 2610
+    },
+    {
+      "epoch": 0.07204323697906787,
+      "grad_norm": 0.0033363320399075747,
+      "learning_rate": 0.001,
+      "loss": 0.395,
+      "step": 2611
+    },
+    {
+      "epoch": 0.07207082918013223,
+      "grad_norm": 0.002400481840595603,
+      "learning_rate": 0.001,
+      "loss": 0.3865,
+      "step": 2612
+    },
+    {
+      "epoch": 0.0720984213811966,
+      "grad_norm": 0.00315939006395638,
+      "learning_rate": 0.001,
+      "loss": 0.4,
+      "step": 2613
+    },
+    {
+      "epoch": 0.07212601358226098,
+      "grad_norm": 0.00354985473677516,
+      "learning_rate": 0.001,
+      "loss": 0.4134,
+      "step": 2614
+    },
+    {
+      "epoch": 0.07215360578332534,
+      "grad_norm": 0.0028168221469968557,
+      "learning_rate": 0.001,
+      "loss": 0.4117,
+      "step": 2615
+    },
+    {
+      "epoch": 0.07218119798438971,
+      "grad_norm": 0.0025577114429324865,
+      "learning_rate": 0.001,
+      "loss": 0.3769,
+      "step": 2616
+    },
+    {
+      "epoch": 0.07220879018545408,
+      "grad_norm": 0.003505377098917961,
+      "learning_rate": 0.001,
+      "loss": 0.4206,
+      "step": 2617
+    },
+    {
+      "epoch": 0.07223638238651844,
+      "grad_norm": 0.005422186106443405,
+      "learning_rate": 0.001,
+      "loss": 0.3677,
+      "step": 2618
+    },
+    {
+      "epoch": 0.07226397458758282,
+      "grad_norm": 0.003959451802074909,
+      "learning_rate": 0.001,
+      "loss": 0.3826,
+      "step": 2619
+    },
+    {
+      "epoch": 0.07229156678864719,
+      "grad_norm": 0.002390485256910324,
+      "learning_rate": 0.001,
+      "loss": 0.3899,
+      "step": 2620
+    },
+    {
+      "epoch": 0.07231915898971156,
+      "grad_norm": 0.0019209344172850251,
+      "learning_rate": 0.001,
+      "loss": 0.395,
+      "step": 2621
+    },
+    {
+      "epoch": 0.07234675119077592,
+      "grad_norm": 0.0027938170824199915,
+      "learning_rate": 0.001,
+      "loss": 0.4097,
+      "step": 2622
+    },
+    {
+      "epoch": 0.07237434339184029,
+      "grad_norm": 0.002219612244516611,
+      "learning_rate": 0.001,
+      "loss": 0.4308,
+      "step": 2623
+    },
+    {
+      "epoch": 0.07240193559290467,
+      "grad_norm": 0.002710830420255661,
+      "learning_rate": 0.001,
+      "loss": 0.4034,
+      "step": 2624
+    },
+    {
+      "epoch": 0.07242952779396904,
+      "grad_norm": 0.002441881690174341,
+      "learning_rate": 0.001,
+      "loss": 0.3905,
+      "step": 2625
+    },
+    {
+      "epoch": 0.0724571199950334,
+      "grad_norm": 0.002835171762853861,
+      "learning_rate": 0.001,
+      "loss": 0.4196,
+      "step": 2626
+    },
+    {
+      "epoch": 0.07248471219609777,
+      "grad_norm": 0.005800854880362749,
+      "learning_rate": 0.001,
+      "loss": 0.3621,
+      "step": 2627
+    },
+    {
+      "epoch": 0.07251230439716214,
+      "grad_norm": 0.0058226133696734905,
+      "learning_rate": 0.001,
+      "loss": 0.4385,
+      "step": 2628
+    },
+    {
+      "epoch": 0.07253989659822652,
+      "grad_norm": 0.0031959593761712313,
+      "learning_rate": 0.001,
+      "loss": 0.4129,
+      "step": 2629
+    },
+    {
+      "epoch": 0.07256748879929088,
+      "grad_norm": 0.007225159555673599,
+      "learning_rate": 0.001,
+      "loss": 0.3695,
+      "step": 2630
+    },
+    {
+      "epoch": 0.07259508100035525,
+      "grad_norm": 0.003910002298653126,
+      "learning_rate": 0.001,
+      "loss": 0.4018,
+      "step": 2631
+    },
+    {
+      "epoch": 0.07262267320141962,
+      "grad_norm": 0.0047545540146529675,
+      "learning_rate": 0.001,
+      "loss": 0.3985,
+      "step": 2632
+    },
+    {
+      "epoch": 0.07265026540248398,
+      "grad_norm": 0.0037914151325821877,
+      "learning_rate": 0.001,
+      "loss": 0.3954,
+      "step": 2633
+    },
+    {
+      "epoch": 0.07267785760354836,
+      "grad_norm": 0.007707824464887381,
+      "learning_rate": 0.001,
+      "loss": 0.3865,
+      "step": 2634
+    },
+    {
+      "epoch": 0.07270544980461273,
+      "grad_norm": 0.0031401002779603004,
+      "learning_rate": 0.001,
+      "loss": 0.3622,
+      "step": 2635
+    },
+    {
+      "epoch": 0.0727330420056771,
+      "grad_norm": 0.005065685138106346,
+      "learning_rate": 0.001,
+      "loss": 0.3612,
+      "step": 2636
+    },
+    {
+      "epoch": 0.07276063420674146,
+      "grad_norm": 0.0033927711192518473,
+      "learning_rate": 0.001,
+      "loss": 0.4286,
+      "step": 2637
+    },
+    {
+      "epoch": 0.07278822640780583,
+      "grad_norm": 0.0026958470698446035,
+      "learning_rate": 0.001,
+      "loss": 0.4064,
+      "step": 2638
+    },
+    {
+      "epoch": 0.07281581860887021,
+      "grad_norm": 0.004273698199540377,
+      "learning_rate": 0.001,
+      "loss": 0.3942,
+      "step": 2639
+    },
+    {
+      "epoch": 0.07284341080993458,
+      "grad_norm": 0.0033470946364104748,
+      "learning_rate": 0.001,
+      "loss": 0.4117,
+      "step": 2640
+    },
+    {
+      "epoch": 0.07287100301099894,
+      "grad_norm": 0.0030789680313318968,
+      "learning_rate": 0.001,
+      "loss": 0.3801,
+      "step": 2641
+    },
+    {
+      "epoch": 0.07289859521206331,
+      "grad_norm": 0.003024066798388958,
+      "learning_rate": 0.001,
+      "loss": 0.4068,
+      "step": 2642
+    },
+    {
+      "epoch": 0.07292618741312767,
+      "grad_norm": 0.0035327670630067587,
+      "learning_rate": 0.001,
+      "loss": 0.4213,
+      "step": 2643
+    },
+    {
+      "epoch": 0.07295377961419205,
+      "grad_norm": 0.005246289074420929,
+      "learning_rate": 0.001,
+      "loss": 0.4035,
+      "step": 2644
+    },
+    {
+      "epoch": 0.07298137181525642,
+      "grad_norm": 0.0033100054133683443,
+      "learning_rate": 0.001,
+      "loss": 0.4015,
+      "step": 2645
+    },
+    {
+      "epoch": 0.07300896401632079,
+      "grad_norm": 0.014535458758473396,
+      "learning_rate": 0.001,
+      "loss": 0.4149,
+      "step": 2646
+    },
+    {
+      "epoch": 0.07303655621738515,
+      "grad_norm": 0.006717653013765812,
+      "learning_rate": 0.001,
+      "loss": 0.4366,
+      "step": 2647
+    },
+    {
+      "epoch": 0.07306414841844952,
+      "grad_norm": 0.005544296000152826,
+      "learning_rate": 0.001,
+      "loss": 0.3546,
+      "step": 2648
+    },
+    {
+      "epoch": 0.0730917406195139,
+      "grad_norm": 0.002779677277430892,
+      "learning_rate": 0.001,
+      "loss": 0.415,
+      "step": 2649
+    },
+    {
+      "epoch": 0.07311933282057827,
+      "grad_norm": 0.0030521864537149668,
+      "learning_rate": 0.001,
+      "loss": 0.41,
+      "step": 2650
+    },
+    {
+      "epoch": 0.07314692502164263,
+      "grad_norm": 0.0032854536548256874,
+      "learning_rate": 0.001,
+      "loss": 0.4159,
+      "step": 2651
+    },
+    {
+      "epoch": 0.073174517222707,
+      "grad_norm": 0.002539557870477438,
+      "learning_rate": 0.001,
+      "loss": 0.4195,
+      "step": 2652
+    },
+    {
+      "epoch": 0.07320210942377137,
+      "grad_norm": 0.004703735467046499,
+      "learning_rate": 0.001,
+      "loss": 0.3759,
+      "step": 2653
+    },
+    {
+      "epoch": 0.07322970162483575,
+      "grad_norm": 0.0029251237865537405,
+      "learning_rate": 0.001,
+      "loss": 0.4182,
+      "step": 2654
+    },
+    {
+      "epoch": 0.07325729382590011,
+      "grad_norm": 0.0026435167528688908,
+      "learning_rate": 0.001,
+      "loss": 0.3718,
+      "step": 2655
+    },
+    {
+      "epoch": 0.07328488602696448,
+      "grad_norm": 0.004778844770044088,
+      "learning_rate": 0.001,
+      "loss": 0.4264,
+      "step": 2656
+    },
+    {
+      "epoch": 0.07331247822802885,
+      "grad_norm": 0.003458186285570264,
+      "learning_rate": 0.001,
+      "loss": 0.4593,
+      "step": 2657
+    },
+    {
+      "epoch": 0.07334007042909321,
+      "grad_norm": 0.0034521680790930986,
+      "learning_rate": 0.001,
+      "loss": 0.4509,
+      "step": 2658
+    },
+    {
+      "epoch": 0.07336766263015758,
+      "grad_norm": 0.002783542964607477,
+      "learning_rate": 0.001,
+      "loss": 0.3887,
+      "step": 2659
+    },
+    {
+      "epoch": 0.07339525483122196,
+      "grad_norm": 0.004468636121600866,
+      "learning_rate": 0.001,
+      "loss": 0.3971,
+      "step": 2660
+    },
+    {
+      "epoch": 0.07342284703228633,
+      "grad_norm": 0.0026452334132045507,
+      "learning_rate": 0.001,
+      "loss": 0.4105,
+      "step": 2661
+    },
+    {
+      "epoch": 0.07345043923335069,
+      "grad_norm": 0.003701903624460101,
+      "learning_rate": 0.001,
+      "loss": 0.3864,
+      "step": 2662
+    },
+    {
+      "epoch": 0.07347803143441506,
+      "grad_norm": 0.002505905693396926,
+      "learning_rate": 0.001,
+      "loss": 0.4081,
+      "step": 2663
+    },
+    {
+      "epoch": 0.07350562363547943,
+      "grad_norm": 0.002993806730955839,
+      "learning_rate": 0.001,
+      "loss": 0.3838,
+      "step": 2664
+    },
+    {
+      "epoch": 0.0735332158365438,
+      "grad_norm": 0.002987109124660492,
+      "learning_rate": 0.001,
+      "loss": 0.4064,
+      "step": 2665
+    },
+    {
+      "epoch": 0.07356080803760817,
+      "grad_norm": 0.0028901277109980583,
+      "learning_rate": 0.001,
+      "loss": 0.3842,
+      "step": 2666
+    },
+    {
+      "epoch": 0.07358840023867254,
+      "grad_norm": 0.0037864744663238525,
+      "learning_rate": 0.001,
+      "loss": 0.3962,
+      "step": 2667
+    },
+    {
+      "epoch": 0.0736159924397369,
+      "grad_norm": 0.005365621764212847,
+      "learning_rate": 0.001,
+      "loss": 0.4013,
+      "step": 2668
+    },
+    {
+      "epoch": 0.07364358464080127,
+      "grad_norm": 0.0025341627188026905,
+      "learning_rate": 0.001,
+      "loss": 0.4598,
+      "step": 2669
+    },
+    {
+      "epoch": 0.07367117684186565,
+      "grad_norm": 0.0026153249200433493,
+      "learning_rate": 0.001,
+      "loss": 0.4187,
+      "step": 2670
+    },
+    {
+      "epoch": 0.07369876904293002,
+      "grad_norm": 0.00424772035330534,
+      "learning_rate": 0.001,
+      "loss": 0.4292,
+      "step": 2671
+    },
+    {
+      "epoch": 0.07372636124399438,
+      "grad_norm": 0.005840023048222065,
+      "learning_rate": 0.001,
+      "loss": 0.382,
+      "step": 2672
+    },
+    {
+      "epoch": 0.07375395344505875,
+      "grad_norm": 0.00886671431362629,
+      "learning_rate": 0.001,
+      "loss": 0.3735,
+      "step": 2673
+    },
+    {
+      "epoch": 0.07378154564612312,
+      "grad_norm": 0.004951406270265579,
+      "learning_rate": 0.001,
+      "loss": 0.383,
+      "step": 2674
+    },
+    {
+      "epoch": 0.0738091378471875,
+      "grad_norm": 0.004046993795782328,
+      "learning_rate": 0.001,
+      "loss": 0.4047,
+      "step": 2675
+    },
+    {
+      "epoch": 0.07383673004825186,
+      "grad_norm": 0.003650445956736803,
+      "learning_rate": 0.001,
+      "loss": 0.4343,
+      "step": 2676
+    },
+    {
+      "epoch": 0.07386432224931623,
+      "grad_norm": 0.0027846968732774258,
+      "learning_rate": 0.001,
+      "loss": 0.4657,
+      "step": 2677
+    },
+    {
+      "epoch": 0.0738919144503806,
+      "grad_norm": 0.0027162914630025625,
+      "learning_rate": 0.001,
+      "loss": 0.3845,
+      "step": 2678
+    },
+    {
+      "epoch": 0.07391950665144496,
+      "grad_norm": 0.002881822641938925,
+      "learning_rate": 0.001,
+      "loss": 0.435,
+      "step": 2679
+    },
+    {
+      "epoch": 0.07394709885250934,
+      "grad_norm": 0.006673680152744055,
+      "learning_rate": 0.001,
+      "loss": 0.4224,
+      "step": 2680
+    },
+    {
+      "epoch": 0.07397469105357371,
+      "grad_norm": 0.0028485655784606934,
+      "learning_rate": 0.001,
+      "loss": 0.3907,
+      "step": 2681
+    },
+    {
+      "epoch": 0.07400228325463808,
+      "grad_norm": 0.004903602413833141,
+      "learning_rate": 0.001,
+      "loss": 0.3578,
+      "step": 2682
+    },
+    {
+      "epoch": 0.07402987545570244,
+      "grad_norm": 0.010232421569526196,
+      "learning_rate": 0.001,
+      "loss": 0.4232,
+      "step": 2683
+    },
+    {
+      "epoch": 0.07405746765676681,
+      "grad_norm": 0.01437798049300909,
+      "learning_rate": 0.001,
+      "loss": 0.4093,
+      "step": 2684
+    },
+    {
+      "epoch": 0.07408505985783119,
+      "grad_norm": 0.0036920029670000076,
+      "learning_rate": 0.001,
+      "loss": 0.4224,
+      "step": 2685
+    },
+    {
+      "epoch": 0.07411265205889556,
+      "grad_norm": 0.008351249620318413,
+      "learning_rate": 0.001,
+      "loss": 0.4205,
+      "step": 2686
+    },
+    {
+      "epoch": 0.07414024425995992,
+      "grad_norm": 0.004162284545600414,
+      "learning_rate": 0.001,
+      "loss": 0.4054,
+      "step": 2687
+    },
+    {
+      "epoch": 0.07416783646102429,
+      "grad_norm": 0.0036121357697993517,
+      "learning_rate": 0.001,
+      "loss": 0.4144,
+      "step": 2688
+    },
+    {
+      "epoch": 0.07419542866208866,
+      "grad_norm": 0.0035279730800539255,
+      "learning_rate": 0.001,
+      "loss": 0.3711,
+      "step": 2689
+    },
+    {
+      "epoch": 0.07422302086315304,
+      "grad_norm": 0.0031304580625146627,
+      "learning_rate": 0.001,
+      "loss": 0.3772,
+      "step": 2690
+    },
+    {
+      "epoch": 0.0742506130642174,
+      "grad_norm": 0.006544687785208225,
+      "learning_rate": 0.001,
+      "loss": 0.4303,
+      "step": 2691
+    },
+    {
+      "epoch": 0.07427820526528177,
+      "grad_norm": 0.004466759506613016,
+      "learning_rate": 0.001,
+      "loss": 0.3946,
+      "step": 2692
+    },
+    {
+      "epoch": 0.07430579746634614,
+      "grad_norm": 0.0027272431179881096,
+      "learning_rate": 0.001,
+      "loss": 0.4069,
+      "step": 2693
+    },
+    {
+      "epoch": 0.0743333896674105,
+      "grad_norm": 0.0032364106737077236,
+      "learning_rate": 0.001,
+      "loss": 0.3972,
+      "step": 2694
+    },
+    {
+      "epoch": 0.07436098186847488,
+      "grad_norm": 0.002972652204334736,
+      "learning_rate": 0.001,
+      "loss": 0.3713,
+      "step": 2695
+    },
+    {
+      "epoch": 0.07438857406953925,
+      "grad_norm": 0.0027488903142511845,
+      "learning_rate": 0.001,
+      "loss": 0.3686,
+      "step": 2696
+    },
+    {
+      "epoch": 0.07441616627060361,
+      "grad_norm": 0.003380288602784276,
+      "learning_rate": 0.001,
+      "loss": 0.3894,
+      "step": 2697
+    },
+    {
+      "epoch": 0.07444375847166798,
+      "grad_norm": 0.002856023609638214,
+      "learning_rate": 0.001,
+      "loss": 0.3889,
+      "step": 2698
+    },
+    {
+      "epoch": 0.07447135067273235,
+      "grad_norm": 0.003780797589570284,
+      "learning_rate": 0.001,
+      "loss": 0.4039,
+      "step": 2699
+    },
+    {
+      "epoch": 0.07449894287379673,
+      "grad_norm": 0.006125589832663536,
+      "learning_rate": 0.001,
+      "loss": 0.4125,
+      "step": 2700
+    },
+    {
+      "epoch": 0.0745265350748611,
+      "grad_norm": 0.003866039216518402,
+      "learning_rate": 0.001,
+      "loss": 0.3978,
+      "step": 2701
+    },
+    {
+      "epoch": 0.07455412727592546,
+      "grad_norm": 0.0035539071541279554,
+      "learning_rate": 0.001,
+      "loss": 0.3874,
+      "step": 2702
+    },
+    {
+      "epoch": 0.07458171947698983,
+      "grad_norm": 0.003555738367140293,
+      "learning_rate": 0.001,
+      "loss": 0.3996,
+      "step": 2703
+    },
+    {
+      "epoch": 0.0746093116780542,
+      "grad_norm": 0.0025483653880655766,
+      "learning_rate": 0.001,
+      "loss": 0.3836,
+      "step": 2704
+    },
+    {
+      "epoch": 0.07463690387911857,
+      "grad_norm": 0.005306419916450977,
+      "learning_rate": 0.001,
+      "loss": 0.4259,
+      "step": 2705
+    },
+    {
+      "epoch": 0.07466449608018294,
+      "grad_norm": 0.004178240429610014,
+      "learning_rate": 0.001,
+      "loss": 0.4074,
+      "step": 2706
+    },
+    {
+      "epoch": 0.0746920882812473,
+      "grad_norm": 0.0047726561315357685,
+      "learning_rate": 0.001,
+      "loss": 0.3957,
+      "step": 2707
+    },
+    {
+      "epoch": 0.07471968048231167,
+      "grad_norm": 0.003424114780500531,
+      "learning_rate": 0.001,
+      "loss": 0.4242,
+      "step": 2708
+    },
+    {
+      "epoch": 0.07474727268337604,
+      "grad_norm": 0.004055993165820837,
+      "learning_rate": 0.001,
+      "loss": 0.4025,
+      "step": 2709
+    },
+    {
+      "epoch": 0.0747748648844404,
+      "grad_norm": 0.0035490887239575386,
+      "learning_rate": 0.001,
+      "loss": 0.4291,
+      "step": 2710
+    },
+    {
+      "epoch": 0.07480245708550479,
+      "grad_norm": 0.0075918626971542835,
+      "learning_rate": 0.001,
+      "loss": 0.4556,
+      "step": 2711
+    },
+    {
+      "epoch": 0.07483004928656915,
+      "grad_norm": 0.00878838449716568,
+      "learning_rate": 0.001,
+      "loss": 0.4055,
+      "step": 2712
+    },
+    {
+      "epoch": 0.07485764148763352,
+      "grad_norm": 0.004037888254970312,
+      "learning_rate": 0.001,
+      "loss": 0.3879,
+      "step": 2713
+    },
+    {
+      "epoch": 0.07488523368869789,
+      "grad_norm": 0.003275372786447406,
+      "learning_rate": 0.001,
+      "loss": 0.3995,
+      "step": 2714
+    },
+    {
+      "epoch": 0.07491282588976225,
+      "grad_norm": 0.005921733099967241,
+      "learning_rate": 0.001,
+      "loss": 0.3959,
+      "step": 2715
+    },
+    {
+      "epoch": 0.07494041809082663,
+      "grad_norm": 0.003877841867506504,
+      "learning_rate": 0.001,
+      "loss": 0.3889,
+      "step": 2716
+    },
+    {
+      "epoch": 0.074968010291891,
+      "grad_norm": 0.004174409434199333,
+      "learning_rate": 0.001,
+      "loss": 0.385,
+      "step": 2717
+    },
+    {
+      "epoch": 0.07499560249295537,
+      "grad_norm": 0.004188715014606714,
+      "learning_rate": 0.001,
+      "loss": 0.4182,
+      "step": 2718
+    },
+    {
+      "epoch": 0.07502319469401973,
+      "grad_norm": 0.0031484768260270357,
+      "learning_rate": 0.001,
+      "loss": 0.3746,
+      "step": 2719
+    },
+    {
+      "epoch": 0.0750507868950841,
+      "grad_norm": 0.006613558623939753,
+      "learning_rate": 0.001,
+      "loss": 0.4151,
+      "step": 2720
+    },
+    {
+      "epoch": 0.07507837909614848,
+      "grad_norm": 0.0036274839658290148,
+      "learning_rate": 0.001,
+      "loss": 0.3511,
+      "step": 2721
+    },
+    {
+      "epoch": 0.07510597129721285,
+      "grad_norm": 0.004630135837942362,
+      "learning_rate": 0.001,
+      "loss": 0.412,
+      "step": 2722
+    },
+    {
+      "epoch": 0.07513356349827721,
+      "grad_norm": 0.0061690667644143105,
+      "learning_rate": 0.001,
+      "loss": 0.4167,
+      "step": 2723
+    },
+    {
+      "epoch": 0.07516115569934158,
+      "grad_norm": 0.005458046216517687,
+      "learning_rate": 0.001,
+      "loss": 0.3676,
+      "step": 2724
+    },
+    {
+      "epoch": 0.07518874790040594,
+      "grad_norm": 0.007112190593034029,
+      "learning_rate": 0.001,
+      "loss": 0.3842,
+      "step": 2725
+    },
+    {
+      "epoch": 0.07521634010147032,
+      "grad_norm": 0.0031167047563940287,
+      "learning_rate": 0.001,
+      "loss": 0.3922,
+      "step": 2726
+    },
+    {
+      "epoch": 0.07524393230253469,
+      "grad_norm": 0.003523972351104021,
+      "learning_rate": 0.001,
+      "loss": 0.4247,
+      "step": 2727
+    },
+    {
+      "epoch": 0.07527152450359906,
+      "grad_norm": 0.003653216175734997,
+      "learning_rate": 0.001,
+      "loss": 0.4352,
+      "step": 2728
+    },
+    {
+      "epoch": 0.07529911670466342,
+      "grad_norm": 0.003052507760003209,
+      "learning_rate": 0.001,
+      "loss": 0.4049,
+      "step": 2729
+    },
+    {
+      "epoch": 0.07532670890572779,
+      "grad_norm": 0.002761593321338296,
+      "learning_rate": 0.001,
+      "loss": 0.3611,
+      "step": 2730
+    },
+    {
+      "epoch": 0.07535430110679217,
+      "grad_norm": 0.0036866154987365007,
+      "learning_rate": 0.001,
+      "loss": 0.3891,
+      "step": 2731
+    },
+    {
+      "epoch": 0.07538189330785654,
+      "grad_norm": 0.008540788665413857,
+      "learning_rate": 0.001,
+      "loss": 0.4011,
+      "step": 2732
+    },
+    {
+      "epoch": 0.0754094855089209,
+      "grad_norm": 0.0023057356011122465,
+      "learning_rate": 0.001,
+      "loss": 0.3936,
+      "step": 2733
+    },
+    {
+      "epoch": 0.07543707770998527,
+      "grad_norm": 0.006381066981703043,
+      "learning_rate": 0.001,
+      "loss": 0.3804,
+      "step": 2734
+    },
+    {
+      "epoch": 0.07546466991104964,
+      "grad_norm": 0.0035177739337086678,
+      "learning_rate": 0.001,
+      "loss": 0.4017,
+      "step": 2735
+    },
+    {
+      "epoch": 0.07549226211211402,
+      "grad_norm": 0.0035792491398751736,
+      "learning_rate": 0.001,
+      "loss": 0.3758,
+      "step": 2736
+    },
+    {
+      "epoch": 0.07551985431317838,
+      "grad_norm": 0.0046767196618020535,
+      "learning_rate": 0.001,
+      "loss": 0.4361,
+      "step": 2737
+    },
+    {
+      "epoch": 0.07554744651424275,
+      "grad_norm": 0.0037277627270668745,
+      "learning_rate": 0.001,
+      "loss": 0.3893,
+      "step": 2738
+    },
+    {
+      "epoch": 0.07557503871530712,
+      "grad_norm": 0.003284999867901206,
+      "learning_rate": 0.001,
+      "loss": 0.404,
+      "step": 2739
+    },
+    {
+      "epoch": 0.07560263091637148,
+      "grad_norm": 0.0028962334617972374,
+      "learning_rate": 0.001,
+      "loss": 0.3979,
+      "step": 2740
+    },
+    {
+      "epoch": 0.07563022311743586,
+      "grad_norm": 0.003682551207020879,
+      "learning_rate": 0.001,
+      "loss": 0.4198,
+      "step": 2741
+    },
+    {
+      "epoch": 0.07565781531850023,
+      "grad_norm": 0.0028806542977690697,
+      "learning_rate": 0.001,
+      "loss": 0.3745,
+      "step": 2742
+    },
+    {
+      "epoch": 0.0756854075195646,
+      "grad_norm": 0.003922648727893829,
+      "learning_rate": 0.001,
+      "loss": 0.3968,
+      "step": 2743
+    },
+    {
+      "epoch": 0.07571299972062896,
+      "grad_norm": 0.0026945569552481174,
+      "learning_rate": 0.001,
+      "loss": 0.3959,
+      "step": 2744
+    },
+    {
+      "epoch": 0.07574059192169333,
+      "grad_norm": 0.0057632802054286,
+      "learning_rate": 0.001,
+      "loss": 0.4194,
+      "step": 2745
+    },
+    {
+      "epoch": 0.07576818412275771,
+      "grad_norm": 0.006452036090195179,
+      "learning_rate": 0.001,
+      "loss": 0.3966,
+      "step": 2746
+    },
+    {
+      "epoch": 0.07579577632382208,
+      "grad_norm": 0.0030313171446323395,
+      "learning_rate": 0.001,
+      "loss": 0.4016,
+      "step": 2747
+    },
+    {
+      "epoch": 0.07582336852488644,
+      "grad_norm": 0.003243018174543977,
+      "learning_rate": 0.001,
+      "loss": 0.3503,
+      "step": 2748
+    },
+    {
+      "epoch": 0.07585096072595081,
+      "grad_norm": 0.0030355071648955345,
+      "learning_rate": 0.001,
+      "loss": 0.4086,
+      "step": 2749
+    },
+    {
+      "epoch": 0.07587855292701517,
+      "grad_norm": 0.003429468721151352,
+      "learning_rate": 0.001,
+      "loss": 0.3872,
+      "step": 2750
+    },
+    {
+      "epoch": 0.07590614512807956,
+      "grad_norm": 0.0029639608692377806,
+      "learning_rate": 0.001,
+      "loss": 0.4498,
+      "step": 2751
+    },
+    {
+      "epoch": 0.07593373732914392,
+      "grad_norm": 0.004542426206171513,
+      "learning_rate": 0.001,
+      "loss": 0.4143,
+      "step": 2752
+    },
+    {
+      "epoch": 0.07596132953020829,
+      "grad_norm": 0.002826446434482932,
+      "learning_rate": 0.001,
+      "loss": 0.405,
+      "step": 2753
+    },
+    {
+      "epoch": 0.07598892173127265,
+      "grad_norm": 0.004107923712581396,
+      "learning_rate": 0.001,
+      "loss": 0.4035,
+      "step": 2754
+    },
+    {
+      "epoch": 0.07601651393233702,
+      "grad_norm": 0.0028436153661459684,
+      "learning_rate": 0.001,
+      "loss": 0.4484,
+      "step": 2755
+    },
+    {
+      "epoch": 0.07604410613340139,
+      "grad_norm": 0.004834937863051891,
+      "learning_rate": 0.001,
+      "loss": 0.3966,
+      "step": 2756
+    },
+    {
+      "epoch": 0.07607169833446577,
+      "grad_norm": 0.003390131751075387,
+      "learning_rate": 0.001,
+      "loss": 0.4044,
+      "step": 2757
+    },
+    {
+      "epoch": 0.07609929053553013,
+      "grad_norm": 0.002966254251077771,
+      "learning_rate": 0.001,
+      "loss": 0.4246,
+      "step": 2758
+    },
+    {
+      "epoch": 0.0761268827365945,
+      "grad_norm": 0.00300071039237082,
+      "learning_rate": 0.001,
+      "loss": 0.3683,
+      "step": 2759
+    },
+    {
+      "epoch": 0.07615447493765887,
+      "grad_norm": 0.0030495107639580965,
+      "learning_rate": 0.001,
+      "loss": 0.3625,
+      "step": 2760
+    },
+    {
+      "epoch": 0.07618206713872323,
+      "grad_norm": 0.002420577686280012,
+      "learning_rate": 0.001,
+      "loss": 0.4494,
+      "step": 2761
+    },
+    {
+      "epoch": 0.07620965933978761,
+      "grad_norm": 0.0036301531363278627,
+      "learning_rate": 0.001,
+      "loss": 0.3941,
+      "step": 2762
+    },
+    {
+      "epoch": 0.07623725154085198,
+      "grad_norm": 0.0024638317991048098,
+      "learning_rate": 0.001,
+      "loss": 0.3652,
+      "step": 2763
+    },
+    {
+      "epoch": 0.07626484374191635,
+      "grad_norm": 0.003583789337426424,
+      "learning_rate": 0.001,
+      "loss": 0.3946,
+      "step": 2764
+    },
+    {
+      "epoch": 0.07629243594298071,
+      "grad_norm": 0.003300134791061282,
+      "learning_rate": 0.001,
+      "loss": 0.4191,
+      "step": 2765
+    },
+    {
+      "epoch": 0.07632002814404508,
+      "grad_norm": 0.0033223924692720175,
+      "learning_rate": 0.001,
+      "loss": 0.4072,
+      "step": 2766
+    },
+    {
+      "epoch": 0.07634762034510946,
+      "grad_norm": 0.003998725675046444,
+      "learning_rate": 0.001,
+      "loss": 0.4096,
+      "step": 2767
+    },
+    {
+      "epoch": 0.07637521254617383,
+      "grad_norm": 0.0030690559651702642,
+      "learning_rate": 0.001,
+      "loss": 0.3798,
+      "step": 2768
+    },
+    {
+      "epoch": 0.07640280474723819,
+      "grad_norm": 0.0026303695049136877,
+      "learning_rate": 0.001,
+      "loss": 0.4359,
+      "step": 2769
+    },
+    {
+      "epoch": 0.07643039694830256,
+      "grad_norm": 0.004332841839641333,
+      "learning_rate": 0.001,
+      "loss": 0.3492,
+      "step": 2770
+    },
+    {
+      "epoch": 0.07645798914936693,
+      "grad_norm": 0.0027642296627163887,
+      "learning_rate": 0.001,
+      "loss": 0.4068,
+      "step": 2771
+    },
+    {
+      "epoch": 0.0764855813504313,
+      "grad_norm": 0.0023598058614879847,
+      "learning_rate": 0.001,
+      "loss": 0.453,
+      "step": 2772
+    },
+    {
+      "epoch": 0.07651317355149567,
+      "grad_norm": 0.002782411640509963,
+      "learning_rate": 0.001,
+      "loss": 0.401,
+      "step": 2773
+    },
+    {
+      "epoch": 0.07654076575256004,
+      "grad_norm": 0.0030131624080240726,
+      "learning_rate": 0.001,
+      "loss": 0.4043,
+      "step": 2774
+    },
+    {
+      "epoch": 0.0765683579536244,
+      "grad_norm": 0.003464979352429509,
+      "learning_rate": 0.001,
+      "loss": 0.3996,
+      "step": 2775
+    },
+    {
+      "epoch": 0.07659595015468877,
+      "grad_norm": 0.002998175797984004,
+      "learning_rate": 0.001,
+      "loss": 0.3838,
+      "step": 2776
+    },
+    {
+      "epoch": 0.07662354235575315,
+      "grad_norm": 0.004081446677446365,
+      "learning_rate": 0.001,
+      "loss": 0.4126,
+      "step": 2777
+    },
+    {
+      "epoch": 0.07665113455681752,
+      "grad_norm": 0.02017812430858612,
+      "learning_rate": 0.001,
+      "loss": 0.4203,
+      "step": 2778
+    },
+    {
+      "epoch": 0.07667872675788188,
+      "grad_norm": 0.003272912697866559,
+      "learning_rate": 0.001,
+      "loss": 0.3891,
+      "step": 2779
+    },
+    {
+      "epoch": 0.07670631895894625,
+      "grad_norm": 0.005462125409394503,
+      "learning_rate": 0.001,
+      "loss": 0.4047,
+      "step": 2780
+    },
+    {
+      "epoch": 0.07673391116001062,
+      "grad_norm": 0.003259886521846056,
+      "learning_rate": 0.001,
+      "loss": 0.3656,
+      "step": 2781
+    },
+    {
+      "epoch": 0.076761503361075,
+      "grad_norm": 0.003059667069464922,
+      "learning_rate": 0.001,
+      "loss": 0.3919,
+      "step": 2782
+    },
+    {
+      "epoch": 0.07678909556213936,
+      "grad_norm": 0.00428088940680027,
+      "learning_rate": 0.001,
+      "loss": 0.3705,
+      "step": 2783
+    },
+    {
+      "epoch": 0.07681668776320373,
+      "grad_norm": 0.002609378658235073,
+      "learning_rate": 0.001,
+      "loss": 0.395,
+      "step": 2784
+    },
+    {
+      "epoch": 0.0768442799642681,
+      "grad_norm": 0.005715689156204462,
+      "learning_rate": 0.001,
+      "loss": 0.4003,
+      "step": 2785
+    },
+    {
+      "epoch": 0.07687187216533246,
+      "grad_norm": 0.003460571402683854,
+      "learning_rate": 0.001,
+      "loss": 0.4222,
+      "step": 2786
+    },
+    {
+      "epoch": 0.07689946436639684,
+      "grad_norm": 0.002582078566774726,
+      "learning_rate": 0.001,
+      "loss": 0.3751,
+      "step": 2787
+    },
+    {
+      "epoch": 0.07692705656746121,
+      "grad_norm": 0.003169649513438344,
+      "learning_rate": 0.001,
+      "loss": 0.3896,
+      "step": 2788
+    },
+    {
+      "epoch": 0.07695464876852558,
+      "grad_norm": 0.005018650088459253,
+      "learning_rate": 0.001,
+      "loss": 0.4245,
+      "step": 2789
+    },
+    {
+      "epoch": 0.07698224096958994,
+      "grad_norm": 0.0027702595107257366,
+      "learning_rate": 0.001,
+      "loss": 0.3781,
+      "step": 2790
+    },
+    {
+      "epoch": 0.07700983317065431,
+      "grad_norm": 0.0027409393806010485,
+      "learning_rate": 0.001,
+      "loss": 0.4126,
+      "step": 2791
+    },
+    {
+      "epoch": 0.07703742537171869,
+      "grad_norm": 0.0031372166704386473,
+      "learning_rate": 0.001,
+      "loss": 0.438,
+      "step": 2792
+    },
+    {
+      "epoch": 0.07706501757278306,
+      "grad_norm": 0.0028250846080482006,
+      "learning_rate": 0.001,
+      "loss": 0.3607,
+      "step": 2793
+    },
+    {
+      "epoch": 0.07709260977384742,
+      "grad_norm": 0.003316509071737528,
+      "learning_rate": 0.001,
+      "loss": 0.3957,
+      "step": 2794
+    },
+    {
+      "epoch": 0.07712020197491179,
+      "grad_norm": 0.002770907012745738,
+      "learning_rate": 0.001,
+      "loss": 0.3741,
+      "step": 2795
+    },
+    {
+      "epoch": 0.07714779417597616,
+      "grad_norm": 0.002429973566904664,
+      "learning_rate": 0.001,
+      "loss": 0.4003,
+      "step": 2796
+    },
+    {
+      "epoch": 0.07717538637704054,
+      "grad_norm": 0.0032115073408931494,
+      "learning_rate": 0.001,
+      "loss": 0.3559,
+      "step": 2797
+    },
+    {
+      "epoch": 0.0772029785781049,
+      "grad_norm": 0.006297094281762838,
+      "learning_rate": 0.001,
+      "loss": 0.3917,
+      "step": 2798
+    },
+    {
+      "epoch": 0.07723057077916927,
+      "grad_norm": 0.00496833398938179,
+      "learning_rate": 0.001,
+      "loss": 0.3812,
+      "step": 2799
+    },
+    {
+      "epoch": 0.07725816298023364,
+      "grad_norm": 0.002831167308613658,
+      "learning_rate": 0.001,
+      "loss": 0.3944,
+      "step": 2800
+    },
+    {
+      "epoch": 0.077285755181298,
+      "grad_norm": 0.004788990132510662,
+      "learning_rate": 0.001,
+      "loss": 0.4015,
+      "step": 2801
+    },
+    {
+      "epoch": 0.07731334738236237,
+      "grad_norm": 0.0026512015610933304,
+      "learning_rate": 0.001,
+      "loss": 0.3873,
+      "step": 2802
+    },
+    {
+      "epoch": 0.07734093958342675,
+      "grad_norm": 0.003184010973200202,
+      "learning_rate": 0.001,
+      "loss": 0.3668,
+      "step": 2803
+    },
+    {
+      "epoch": 0.07736853178449112,
+      "grad_norm": 0.003882109420374036,
+      "learning_rate": 0.001,
+      "loss": 0.3792,
+      "step": 2804
+    },
+    {
+      "epoch": 0.07739612398555548,
+      "grad_norm": 0.003275086637586355,
+      "learning_rate": 0.001,
+      "loss": 0.4097,
+      "step": 2805
+    },
+    {
+      "epoch": 0.07742371618661985,
+      "grad_norm": 0.0032583875581622124,
+      "learning_rate": 0.001,
+      "loss": 0.4064,
+      "step": 2806
+    },
+    {
+      "epoch": 0.07745130838768421,
+      "grad_norm": 0.0029904379043728113,
+      "learning_rate": 0.001,
+      "loss": 0.3788,
+      "step": 2807
+    },
+    {
+      "epoch": 0.0774789005887486,
+      "grad_norm": 0.0035862699151039124,
+      "learning_rate": 0.001,
+      "loss": 0.3749,
+      "step": 2808
+    },
+    {
+      "epoch": 0.07750649278981296,
+      "grad_norm": 0.003321669064462185,
+      "learning_rate": 0.001,
+      "loss": 0.3679,
+      "step": 2809
+    },
+    {
+      "epoch": 0.07753408499087733,
+      "grad_norm": 0.002322467975318432,
+      "learning_rate": 0.001,
+      "loss": 0.4202,
+      "step": 2810
+    },
+    {
+      "epoch": 0.0775616771919417,
+      "grad_norm": 0.0043731010518968105,
+      "learning_rate": 0.001,
+      "loss": 0.3979,
+      "step": 2811
+    },
+    {
+      "epoch": 0.07758926939300606,
+      "grad_norm": 0.003245170461013913,
+      "learning_rate": 0.001,
+      "loss": 0.412,
+      "step": 2812
+    },
+    {
+      "epoch": 0.07761686159407044,
+      "grad_norm": 0.0024038052652031183,
+      "learning_rate": 0.001,
+      "loss": 0.3928,
+      "step": 2813
+    },
+    {
+      "epoch": 0.07764445379513481,
+      "grad_norm": 0.0031776116229593754,
+      "learning_rate": 0.001,
+      "loss": 0.3959,
+      "step": 2814
+    },
+    {
+      "epoch": 0.07767204599619917,
+      "grad_norm": 0.0029750748071819544,
+      "learning_rate": 0.001,
+      "loss": 0.4302,
+      "step": 2815
+    },
+    {
+      "epoch": 0.07769963819726354,
+      "grad_norm": 0.0033383795525878668,
+      "learning_rate": 0.001,
+      "loss": 0.4222,
+      "step": 2816
+    },
+    {
+      "epoch": 0.0777272303983279,
+      "grad_norm": 0.00494232214987278,
+      "learning_rate": 0.001,
+      "loss": 0.3798,
+      "step": 2817
+    },
+    {
+      "epoch": 0.07775482259939229,
+      "grad_norm": 0.0031302745919674635,
+      "learning_rate": 0.001,
+      "loss": 0.4021,
+      "step": 2818
+    },
+    {
+      "epoch": 0.07778241480045665,
+      "grad_norm": 0.0038369682151824236,
+      "learning_rate": 0.001,
+      "loss": 0.4356,
+      "step": 2819
+    },
+    {
+      "epoch": 0.07781000700152102,
+      "grad_norm": 0.00343205570243299,
+      "learning_rate": 0.001,
+      "loss": 0.4297,
+      "step": 2820
+    },
+    {
+      "epoch": 0.07783759920258539,
+      "grad_norm": 0.005272636190056801,
+      "learning_rate": 0.001,
+      "loss": 0.4319,
+      "step": 2821
+    },
+    {
+      "epoch": 0.07786519140364975,
+      "grad_norm": 0.006007963325828314,
+      "learning_rate": 0.001,
+      "loss": 0.3682,
+      "step": 2822
+    },
+    {
+      "epoch": 0.07789278360471413,
+      "grad_norm": 0.004388149362057447,
+      "learning_rate": 0.001,
+      "loss": 0.3658,
+      "step": 2823
+    },
+    {
+      "epoch": 0.0779203758057785,
+      "grad_norm": 0.006076582707464695,
+      "learning_rate": 0.001,
+      "loss": 0.3637,
+      "step": 2824
+    },
+    {
+      "epoch": 0.07794796800684287,
+      "grad_norm": 0.0026879182551056147,
+      "learning_rate": 0.001,
+      "loss": 0.4033,
+      "step": 2825
+    },
+    {
+      "epoch": 0.07797556020790723,
+      "grad_norm": 0.003195406636223197,
+      "learning_rate": 0.001,
+      "loss": 0.4289,
+      "step": 2826
+    },
+    {
+      "epoch": 0.0780031524089716,
+      "grad_norm": 0.003109849989414215,
+      "learning_rate": 0.001,
+      "loss": 0.4082,
+      "step": 2827
+    },
+    {
+      "epoch": 0.07803074461003598,
+      "grad_norm": 0.004339446779340506,
+      "learning_rate": 0.001,
+      "loss": 0.3928,
+      "step": 2828
+    },
+    {
+      "epoch": 0.07805833681110035,
+      "grad_norm": 0.003388351993635297,
+      "learning_rate": 0.001,
+      "loss": 0.3823,
+      "step": 2829
+    },
+    {
+      "epoch": 0.07808592901216471,
+      "grad_norm": 0.0037827894557267427,
+      "learning_rate": 0.001,
+      "loss": 0.3721,
+      "step": 2830
+    },
+    {
+      "epoch": 0.07811352121322908,
+      "grad_norm": 0.006149903871119022,
+      "learning_rate": 0.001,
+      "loss": 0.3868,
+      "step": 2831
+    },
+    {
+      "epoch": 0.07814111341429344,
+      "grad_norm": 0.0027368527371436357,
+      "learning_rate": 0.001,
+      "loss": 0.4037,
+      "step": 2832
+    },
+    {
+      "epoch": 0.07816870561535783,
+      "grad_norm": 0.004948263522237539,
+      "learning_rate": 0.001,
+      "loss": 0.4146,
+      "step": 2833
+    },
+    {
+      "epoch": 0.07819629781642219,
+      "grad_norm": 0.017221815884113312,
+      "learning_rate": 0.001,
+      "loss": 0.3695,
+      "step": 2834
+    },
+    {
+      "epoch": 0.07822389001748656,
+      "grad_norm": 0.004491179715842009,
+      "learning_rate": 0.001,
+      "loss": 0.4319,
+      "step": 2835
+    },
+    {
+      "epoch": 0.07825148221855092,
+      "grad_norm": 0.004400896839797497,
+      "learning_rate": 0.001,
+      "loss": 0.3943,
+      "step": 2836
+    },
+    {
+      "epoch": 0.07827907441961529,
+      "grad_norm": 0.0028574098832905293,
+      "learning_rate": 0.001,
+      "loss": 0.4322,
+      "step": 2837
+    },
+    {
+      "epoch": 0.07830666662067967,
+      "grad_norm": 0.005395799409598112,
+      "learning_rate": 0.001,
+      "loss": 0.4221,
+      "step": 2838
+    },
+    {
+      "epoch": 0.07833425882174404,
+      "grad_norm": 0.004169132094830275,
+      "learning_rate": 0.001,
+      "loss": 0.3925,
+      "step": 2839
+    },
+    {
+      "epoch": 0.0783618510228084,
+      "grad_norm": 0.0036933980882167816,
+      "learning_rate": 0.001,
+      "loss": 0.4086,
+      "step": 2840
+    },
+    {
+      "epoch": 0.07838944322387277,
+      "grad_norm": 0.0037820383440703154,
+      "learning_rate": 0.001,
+      "loss": 0.4078,
+      "step": 2841
+    },
+    {
+      "epoch": 0.07841703542493714,
+      "grad_norm": 0.00365295703522861,
+      "learning_rate": 0.001,
+      "loss": 0.4157,
+      "step": 2842
+    },
+    {
+      "epoch": 0.07844462762600152,
+      "grad_norm": 0.0040397047996521,
+      "learning_rate": 0.001,
+      "loss": 0.4155,
+      "step": 2843
+    },
+    {
+      "epoch": 0.07847221982706588,
+      "grad_norm": 0.0027921211440116167,
+      "learning_rate": 0.001,
+      "loss": 0.4173,
+      "step": 2844
+    },
+    {
+      "epoch": 0.07849981202813025,
+      "grad_norm": 0.002542336704209447,
+      "learning_rate": 0.001,
+      "loss": 0.4587,
+      "step": 2845
+    },
+    {
+      "epoch": 0.07852740422919462,
+      "grad_norm": 0.0032813462894409895,
+      "learning_rate": 0.001,
+      "loss": 0.4137,
+      "step": 2846
+    },
+    {
+      "epoch": 0.07855499643025898,
+      "grad_norm": 0.0026641942095011473,
+      "learning_rate": 0.001,
+      "loss": 0.3809,
+      "step": 2847
+    },
+    {
+      "epoch": 0.07858258863132335,
+      "grad_norm": 0.0045136939734220505,
+      "learning_rate": 0.001,
+      "loss": 0.4269,
+      "step": 2848
+    },
+    {
+      "epoch": 0.07861018083238773,
+      "grad_norm": 0.003331273328512907,
+      "learning_rate": 0.001,
+      "loss": 0.4074,
+      "step": 2849
+    },
+    {
+      "epoch": 0.0786377730334521,
+      "grad_norm": 0.0029903652612119913,
+      "learning_rate": 0.001,
+      "loss": 0.3965,
+      "step": 2850
+    },
+    {
+      "epoch": 0.07866536523451646,
+      "grad_norm": 0.003270956454798579,
+      "learning_rate": 0.001,
+      "loss": 0.4201,
+      "step": 2851
+    },
+    {
+      "epoch": 0.07869295743558083,
+      "grad_norm": 0.0025751839857548475,
+      "learning_rate": 0.001,
+      "loss": 0.3856,
+      "step": 2852
+    },
+    {
+      "epoch": 0.0787205496366452,
+      "grad_norm": 0.004237758927047253,
+      "learning_rate": 0.001,
+      "loss": 0.4029,
+      "step": 2853
+    },
+    {
+      "epoch": 0.07874814183770958,
+      "grad_norm": 0.005102077033370733,
+      "learning_rate": 0.001,
+      "loss": 0.4498,
+      "step": 2854
+    },
+    {
+      "epoch": 0.07877573403877394,
+      "grad_norm": 0.003620270872488618,
+      "learning_rate": 0.001,
+      "loss": 0.4124,
+      "step": 2855
+    },
+    {
+      "epoch": 0.07880332623983831,
+      "grad_norm": 0.003328984137624502,
+      "learning_rate": 0.001,
+      "loss": 0.3582,
+      "step": 2856
+    },
+    {
+      "epoch": 0.07883091844090268,
+      "grad_norm": 0.005346687976270914,
+      "learning_rate": 0.001,
+      "loss": 0.4406,
+      "step": 2857
+    },
+    {
+      "epoch": 0.07885851064196704,
+      "grad_norm": 0.002766883932054043,
+      "learning_rate": 0.001,
+      "loss": 0.4048,
+      "step": 2858
+    },
+    {
+      "epoch": 0.07888610284303142,
+      "grad_norm": 0.0039212643168866634,
+      "learning_rate": 0.001,
+      "loss": 0.3732,
+      "step": 2859
+    },
+    {
+      "epoch": 0.07891369504409579,
+      "grad_norm": 0.00360241811722517,
+      "learning_rate": 0.001,
+      "loss": 0.4059,
+      "step": 2860
+    },
+    {
+      "epoch": 0.07894128724516015,
+      "grad_norm": 0.009609011001884937,
+      "learning_rate": 0.001,
+      "loss": 0.3921,
+      "step": 2861
+    },
+    {
+      "epoch": 0.07896887944622452,
+      "grad_norm": 0.004263875540345907,
+      "learning_rate": 0.001,
+      "loss": 0.4211,
+      "step": 2862
+    },
+    {
+      "epoch": 0.07899647164728889,
+      "grad_norm": 0.003934512846171856,
+      "learning_rate": 0.001,
+      "loss": 0.3985,
+      "step": 2863
+    },
+    {
+      "epoch": 0.07902406384835327,
+      "grad_norm": 0.0042722225189208984,
+      "learning_rate": 0.001,
+      "loss": 0.3809,
+      "step": 2864
+    },
+    {
+      "epoch": 0.07905165604941763,
+      "grad_norm": 0.0036077832337468863,
+      "learning_rate": 0.001,
+      "loss": 0.3958,
+      "step": 2865
+    },
+    {
+      "epoch": 0.079079248250482,
+      "grad_norm": 0.0027798449154943228,
+      "learning_rate": 0.001,
+      "loss": 0.4138,
+      "step": 2866
+    },
+    {
+      "epoch": 0.07910684045154637,
+      "grad_norm": 0.0025270867627114058,
+      "learning_rate": 0.001,
+      "loss": 0.4042,
+      "step": 2867
+    },
+    {
+      "epoch": 0.07913443265261073,
+      "grad_norm": 0.0037853806279599667,
+      "learning_rate": 0.001,
+      "loss": 0.4124,
+      "step": 2868
+    },
+    {
+      "epoch": 0.07916202485367511,
+      "grad_norm": 0.004050467163324356,
+      "learning_rate": 0.001,
+      "loss": 0.4246,
+      "step": 2869
+    },
+    {
+      "epoch": 0.07918961705473948,
+      "grad_norm": 0.002820851979777217,
+      "learning_rate": 0.001,
+      "loss": 0.3781,
+      "step": 2870
+    },
+    {
+      "epoch": 0.07921720925580385,
+      "grad_norm": 0.007634916342794895,
+      "learning_rate": 0.001,
+      "loss": 0.376,
+      "step": 2871
+    },
+    {
+      "epoch": 0.07924480145686821,
+      "grad_norm": 0.004927767440676689,
+      "learning_rate": 0.001,
+      "loss": 0.3862,
+      "step": 2872
+    },
+    {
+      "epoch": 0.07927239365793258,
+      "grad_norm": 0.00282577658072114,
+      "learning_rate": 0.001,
+      "loss": 0.4018,
+      "step": 2873
+    },
+    {
+      "epoch": 0.07929998585899696,
+      "grad_norm": 0.0034970357082784176,
+      "learning_rate": 0.001,
+      "loss": 0.3994,
+      "step": 2874
+    },
+    {
+      "epoch": 0.07932757806006133,
+      "grad_norm": 0.00328719150274992,
+      "learning_rate": 0.001,
+      "loss": 0.3795,
+      "step": 2875
+    },
+    {
+      "epoch": 0.0793551702611257,
+      "grad_norm": 0.0030249380506575108,
+      "learning_rate": 0.001,
+      "loss": 0.3827,
+      "step": 2876
+    },
+    {
+      "epoch": 0.07938276246219006,
+      "grad_norm": 0.0026030796580016613,
+      "learning_rate": 0.001,
+      "loss": 0.4465,
+      "step": 2877
+    },
+    {
+      "epoch": 0.07941035466325443,
+      "grad_norm": 0.0026747360825538635,
+      "learning_rate": 0.001,
+      "loss": 0.4162,
+      "step": 2878
+    },
+    {
+      "epoch": 0.0794379468643188,
+      "grad_norm": 0.0031104108784347773,
+      "learning_rate": 0.001,
+      "loss": 0.4035,
+      "step": 2879
+    },
+    {
+      "epoch": 0.07946553906538317,
+      "grad_norm": 0.0029512427281588316,
+      "learning_rate": 0.001,
+      "loss": 0.426,
+      "step": 2880
+    },
+    {
+      "epoch": 0.07949313126644754,
+      "grad_norm": 0.0036311009898781776,
+      "learning_rate": 0.001,
+      "loss": 0.385,
+      "step": 2881
+    },
+    {
+      "epoch": 0.0795207234675119,
+      "grad_norm": 0.0028290299233049154,
+      "learning_rate": 0.001,
+      "loss": 0.403,
+      "step": 2882
+    },
+    {
+      "epoch": 0.07954831566857627,
+      "grad_norm": 0.0037825354374945164,
+      "learning_rate": 0.001,
+      "loss": 0.4213,
+      "step": 2883
+    },
+    {
+      "epoch": 0.07957590786964065,
+      "grad_norm": 0.004042259883135557,
+      "learning_rate": 0.001,
+      "loss": 0.4346,
+      "step": 2884
+    },
+    {
+      "epoch": 0.07960350007070502,
+      "grad_norm": 0.0031280801631510258,
+      "learning_rate": 0.001,
+      "loss": 0.3606,
+      "step": 2885
+    },
+    {
+      "epoch": 0.07963109227176939,
+      "grad_norm": 0.003348682075738907,
+      "learning_rate": 0.001,
+      "loss": 0.3971,
+      "step": 2886
+    },
+    {
+      "epoch": 0.07965868447283375,
+      "grad_norm": 0.003773730481043458,
+      "learning_rate": 0.001,
+      "loss": 0.4046,
+      "step": 2887
+    },
+    {
+      "epoch": 0.07968627667389812,
+      "grad_norm": 0.0028589165303856134,
+      "learning_rate": 0.001,
+      "loss": 0.3763,
+      "step": 2888
+    },
+    {
+      "epoch": 0.0797138688749625,
+      "grad_norm": 0.003061666851863265,
+      "learning_rate": 0.001,
+      "loss": 0.4058,
+      "step": 2889
+    },
+    {
+      "epoch": 0.07974146107602686,
+      "grad_norm": 0.002353568095713854,
+      "learning_rate": 0.001,
+      "loss": 0.3691,
+      "step": 2890
+    },
+    {
+      "epoch": 0.07976905327709123,
+      "grad_norm": 0.0026803589425981045,
+      "learning_rate": 0.001,
+      "loss": 0.3797,
+      "step": 2891
+    },
+    {
+      "epoch": 0.0797966454781556,
+      "grad_norm": 0.0032909938599914312,
+      "learning_rate": 0.001,
+      "loss": 0.4074,
+      "step": 2892
+    },
+    {
+      "epoch": 0.07982423767921996,
+      "grad_norm": 0.005034049041569233,
+      "learning_rate": 0.001,
+      "loss": 0.3969,
+      "step": 2893
+    },
+    {
+      "epoch": 0.07985182988028433,
+      "grad_norm": 0.003058070782572031,
+      "learning_rate": 0.001,
+      "loss": 0.3646,
+      "step": 2894
+    },
+    {
+      "epoch": 0.07987942208134871,
+      "grad_norm": 0.0028637642972171307,
+      "learning_rate": 0.001,
+      "loss": 0.3609,
+      "step": 2895
+    },
+    {
+      "epoch": 0.07990701428241308,
+      "grad_norm": 0.004128556232899427,
+      "learning_rate": 0.001,
+      "loss": 0.391,
+      "step": 2896
+    },
+    {
+      "epoch": 0.07993460648347744,
+      "grad_norm": 0.0060408939607441425,
+      "learning_rate": 0.001,
+      "loss": 0.3351,
+      "step": 2897
+    },
+    {
+      "epoch": 0.07996219868454181,
+      "grad_norm": 0.0026880092918872833,
+      "learning_rate": 0.001,
+      "loss": 0.3596,
+      "step": 2898
+    },
+    {
+      "epoch": 0.07998979088560618,
+      "grad_norm": 0.00707295211032033,
+      "learning_rate": 0.001,
+      "loss": 0.3579,
+      "step": 2899
+    },
+    {
+      "epoch": 0.08001738308667056,
+      "grad_norm": 0.0043478901498019695,
+      "learning_rate": 0.001,
+      "loss": 0.4017,
+      "step": 2900
+    },
+    {
+      "epoch": 0.08004497528773492,
+      "grad_norm": 0.0034000363666564226,
+      "learning_rate": 0.001,
+      "loss": 0.387,
+      "step": 2901
+    },
+    {
+      "epoch": 0.08007256748879929,
+      "grad_norm": 0.004060294013470411,
+      "learning_rate": 0.001,
+      "loss": 0.3805,
+      "step": 2902
+    },
+    {
+      "epoch": 0.08010015968986366,
+      "grad_norm": 0.0032872636802494526,
+      "learning_rate": 0.001,
+      "loss": 0.3986,
+      "step": 2903
+    },
+    {
+      "epoch": 0.08012775189092802,
+      "grad_norm": 0.006337369792163372,
+      "learning_rate": 0.001,
+      "loss": 0.3986,
+      "step": 2904
+    },
+    {
+      "epoch": 0.0801553440919924,
+      "grad_norm": 0.002969271270558238,
+      "learning_rate": 0.001,
+      "loss": 0.4192,
+      "step": 2905
+    },
+    {
+      "epoch": 0.08018293629305677,
+      "grad_norm": 0.002581764478236437,
+      "learning_rate": 0.001,
+      "loss": 0.3994,
+      "step": 2906
+    },
+    {
+      "epoch": 0.08021052849412114,
+      "grad_norm": 0.0030686580576002598,
+      "learning_rate": 0.001,
+      "loss": 0.4259,
+      "step": 2907
+    },
+    {
+      "epoch": 0.0802381206951855,
+      "grad_norm": 0.0032976726070046425,
+      "learning_rate": 0.001,
+      "loss": 0.4098,
+      "step": 2908
+    },
+    {
+      "epoch": 0.08026571289624987,
+      "grad_norm": 0.0037495982833206654,
+      "learning_rate": 0.001,
+      "loss": 0.3674,
+      "step": 2909
+    },
+    {
+      "epoch": 0.08029330509731425,
+      "grad_norm": 0.0030835745856165886,
+      "learning_rate": 0.001,
+      "loss": 0.3899,
+      "step": 2910
+    },
+    {
+      "epoch": 0.08032089729837862,
+      "grad_norm": 0.006115986034274101,
+      "learning_rate": 0.001,
+      "loss": 0.4111,
+      "step": 2911
+    },
+    {
+      "epoch": 0.08034848949944298,
+      "grad_norm": 0.003554831026121974,
+      "learning_rate": 0.001,
+      "loss": 0.4173,
+      "step": 2912
+    },
+    {
+      "epoch": 0.08037608170050735,
+      "grad_norm": 0.0033155917190015316,
+      "learning_rate": 0.001,
+      "loss": 0.4169,
+      "step": 2913
+    },
+    {
+      "epoch": 0.08040367390157172,
+      "grad_norm": 0.0027983803302049637,
+      "learning_rate": 0.001,
+      "loss": 0.4004,
+      "step": 2914
+    },
+    {
+      "epoch": 0.0804312661026361,
+      "grad_norm": 0.0023535455111414194,
+      "learning_rate": 0.001,
+      "loss": 0.4107,
+      "step": 2915
+    },
+    {
+      "epoch": 0.08045885830370046,
+      "grad_norm": 0.002943321131169796,
+      "learning_rate": 0.001,
+      "loss": 0.4106,
+      "step": 2916
+    },
+    {
+      "epoch": 0.08048645050476483,
+      "grad_norm": 0.00304407742805779,
+      "learning_rate": 0.001,
+      "loss": 0.372,
+      "step": 2917
+    },
+    {
+      "epoch": 0.0805140427058292,
+      "grad_norm": 0.0032863402739167213,
+      "learning_rate": 0.001,
+      "loss": 0.4026,
+      "step": 2918
+    },
+    {
+      "epoch": 0.08054163490689356,
+      "grad_norm": 0.0027361640240997076,
+      "learning_rate": 0.001,
+      "loss": 0.3709,
+      "step": 2919
+    },
+    {
+      "epoch": 0.08056922710795794,
+      "grad_norm": 0.0026063849218189716,
+      "learning_rate": 0.001,
+      "loss": 0.4095,
+      "step": 2920
+    },
+    {
+      "epoch": 0.08059681930902231,
+      "grad_norm": 0.0022036924492567778,
+      "learning_rate": 0.001,
+      "loss": 0.4303,
+      "step": 2921
+    },
+    {
+      "epoch": 0.08062441151008667,
+      "grad_norm": 0.0023288466036319733,
+      "learning_rate": 0.001,
+      "loss": 0.4181,
+      "step": 2922
+    },
+    {
+      "epoch": 0.08065200371115104,
+      "grad_norm": 0.0029277384746819735,
+      "learning_rate": 0.001,
+      "loss": 0.4015,
+      "step": 2923
+    },
+    {
+      "epoch": 0.08067959591221541,
+      "grad_norm": 0.0061630988493561745,
+      "learning_rate": 0.001,
+      "loss": 0.3958,
+      "step": 2924
+    },
+    {
+      "epoch": 0.08070718811327979,
+      "grad_norm": 0.0035532654728740454,
+      "learning_rate": 0.001,
+      "loss": 0.3883,
+      "step": 2925
+    },
+    {
+      "epoch": 0.08073478031434415,
+      "grad_norm": 0.0036336968187242746,
+      "learning_rate": 0.001,
+      "loss": 0.38,
+      "step": 2926
+    },
+    {
+      "epoch": 0.08076237251540852,
+      "grad_norm": 0.004034141544252634,
+      "learning_rate": 0.001,
+      "loss": 0.4127,
+      "step": 2927
+    },
+    {
+      "epoch": 0.08078996471647289,
+      "grad_norm": 0.0049674310721457005,
+      "learning_rate": 0.001,
+      "loss": 0.3995,
+      "step": 2928
+    },
+    {
+      "epoch": 0.08081755691753725,
+      "grad_norm": 0.00383196328766644,
+      "learning_rate": 0.001,
+      "loss": 0.4193,
+      "step": 2929
+    },
+    {
+      "epoch": 0.08084514911860163,
+      "grad_norm": 0.002637132303789258,
+      "learning_rate": 0.001,
+      "loss": 0.3848,
+      "step": 2930
+    },
+    {
+      "epoch": 0.080872741319666,
+      "grad_norm": 0.004422449506819248,
+      "learning_rate": 0.001,
+      "loss": 0.4159,
+      "step": 2931
+    },
+    {
+      "epoch": 0.08090033352073037,
+      "grad_norm": 0.005644423421472311,
+      "learning_rate": 0.001,
+      "loss": 0.3794,
+      "step": 2932
+    },
+    {
+      "epoch": 0.08092792572179473,
+      "grad_norm": 0.0029548918828368187,
+      "learning_rate": 0.001,
+      "loss": 0.4132,
+      "step": 2933
+    },
+    {
+      "epoch": 0.0809555179228591,
+      "grad_norm": 0.0033653799910098314,
+      "learning_rate": 0.001,
+      "loss": 0.407,
+      "step": 2934
+    },
+    {
+      "epoch": 0.08098311012392348,
+      "grad_norm": 0.0336541123688221,
+      "learning_rate": 0.001,
+      "loss": 0.4278,
+      "step": 2935
+    },
+    {
+      "epoch": 0.08101070232498785,
+      "grad_norm": 0.005585819482803345,
+      "learning_rate": 0.001,
+      "loss": 0.3806,
+      "step": 2936
+    },
+    {
+      "epoch": 0.08103829452605221,
+      "grad_norm": 0.0023465941194444895,
+      "learning_rate": 0.001,
+      "loss": 0.462,
+      "step": 2937
+    },
+    {
+      "epoch": 0.08106588672711658,
+      "grad_norm": 0.003671723185107112,
+      "learning_rate": 0.001,
+      "loss": 0.4217,
+      "step": 2938
+    },
+    {
+      "epoch": 0.08109347892818095,
+      "grad_norm": 0.002763295080512762,
+      "learning_rate": 0.001,
+      "loss": 0.3752,
+      "step": 2939
+    },
+    {
+      "epoch": 0.08112107112924533,
+      "grad_norm": 0.0028186712879687548,
+      "learning_rate": 0.001,
+      "loss": 0.3913,
+      "step": 2940
+    },
+    {
+      "epoch": 0.08114866333030969,
+      "grad_norm": 0.0023679453879594803,
+      "learning_rate": 0.001,
+      "loss": 0.3965,
+      "step": 2941
+    },
+    {
+      "epoch": 0.08117625553137406,
+      "grad_norm": 0.002650237875059247,
+      "learning_rate": 0.001,
+      "loss": 0.4037,
+      "step": 2942
+    },
+    {
+      "epoch": 0.08120384773243843,
+      "grad_norm": 0.0028579425998032093,
+      "learning_rate": 0.001,
+      "loss": 0.421,
+      "step": 2943
+    },
+    {
+      "epoch": 0.08123143993350279,
+      "grad_norm": 0.003209290560334921,
+      "learning_rate": 0.001,
+      "loss": 0.4071,
+      "step": 2944
+    },
+    {
+      "epoch": 0.08125903213456716,
+      "grad_norm": 0.0023669025395065546,
+      "learning_rate": 0.001,
+      "loss": 0.4131,
+      "step": 2945
+    },
+    {
+      "epoch": 0.08128662433563154,
+      "grad_norm": 0.002830538898706436,
+      "learning_rate": 0.001,
+      "loss": 0.3586,
+      "step": 2946
+    },
+    {
+      "epoch": 0.0813142165366959,
+      "grad_norm": 0.0034998899791389704,
+      "learning_rate": 0.001,
+      "loss": 0.4271,
+      "step": 2947
+    },
+    {
+      "epoch": 0.08134180873776027,
+      "grad_norm": 0.003356503788381815,
+      "learning_rate": 0.001,
+      "loss": 0.3916,
+      "step": 2948
+    },
+    {
+      "epoch": 0.08136940093882464,
+      "grad_norm": 0.0027026510797441006,
+      "learning_rate": 0.001,
+      "loss": 0.3844,
+      "step": 2949
+    },
+    {
+      "epoch": 0.081396993139889,
+      "grad_norm": 0.0025660600513219833,
+      "learning_rate": 0.001,
+      "loss": 0.4176,
+      "step": 2950
+    },
+    {
+      "epoch": 0.08142458534095338,
+      "grad_norm": 0.0029600337147712708,
+      "learning_rate": 0.001,
+      "loss": 0.4324,
+      "step": 2951
+    },
+    {
+      "epoch": 0.08145217754201775,
+      "grad_norm": 0.00301582389511168,
+      "learning_rate": 0.001,
+      "loss": 0.373,
+      "step": 2952
+    },
+    {
+      "epoch": 0.08147976974308212,
+      "grad_norm": 0.0027227008249610662,
+      "learning_rate": 0.001,
+      "loss": 0.4082,
+      "step": 2953
+    },
+    {
+      "epoch": 0.08150736194414648,
+      "grad_norm": 0.004939099308103323,
+      "learning_rate": 0.001,
+      "loss": 0.3956,
+      "step": 2954
+    },
+    {
+      "epoch": 0.08153495414521085,
+      "grad_norm": 0.0038117829244583845,
+      "learning_rate": 0.001,
+      "loss": 0.3908,
+      "step": 2955
+    },
+    {
+      "epoch": 0.08156254634627523,
+      "grad_norm": 0.002671457827091217,
+      "learning_rate": 0.001,
+      "loss": 0.4099,
+      "step": 2956
+    },
+    {
+      "epoch": 0.0815901385473396,
+      "grad_norm": 0.0051042488776147366,
+      "learning_rate": 0.001,
+      "loss": 0.4025,
+      "step": 2957
+    },
+    {
+      "epoch": 0.08161773074840396,
+      "grad_norm": 0.013458254747092724,
+      "learning_rate": 0.001,
+      "loss": 0.4205,
+      "step": 2958
+    },
+    {
+      "epoch": 0.08164532294946833,
+      "grad_norm": 0.004344523418694735,
+      "learning_rate": 0.001,
+      "loss": 0.374,
+      "step": 2959
+    },
+    {
+      "epoch": 0.0816729151505327,
+      "grad_norm": 0.003277859417721629,
+      "learning_rate": 0.001,
+      "loss": 0.4002,
+      "step": 2960
+    },
+    {
+      "epoch": 0.08170050735159708,
+      "grad_norm": 0.007647217717021704,
+      "learning_rate": 0.001,
+      "loss": 0.4151,
+      "step": 2961
+    },
+    {
+      "epoch": 0.08172809955266144,
+      "grad_norm": 0.0031640264205634594,
+      "learning_rate": 0.001,
+      "loss": 0.4208,
+      "step": 2962
+    },
+    {
+      "epoch": 0.08175569175372581,
+      "grad_norm": 0.00352554302662611,
+      "learning_rate": 0.001,
+      "loss": 0.4167,
+      "step": 2963
+    },
+    {
+      "epoch": 0.08178328395479018,
+      "grad_norm": 0.004366365727037191,
+      "learning_rate": 0.001,
+      "loss": 0.4115,
+      "step": 2964
+    },
+    {
+      "epoch": 0.08181087615585454,
+      "grad_norm": 0.0043410733342170715,
+      "learning_rate": 0.001,
+      "loss": 0.3822,
+      "step": 2965
+    },
+    {
+      "epoch": 0.08183846835691892,
+      "grad_norm": 0.0039422293193638325,
+      "learning_rate": 0.001,
+      "loss": 0.4293,
+      "step": 2966
+    },
+    {
+      "epoch": 0.08186606055798329,
+      "grad_norm": 0.0037924828939139843,
+      "learning_rate": 0.001,
+      "loss": 0.3922,
+      "step": 2967
+    },
+    {
+      "epoch": 0.08189365275904766,
+      "grad_norm": 0.007280276622623205,
+      "learning_rate": 0.001,
+      "loss": 0.4094,
+      "step": 2968
+    },
+    {
+      "epoch": 0.08192124496011202,
+      "grad_norm": 0.0051049706526100636,
+      "learning_rate": 0.001,
+      "loss": 0.3825,
+      "step": 2969
+    },
+    {
+      "epoch": 0.08194883716117639,
+      "grad_norm": 0.003454606281593442,
+      "learning_rate": 0.001,
+      "loss": 0.4029,
+      "step": 2970
+    },
+    {
+      "epoch": 0.08197642936224077,
+      "grad_norm": 0.0029868248384445906,
+      "learning_rate": 0.001,
+      "loss": 0.3969,
+      "step": 2971
+    },
+    {
+      "epoch": 0.08200402156330514,
+      "grad_norm": 0.004341921303421259,
+      "learning_rate": 0.001,
+      "loss": 0.4353,
+      "step": 2972
+    },
+    {
+      "epoch": 0.0820316137643695,
+      "grad_norm": 0.003885595127940178,
+      "learning_rate": 0.001,
+      "loss": 0.4144,
+      "step": 2973
+    },
+    {
+      "epoch": 0.08205920596543387,
+      "grad_norm": 0.0032196505926549435,
+      "learning_rate": 0.001,
+      "loss": 0.3865,
+      "step": 2974
+    },
+    {
+      "epoch": 0.08208679816649823,
+      "grad_norm": 0.004508745390921831,
+      "learning_rate": 0.001,
+      "loss": 0.4095,
+      "step": 2975
+    },
+    {
+      "epoch": 0.08211439036756261,
+      "grad_norm": 0.0032329261302948,
+      "learning_rate": 0.001,
+      "loss": 0.3922,
+      "step": 2976
+    },
+    {
+      "epoch": 0.08214198256862698,
+      "grad_norm": 0.004983431659638882,
+      "learning_rate": 0.001,
+      "loss": 0.407,
+      "step": 2977
+    },
+    {
+      "epoch": 0.08216957476969135,
+      "grad_norm": 0.03812706470489502,
+      "learning_rate": 0.001,
+      "loss": 0.3978,
+      "step": 2978
+    },
+    {
+      "epoch": 0.08219716697075571,
+      "grad_norm": 0.002776419511064887,
+      "learning_rate": 0.001,
+      "loss": 0.3993,
+      "step": 2979
+    },
+    {
+      "epoch": 0.08222475917182008,
+      "grad_norm": 0.0030240516643971205,
+      "learning_rate": 0.001,
+      "loss": 0.4176,
+      "step": 2980
+    },
+    {
+      "epoch": 0.08225235137288446,
+      "grad_norm": 0.0031225322745740414,
+      "learning_rate": 0.001,
+      "loss": 0.3519,
+      "step": 2981
+    },
+    {
+      "epoch": 0.08227994357394883,
+      "grad_norm": 0.002751345979049802,
+      "learning_rate": 0.001,
+      "loss": 0.398,
+      "step": 2982
+    },
+    {
+      "epoch": 0.0823075357750132,
+      "grad_norm": 0.003123503876850009,
+      "learning_rate": 0.001,
+      "loss": 0.4157,
+      "step": 2983
+    },
+    {
+      "epoch": 0.08233512797607756,
+      "grad_norm": 0.004268140532076359,
+      "learning_rate": 0.001,
+      "loss": 0.4198,
+      "step": 2984
+    },
+    {
+      "epoch": 0.08236272017714193,
+      "grad_norm": 0.002731149084866047,
+      "learning_rate": 0.001,
+      "loss": 0.4103,
+      "step": 2985
+    },
+    {
+      "epoch": 0.0823903123782063,
+      "grad_norm": 0.002880630549043417,
+      "learning_rate": 0.001,
+      "loss": 0.4146,
+      "step": 2986
+    },
+    {
+      "epoch": 0.08241790457927067,
+      "grad_norm": 0.008699797093868256,
+      "learning_rate": 0.001,
+      "loss": 0.4313,
+      "step": 2987
+    },
+    {
+      "epoch": 0.08244549678033504,
+      "grad_norm": 0.0032247393392026424,
+      "learning_rate": 0.001,
+      "loss": 0.4263,
+      "step": 2988
+    },
+    {
+      "epoch": 0.0824730889813994,
+      "grad_norm": 0.003280255477875471,
+      "learning_rate": 0.001,
+      "loss": 0.4152,
+      "step": 2989
+    },
+    {
+      "epoch": 0.08250068118246377,
+      "grad_norm": 0.0024324634578078985,
+      "learning_rate": 0.001,
+      "loss": 0.4188,
+      "step": 2990
+    },
+    {
+      "epoch": 0.08252827338352814,
+      "grad_norm": 0.004013401456177235,
+      "learning_rate": 0.001,
+      "loss": 0.3778,
+      "step": 2991
+    },
+    {
+      "epoch": 0.08255586558459252,
+      "grad_norm": 0.0036929554771631956,
+      "learning_rate": 0.001,
+      "loss": 0.3752,
+      "step": 2992
+    },
+    {
+      "epoch": 0.08258345778565689,
+      "grad_norm": 0.003950448241084814,
+      "learning_rate": 0.001,
+      "loss": 0.3971,
+      "step": 2993
+    },
+    {
+      "epoch": 0.08261104998672125,
+      "grad_norm": 0.004668880719691515,
+      "learning_rate": 0.001,
+      "loss": 0.4134,
+      "step": 2994
+    },
+    {
+      "epoch": 0.08263864218778562,
+      "grad_norm": 0.002737791510298848,
+      "learning_rate": 0.001,
+      "loss": 0.4388,
+      "step": 2995
+    },
+    {
+      "epoch": 0.08266623438884999,
+      "grad_norm": 0.007926344871520996,
+      "learning_rate": 0.001,
+      "loss": 0.3542,
+      "step": 2996
+    },
+    {
+      "epoch": 0.08269382658991437,
+      "grad_norm": 0.003492021933197975,
+      "learning_rate": 0.001,
+      "loss": 0.4196,
+      "step": 2997
+    },
+    {
+      "epoch": 0.08272141879097873,
+      "grad_norm": 0.0026433023158460855,
+      "learning_rate": 0.001,
+      "loss": 0.3852,
+      "step": 2998
+    },
+    {
+      "epoch": 0.0827490109920431,
+      "grad_norm": 0.0029606178868561983,
+      "learning_rate": 0.001,
+      "loss": 0.4082,
+      "step": 2999
+    },
+    {
+      "epoch": 0.08277660319310746,
+      "grad_norm": 0.00282181310467422,
+      "learning_rate": 0.001,
+      "loss": 0.4111,
+      "step": 3000
+    },
+    {
+      "epoch": 0.08277660319310746,
+      "eval_runtime": 24.5847,
+      "eval_samples_per_second": 1.302,
+      "eval_steps_per_second": 0.163,
+      "step": 3000
+    },
+    {
+      "epoch": 0.08280419539417183,
+      "grad_norm": 0.004229205194860697,
+      "learning_rate": 0.001,
+      "loss": 0.3982,
+      "step": 3001
+    },
+    {
+      "epoch": 0.08283178759523621,
+      "grad_norm": 0.002889542607590556,
+      "learning_rate": 0.001,
+      "loss": 0.4226,
+      "step": 3002
+    },
+    {
+      "epoch": 0.08285937979630058,
+      "grad_norm": 0.0026309038512408733,
+      "learning_rate": 0.001,
+      "loss": 0.4207,
+      "step": 3003
+    },
+    {
+      "epoch": 0.08288697199736494,
+      "grad_norm": 0.002682015299797058,
+      "learning_rate": 0.001,
+      "loss": 0.4143,
+      "step": 3004
+    },
+    {
+      "epoch": 0.08291456419842931,
+      "grad_norm": 0.0026974889915436506,
+      "learning_rate": 0.001,
+      "loss": 0.3976,
+      "step": 3005
+    },
+    {
+      "epoch": 0.08294215639949368,
+      "grad_norm": 0.0023957311641424894,
+      "learning_rate": 0.001,
+      "loss": 0.4082,
+      "step": 3006
+    },
+    {
+      "epoch": 0.08296974860055806,
+      "grad_norm": 0.004567582160234451,
+      "learning_rate": 0.001,
+      "loss": 0.4249,
+      "step": 3007
+    },
+    {
+      "epoch": 0.08299734080162242,
+      "grad_norm": 0.004289150703698397,
+      "learning_rate": 0.001,
+      "loss": 0.4116,
+      "step": 3008
+    },
+    {
+      "epoch": 0.08302493300268679,
+      "grad_norm": 0.004455960355699062,
+      "learning_rate": 0.001,
+      "loss": 0.4003,
+      "step": 3009
+    },
+    {
+      "epoch": 0.08305252520375116,
+      "grad_norm": 0.005996396765112877,
+      "learning_rate": 0.001,
+      "loss": 0.3885,
+      "step": 3010
+    },
+    {
+      "epoch": 0.08308011740481552,
+      "grad_norm": 0.004294134210795164,
+      "learning_rate": 0.001,
+      "loss": 0.3705,
+      "step": 3011
+    },
+    {
+      "epoch": 0.0831077096058799,
+      "grad_norm": 0.0030041569843888283,
+      "learning_rate": 0.001,
+      "loss": 0.3946,
+      "step": 3012
+    },
+    {
+      "epoch": 0.08313530180694427,
+      "grad_norm": 0.0038454660680145025,
+      "learning_rate": 0.001,
+      "loss": 0.378,
+      "step": 3013
+    },
+    {
+      "epoch": 0.08316289400800864,
+      "grad_norm": 0.003388310084119439,
+      "learning_rate": 0.001,
+      "loss": 0.4139,
+      "step": 3014
+    },
+    {
+      "epoch": 0.083190486209073,
+      "grad_norm": 0.003318838309496641,
+      "learning_rate": 0.001,
+      "loss": 0.38,
+      "step": 3015
+    },
+    {
+      "epoch": 0.08321807841013737,
+      "grad_norm": 0.002537375781685114,
+      "learning_rate": 0.001,
+      "loss": 0.4243,
+      "step": 3016
+    },
+    {
+      "epoch": 0.08324567061120175,
+      "grad_norm": 0.00318319583311677,
+      "learning_rate": 0.001,
+      "loss": 0.408,
+      "step": 3017
+    },
+    {
+      "epoch": 0.08327326281226612,
+      "grad_norm": 0.004292814992368221,
+      "learning_rate": 0.001,
+      "loss": 0.3454,
+      "step": 3018
+    },
+    {
+      "epoch": 0.08330085501333048,
+      "grad_norm": 0.0027642296627163887,
+      "learning_rate": 0.001,
+      "loss": 0.3928,
+      "step": 3019
+    },
+    {
+      "epoch": 0.08332844721439485,
+      "grad_norm": 0.0034603141248226166,
+      "learning_rate": 0.001,
+      "loss": 0.4081,
+      "step": 3020
+    },
+    {
+      "epoch": 0.08335603941545922,
+      "grad_norm": 0.006106778047978878,
+      "learning_rate": 0.001,
+      "loss": 0.3804,
+      "step": 3021
+    },
+    {
+      "epoch": 0.0833836316165236,
+      "grad_norm": 0.002718998584896326,
+      "learning_rate": 0.001,
+      "loss": 0.4311,
+      "step": 3022
+    },
+    {
+      "epoch": 0.08341122381758796,
+      "grad_norm": 0.002778289606794715,
+      "learning_rate": 0.001,
+      "loss": 0.3634,
+      "step": 3023
+    },
+    {
+      "epoch": 0.08343881601865233,
+      "grad_norm": 0.002709881402552128,
+      "learning_rate": 0.001,
+      "loss": 0.4208,
+      "step": 3024
+    },
+    {
+      "epoch": 0.0834664082197167,
+      "grad_norm": 0.004608054179698229,
+      "learning_rate": 0.001,
+      "loss": 0.3792,
+      "step": 3025
+    },
+    {
+      "epoch": 0.08349400042078106,
+      "grad_norm": 0.0070383017882704735,
+      "learning_rate": 0.001,
+      "loss": 0.3838,
+      "step": 3026
+    },
+    {
+      "epoch": 0.08352159262184544,
+      "grad_norm": 0.0029459171928465366,
+      "learning_rate": 0.001,
+      "loss": 0.3634,
+      "step": 3027
+    },
+    {
+      "epoch": 0.08354918482290981,
+      "grad_norm": 0.002476966939866543,
+      "learning_rate": 0.001,
+      "loss": 0.4168,
+      "step": 3028
+    },
+    {
+      "epoch": 0.08357677702397417,
+      "grad_norm": 0.0041242605075240135,
+      "learning_rate": 0.001,
+      "loss": 0.3643,
+      "step": 3029
+    },
+    {
+      "epoch": 0.08360436922503854,
+      "grad_norm": 0.0027966529596596956,
+      "learning_rate": 0.001,
+      "loss": 0.4453,
+      "step": 3030
+    },
+    {
+      "epoch": 0.08363196142610291,
+      "grad_norm": 0.0032828738912940025,
+      "learning_rate": 0.001,
+      "loss": 0.4019,
+      "step": 3031
+    },
+    {
+      "epoch": 0.08365955362716729,
+      "grad_norm": 0.002733866684138775,
+      "learning_rate": 0.001,
+      "loss": 0.4003,
+      "step": 3032
+    },
+    {
+      "epoch": 0.08368714582823165,
+      "grad_norm": 0.004932538606226444,
+      "learning_rate": 0.001,
+      "loss": 0.4004,
+      "step": 3033
+    },
+    {
+      "epoch": 0.08371473802929602,
+      "grad_norm": 0.0026060608215630054,
+      "learning_rate": 0.001,
+      "loss": 0.3812,
+      "step": 3034
+    },
+    {
+      "epoch": 0.08374233023036039,
+      "grad_norm": 0.0029096321668475866,
+      "learning_rate": 0.001,
+      "loss": 0.3949,
+      "step": 3035
+    },
+    {
+      "epoch": 0.08376992243142475,
+      "grad_norm": 0.0027539224829524755,
+      "learning_rate": 0.001,
+      "loss": 0.4445,
+      "step": 3036
+    },
+    {
+      "epoch": 0.08379751463248912,
+      "grad_norm": 0.00449875695630908,
+      "learning_rate": 0.001,
+      "loss": 0.3852,
+      "step": 3037
+    },
+    {
+      "epoch": 0.0838251068335535,
+      "grad_norm": 0.0045863245613873005,
+      "learning_rate": 0.001,
+      "loss": 0.3814,
+      "step": 3038
+    },
+    {
+      "epoch": 0.08385269903461787,
+      "grad_norm": 0.004102183040231466,
+      "learning_rate": 0.001,
+      "loss": 0.3675,
+      "step": 3039
+    },
+    {
+      "epoch": 0.08388029123568223,
+      "grad_norm": 0.003232220420613885,
+      "learning_rate": 0.001,
+      "loss": 0.3727,
+      "step": 3040
+    },
+    {
+      "epoch": 0.0839078834367466,
+      "grad_norm": 0.0032380821648985147,
+      "learning_rate": 0.001,
+      "loss": 0.4175,
+      "step": 3041
+    },
+    {
+      "epoch": 0.08393547563781097,
+      "grad_norm": 0.0023660236038267612,
+      "learning_rate": 0.001,
+      "loss": 0.3769,
+      "step": 3042
+    },
+    {
+      "epoch": 0.08396306783887535,
+      "grad_norm": 0.006015659775584936,
+      "learning_rate": 0.001,
+      "loss": 0.4042,
+      "step": 3043
+    },
+    {
+      "epoch": 0.08399066003993971,
+      "grad_norm": 0.0027081884909421206,
+      "learning_rate": 0.001,
+      "loss": 0.4108,
+      "step": 3044
+    },
+    {
+      "epoch": 0.08401825224100408,
+      "grad_norm": 0.002505830954760313,
+      "learning_rate": 0.001,
+      "loss": 0.3933,
+      "step": 3045
+    },
+    {
+      "epoch": 0.08404584444206845,
+      "grad_norm": 0.002315351041033864,
+      "learning_rate": 0.001,
+      "loss": 0.4049,
+      "step": 3046
+    },
+    {
+      "epoch": 0.08407343664313281,
+      "grad_norm": 0.003794366493821144,
+      "learning_rate": 0.001,
+      "loss": 0.4072,
+      "step": 3047
+    },
+    {
+      "epoch": 0.08410102884419719,
+      "grad_norm": 0.006150163244456053,
+      "learning_rate": 0.001,
+      "loss": 0.3874,
+      "step": 3048
+    },
+    {
+      "epoch": 0.08412862104526156,
+      "grad_norm": 0.00300444639287889,
+      "learning_rate": 0.001,
+      "loss": 0.3932,
+      "step": 3049
+    },
+    {
+      "epoch": 0.08415621324632593,
+      "grad_norm": 0.0030877876561135054,
+      "learning_rate": 0.001,
+      "loss": 0.4026,
+      "step": 3050
+    },
+    {
+      "epoch": 0.08418380544739029,
+      "grad_norm": 0.004085875116288662,
+      "learning_rate": 0.001,
+      "loss": 0.4016,
+      "step": 3051
+    },
+    {
+      "epoch": 0.08421139764845466,
+      "grad_norm": 0.0042649428360164165,
+      "learning_rate": 0.001,
+      "loss": 0.3727,
+      "step": 3052
+    },
+    {
+      "epoch": 0.08423898984951904,
+      "grad_norm": 0.002339342376217246,
+      "learning_rate": 0.001,
+      "loss": 0.3953,
+      "step": 3053
+    },
+    {
+      "epoch": 0.0842665820505834,
+      "grad_norm": 0.0029005780816078186,
+      "learning_rate": 0.001,
+      "loss": 0.3983,
+      "step": 3054
+    },
+    {
+      "epoch": 0.08429417425164777,
+      "grad_norm": 0.002606838010251522,
+      "learning_rate": 0.001,
+      "loss": 0.4083,
+      "step": 3055
+    },
+    {
+      "epoch": 0.08432176645271214,
+      "grad_norm": 0.002856641774997115,
+      "learning_rate": 0.001,
+      "loss": 0.4205,
+      "step": 3056
+    },
+    {
+      "epoch": 0.0843493586537765,
+      "grad_norm": 0.021601708605885506,
+      "learning_rate": 0.001,
+      "loss": 0.4017,
+      "step": 3057
+    },
+    {
+      "epoch": 0.08437695085484088,
+      "grad_norm": 0.002950535388663411,
+      "learning_rate": 0.001,
+      "loss": 0.4361,
+      "step": 3058
+    },
+    {
+      "epoch": 0.08440454305590525,
+      "grad_norm": 0.0030242663342505693,
+      "learning_rate": 0.001,
+      "loss": 0.3981,
+      "step": 3059
+    },
+    {
+      "epoch": 0.08443213525696962,
+      "grad_norm": 0.002619365695863962,
+      "learning_rate": 0.001,
+      "loss": 0.4011,
+      "step": 3060
+    },
+    {
+      "epoch": 0.08445972745803398,
+      "grad_norm": 0.0032321936450898647,
+      "learning_rate": 0.001,
+      "loss": 0.3978,
+      "step": 3061
+    },
+    {
+      "epoch": 0.08448731965909835,
+      "grad_norm": 0.0033744669053703547,
+      "learning_rate": 0.001,
+      "loss": 0.3652,
+      "step": 3062
+    },
+    {
+      "epoch": 0.08451491186016273,
+      "grad_norm": 0.0027905574534088373,
+      "learning_rate": 0.001,
+      "loss": 0.3748,
+      "step": 3063
+    },
+    {
+      "epoch": 0.0845425040612271,
+      "grad_norm": 0.0028084227815270424,
+      "learning_rate": 0.001,
+      "loss": 0.4155,
+      "step": 3064
+    },
+    {
+      "epoch": 0.08457009626229146,
+      "grad_norm": 0.0034166318364441395,
+      "learning_rate": 0.001,
+      "loss": 0.3733,
+      "step": 3065
+    },
+    {
+      "epoch": 0.08459768846335583,
+      "grad_norm": 0.0029833640437573195,
+      "learning_rate": 0.001,
+      "loss": 0.4231,
+      "step": 3066
+    },
+    {
+      "epoch": 0.0846252806644202,
+      "grad_norm": 0.002973797731101513,
+      "learning_rate": 0.001,
+      "loss": 0.4345,
+      "step": 3067
+    },
+    {
+      "epoch": 0.08465287286548458,
+      "grad_norm": 0.0027056816034018993,
+      "learning_rate": 0.001,
+      "loss": 0.3766,
+      "step": 3068
+    },
+    {
+      "epoch": 0.08468046506654894,
+      "grad_norm": 0.008972891606390476,
+      "learning_rate": 0.001,
+      "loss": 0.4121,
+      "step": 3069
+    },
+    {
+      "epoch": 0.08470805726761331,
+      "grad_norm": 0.003379418281838298,
+      "learning_rate": 0.001,
+      "loss": 0.4089,
+      "step": 3070
+    },
+    {
+      "epoch": 0.08473564946867768,
+      "grad_norm": 0.004911572206765413,
+      "learning_rate": 0.001,
+      "loss": 0.3753,
+      "step": 3071
+    },
+    {
+      "epoch": 0.08476324166974204,
+      "grad_norm": 0.003271787893027067,
+      "learning_rate": 0.001,
+      "loss": 0.4428,
+      "step": 3072
+    },
+    {
+      "epoch": 0.08479083387080642,
+      "grad_norm": 0.0034387686755508184,
+      "learning_rate": 0.001,
+      "loss": 0.3708,
+      "step": 3073
+    },
+    {
+      "epoch": 0.08481842607187079,
+      "grad_norm": 0.004575818777084351,
+      "learning_rate": 0.001,
+      "loss": 0.4035,
+      "step": 3074
+    },
+    {
+      "epoch": 0.08484601827293516,
+      "grad_norm": 0.040848325937986374,
+      "learning_rate": 0.001,
+      "loss": 0.3992,
+      "step": 3075
+    },
+    {
+      "epoch": 0.08487361047399952,
+      "grad_norm": 0.009725205600261688,
+      "learning_rate": 0.001,
+      "loss": 0.3652,
+      "step": 3076
+    },
+    {
+      "epoch": 0.08490120267506389,
+      "grad_norm": 0.008186236955225468,
+      "learning_rate": 0.001,
+      "loss": 0.4051,
+      "step": 3077
+    },
+    {
+      "epoch": 0.08492879487612827,
+      "grad_norm": 0.0035197827965021133,
+      "learning_rate": 0.001,
+      "loss": 0.445,
+      "step": 3078
+    },
+    {
+      "epoch": 0.08495638707719264,
+      "grad_norm": 0.002530729863792658,
+      "learning_rate": 0.001,
+      "loss": 0.3985,
+      "step": 3079
+    },
+    {
+      "epoch": 0.084983979278257,
+      "grad_norm": 0.002124677412211895,
+      "learning_rate": 0.001,
+      "loss": 0.4173,
+      "step": 3080
+    },
+    {
+      "epoch": 0.08501157147932137,
+      "grad_norm": 0.0033621720504015684,
+      "learning_rate": 0.001,
+      "loss": 0.3879,
+      "step": 3081
+    },
+    {
+      "epoch": 0.08503916368038573,
+      "grad_norm": 0.003055825363844633,
+      "learning_rate": 0.001,
+      "loss": 0.3891,
+      "step": 3082
+    },
+    {
+      "epoch": 0.0850667558814501,
+      "grad_norm": 0.002876532031223178,
+      "learning_rate": 0.001,
+      "loss": 0.4144,
+      "step": 3083
+    },
+    {
+      "epoch": 0.08509434808251448,
+      "grad_norm": 0.0029174459632486105,
+      "learning_rate": 0.001,
+      "loss": 0.4152,
+      "step": 3084
+    },
+    {
+      "epoch": 0.08512194028357885,
+      "grad_norm": 0.002031163312494755,
+      "learning_rate": 0.001,
+      "loss": 0.3927,
+      "step": 3085
+    },
+    {
+      "epoch": 0.08514953248464321,
+      "grad_norm": 0.003064326476305723,
+      "learning_rate": 0.001,
+      "loss": 0.3883,
+      "step": 3086
+    },
+    {
+      "epoch": 0.08517712468570758,
+      "grad_norm": 0.003118072636425495,
+      "learning_rate": 0.001,
+      "loss": 0.4068,
+      "step": 3087
+    },
+    {
+      "epoch": 0.08520471688677195,
+      "grad_norm": 0.0032082500401884317,
+      "learning_rate": 0.001,
+      "loss": 0.4125,
+      "step": 3088
+    },
+    {
+      "epoch": 0.08523230908783633,
+      "grad_norm": 0.005306860897690058,
+      "learning_rate": 0.001,
+      "loss": 0.3843,
+      "step": 3089
+    },
+    {
+      "epoch": 0.0852599012889007,
+      "grad_norm": 0.004819758236408234,
+      "learning_rate": 0.001,
+      "loss": 0.4069,
+      "step": 3090
+    },
+    {
+      "epoch": 0.08528749348996506,
+      "grad_norm": 0.003489856142550707,
+      "learning_rate": 0.001,
+      "loss": 0.3821,
+      "step": 3091
+    },
+    {
+      "epoch": 0.08531508569102943,
+      "grad_norm": 0.002650489332154393,
+      "learning_rate": 0.001,
+      "loss": 0.4061,
+      "step": 3092
+    },
+    {
+      "epoch": 0.0853426778920938,
+      "grad_norm": 0.006152056623250246,
+      "learning_rate": 0.001,
+      "loss": 0.3641,
+      "step": 3093
+    },
+    {
+      "epoch": 0.08537027009315817,
+      "grad_norm": 0.002845000009983778,
+      "learning_rate": 0.001,
+      "loss": 0.4092,
+      "step": 3094
+    },
+    {
+      "epoch": 0.08539786229422254,
+      "grad_norm": 0.012226511724293232,
+      "learning_rate": 0.001,
+      "loss": 0.3805,
+      "step": 3095
+    },
+    {
+      "epoch": 0.0854254544952869,
+      "grad_norm": 0.003854371840134263,
+      "learning_rate": 0.001,
+      "loss": 0.3938,
+      "step": 3096
+    },
+    {
+      "epoch": 0.08545304669635127,
+      "grad_norm": 0.0025693075731396675,
+      "learning_rate": 0.001,
+      "loss": 0.413,
+      "step": 3097
+    },
+    {
+      "epoch": 0.08548063889741564,
+      "grad_norm": 0.006640615873038769,
+      "learning_rate": 0.001,
+      "loss": 0.3868,
+      "step": 3098
+    },
+    {
+      "epoch": 0.08550823109848002,
+      "grad_norm": 0.006392517127096653,
+      "learning_rate": 0.001,
+      "loss": 0.396,
+      "step": 3099
+    },
+    {
+      "epoch": 0.08553582329954439,
+      "grad_norm": 0.004845835268497467,
+      "learning_rate": 0.001,
+      "loss": 0.3825,
+      "step": 3100
+    },
+    {
+      "epoch": 0.08556341550060875,
+      "grad_norm": 0.005125217605382204,
+      "learning_rate": 0.001,
+      "loss": 0.3911,
+      "step": 3101
+    },
+    {
+      "epoch": 0.08559100770167312,
+      "grad_norm": 0.002894837176427245,
+      "learning_rate": 0.001,
+      "loss": 0.4292,
+      "step": 3102
+    },
+    {
+      "epoch": 0.08561859990273749,
+      "grad_norm": 0.0028156936168670654,
+      "learning_rate": 0.001,
+      "loss": 0.4005,
+      "step": 3103
+    },
+    {
+      "epoch": 0.08564619210380187,
+      "grad_norm": 0.0022546479012817144,
+      "learning_rate": 0.001,
+      "loss": 0.3995,
+      "step": 3104
+    },
+    {
+      "epoch": 0.08567378430486623,
+      "grad_norm": 0.0024128523655235767,
+      "learning_rate": 0.001,
+      "loss": 0.4012,
+      "step": 3105
+    },
+    {
+      "epoch": 0.0857013765059306,
+      "grad_norm": 0.0024307817220687866,
+      "learning_rate": 0.001,
+      "loss": 0.3972,
+      "step": 3106
+    },
+    {
+      "epoch": 0.08572896870699497,
+      "grad_norm": 0.0030128727667033672,
+      "learning_rate": 0.001,
+      "loss": 0.3963,
+      "step": 3107
+    },
+    {
+      "epoch": 0.08575656090805933,
+      "grad_norm": 0.00243613263592124,
+      "learning_rate": 0.001,
+      "loss": 0.4105,
+      "step": 3108
+    },
+    {
+      "epoch": 0.08578415310912371,
+      "grad_norm": 0.0021383436396718025,
+      "learning_rate": 0.001,
+      "loss": 0.3999,
+      "step": 3109
+    },
+    {
+      "epoch": 0.08581174531018808,
+      "grad_norm": 0.003478130092844367,
+      "learning_rate": 0.001,
+      "loss": 0.4161,
+      "step": 3110
+    },
+    {
+      "epoch": 0.08583933751125244,
+      "grad_norm": 0.0026314982678741217,
+      "learning_rate": 0.001,
+      "loss": 0.4143,
+      "step": 3111
+    },
+    {
+      "epoch": 0.08586692971231681,
+      "grad_norm": 0.0033548614010214806,
+      "learning_rate": 0.001,
+      "loss": 0.3965,
+      "step": 3112
+    },
+    {
+      "epoch": 0.08589452191338118,
+      "grad_norm": 0.0054088556207716465,
+      "learning_rate": 0.001,
+      "loss": 0.3519,
+      "step": 3113
+    },
+    {
+      "epoch": 0.08592211411444556,
+      "grad_norm": 0.0038994327187538147,
+      "learning_rate": 0.001,
+      "loss": 0.4049,
+      "step": 3114
+    },
+    {
+      "epoch": 0.08594970631550992,
+      "grad_norm": 0.0026842011138796806,
+      "learning_rate": 0.001,
+      "loss": 0.4058,
+      "step": 3115
+    },
+    {
+      "epoch": 0.08597729851657429,
+      "grad_norm": 0.0026092708576470613,
+      "learning_rate": 0.001,
+      "loss": 0.431,
+      "step": 3116
+    },
+    {
+      "epoch": 0.08600489071763866,
+      "grad_norm": 0.004279262386262417,
+      "learning_rate": 0.001,
+      "loss": 0.3745,
+      "step": 3117
+    },
+    {
+      "epoch": 0.08603248291870302,
+      "grad_norm": 0.0036713224835693836,
+      "learning_rate": 0.001,
+      "loss": 0.3371,
+      "step": 3118
+    },
+    {
+      "epoch": 0.0860600751197674,
+      "grad_norm": 0.004069841001182795,
+      "learning_rate": 0.001,
+      "loss": 0.3678,
+      "step": 3119
+    },
+    {
+      "epoch": 0.08608766732083177,
+      "grad_norm": 0.003785800887271762,
+      "learning_rate": 0.001,
+      "loss": 0.3646,
+      "step": 3120
+    },
+    {
+      "epoch": 0.08611525952189614,
+      "grad_norm": 0.003927123267203569,
+      "learning_rate": 0.001,
+      "loss": 0.3829,
+      "step": 3121
+    },
+    {
+      "epoch": 0.0861428517229605,
+      "grad_norm": 0.004813566338270903,
+      "learning_rate": 0.001,
+      "loss": 0.4,
+      "step": 3122
+    },
+    {
+      "epoch": 0.08617044392402487,
+      "grad_norm": 0.002674676012247801,
+      "learning_rate": 0.001,
+      "loss": 0.415,
+      "step": 3123
+    },
+    {
+      "epoch": 0.08619803612508925,
+      "grad_norm": 0.0027380958199501038,
+      "learning_rate": 0.001,
+      "loss": 0.4164,
+      "step": 3124
+    },
+    {
+      "epoch": 0.08622562832615362,
+      "grad_norm": 0.006593447644263506,
+      "learning_rate": 0.001,
+      "loss": 0.4127,
+      "step": 3125
+    },
+    {
+      "epoch": 0.08625322052721798,
+      "grad_norm": 0.005558252800256014,
+      "learning_rate": 0.001,
+      "loss": 0.4081,
+      "step": 3126
+    },
+    {
+      "epoch": 0.08628081272828235,
+      "grad_norm": 0.003584665711969137,
+      "learning_rate": 0.001,
+      "loss": 0.3668,
+      "step": 3127
+    },
+    {
+      "epoch": 0.08630840492934672,
+      "grad_norm": 0.004839813802391291,
+      "learning_rate": 0.001,
+      "loss": 0.4007,
+      "step": 3128
+    },
+    {
+      "epoch": 0.08633599713041108,
+      "grad_norm": 0.0030945283360779285,
+      "learning_rate": 0.001,
+      "loss": 0.4139,
+      "step": 3129
+    },
+    {
+      "epoch": 0.08636358933147546,
+      "grad_norm": 0.005693027749657631,
+      "learning_rate": 0.001,
+      "loss": 0.3907,
+      "step": 3130
+    },
+    {
+      "epoch": 0.08639118153253983,
+      "grad_norm": 0.0034251015167683363,
+      "learning_rate": 0.001,
+      "loss": 0.4084,
+      "step": 3131
+    },
+    {
+      "epoch": 0.0864187737336042,
+      "grad_norm": 0.0030861585400998592,
+      "learning_rate": 0.001,
+      "loss": 0.4367,
+      "step": 3132
+    },
+    {
+      "epoch": 0.08644636593466856,
+      "grad_norm": 0.0027904235757887363,
+      "learning_rate": 0.001,
+      "loss": 0.394,
+      "step": 3133
+    },
+    {
+      "epoch": 0.08647395813573293,
+      "grad_norm": 0.003176966914907098,
+      "learning_rate": 0.001,
+      "loss": 0.407,
+      "step": 3134
+    },
+    {
+      "epoch": 0.08650155033679731,
+      "grad_norm": 0.007988881319761276,
+      "learning_rate": 0.001,
+      "loss": 0.3868,
+      "step": 3135
+    },
+    {
+      "epoch": 0.08652914253786168,
+      "grad_norm": 0.0030314160976558924,
+      "learning_rate": 0.001,
+      "loss": 0.4034,
+      "step": 3136
+    },
+    {
+      "epoch": 0.08655673473892604,
+      "grad_norm": 0.0032279801089316607,
+      "learning_rate": 0.001,
+      "loss": 0.4025,
+      "step": 3137
+    },
+    {
+      "epoch": 0.08658432693999041,
+      "grad_norm": 0.0031009484082460403,
+      "learning_rate": 0.001,
+      "loss": 0.3807,
+      "step": 3138
+    },
+    {
+      "epoch": 0.08661191914105477,
+      "grad_norm": 0.0032069345470517874,
+      "learning_rate": 0.001,
+      "loss": 0.3908,
+      "step": 3139
+    },
+    {
+      "epoch": 0.08663951134211915,
+      "grad_norm": 0.005810491740703583,
+      "learning_rate": 0.001,
+      "loss": 0.3924,
+      "step": 3140
+    },
+    {
+      "epoch": 0.08666710354318352,
+      "grad_norm": 0.003494016360491514,
+      "learning_rate": 0.001,
+      "loss": 0.4004,
+      "step": 3141
+    },
+    {
+      "epoch": 0.08669469574424789,
+      "grad_norm": 0.0038405335508286953,
+      "learning_rate": 0.001,
+      "loss": 0.4051,
+      "step": 3142
+    },
+    {
+      "epoch": 0.08672228794531225,
+      "grad_norm": 0.03096974827349186,
+      "learning_rate": 0.001,
+      "loss": 0.4013,
+      "step": 3143
+    },
+    {
+      "epoch": 0.08674988014637662,
+      "grad_norm": 0.007581043988466263,
+      "learning_rate": 0.001,
+      "loss": 0.3751,
+      "step": 3144
+    },
+    {
+      "epoch": 0.086777472347441,
+      "grad_norm": 0.00279928813688457,
+      "learning_rate": 0.001,
+      "loss": 0.4068,
+      "step": 3145
+    },
+    {
+      "epoch": 0.08680506454850537,
+      "grad_norm": 0.003512721508741379,
+      "learning_rate": 0.001,
+      "loss": 0.3581,
+      "step": 3146
+    },
+    {
+      "epoch": 0.08683265674956973,
+      "grad_norm": 0.0022023352794349194,
+      "learning_rate": 0.001,
+      "loss": 0.4173,
+      "step": 3147
+    },
+    {
+      "epoch": 0.0868602489506341,
+      "grad_norm": 0.0022207568399608135,
+      "learning_rate": 0.001,
+      "loss": 0.4223,
+      "step": 3148
+    },
+    {
+      "epoch": 0.08688784115169847,
+      "grad_norm": 0.003400396555662155,
+      "learning_rate": 0.001,
+      "loss": 0.3907,
+      "step": 3149
+    },
+    {
+      "epoch": 0.08691543335276285,
+      "grad_norm": 0.0031312189530581236,
+      "learning_rate": 0.001,
+      "loss": 0.3781,
+      "step": 3150
+    },
+    {
+      "epoch": 0.08694302555382721,
+      "grad_norm": 0.0022443707566708326,
+      "learning_rate": 0.001,
+      "loss": 0.3987,
+      "step": 3151
+    },
+    {
+      "epoch": 0.08697061775489158,
+      "grad_norm": 0.0031315688975155354,
+      "learning_rate": 0.001,
+      "loss": 0.3965,
+      "step": 3152
+    },
+    {
+      "epoch": 0.08699820995595595,
+      "grad_norm": 0.0029569200705736876,
+      "learning_rate": 0.001,
+      "loss": 0.4351,
+      "step": 3153
+    },
+    {
+      "epoch": 0.08702580215702031,
+      "grad_norm": 0.0045159622095525265,
+      "learning_rate": 0.001,
+      "loss": 0.3658,
+      "step": 3154
+    },
+    {
+      "epoch": 0.08705339435808469,
+      "grad_norm": 0.0024900075513869524,
+      "learning_rate": 0.001,
+      "loss": 0.4371,
+      "step": 3155
+    },
+    {
+      "epoch": 0.08708098655914906,
+      "grad_norm": 0.002761411713436246,
+      "learning_rate": 0.001,
+      "loss": 0.3813,
+      "step": 3156
+    },
+    {
+      "epoch": 0.08710857876021343,
+      "grad_norm": 0.005542066879570484,
+      "learning_rate": 0.001,
+      "loss": 0.4075,
+      "step": 3157
+    },
+    {
+      "epoch": 0.08713617096127779,
+      "grad_norm": 0.0027586561627686024,
+      "learning_rate": 0.001,
+      "loss": 0.3794,
+      "step": 3158
+    },
+    {
+      "epoch": 0.08716376316234216,
+      "grad_norm": 0.0025526464451104403,
+      "learning_rate": 0.001,
+      "loss": 0.3836,
+      "step": 3159
+    },
+    {
+      "epoch": 0.08719135536340654,
+      "grad_norm": 0.0028663338162004948,
+      "learning_rate": 0.001,
+      "loss": 0.4001,
+      "step": 3160
+    },
+    {
+      "epoch": 0.0872189475644709,
+      "grad_norm": 0.0021977245341986418,
+      "learning_rate": 0.001,
+      "loss": 0.3703,
+      "step": 3161
+    },
+    {
+      "epoch": 0.08724653976553527,
+      "grad_norm": 0.003095234278589487,
+      "learning_rate": 0.001,
+      "loss": 0.378,
+      "step": 3162
+    },
+    {
+      "epoch": 0.08727413196659964,
+      "grad_norm": 0.002372263465076685,
+      "learning_rate": 0.001,
+      "loss": 0.4313,
+      "step": 3163
+    },
+    {
+      "epoch": 0.087301724167664,
+      "grad_norm": 0.0028856871649622917,
+      "learning_rate": 0.001,
+      "loss": 0.415,
+      "step": 3164
+    },
+    {
+      "epoch": 0.08732931636872839,
+      "grad_norm": 0.006369113922119141,
+      "learning_rate": 0.001,
+      "loss": 0.3955,
+      "step": 3165
+    },
+    {
+      "epoch": 0.08735690856979275,
+      "grad_norm": 0.00523237232118845,
+      "learning_rate": 0.001,
+      "loss": 0.3853,
+      "step": 3166
+    },
+    {
+      "epoch": 0.08738450077085712,
+      "grad_norm": 0.0037964817602187395,
+      "learning_rate": 0.001,
+      "loss": 0.4094,
+      "step": 3167
+    },
+    {
+      "epoch": 0.08741209297192148,
+      "grad_norm": 0.004853063262999058,
+      "learning_rate": 0.001,
+      "loss": 0.3877,
+      "step": 3168
+    },
+    {
+      "epoch": 0.08743968517298585,
+      "grad_norm": 0.0025882385671138763,
+      "learning_rate": 0.001,
+      "loss": 0.3788,
+      "step": 3169
+    },
+    {
+      "epoch": 0.08746727737405023,
+      "grad_norm": 0.002228371100500226,
+      "learning_rate": 0.001,
+      "loss": 0.468,
+      "step": 3170
+    },
+    {
+      "epoch": 0.0874948695751146,
+      "grad_norm": 0.003534890478476882,
+      "learning_rate": 0.001,
+      "loss": 0.3951,
+      "step": 3171
+    },
+    {
+      "epoch": 0.08752246177617896,
+      "grad_norm": 0.002975932788103819,
+      "learning_rate": 0.001,
+      "loss": 0.3763,
+      "step": 3172
+    },
+    {
+      "epoch": 0.08755005397724333,
+      "grad_norm": 0.0035217402037233114,
+      "learning_rate": 0.001,
+      "loss": 0.4021,
+      "step": 3173
+    },
+    {
+      "epoch": 0.0875776461783077,
+      "grad_norm": 0.002465012250468135,
+      "learning_rate": 0.001,
+      "loss": 0.3913,
+      "step": 3174
+    },
+    {
+      "epoch": 0.08760523837937208,
+      "grad_norm": 0.003945467062294483,
+      "learning_rate": 0.001,
+      "loss": 0.391,
+      "step": 3175
+    },
+    {
+      "epoch": 0.08763283058043644,
+      "grad_norm": 0.003953505773097277,
+      "learning_rate": 0.001,
+      "loss": 0.4273,
+      "step": 3176
+    },
+    {
+      "epoch": 0.08766042278150081,
+      "grad_norm": 0.004156408831477165,
+      "learning_rate": 0.001,
+      "loss": 0.403,
+      "step": 3177
+    },
+    {
+      "epoch": 0.08768801498256518,
+      "grad_norm": 0.0024448104668408632,
+      "learning_rate": 0.001,
+      "loss": 0.4369,
+      "step": 3178
+    },
+    {
+      "epoch": 0.08771560718362954,
+      "grad_norm": 0.0024597158189862967,
+      "learning_rate": 0.001,
+      "loss": 0.425,
+      "step": 3179
+    },
+    {
+      "epoch": 0.08774319938469391,
+      "grad_norm": 0.003507564775645733,
+      "learning_rate": 0.001,
+      "loss": 0.364,
+      "step": 3180
+    },
+    {
+      "epoch": 0.08777079158575829,
+      "grad_norm": 0.002295244485139847,
+      "learning_rate": 0.001,
+      "loss": 0.4024,
+      "step": 3181
+    },
+    {
+      "epoch": 0.08779838378682266,
+      "grad_norm": 0.0022501791827380657,
+      "learning_rate": 0.001,
+      "loss": 0.4159,
+      "step": 3182
+    },
+    {
+      "epoch": 0.08782597598788702,
+      "grad_norm": 0.0027677484322339296,
+      "learning_rate": 0.001,
+      "loss": 0.3919,
+      "step": 3183
+    },
+    {
+      "epoch": 0.08785356818895139,
+      "grad_norm": 0.0029434715397655964,
+      "learning_rate": 0.001,
+      "loss": 0.3447,
+      "step": 3184
+    },
+    {
+      "epoch": 0.08788116039001576,
+      "grad_norm": 0.0024778309743851423,
+      "learning_rate": 0.001,
+      "loss": 0.3948,
+      "step": 3185
+    },
+    {
+      "epoch": 0.08790875259108014,
+      "grad_norm": 0.0023013644386082888,
+      "learning_rate": 0.001,
+      "loss": 0.4046,
+      "step": 3186
+    },
+    {
+      "epoch": 0.0879363447921445,
+      "grad_norm": 0.0019937290344387293,
+      "learning_rate": 0.001,
+      "loss": 0.433,
+      "step": 3187
+    },
+    {
+      "epoch": 0.08796393699320887,
+      "grad_norm": 0.009529836475849152,
+      "learning_rate": 0.001,
+      "loss": 0.3898,
+      "step": 3188
+    },
+    {
+      "epoch": 0.08799152919427324,
+      "grad_norm": 0.0029238457791507244,
+      "learning_rate": 0.001,
+      "loss": 0.3719,
+      "step": 3189
+    },
+    {
+      "epoch": 0.0880191213953376,
+      "grad_norm": 0.0023196239490062,
+      "learning_rate": 0.001,
+      "loss": 0.4035,
+      "step": 3190
+    },
+    {
+      "epoch": 0.08804671359640198,
+      "grad_norm": 0.0024587539955973625,
+      "learning_rate": 0.001,
+      "loss": 0.426,
+      "step": 3191
+    },
+    {
+      "epoch": 0.08807430579746635,
+      "grad_norm": 0.004239838104695082,
+      "learning_rate": 0.001,
+      "loss": 0.3774,
+      "step": 3192
+    },
+    {
+      "epoch": 0.08810189799853071,
+      "grad_norm": 0.003386555938050151,
+      "learning_rate": 0.001,
+      "loss": 0.3839,
+      "step": 3193
+    },
+    {
+      "epoch": 0.08812949019959508,
+      "grad_norm": 0.0025654241908341646,
+      "learning_rate": 0.001,
+      "loss": 0.3882,
+      "step": 3194
+    },
+    {
+      "epoch": 0.08815708240065945,
+      "grad_norm": 0.002837128471583128,
+      "learning_rate": 0.001,
+      "loss": 0.4287,
+      "step": 3195
+    },
+    {
+      "epoch": 0.08818467460172383,
+      "grad_norm": 0.0034069865942001343,
+      "learning_rate": 0.001,
+      "loss": 0.3413,
+      "step": 3196
+    },
+    {
+      "epoch": 0.0882122668027882,
+      "grad_norm": 0.002375151729211211,
+      "learning_rate": 0.001,
+      "loss": 0.4089,
+      "step": 3197
+    },
+    {
+      "epoch": 0.08823985900385256,
+      "grad_norm": 0.003615601221099496,
+      "learning_rate": 0.001,
+      "loss": 0.4382,
+      "step": 3198
+    },
+    {
+      "epoch": 0.08826745120491693,
+      "grad_norm": 0.0048666223883628845,
+      "learning_rate": 0.001,
+      "loss": 0.3939,
+      "step": 3199
+    },
+    {
+      "epoch": 0.0882950434059813,
+      "grad_norm": 0.0025259742978960276,
+      "learning_rate": 0.001,
+      "loss": 0.3826,
+      "step": 3200
+    },
+    {
+      "epoch": 0.08832263560704567,
+      "grad_norm": 0.006654439959675074,
+      "learning_rate": 0.001,
+      "loss": 0.4076,
+      "step": 3201
+    },
+    {
+      "epoch": 0.08835022780811004,
+      "grad_norm": 0.002633353229612112,
+      "learning_rate": 0.001,
+      "loss": 0.4206,
+      "step": 3202
+    },
+    {
+      "epoch": 0.08837782000917441,
+      "grad_norm": 0.0028525053057819605,
+      "learning_rate": 0.001,
+      "loss": 0.4091,
+      "step": 3203
+    },
+    {
+      "epoch": 0.08840541221023877,
+      "grad_norm": 0.0071784635074436665,
+      "learning_rate": 0.001,
+      "loss": 0.4055,
+      "step": 3204
+    },
+    {
+      "epoch": 0.08843300441130314,
+      "grad_norm": 0.0030664210207760334,
+      "learning_rate": 0.001,
+      "loss": 0.3847,
+      "step": 3205
+    },
+    {
+      "epoch": 0.08846059661236752,
+      "grad_norm": 0.0034981323406100273,
+      "learning_rate": 0.001,
+      "loss": 0.3855,
+      "step": 3206
+    },
+    {
+      "epoch": 0.08848818881343189,
+      "grad_norm": 0.006974066607654095,
+      "learning_rate": 0.001,
+      "loss": 0.458,
+      "step": 3207
+    },
+    {
+      "epoch": 0.08851578101449625,
+      "grad_norm": 0.002912199590355158,
+      "learning_rate": 0.001,
+      "loss": 0.416,
+      "step": 3208
+    },
+    {
+      "epoch": 0.08854337321556062,
+      "grad_norm": 0.0036544064059853554,
+      "learning_rate": 0.001,
+      "loss": 0.4617,
+      "step": 3209
+    },
+    {
+      "epoch": 0.08857096541662499,
+      "grad_norm": 0.002682635560631752,
+      "learning_rate": 0.001,
+      "loss": 0.3695,
+      "step": 3210
+    },
+    {
+      "epoch": 0.08859855761768937,
+      "grad_norm": 0.0032552520278841257,
+      "learning_rate": 0.001,
+      "loss": 0.4154,
+      "step": 3211
+    },
+    {
+      "epoch": 0.08862614981875373,
+      "grad_norm": 0.003252149559557438,
+      "learning_rate": 0.001,
+      "loss": 0.425,
+      "step": 3212
+    },
+    {
+      "epoch": 0.0886537420198181,
+      "grad_norm": 0.004062923137098551,
+      "learning_rate": 0.001,
+      "loss": 0.4092,
+      "step": 3213
+    },
+    {
+      "epoch": 0.08868133422088247,
+      "grad_norm": 0.0034245557617396116,
+      "learning_rate": 0.001,
+      "loss": 0.3926,
+      "step": 3214
+    },
+    {
+      "epoch": 0.08870892642194683,
+      "grad_norm": 0.00209795287810266,
+      "learning_rate": 0.001,
+      "loss": 0.4181,
+      "step": 3215
+    },
+    {
+      "epoch": 0.08873651862301121,
+      "grad_norm": 0.002493221778422594,
+      "learning_rate": 0.001,
+      "loss": 0.4211,
+      "step": 3216
+    },
+    {
+      "epoch": 0.08876411082407558,
+      "grad_norm": 0.0032907910645008087,
+      "learning_rate": 0.001,
+      "loss": 0.423,
+      "step": 3217
+    },
+    {
+      "epoch": 0.08879170302513995,
+      "grad_norm": 0.0022613955661654472,
+      "learning_rate": 0.001,
+      "loss": 0.406,
+      "step": 3218
+    },
+    {
+      "epoch": 0.08881929522620431,
+      "grad_norm": 0.0027464418672025204,
+      "learning_rate": 0.001,
+      "loss": 0.3534,
+      "step": 3219
+    },
+    {
+      "epoch": 0.08884688742726868,
+      "grad_norm": 0.0038859571795910597,
+      "learning_rate": 0.001,
+      "loss": 0.3634,
+      "step": 3220
+    },
+    {
+      "epoch": 0.08887447962833306,
+      "grad_norm": 0.002589226933196187,
+      "learning_rate": 0.001,
+      "loss": 0.3884,
+      "step": 3221
+    },
+    {
+      "epoch": 0.08890207182939742,
+      "grad_norm": 0.003947058692574501,
+      "learning_rate": 0.001,
+      "loss": 0.4078,
+      "step": 3222
+    },
+    {
+      "epoch": 0.08892966403046179,
+      "grad_norm": 0.0034198507200926542,
+      "learning_rate": 0.001,
+      "loss": 0.3794,
+      "step": 3223
+    },
+    {
+      "epoch": 0.08895725623152616,
+      "grad_norm": 0.0028758652042597532,
+      "learning_rate": 0.001,
+      "loss": 0.4017,
+      "step": 3224
+    },
+    {
+      "epoch": 0.08898484843259052,
+      "grad_norm": 0.002941399347037077,
+      "learning_rate": 0.001,
+      "loss": 0.4374,
+      "step": 3225
+    },
+    {
+      "epoch": 0.08901244063365489,
+      "grad_norm": 0.0027777596842497587,
+      "learning_rate": 0.001,
+      "loss": 0.4068,
+      "step": 3226
+    },
+    {
+      "epoch": 0.08904003283471927,
+      "grad_norm": 0.00824811402708292,
+      "learning_rate": 0.001,
+      "loss": 0.4095,
+      "step": 3227
+    },
+    {
+      "epoch": 0.08906762503578364,
+      "grad_norm": 0.002942741382867098,
+      "learning_rate": 0.001,
+      "loss": 0.4186,
+      "step": 3228
+    },
+    {
+      "epoch": 0.089095217236848,
+      "grad_norm": 0.0034886416979134083,
+      "learning_rate": 0.001,
+      "loss": 0.3866,
+      "step": 3229
+    },
+    {
+      "epoch": 0.08912280943791237,
+      "grad_norm": 0.0034925418440252542,
+      "learning_rate": 0.001,
+      "loss": 0.3918,
+      "step": 3230
+    },
+    {
+      "epoch": 0.08915040163897674,
+      "grad_norm": 0.0031590645667165518,
+      "learning_rate": 0.001,
+      "loss": 0.4043,
+      "step": 3231
+    },
+    {
+      "epoch": 0.08917799384004112,
+      "grad_norm": 0.00394233874976635,
+      "learning_rate": 0.001,
+      "loss": 0.412,
+      "step": 3232
+    },
+    {
+      "epoch": 0.08920558604110548,
+      "grad_norm": 0.002954542636871338,
+      "learning_rate": 0.001,
+      "loss": 0.4271,
+      "step": 3233
+    },
+    {
+      "epoch": 0.08923317824216985,
+      "grad_norm": 0.0020198922138661146,
+      "learning_rate": 0.001,
+      "loss": 0.4055,
+      "step": 3234
+    },
+    {
+      "epoch": 0.08926077044323422,
+      "grad_norm": 0.0032049540895968676,
+      "learning_rate": 0.001,
+      "loss": 0.3885,
+      "step": 3235
+    },
+    {
+      "epoch": 0.08928836264429858,
+      "grad_norm": 0.0024705370888113976,
+      "learning_rate": 0.001,
+      "loss": 0.421,
+      "step": 3236
+    },
+    {
+      "epoch": 0.08931595484536296,
+      "grad_norm": 0.0023976247757673264,
+      "learning_rate": 0.001,
+      "loss": 0.4301,
+      "step": 3237
+    },
+    {
+      "epoch": 0.08934354704642733,
+      "grad_norm": 0.0025141413789242506,
+      "learning_rate": 0.001,
+      "loss": 0.3916,
+      "step": 3238
+    },
+    {
+      "epoch": 0.0893711392474917,
+      "grad_norm": 0.0034452369436621666,
+      "learning_rate": 0.001,
+      "loss": 0.4055,
+      "step": 3239
+    },
+    {
+      "epoch": 0.08939873144855606,
+      "grad_norm": 0.0036900045815855265,
+      "learning_rate": 0.001,
+      "loss": 0.4006,
+      "step": 3240
+    },
+    {
+      "epoch": 0.08942632364962043,
+      "grad_norm": 0.0030278146732598543,
+      "learning_rate": 0.001,
+      "loss": 0.3918,
+      "step": 3241
+    },
+    {
+      "epoch": 0.08945391585068481,
+      "grad_norm": 0.0027707985136657953,
+      "learning_rate": 0.001,
+      "loss": 0.3721,
+      "step": 3242
+    },
+    {
+      "epoch": 0.08948150805174918,
+      "grad_norm": 0.0024415196385234594,
+      "learning_rate": 0.001,
+      "loss": 0.3999,
+      "step": 3243
+    },
+    {
+      "epoch": 0.08950910025281354,
+      "grad_norm": 0.0054499804973602295,
+      "learning_rate": 0.001,
+      "loss": 0.423,
+      "step": 3244
+    },
+    {
+      "epoch": 0.08953669245387791,
+      "grad_norm": 0.0034079072065651417,
+      "learning_rate": 0.001,
+      "loss": 0.3607,
+      "step": 3245
+    },
+    {
+      "epoch": 0.08956428465494227,
+      "grad_norm": 0.0038858712650835514,
+      "learning_rate": 0.001,
+      "loss": 0.3912,
+      "step": 3246
+    },
+    {
+      "epoch": 0.08959187685600666,
+      "grad_norm": 0.007187449838966131,
+      "learning_rate": 0.001,
+      "loss": 0.4164,
+      "step": 3247
+    },
+    {
+      "epoch": 0.08961946905707102,
+      "grad_norm": 0.0033226367086172104,
+      "learning_rate": 0.001,
+      "loss": 0.4007,
+      "step": 3248
+    },
+    {
+      "epoch": 0.08964706125813539,
+      "grad_norm": 0.0028839895967394114,
+      "learning_rate": 0.001,
+      "loss": 0.4179,
+      "step": 3249
+    },
+    {
+      "epoch": 0.08967465345919975,
+      "grad_norm": 0.0032480424270033836,
+      "learning_rate": 0.001,
+      "loss": 0.3933,
+      "step": 3250
+    },
+    {
+      "epoch": 0.08970224566026412,
+      "grad_norm": 0.004398911260068417,
+      "learning_rate": 0.001,
+      "loss": 0.3914,
+      "step": 3251
+    },
+    {
+      "epoch": 0.0897298378613285,
+      "grad_norm": 0.00363975390791893,
+      "learning_rate": 0.001,
+      "loss": 0.4069,
+      "step": 3252
+    },
+    {
+      "epoch": 0.08975743006239287,
+      "grad_norm": 0.008247625082731247,
+      "learning_rate": 0.001,
+      "loss": 0.4085,
+      "step": 3253
+    },
+    {
+      "epoch": 0.08978502226345723,
+      "grad_norm": 0.003181576495990157,
+      "learning_rate": 0.001,
+      "loss": 0.3846,
+      "step": 3254
+    },
+    {
+      "epoch": 0.0898126144645216,
+      "grad_norm": 0.005014390219002962,
+      "learning_rate": 0.001,
+      "loss": 0.4159,
+      "step": 3255
+    },
+    {
+      "epoch": 0.08984020666558597,
+      "grad_norm": 0.0027002303395420313,
+      "learning_rate": 0.001,
+      "loss": 0.3703,
+      "step": 3256
+    },
+    {
+      "epoch": 0.08986779886665035,
+      "grad_norm": 0.0028025219216942787,
+      "learning_rate": 0.001,
+      "loss": 0.4148,
+      "step": 3257
+    },
+    {
+      "epoch": 0.08989539106771471,
+      "grad_norm": 0.0026610197965055704,
+      "learning_rate": 0.001,
+      "loss": 0.3921,
+      "step": 3258
+    },
+    {
+      "epoch": 0.08992298326877908,
+      "grad_norm": 0.003577177645638585,
+      "learning_rate": 0.001,
+      "loss": 0.3467,
+      "step": 3259
+    },
+    {
+      "epoch": 0.08995057546984345,
+      "grad_norm": 0.0036113469395786524,
+      "learning_rate": 0.001,
+      "loss": 0.3812,
+      "step": 3260
+    },
+    {
+      "epoch": 0.08997816767090781,
+      "grad_norm": 0.0033262385986745358,
+      "learning_rate": 0.001,
+      "loss": 0.3851,
+      "step": 3261
+    },
+    {
+      "epoch": 0.0900057598719722,
+      "grad_norm": 0.0021796945948153734,
+      "learning_rate": 0.001,
+      "loss": 0.4053,
+      "step": 3262
+    },
+    {
+      "epoch": 0.09003335207303656,
+      "grad_norm": 0.002145004691556096,
+      "learning_rate": 0.001,
+      "loss": 0.3929,
+      "step": 3263
+    },
+    {
+      "epoch": 0.09006094427410093,
+      "grad_norm": 0.0034920983016490936,
+      "learning_rate": 0.001,
+      "loss": 0.397,
+      "step": 3264
+    },
+    {
+      "epoch": 0.09008853647516529,
+      "grad_norm": 0.0032197278924286366,
+      "learning_rate": 0.001,
+      "loss": 0.3785,
+      "step": 3265
+    },
+    {
+      "epoch": 0.09011612867622966,
+      "grad_norm": 0.0023128585889935493,
+      "learning_rate": 0.001,
+      "loss": 0.4024,
+      "step": 3266
+    },
+    {
+      "epoch": 0.09014372087729404,
+      "grad_norm": 0.0028581952210515738,
+      "learning_rate": 0.001,
+      "loss": 0.3924,
+      "step": 3267
+    },
+    {
+      "epoch": 0.0901713130783584,
+      "grad_norm": 0.002962828380987048,
+      "learning_rate": 0.001,
+      "loss": 0.3842,
+      "step": 3268
+    },
+    {
+      "epoch": 0.09019890527942277,
+      "grad_norm": 0.004223112482577562,
+      "learning_rate": 0.001,
+      "loss": 0.4086,
+      "step": 3269
+    },
+    {
+      "epoch": 0.09022649748048714,
+      "grad_norm": 0.02053755894303322,
+      "learning_rate": 0.001,
+      "loss": 0.3863,
+      "step": 3270
+    },
+    {
+      "epoch": 0.0902540896815515,
+      "grad_norm": 0.0043358695693314075,
+      "learning_rate": 0.001,
+      "loss": 0.4023,
+      "step": 3271
+    },
+    {
+      "epoch": 0.09028168188261587,
+      "grad_norm": 0.003768104827031493,
+      "learning_rate": 0.001,
+      "loss": 0.3758,
+      "step": 3272
+    },
+    {
+      "epoch": 0.09030927408368025,
+      "grad_norm": 0.0026477002538740635,
+      "learning_rate": 0.001,
+      "loss": 0.3851,
+      "step": 3273
+    },
+    {
+      "epoch": 0.09033686628474462,
+      "grad_norm": 0.003647193778306246,
+      "learning_rate": 0.001,
+      "loss": 0.3826,
+      "step": 3274
+    },
+    {
+      "epoch": 0.09036445848580898,
+      "grad_norm": 0.002453665481880307,
+      "learning_rate": 0.001,
+      "loss": 0.407,
+      "step": 3275
+    },
+    {
+      "epoch": 0.09039205068687335,
+      "grad_norm": 0.002882964676246047,
+      "learning_rate": 0.001,
+      "loss": 0.3935,
+      "step": 3276
+    },
+    {
+      "epoch": 0.09041964288793772,
+      "grad_norm": 0.00521261477842927,
+      "learning_rate": 0.001,
+      "loss": 0.4009,
+      "step": 3277
+    },
+    {
+      "epoch": 0.0904472350890021,
+      "grad_norm": 0.002587896538898349,
+      "learning_rate": 0.001,
+      "loss": 0.4291,
+      "step": 3278
+    },
+    {
+      "epoch": 0.09047482729006646,
+      "grad_norm": 0.002533156191930175,
+      "learning_rate": 0.001,
+      "loss": 0.3831,
+      "step": 3279
+    },
+    {
+      "epoch": 0.09050241949113083,
+      "grad_norm": 0.0027842391282320023,
+      "learning_rate": 0.001,
+      "loss": 0.3854,
+      "step": 3280
+    },
+    {
+      "epoch": 0.0905300116921952,
+      "grad_norm": 0.0023463580291718245,
+      "learning_rate": 0.001,
+      "loss": 0.4197,
+      "step": 3281
+    },
+    {
+      "epoch": 0.09055760389325956,
+      "grad_norm": 0.0023778725881129503,
+      "learning_rate": 0.001,
+      "loss": 0.3927,
+      "step": 3282
+    },
+    {
+      "epoch": 0.09058519609432394,
+      "grad_norm": 0.0035691028460860252,
+      "learning_rate": 0.001,
+      "loss": 0.3971,
+      "step": 3283
+    },
+    {
+      "epoch": 0.09061278829538831,
+      "grad_norm": 0.0037309350445866585,
+      "learning_rate": 0.001,
+      "loss": 0.3752,
+      "step": 3284
+    },
+    {
+      "epoch": 0.09064038049645268,
+      "grad_norm": 0.002625760156661272,
+      "learning_rate": 0.001,
+      "loss": 0.4123,
+      "step": 3285
+    },
+    {
+      "epoch": 0.09066797269751704,
+      "grad_norm": 0.003352556610479951,
+      "learning_rate": 0.001,
+      "loss": 0.4268,
+      "step": 3286
+    },
+    {
+      "epoch": 0.09069556489858141,
+      "grad_norm": 0.0050033205188810825,
+      "learning_rate": 0.001,
+      "loss": 0.3944,
+      "step": 3287
+    },
+    {
+      "epoch": 0.09072315709964579,
+      "grad_norm": 0.002488895785063505,
+      "learning_rate": 0.001,
+      "loss": 0.4261,
+      "step": 3288
+    },
+    {
+      "epoch": 0.09075074930071016,
+      "grad_norm": 0.0031474691350013018,
+      "learning_rate": 0.001,
+      "loss": 0.3763,
+      "step": 3289
+    },
+    {
+      "epoch": 0.09077834150177452,
+      "grad_norm": 0.002337649930268526,
+      "learning_rate": 0.001,
+      "loss": 0.4015,
+      "step": 3290
+    },
+    {
+      "epoch": 0.09080593370283889,
+      "grad_norm": 0.0034202388487756252,
+      "learning_rate": 0.001,
+      "loss": 0.4156,
+      "step": 3291
+    },
+    {
+      "epoch": 0.09083352590390326,
+      "grad_norm": 0.004500823561102152,
+      "learning_rate": 0.001,
+      "loss": 0.3663,
+      "step": 3292
+    },
+    {
+      "epoch": 0.09086111810496764,
+      "grad_norm": 0.0026400513015687466,
+      "learning_rate": 0.001,
+      "loss": 0.4163,
+      "step": 3293
+    },
+    {
+      "epoch": 0.090888710306032,
+      "grad_norm": 0.002835595514625311,
+      "learning_rate": 0.001,
+      "loss": 0.388,
+      "step": 3294
+    },
+    {
+      "epoch": 0.09091630250709637,
+      "grad_norm": 0.0021484890021383762,
+      "learning_rate": 0.001,
+      "loss": 0.3625,
+      "step": 3295
+    },
+    {
+      "epoch": 0.09094389470816074,
+      "grad_norm": 0.002589087001979351,
+      "learning_rate": 0.001,
+      "loss": 0.4258,
+      "step": 3296
+    },
+    {
+      "epoch": 0.0909714869092251,
+      "grad_norm": 0.004091163631528616,
+      "learning_rate": 0.001,
+      "loss": 0.3974,
+      "step": 3297
+    },
+    {
+      "epoch": 0.09099907911028948,
+      "grad_norm": 0.002472213003784418,
+      "learning_rate": 0.001,
+      "loss": 0.4512,
+      "step": 3298
+    },
+    {
+      "epoch": 0.09102667131135385,
+      "grad_norm": 0.0025115078315138817,
+      "learning_rate": 0.001,
+      "loss": 0.4038,
+      "step": 3299
+    },
+    {
+      "epoch": 0.09105426351241822,
+      "grad_norm": 0.002945608925074339,
+      "learning_rate": 0.001,
+      "loss": 0.3899,
+      "step": 3300
+    },
+    {
+      "epoch": 0.09108185571348258,
+      "grad_norm": 0.0024975589476525784,
+      "learning_rate": 0.001,
+      "loss": 0.357,
+      "step": 3301
+    },
+    {
+      "epoch": 0.09110944791454695,
+      "grad_norm": 0.003322755452245474,
+      "learning_rate": 0.001,
+      "loss": 0.4099,
+      "step": 3302
+    },
+    {
+      "epoch": 0.09113704011561133,
+      "grad_norm": 0.003869826439768076,
+      "learning_rate": 0.001,
+      "loss": 0.3477,
+      "step": 3303
+    },
+    {
+      "epoch": 0.0911646323166757,
+      "grad_norm": 0.0036806685384362936,
+      "learning_rate": 0.001,
+      "loss": 0.4237,
+      "step": 3304
+    },
+    {
+      "epoch": 0.09119222451774006,
+      "grad_norm": 0.002534373663365841,
+      "learning_rate": 0.001,
+      "loss": 0.4156,
+      "step": 3305
+    },
+    {
+      "epoch": 0.09121981671880443,
+      "grad_norm": 0.002670099725946784,
+      "learning_rate": 0.001,
+      "loss": 0.4474,
+      "step": 3306
+    },
+    {
+      "epoch": 0.0912474089198688,
+      "grad_norm": 0.0026154613588005304,
+      "learning_rate": 0.001,
+      "loss": 0.3922,
+      "step": 3307
+    },
+    {
+      "epoch": 0.09127500112093317,
+      "grad_norm": 0.002783595584332943,
+      "learning_rate": 0.001,
+      "loss": 0.3937,
+      "step": 3308
+    },
+    {
+      "epoch": 0.09130259332199754,
+      "grad_norm": 0.0024160996545106173,
+      "learning_rate": 0.001,
+      "loss": 0.4107,
+      "step": 3309
+    },
+    {
+      "epoch": 0.09133018552306191,
+      "grad_norm": 0.0040392628870904446,
+      "learning_rate": 0.001,
+      "loss": 0.3911,
+      "step": 3310
+    },
+    {
+      "epoch": 0.09135777772412627,
+      "grad_norm": 0.0036512308288365602,
+      "learning_rate": 0.001,
+      "loss": 0.3794,
+      "step": 3311
+    },
+    {
+      "epoch": 0.09138536992519064,
+      "grad_norm": 0.0026296162977814674,
+      "learning_rate": 0.001,
+      "loss": 0.3898,
+      "step": 3312
+    },
+    {
+      "epoch": 0.09141296212625502,
+      "grad_norm": 0.002523603616282344,
+      "learning_rate": 0.001,
+      "loss": 0.4242,
+      "step": 3313
+    },
+    {
+      "epoch": 0.09144055432731939,
+      "grad_norm": 0.0027449633926153183,
+      "learning_rate": 0.001,
+      "loss": 0.4058,
+      "step": 3314
+    },
+    {
+      "epoch": 0.09146814652838375,
+      "grad_norm": 0.004635502118617296,
+      "learning_rate": 0.001,
+      "loss": 0.3953,
+      "step": 3315
+    },
+    {
+      "epoch": 0.09149573872944812,
+      "grad_norm": 0.0026435446925461292,
+      "learning_rate": 0.001,
+      "loss": 0.4115,
+      "step": 3316
+    },
+    {
+      "epoch": 0.09152333093051249,
+      "grad_norm": 0.012709138914942741,
+      "learning_rate": 0.001,
+      "loss": 0.3866,
+      "step": 3317
+    },
+    {
+      "epoch": 0.09155092313157685,
+      "grad_norm": 0.0032850292045623064,
+      "learning_rate": 0.001,
+      "loss": 0.4388,
+      "step": 3318
+    },
+    {
+      "epoch": 0.09157851533264123,
+      "grad_norm": 0.0038955537602305412,
+      "learning_rate": 0.001,
+      "loss": 0.3689,
+      "step": 3319
+    },
+    {
+      "epoch": 0.0916061075337056,
+      "grad_norm": 0.004457899369299412,
+      "learning_rate": 0.001,
+      "loss": 0.4155,
+      "step": 3320
+    },
+    {
+      "epoch": 0.09163369973476997,
+      "grad_norm": 0.002992943860590458,
+      "learning_rate": 0.001,
+      "loss": 0.4278,
+      "step": 3321
+    },
+    {
+      "epoch": 0.09166129193583433,
+      "grad_norm": 0.004083162639290094,
+      "learning_rate": 0.001,
+      "loss": 0.4232,
+      "step": 3322
+    },
+    {
+      "epoch": 0.0916888841368987,
+      "grad_norm": 0.0032160687260329723,
+      "learning_rate": 0.001,
+      "loss": 0.4005,
+      "step": 3323
+    },
+    {
+      "epoch": 0.09171647633796308,
+      "grad_norm": 0.003969733603298664,
+      "learning_rate": 0.001,
+      "loss": 0.4146,
+      "step": 3324
+    },
+    {
+      "epoch": 0.09174406853902745,
+      "grad_norm": 0.004920699633657932,
+      "learning_rate": 0.001,
+      "loss": 0.3879,
+      "step": 3325
+    },
+    {
+      "epoch": 0.09177166074009181,
+      "grad_norm": 0.005621886812150478,
+      "learning_rate": 0.001,
+      "loss": 0.4114,
+      "step": 3326
+    },
+    {
+      "epoch": 0.09179925294115618,
+      "grad_norm": 0.015430702827870846,
+      "learning_rate": 0.001,
+      "loss": 0.389,
+      "step": 3327
+    },
+    {
+      "epoch": 0.09182684514222055,
+      "grad_norm": 0.004751747474074364,
+      "learning_rate": 0.001,
+      "loss": 0.3876,
+      "step": 3328
+    },
+    {
+      "epoch": 0.09185443734328493,
+      "grad_norm": 0.0023684531915932894,
+      "learning_rate": 0.001,
+      "loss": 0.3866,
+      "step": 3329
+    },
+    {
+      "epoch": 0.09188202954434929,
+      "grad_norm": 0.004853997845202684,
+      "learning_rate": 0.001,
+      "loss": 0.4107,
+      "step": 3330
+    },
+    {
+      "epoch": 0.09190962174541366,
+      "grad_norm": 0.005576598923653364,
+      "learning_rate": 0.001,
+      "loss": 0.4054,
+      "step": 3331
+    },
+    {
+      "epoch": 0.09193721394647802,
+      "grad_norm": 0.0027397077064961195,
+      "learning_rate": 0.001,
+      "loss": 0.415,
+      "step": 3332
+    },
+    {
+      "epoch": 0.09196480614754239,
+      "grad_norm": 0.0026846746914088726,
+      "learning_rate": 0.001,
+      "loss": 0.3813,
+      "step": 3333
+    },
+    {
+      "epoch": 0.09199239834860677,
+      "grad_norm": 0.0023833108134567738,
+      "learning_rate": 0.001,
+      "loss": 0.4137,
+      "step": 3334
+    },
+    {
+      "epoch": 0.09201999054967114,
+      "grad_norm": 0.002954500960186124,
+      "learning_rate": 0.001,
+      "loss": 0.4484,
+      "step": 3335
+    },
+    {
+      "epoch": 0.0920475827507355,
+      "grad_norm": 0.0023365288507193327,
+      "learning_rate": 0.001,
+      "loss": 0.391,
+      "step": 3336
+    },
+    {
+      "epoch": 0.09207517495179987,
+      "grad_norm": 0.0031199888326227665,
+      "learning_rate": 0.001,
+      "loss": 0.4011,
+      "step": 3337
+    },
+    {
+      "epoch": 0.09210276715286424,
+      "grad_norm": 0.00201621581800282,
+      "learning_rate": 0.001,
+      "loss": 0.3766,
+      "step": 3338
+    },
+    {
+      "epoch": 0.09213035935392862,
+      "grad_norm": 0.0036083683371543884,
+      "learning_rate": 0.001,
+      "loss": 0.3851,
+      "step": 3339
+    },
+    {
+      "epoch": 0.09215795155499298,
+      "grad_norm": 0.0037763137370347977,
+      "learning_rate": 0.001,
+      "loss": 0.3818,
+      "step": 3340
+    },
+    {
+      "epoch": 0.09218554375605735,
+      "grad_norm": 0.002573802135884762,
+      "learning_rate": 0.001,
+      "loss": 0.4039,
+      "step": 3341
+    },
+    {
+      "epoch": 0.09221313595712172,
+      "grad_norm": 0.0032212398946285248,
+      "learning_rate": 0.001,
+      "loss": 0.3777,
+      "step": 3342
+    },
+    {
+      "epoch": 0.09224072815818608,
+      "grad_norm": 0.002471911022439599,
+      "learning_rate": 0.001,
+      "loss": 0.4051,
+      "step": 3343
+    },
+    {
+      "epoch": 0.09226832035925046,
+      "grad_norm": 0.0030415072105824947,
+      "learning_rate": 0.001,
+      "loss": 0.4206,
+      "step": 3344
+    },
+    {
+      "epoch": 0.09229591256031483,
+      "grad_norm": 0.0027574030682444572,
+      "learning_rate": 0.001,
+      "loss": 0.3627,
+      "step": 3345
+    },
+    {
+      "epoch": 0.0923235047613792,
+      "grad_norm": 0.0020206805784255266,
+      "learning_rate": 0.001,
+      "loss": 0.4212,
+      "step": 3346
+    },
+    {
+      "epoch": 0.09235109696244356,
+      "grad_norm": 0.002503051655367017,
+      "learning_rate": 0.001,
+      "loss": 0.3972,
+      "step": 3347
+    },
+    {
+      "epoch": 0.09237868916350793,
+      "grad_norm": 0.00392269529402256,
+      "learning_rate": 0.001,
+      "loss": 0.4007,
+      "step": 3348
+    },
+    {
+      "epoch": 0.09240628136457231,
+      "grad_norm": 0.0035916061606258154,
+      "learning_rate": 0.001,
+      "loss": 0.4151,
+      "step": 3349
+    },
+    {
+      "epoch": 0.09243387356563668,
+      "grad_norm": 0.003222295781597495,
+      "learning_rate": 0.001,
+      "loss": 0.4087,
+      "step": 3350
+    },
+    {
+      "epoch": 0.09246146576670104,
+      "grad_norm": 0.003206141758710146,
+      "learning_rate": 0.001,
+      "loss": 0.3968,
+      "step": 3351
+    },
+    {
+      "epoch": 0.09248905796776541,
+      "grad_norm": 0.00480427872389555,
+      "learning_rate": 0.001,
+      "loss": 0.398,
+      "step": 3352
+    },
+    {
+      "epoch": 0.09251665016882978,
+      "grad_norm": 0.0026392394211143255,
+      "learning_rate": 0.001,
+      "loss": 0.4142,
+      "step": 3353
+    },
+    {
+      "epoch": 0.09254424236989416,
+      "grad_norm": 0.003221785882487893,
+      "learning_rate": 0.001,
+      "loss": 0.4083,
+      "step": 3354
+    },
+    {
+      "epoch": 0.09257183457095852,
+      "grad_norm": 0.0030678475741297007,
+      "learning_rate": 0.001,
+      "loss": 0.4267,
+      "step": 3355
+    },
+    {
+      "epoch": 0.09259942677202289,
+      "grad_norm": 0.0028969766572117805,
+      "learning_rate": 0.001,
+      "loss": 0.4222,
+      "step": 3356
+    },
+    {
+      "epoch": 0.09262701897308726,
+      "grad_norm": 0.00338752125389874,
+      "learning_rate": 0.001,
+      "loss": 0.4088,
+      "step": 3357
+    },
+    {
+      "epoch": 0.09265461117415162,
+      "grad_norm": 0.003165165428072214,
+      "learning_rate": 0.001,
+      "loss": 0.4127,
+      "step": 3358
+    },
+    {
+      "epoch": 0.092682203375216,
+      "grad_norm": 0.002541585825383663,
+      "learning_rate": 0.001,
+      "loss": 0.4281,
+      "step": 3359
+    },
+    {
+      "epoch": 0.09270979557628037,
+      "grad_norm": 0.0037303478457033634,
+      "learning_rate": 0.001,
+      "loss": 0.4178,
+      "step": 3360
+    },
+    {
+      "epoch": 0.09273738777734473,
+      "grad_norm": 0.006375085562467575,
+      "learning_rate": 0.001,
+      "loss": 0.3953,
+      "step": 3361
+    },
+    {
+      "epoch": 0.0927649799784091,
+      "grad_norm": 0.023185908794403076,
+      "learning_rate": 0.001,
+      "loss": 0.3785,
+      "step": 3362
+    },
+    {
+      "epoch": 0.09279257217947347,
+      "grad_norm": 0.003924873657524586,
+      "learning_rate": 0.001,
+      "loss": 0.39,
+      "step": 3363
+    },
+    {
+      "epoch": 0.09282016438053785,
+      "grad_norm": 0.0037961790803819895,
+      "learning_rate": 0.001,
+      "loss": 0.4434,
+      "step": 3364
+    },
+    {
+      "epoch": 0.09284775658160221,
+      "grad_norm": 0.005324463825672865,
+      "learning_rate": 0.001,
+      "loss": 0.3622,
+      "step": 3365
+    },
+    {
+      "epoch": 0.09287534878266658,
+      "grad_norm": 0.0031803487800061703,
+      "learning_rate": 0.001,
+      "loss": 0.3928,
+      "step": 3366
+    },
+    {
+      "epoch": 0.09290294098373095,
+      "grad_norm": 0.003402640810236335,
+      "learning_rate": 0.001,
+      "loss": 0.4083,
+      "step": 3367
+    },
+    {
+      "epoch": 0.09293053318479531,
+      "grad_norm": 0.003059846581891179,
+      "learning_rate": 0.001,
+      "loss": 0.3749,
+      "step": 3368
+    },
+    {
+      "epoch": 0.09295812538585968,
+      "grad_norm": 0.002190982224419713,
+      "learning_rate": 0.001,
+      "loss": 0.4093,
+      "step": 3369
+    },
+    {
+      "epoch": 0.09298571758692406,
+      "grad_norm": 0.007336355280131102,
+      "learning_rate": 0.001,
+      "loss": 0.4231,
+      "step": 3370
+    },
+    {
+      "epoch": 0.09301330978798843,
+      "grad_norm": 0.0030043197330087423,
+      "learning_rate": 0.001,
+      "loss": 0.4232,
+      "step": 3371
+    },
+    {
+      "epoch": 0.0930409019890528,
+      "grad_norm": 0.00424328725785017,
+      "learning_rate": 0.001,
+      "loss": 0.3894,
+      "step": 3372
+    },
+    {
+      "epoch": 0.09306849419011716,
+      "grad_norm": 0.0036531679797917604,
+      "learning_rate": 0.001,
+      "loss": 0.3866,
+      "step": 3373
+    },
+    {
+      "epoch": 0.09309608639118153,
+      "grad_norm": 0.0029019531793892384,
+      "learning_rate": 0.001,
+      "loss": 0.4291,
+      "step": 3374
+    },
+    {
+      "epoch": 0.0931236785922459,
+      "grad_norm": 0.002536515239626169,
+      "learning_rate": 0.001,
+      "loss": 0.41,
+      "step": 3375
+    },
+    {
+      "epoch": 0.09315127079331027,
+      "grad_norm": 0.003373499261215329,
+      "learning_rate": 0.001,
+      "loss": 0.4067,
+      "step": 3376
+    },
+    {
+      "epoch": 0.09317886299437464,
+      "grad_norm": 0.0034982135985046625,
+      "learning_rate": 0.001,
+      "loss": 0.429,
+      "step": 3377
+    },
+    {
+      "epoch": 0.093206455195439,
+      "grad_norm": 0.005799585022032261,
+      "learning_rate": 0.001,
+      "loss": 0.4309,
+      "step": 3378
+    },
+    {
+      "epoch": 0.09323404739650337,
+      "grad_norm": 0.002723401878029108,
+      "learning_rate": 0.001,
+      "loss": 0.3932,
+      "step": 3379
+    },
+    {
+      "epoch": 0.09326163959756775,
+      "grad_norm": 0.002761744661256671,
+      "learning_rate": 0.001,
+      "loss": 0.4301,
+      "step": 3380
+    },
+    {
+      "epoch": 0.09328923179863212,
+      "grad_norm": 0.0033628942910581827,
+      "learning_rate": 0.001,
+      "loss": 0.3764,
+      "step": 3381
+    },
+    {
+      "epoch": 0.09331682399969649,
+      "grad_norm": 0.004340017680078745,
+      "learning_rate": 0.001,
+      "loss": 0.3864,
+      "step": 3382
+    },
+    {
+      "epoch": 0.09334441620076085,
+      "grad_norm": 0.00276694493368268,
+      "learning_rate": 0.001,
+      "loss": 0.4233,
+      "step": 3383
+    },
+    {
+      "epoch": 0.09337200840182522,
+      "grad_norm": 0.0040064319036901,
+      "learning_rate": 0.001,
+      "loss": 0.4161,
+      "step": 3384
+    },
+    {
+      "epoch": 0.0933996006028896,
+      "grad_norm": 0.004304267466068268,
+      "learning_rate": 0.001,
+      "loss": 0.3773,
+      "step": 3385
+    },
+    {
+      "epoch": 0.09342719280395397,
+      "grad_norm": 0.0043169171549379826,
+      "learning_rate": 0.001,
+      "loss": 0.4078,
+      "step": 3386
+    },
+    {
+      "epoch": 0.09345478500501833,
+      "grad_norm": 0.0029776948504149914,
+      "learning_rate": 0.001,
+      "loss": 0.3993,
+      "step": 3387
+    },
+    {
+      "epoch": 0.0934823772060827,
+      "grad_norm": 0.0030496844556182623,
+      "learning_rate": 0.001,
+      "loss": 0.3938,
+      "step": 3388
+    },
+    {
+      "epoch": 0.09350996940714706,
+      "grad_norm": 0.002344959881156683,
+      "learning_rate": 0.001,
+      "loss": 0.3994,
+      "step": 3389
+    },
+    {
+      "epoch": 0.09353756160821144,
+      "grad_norm": 0.0035598163958638906,
+      "learning_rate": 0.001,
+      "loss": 0.3724,
+      "step": 3390
+    },
+    {
+      "epoch": 0.09356515380927581,
+      "grad_norm": 0.002589760348200798,
+      "learning_rate": 0.001,
+      "loss": 0.3717,
+      "step": 3391
+    },
+    {
+      "epoch": 0.09359274601034018,
+      "grad_norm": 0.003679790301248431,
+      "learning_rate": 0.001,
+      "loss": 0.3774,
+      "step": 3392
+    },
+    {
+      "epoch": 0.09362033821140454,
+      "grad_norm": 0.0021475160028785467,
+      "learning_rate": 0.001,
+      "loss": 0.4379,
+      "step": 3393
+    },
+    {
+      "epoch": 0.09364793041246891,
+      "grad_norm": 0.0025992344599217176,
+      "learning_rate": 0.001,
+      "loss": 0.3772,
+      "step": 3394
+    },
+    {
+      "epoch": 0.09367552261353329,
+      "grad_norm": 0.003233823226764798,
+      "learning_rate": 0.001,
+      "loss": 0.3825,
+      "step": 3395
+    },
+    {
+      "epoch": 0.09370311481459766,
+      "grad_norm": 0.003669161582365632,
+      "learning_rate": 0.001,
+      "loss": 0.4085,
+      "step": 3396
+    },
+    {
+      "epoch": 0.09373070701566202,
+      "grad_norm": 0.0025238466914743185,
+      "learning_rate": 0.001,
+      "loss": 0.4148,
+      "step": 3397
+    },
+    {
+      "epoch": 0.09375829921672639,
+      "grad_norm": 0.0033630593679845333,
+      "learning_rate": 0.001,
+      "loss": 0.3781,
+      "step": 3398
+    },
+    {
+      "epoch": 0.09378589141779076,
+      "grad_norm": 0.003087608842179179,
+      "learning_rate": 0.001,
+      "loss": 0.3903,
+      "step": 3399
+    },
+    {
+      "epoch": 0.09381348361885514,
+      "grad_norm": 0.003995342645794153,
+      "learning_rate": 0.001,
+      "loss": 0.3936,
+      "step": 3400
+    },
+    {
+      "epoch": 0.0938410758199195,
+      "grad_norm": 0.0032711985986679792,
+      "learning_rate": 0.001,
+      "loss": 0.3869,
+      "step": 3401
+    },
+    {
+      "epoch": 0.09386866802098387,
+      "grad_norm": 0.003242811420932412,
+      "learning_rate": 0.001,
+      "loss": 0.4031,
+      "step": 3402
+    },
+    {
+      "epoch": 0.09389626022204824,
+      "grad_norm": 0.003735556034371257,
+      "learning_rate": 0.001,
+      "loss": 0.3986,
+      "step": 3403
+    },
+    {
+      "epoch": 0.0939238524231126,
+      "grad_norm": 0.005183308385312557,
+      "learning_rate": 0.001,
+      "loss": 0.3932,
+      "step": 3404
+    },
+    {
+      "epoch": 0.09395144462417698,
+      "grad_norm": 0.0034695270005613565,
+      "learning_rate": 0.001,
+      "loss": 0.3867,
+      "step": 3405
+    },
+    {
+      "epoch": 0.09397903682524135,
+      "grad_norm": 0.002555908402428031,
+      "learning_rate": 0.001,
+      "loss": 0.3941,
+      "step": 3406
+    },
+    {
+      "epoch": 0.09400662902630572,
+      "grad_norm": 0.0033101667650043964,
+      "learning_rate": 0.001,
+      "loss": 0.4072,
+      "step": 3407
+    },
+    {
+      "epoch": 0.09403422122737008,
+      "grad_norm": 0.0035219481214880943,
+      "learning_rate": 0.001,
+      "loss": 0.4187,
+      "step": 3408
+    },
+    {
+      "epoch": 0.09406181342843445,
+      "grad_norm": 0.002841345267370343,
+      "learning_rate": 0.001,
+      "loss": 0.4292,
+      "step": 3409
+    },
+    {
+      "epoch": 0.09408940562949883,
+      "grad_norm": 0.0024063391610980034,
+      "learning_rate": 0.001,
+      "loss": 0.3992,
+      "step": 3410
+    },
+    {
+      "epoch": 0.0941169978305632,
+      "grad_norm": 0.004291092045605183,
+      "learning_rate": 0.001,
+      "loss": 0.3888,
+      "step": 3411
+    },
+    {
+      "epoch": 0.09414459003162756,
+      "grad_norm": 0.00434610852971673,
+      "learning_rate": 0.001,
+      "loss": 0.4221,
+      "step": 3412
+    },
+    {
+      "epoch": 0.09417218223269193,
+      "grad_norm": 0.002690440509468317,
+      "learning_rate": 0.001,
+      "loss": 0.461,
+      "step": 3413
+    },
+    {
+      "epoch": 0.0941997744337563,
+      "grad_norm": 0.004084399435669184,
+      "learning_rate": 0.001,
+      "loss": 0.3907,
+      "step": 3414
+    },
+    {
+      "epoch": 0.09422736663482066,
+      "grad_norm": 0.003913892433047295,
+      "learning_rate": 0.001,
+      "loss": 0.4183,
+      "step": 3415
+    },
+    {
+      "epoch": 0.09425495883588504,
+      "grad_norm": 0.002662160899490118,
+      "learning_rate": 0.001,
+      "loss": 0.4368,
+      "step": 3416
+    },
+    {
+      "epoch": 0.09428255103694941,
+      "grad_norm": 0.0030965355690568686,
+      "learning_rate": 0.001,
+      "loss": 0.4074,
+      "step": 3417
+    },
+    {
+      "epoch": 0.09431014323801377,
+      "grad_norm": 0.00348603050224483,
+      "learning_rate": 0.001,
+      "loss": 0.4088,
+      "step": 3418
+    },
+    {
+      "epoch": 0.09433773543907814,
+      "grad_norm": 0.002485208213329315,
+      "learning_rate": 0.001,
+      "loss": 0.3941,
+      "step": 3419
+    },
+    {
+      "epoch": 0.09436532764014251,
+      "grad_norm": 0.0029183162841945887,
+      "learning_rate": 0.001,
+      "loss": 0.3522,
+      "step": 3420
+    },
+    {
+      "epoch": 0.09439291984120689,
+      "grad_norm": 0.0036348486319184303,
+      "learning_rate": 0.001,
+      "loss": 0.3738,
+      "step": 3421
+    },
+    {
+      "epoch": 0.09442051204227125,
+      "grad_norm": 0.03472241014242172,
+      "learning_rate": 0.001,
+      "loss": 0.4118,
+      "step": 3422
+    },
+    {
+      "epoch": 0.09444810424333562,
+      "grad_norm": 0.0032440361101180315,
+      "learning_rate": 0.001,
+      "loss": 0.3886,
+      "step": 3423
+    },
+    {
+      "epoch": 0.09447569644439999,
+      "grad_norm": 0.0023810886777937412,
+      "learning_rate": 0.001,
+      "loss": 0.4417,
+      "step": 3424
+    },
+    {
+      "epoch": 0.09450328864546435,
+      "grad_norm": 0.0022793947719037533,
+      "learning_rate": 0.001,
+      "loss": 0.4211,
+      "step": 3425
+    },
+    {
+      "epoch": 0.09453088084652873,
+      "grad_norm": 0.0028985291719436646,
+      "learning_rate": 0.001,
+      "loss": 0.413,
+      "step": 3426
+    },
+    {
+      "epoch": 0.0945584730475931,
+      "grad_norm": 0.00440243910998106,
+      "learning_rate": 0.001,
+      "loss": 0.4098,
+      "step": 3427
+    },
+    {
+      "epoch": 0.09458606524865747,
+      "grad_norm": 0.003319642972201109,
+      "learning_rate": 0.001,
+      "loss": 0.3957,
+      "step": 3428
+    },
+    {
+      "epoch": 0.09461365744972183,
+      "grad_norm": 0.016772592440247536,
+      "learning_rate": 0.001,
+      "loss": 0.4109,
+      "step": 3429
+    },
+    {
+      "epoch": 0.0946412496507862,
+      "grad_norm": 0.005076760891824961,
+      "learning_rate": 0.001,
+      "loss": 0.3902,
+      "step": 3430
+    },
+    {
+      "epoch": 0.09466884185185058,
+      "grad_norm": 0.005572374444454908,
+      "learning_rate": 0.001,
+      "loss": 0.3829,
+      "step": 3431
+    },
+    {
+      "epoch": 0.09469643405291495,
+      "grad_norm": 0.004039878491312265,
+      "learning_rate": 0.001,
+      "loss": 0.3811,
+      "step": 3432
+    },
+    {
+      "epoch": 0.09472402625397931,
+      "grad_norm": 0.0031457028817385435,
+      "learning_rate": 0.001,
+      "loss": 0.4291,
+      "step": 3433
+    },
+    {
+      "epoch": 0.09475161845504368,
+      "grad_norm": 0.0031064285431057215,
+      "learning_rate": 0.001,
+      "loss": 0.3624,
+      "step": 3434
+    },
+    {
+      "epoch": 0.09477921065610805,
+      "grad_norm": 0.00337353372015059,
+      "learning_rate": 0.001,
+      "loss": 0.3938,
+      "step": 3435
+    },
+    {
+      "epoch": 0.09480680285717243,
+      "grad_norm": 0.0029832145664840937,
+      "learning_rate": 0.001,
+      "loss": 0.4154,
+      "step": 3436
+    },
+    {
+      "epoch": 0.09483439505823679,
+      "grad_norm": 0.0031751911155879498,
+      "learning_rate": 0.001,
+      "loss": 0.3812,
+      "step": 3437
+    },
+    {
+      "epoch": 0.09486198725930116,
+      "grad_norm": 0.056237224489450455,
+      "learning_rate": 0.001,
+      "loss": 0.4069,
+      "step": 3438
+    },
+    {
+      "epoch": 0.09488957946036553,
+      "grad_norm": 0.010076162405312061,
+      "learning_rate": 0.001,
+      "loss": 0.3568,
+      "step": 3439
+    },
+    {
+      "epoch": 0.09491717166142989,
+      "grad_norm": 0.0027662264183163643,
+      "learning_rate": 0.001,
+      "loss": 0.3932,
+      "step": 3440
+    },
+    {
+      "epoch": 0.09494476386249427,
+      "grad_norm": 0.0028696192894130945,
+      "learning_rate": 0.001,
+      "loss": 0.3689,
+      "step": 3441
+    },
+    {
+      "epoch": 0.09497235606355864,
+      "grad_norm": 0.0029843011870980263,
+      "learning_rate": 0.001,
+      "loss": 0.3988,
+      "step": 3442
+    },
+    {
+      "epoch": 0.094999948264623,
+      "grad_norm": 0.0037254204507917166,
+      "learning_rate": 0.001,
+      "loss": 0.4055,
+      "step": 3443
+    },
+    {
+      "epoch": 0.09502754046568737,
+      "grad_norm": 0.0030714944005012512,
+      "learning_rate": 0.001,
+      "loss": 0.4093,
+      "step": 3444
+    },
+    {
+      "epoch": 0.09505513266675174,
+      "grad_norm": 0.0032339338213205338,
+      "learning_rate": 0.001,
+      "loss": 0.3934,
+      "step": 3445
+    },
+    {
+      "epoch": 0.09508272486781612,
+      "grad_norm": 0.002931388793513179,
+      "learning_rate": 0.001,
+      "loss": 0.3719,
+      "step": 3446
+    },
+    {
+      "epoch": 0.09511031706888048,
+      "grad_norm": 0.0032540869433432817,
+      "learning_rate": 0.001,
+      "loss": 0.4122,
+      "step": 3447
+    },
+    {
+      "epoch": 0.09513790926994485,
+      "grad_norm": 0.003339814953505993,
+      "learning_rate": 0.001,
+      "loss": 0.3725,
+      "step": 3448
+    },
+    {
+      "epoch": 0.09516550147100922,
+      "grad_norm": 0.0028484398499131203,
+      "learning_rate": 0.001,
+      "loss": 0.4023,
+      "step": 3449
+    },
+    {
+      "epoch": 0.09519309367207358,
+      "grad_norm": 0.0028640010859817266,
+      "learning_rate": 0.001,
+      "loss": 0.3643,
+      "step": 3450
+    },
+    {
+      "epoch": 0.09522068587313796,
+      "grad_norm": 0.003085035365074873,
+      "learning_rate": 0.001,
+      "loss": 0.4297,
+      "step": 3451
+    },
+    {
+      "epoch": 0.09524827807420233,
+      "grad_norm": 0.0024918443523347378,
+      "learning_rate": 0.001,
+      "loss": 0.3792,
+      "step": 3452
+    },
+    {
+      "epoch": 0.0952758702752667,
+      "grad_norm": 0.0036890755873173475,
+      "learning_rate": 0.001,
+      "loss": 0.4053,
+      "step": 3453
+    },
+    {
+      "epoch": 0.09530346247633106,
+      "grad_norm": 0.002531822072342038,
+      "learning_rate": 0.001,
+      "loss": 0.4078,
+      "step": 3454
+    },
+    {
+      "epoch": 0.09533105467739543,
+      "grad_norm": 0.00243132165633142,
+      "learning_rate": 0.001,
+      "loss": 0.4177,
+      "step": 3455
+    },
+    {
+      "epoch": 0.09535864687845981,
+      "grad_norm": 0.0061992863193154335,
+      "learning_rate": 0.001,
+      "loss": 0.4075,
+      "step": 3456
+    },
+    {
+      "epoch": 0.09538623907952418,
+      "grad_norm": 0.0030754937324672937,
+      "learning_rate": 0.001,
+      "loss": 0.3879,
+      "step": 3457
+    },
+    {
+      "epoch": 0.09541383128058854,
+      "grad_norm": 0.0024158579763025045,
+      "learning_rate": 0.001,
+      "loss": 0.393,
+      "step": 3458
+    },
+    {
+      "epoch": 0.09544142348165291,
+      "grad_norm": 0.002696392824873328,
+      "learning_rate": 0.001,
+      "loss": 0.3766,
+      "step": 3459
+    },
+    {
+      "epoch": 0.09546901568271728,
+      "grad_norm": 0.0024891409557312727,
+      "learning_rate": 0.001,
+      "loss": 0.3993,
+      "step": 3460
+    },
+    {
+      "epoch": 0.09549660788378164,
+      "grad_norm": 0.004480718169361353,
+      "learning_rate": 0.001,
+      "loss": 0.3939,
+      "step": 3461
+    },
+    {
+      "epoch": 0.09552420008484602,
+      "grad_norm": 0.0028512163553386927,
+      "learning_rate": 0.001,
+      "loss": 0.3774,
+      "step": 3462
+    },
+    {
+      "epoch": 0.09555179228591039,
+      "grad_norm": 0.003078205045312643,
+      "learning_rate": 0.001,
+      "loss": 0.4043,
+      "step": 3463
+    },
+    {
+      "epoch": 0.09557938448697476,
+      "grad_norm": 0.004663311876356602,
+      "learning_rate": 0.001,
+      "loss": 0.3878,
+      "step": 3464
+    },
+    {
+      "epoch": 0.09560697668803912,
+      "grad_norm": 0.002411546418443322,
+      "learning_rate": 0.001,
+      "loss": 0.3804,
+      "step": 3465
+    },
+    {
+      "epoch": 0.09563456888910349,
+      "grad_norm": 0.0031327796168625355,
+      "learning_rate": 0.001,
+      "loss": 0.4213,
+      "step": 3466
+    },
+    {
+      "epoch": 0.09566216109016787,
+      "grad_norm": 0.0027790337335318327,
+      "learning_rate": 0.001,
+      "loss": 0.4219,
+      "step": 3467
+    },
+    {
+      "epoch": 0.09568975329123224,
+      "grad_norm": 0.0028543812222778797,
+      "learning_rate": 0.001,
+      "loss": 0.4288,
+      "step": 3468
+    },
+    {
+      "epoch": 0.0957173454922966,
+      "grad_norm": 0.003498122561722994,
+      "learning_rate": 0.001,
+      "loss": 0.3993,
+      "step": 3469
+    },
+    {
+      "epoch": 0.09574493769336097,
+      "grad_norm": 0.00663920259103179,
+      "learning_rate": 0.001,
+      "loss": 0.4401,
+      "step": 3470
+    },
+    {
+      "epoch": 0.09577252989442533,
+      "grad_norm": 0.0032984658610075712,
+      "learning_rate": 0.001,
+      "loss": 0.3899,
+      "step": 3471
+    },
+    {
+      "epoch": 0.09580012209548971,
+      "grad_norm": 0.0030672100838273764,
+      "learning_rate": 0.001,
+      "loss": 0.411,
+      "step": 3472
+    },
+    {
+      "epoch": 0.09582771429655408,
+      "grad_norm": 0.003116041421890259,
+      "learning_rate": 0.001,
+      "loss": 0.4194,
+      "step": 3473
+    },
+    {
+      "epoch": 0.09585530649761845,
+      "grad_norm": 0.0065819453448057175,
+      "learning_rate": 0.001,
+      "loss": 0.4178,
+      "step": 3474
+    },
+    {
+      "epoch": 0.09588289869868281,
+      "grad_norm": 0.015010586008429527,
+      "learning_rate": 0.001,
+      "loss": 0.4313,
+      "step": 3475
+    },
+    {
+      "epoch": 0.09591049089974718,
+      "grad_norm": 0.008877400308847427,
+      "learning_rate": 0.001,
+      "loss": 0.4106,
+      "step": 3476
+    },
+    {
+      "epoch": 0.09593808310081156,
+      "grad_norm": 0.004334705416113138,
+      "learning_rate": 0.001,
+      "loss": 0.3934,
+      "step": 3477
+    },
+    {
+      "epoch": 0.09596567530187593,
+      "grad_norm": 0.0028876853175461292,
+      "learning_rate": 0.001,
+      "loss": 0.3759,
+      "step": 3478
+    },
+    {
+      "epoch": 0.0959932675029403,
+      "grad_norm": 0.0028831001836806536,
+      "learning_rate": 0.001,
+      "loss": 0.398,
+      "step": 3479
+    },
+    {
+      "epoch": 0.09602085970400466,
+      "grad_norm": 0.002438000636175275,
+      "learning_rate": 0.001,
+      "loss": 0.3728,
+      "step": 3480
+    },
+    {
+      "epoch": 0.09604845190506903,
+      "grad_norm": 0.004020646680146456,
+      "learning_rate": 0.001,
+      "loss": 0.3893,
+      "step": 3481
+    },
+    {
+      "epoch": 0.09607604410613341,
+      "grad_norm": 0.0025036863517016172,
+      "learning_rate": 0.001,
+      "loss": 0.3881,
+      "step": 3482
+    },
+    {
+      "epoch": 0.09610363630719777,
+      "grad_norm": 0.0033008300233632326,
+      "learning_rate": 0.001,
+      "loss": 0.4334,
+      "step": 3483
+    },
+    {
+      "epoch": 0.09613122850826214,
+      "grad_norm": 0.002855231985449791,
+      "learning_rate": 0.001,
+      "loss": 0.4157,
+      "step": 3484
+    },
+    {
+      "epoch": 0.0961588207093265,
+      "grad_norm": 0.008589272387325764,
+      "learning_rate": 0.001,
+      "loss": 0.436,
+      "step": 3485
+    },
+    {
+      "epoch": 0.09618641291039087,
+      "grad_norm": 0.0025324004236608744,
+      "learning_rate": 0.001,
+      "loss": 0.3986,
+      "step": 3486
+    },
+    {
+      "epoch": 0.09621400511145525,
+      "grad_norm": 0.003534125629812479,
+      "learning_rate": 0.001,
+      "loss": 0.3844,
+      "step": 3487
+    },
+    {
+      "epoch": 0.09624159731251962,
+      "grad_norm": 0.003250879468396306,
+      "learning_rate": 0.001,
+      "loss": 0.4009,
+      "step": 3488
+    },
+    {
+      "epoch": 0.09626918951358399,
+      "grad_norm": 0.004198684822767973,
+      "learning_rate": 0.001,
+      "loss": 0.381,
+      "step": 3489
+    },
+    {
+      "epoch": 0.09629678171464835,
+      "grad_norm": 0.0034648876171559095,
+      "learning_rate": 0.001,
+      "loss": 0.3931,
+      "step": 3490
+    },
+    {
+      "epoch": 0.09632437391571272,
+      "grad_norm": 0.003584906691685319,
+      "learning_rate": 0.001,
+      "loss": 0.3903,
+      "step": 3491
+    },
+    {
+      "epoch": 0.0963519661167771,
+      "grad_norm": 0.0033312232699245214,
+      "learning_rate": 0.001,
+      "loss": 0.405,
+      "step": 3492
+    },
+    {
+      "epoch": 0.09637955831784147,
+      "grad_norm": 0.0032407655380666256,
+      "learning_rate": 0.001,
+      "loss": 0.4217,
+      "step": 3493
+    },
+    {
+      "epoch": 0.09640715051890583,
+      "grad_norm": 0.00250063999556005,
+      "learning_rate": 0.001,
+      "loss": 0.4121,
+      "step": 3494
+    },
+    {
+      "epoch": 0.0964347427199702,
+      "grad_norm": 0.0022465146612375975,
+      "learning_rate": 0.001,
+      "loss": 0.387,
+      "step": 3495
+    },
+    {
+      "epoch": 0.09646233492103456,
+      "grad_norm": 0.002582980552688241,
+      "learning_rate": 0.001,
+      "loss": 0.393,
+      "step": 3496
+    },
+    {
+      "epoch": 0.09648992712209895,
+      "grad_norm": 0.003146865637972951,
+      "learning_rate": 0.001,
+      "loss": 0.375,
+      "step": 3497
+    },
+    {
+      "epoch": 0.09651751932316331,
+      "grad_norm": 0.004826393909752369,
+      "learning_rate": 0.001,
+      "loss": 0.3753,
+      "step": 3498
+    },
+    {
+      "epoch": 0.09654511152422768,
+      "grad_norm": 0.004438623785972595,
+      "learning_rate": 0.001,
+      "loss": 0.4301,
+      "step": 3499
+    },
+    {
+      "epoch": 0.09657270372529204,
+      "grad_norm": 0.002986249513924122,
+      "learning_rate": 0.001,
+      "loss": 0.4145,
+      "step": 3500
+    },
+    {
+      "epoch": 0.09657270372529204,
+      "eval_runtime": 24.1649,
+      "eval_samples_per_second": 1.324,
+      "eval_steps_per_second": 0.166,
+      "step": 3500
+    },
+    {
+      "epoch": 0.09660029592635641,
+      "grad_norm": 0.002994472160935402,
+      "learning_rate": 0.001,
+      "loss": 0.395,
+      "step": 3501
+    },
+    {
+      "epoch": 0.09662788812742079,
+      "grad_norm": 0.006290449295192957,
+      "learning_rate": 0.001,
+      "loss": 0.4092,
+      "step": 3502
+    },
+    {
+      "epoch": 0.09665548032848516,
+      "grad_norm": 0.0027024163864552975,
+      "learning_rate": 0.001,
+      "loss": 0.3844,
+      "step": 3503
+    },
+    {
+      "epoch": 0.09668307252954952,
+      "grad_norm": 0.0026126017328351736,
+      "learning_rate": 0.001,
+      "loss": 0.4056,
+      "step": 3504
+    },
+    {
+      "epoch": 0.09671066473061389,
+      "grad_norm": 0.002393176080659032,
+      "learning_rate": 0.001,
+      "loss": 0.4,
+      "step": 3505
+    },
+    {
+      "epoch": 0.09673825693167826,
+      "grad_norm": 0.0025012970436364412,
+      "learning_rate": 0.001,
+      "loss": 0.4309,
+      "step": 3506
+    },
+    {
+      "epoch": 0.09676584913274262,
+      "grad_norm": 0.0026587117463350296,
+      "learning_rate": 0.001,
+      "loss": 0.394,
+      "step": 3507
+    },
+    {
+      "epoch": 0.096793441333807,
+      "grad_norm": 0.004255054052919149,
+      "learning_rate": 0.001,
+      "loss": 0.3657,
+      "step": 3508
+    },
+    {
+      "epoch": 0.09682103353487137,
+      "grad_norm": 0.0029915212653577328,
+      "learning_rate": 0.001,
+      "loss": 0.4106,
+      "step": 3509
+    },
+    {
+      "epoch": 0.09684862573593574,
+      "grad_norm": 0.002514585852622986,
+      "learning_rate": 0.001,
+      "loss": 0.4136,
+      "step": 3510
+    },
+    {
+      "epoch": 0.0968762179370001,
+      "grad_norm": 0.0031415291596204042,
+      "learning_rate": 0.001,
+      "loss": 0.4172,
+      "step": 3511
+    },
+    {
+      "epoch": 0.09690381013806447,
+      "grad_norm": 0.004052399192005396,
+      "learning_rate": 0.001,
+      "loss": 0.3988,
+      "step": 3512
+    },
+    {
+      "epoch": 0.09693140233912885,
+      "grad_norm": 0.003938235808163881,
+      "learning_rate": 0.001,
+      "loss": 0.3865,
+      "step": 3513
+    },
+    {
+      "epoch": 0.09695899454019322,
+      "grad_norm": 0.0027284801471978426,
+      "learning_rate": 0.001,
+      "loss": 0.4085,
+      "step": 3514
+    },
+    {
+      "epoch": 0.09698658674125758,
+      "grad_norm": 0.0021578462328761816,
+      "learning_rate": 0.001,
+      "loss": 0.4513,
+      "step": 3515
+    },
+    {
+      "epoch": 0.09701417894232195,
+      "grad_norm": 0.0030869885813444853,
+      "learning_rate": 0.001,
+      "loss": 0.4159,
+      "step": 3516
+    },
+    {
+      "epoch": 0.09704177114338632,
+      "grad_norm": 0.0028977058827877045,
+      "learning_rate": 0.001,
+      "loss": 0.4035,
+      "step": 3517
+    },
+    {
+      "epoch": 0.0970693633444507,
+      "grad_norm": 0.0025139932986348867,
+      "learning_rate": 0.001,
+      "loss": 0.4132,
+      "step": 3518
+    },
+    {
+      "epoch": 0.09709695554551506,
+      "grad_norm": 0.0022785612381994724,
+      "learning_rate": 0.001,
+      "loss": 0.4084,
+      "step": 3519
+    },
+    {
+      "epoch": 0.09712454774657943,
+      "grad_norm": 0.002874610014259815,
+      "learning_rate": 0.001,
+      "loss": 0.3645,
+      "step": 3520
+    },
+    {
+      "epoch": 0.0971521399476438,
+      "grad_norm": 0.004266508389264345,
+      "learning_rate": 0.001,
+      "loss": 0.3449,
+      "step": 3521
+    },
+    {
+      "epoch": 0.09717973214870816,
+      "grad_norm": 0.0025132927112281322,
+      "learning_rate": 0.001,
+      "loss": 0.3937,
+      "step": 3522
+    },
+    {
+      "epoch": 0.09720732434977254,
+      "grad_norm": 0.006651229690760374,
+      "learning_rate": 0.001,
+      "loss": 0.3854,
+      "step": 3523
+    },
+    {
+      "epoch": 0.09723491655083691,
+      "grad_norm": 0.0053511932492256165,
+      "learning_rate": 0.001,
+      "loss": 0.3865,
+      "step": 3524
+    },
+    {
+      "epoch": 0.09726250875190127,
+      "grad_norm": 0.0037280949763953686,
+      "learning_rate": 0.001,
+      "loss": 0.4106,
+      "step": 3525
+    },
+    {
+      "epoch": 0.09729010095296564,
+      "grad_norm": 0.0037587578408420086,
+      "learning_rate": 0.001,
+      "loss": 0.4043,
+      "step": 3526
+    },
+    {
+      "epoch": 0.09731769315403001,
+      "grad_norm": 0.0037075839936733246,
+      "learning_rate": 0.001,
+      "loss": 0.3948,
+      "step": 3527
+    },
+    {
+      "epoch": 0.09734528535509439,
+      "grad_norm": 0.004948961082845926,
+      "learning_rate": 0.001,
+      "loss": 0.4124,
+      "step": 3528
+    },
+    {
+      "epoch": 0.09737287755615875,
+      "grad_norm": 0.006435552146285772,
+      "learning_rate": 0.001,
+      "loss": 0.3938,
+      "step": 3529
+    },
+    {
+      "epoch": 0.09740046975722312,
+      "grad_norm": 0.0021128810476511717,
+      "learning_rate": 0.001,
+      "loss": 0.41,
+      "step": 3530
+    },
+    {
+      "epoch": 0.09742806195828749,
+      "grad_norm": 0.0025616176426410675,
+      "learning_rate": 0.001,
+      "loss": 0.4178,
+      "step": 3531
+    },
+    {
+      "epoch": 0.09745565415935185,
+      "grad_norm": 0.00574469892308116,
+      "learning_rate": 0.001,
+      "loss": 0.4012,
+      "step": 3532
+    },
+    {
+      "epoch": 0.09748324636041623,
+      "grad_norm": 0.0031522875651717186,
+      "learning_rate": 0.001,
+      "loss": 0.3611,
+      "step": 3533
+    },
+    {
+      "epoch": 0.0975108385614806,
+      "grad_norm": 0.0031021784525364637,
+      "learning_rate": 0.001,
+      "loss": 0.3709,
+      "step": 3534
+    },
+    {
+      "epoch": 0.09753843076254497,
+      "grad_norm": 0.005699521396309137,
+      "learning_rate": 0.001,
+      "loss": 0.4012,
+      "step": 3535
+    },
+    {
+      "epoch": 0.09756602296360933,
+      "grad_norm": 0.003169464645907283,
+      "learning_rate": 0.001,
+      "loss": 0.3986,
+      "step": 3536
+    },
+    {
+      "epoch": 0.0975936151646737,
+      "grad_norm": 0.003686879761517048,
+      "learning_rate": 0.001,
+      "loss": 0.4229,
+      "step": 3537
+    },
+    {
+      "epoch": 0.09762120736573808,
+      "grad_norm": 0.0031342299189418554,
+      "learning_rate": 0.001,
+      "loss": 0.4062,
+      "step": 3538
+    },
+    {
+      "epoch": 0.09764879956680245,
+      "grad_norm": 0.0023811724968254566,
+      "learning_rate": 0.001,
+      "loss": 0.4048,
+      "step": 3539
+    },
+    {
+      "epoch": 0.09767639176786681,
+      "grad_norm": 0.003792383009567857,
+      "learning_rate": 0.001,
+      "loss": 0.4013,
+      "step": 3540
+    },
+    {
+      "epoch": 0.09770398396893118,
+      "grad_norm": 0.0024562154430896044,
+      "learning_rate": 0.001,
+      "loss": 0.4049,
+      "step": 3541
+    },
+    {
+      "epoch": 0.09773157616999555,
+      "grad_norm": 0.0034171422012150288,
+      "learning_rate": 0.001,
+      "loss": 0.3906,
+      "step": 3542
+    },
+    {
+      "epoch": 0.09775916837105993,
+      "grad_norm": 0.003149349009618163,
+      "learning_rate": 0.001,
+      "loss": 0.3842,
+      "step": 3543
+    },
+    {
+      "epoch": 0.09778676057212429,
+      "grad_norm": 0.002210955834016204,
+      "learning_rate": 0.001,
+      "loss": 0.4131,
+      "step": 3544
+    },
+    {
+      "epoch": 0.09781435277318866,
+      "grad_norm": 0.003593403846025467,
+      "learning_rate": 0.001,
+      "loss": 0.3938,
+      "step": 3545
+    },
+    {
+      "epoch": 0.09784194497425303,
+      "grad_norm": 0.0022881280165165663,
+      "learning_rate": 0.001,
+      "loss": 0.441,
+      "step": 3546
+    },
+    {
+      "epoch": 0.09786953717531739,
+      "grad_norm": 0.002236244734376669,
+      "learning_rate": 0.001,
+      "loss": 0.4306,
+      "step": 3547
+    },
+    {
+      "epoch": 0.09789712937638177,
+      "grad_norm": 0.004320134408771992,
+      "learning_rate": 0.001,
+      "loss": 0.3519,
+      "step": 3548
+    },
+    {
+      "epoch": 0.09792472157744614,
+      "grad_norm": 0.003616459434852004,
+      "learning_rate": 0.001,
+      "loss": 0.3748,
+      "step": 3549
+    },
+    {
+      "epoch": 0.0979523137785105,
+      "grad_norm": 0.0072233411483466625,
+      "learning_rate": 0.001,
+      "loss": 0.3692,
+      "step": 3550
+    },
+    {
+      "epoch": 0.09797990597957487,
+      "grad_norm": 0.017990432679653168,
+      "learning_rate": 0.001,
+      "loss": 0.4225,
+      "step": 3551
+    },
+    {
+      "epoch": 0.09800749818063924,
+      "grad_norm": 0.0036775225307792425,
+      "learning_rate": 0.001,
+      "loss": 0.3654,
+      "step": 3552
+    },
+    {
+      "epoch": 0.0980350903817036,
+      "grad_norm": 0.0034110434353351593,
+      "learning_rate": 0.001,
+      "loss": 0.3865,
+      "step": 3553
+    },
+    {
+      "epoch": 0.09806268258276798,
+      "grad_norm": 0.005091147031635046,
+      "learning_rate": 0.001,
+      "loss": 0.385,
+      "step": 3554
+    },
+    {
+      "epoch": 0.09809027478383235,
+      "grad_norm": 0.011450120247900486,
+      "learning_rate": 0.001,
+      "loss": 0.4183,
+      "step": 3555
+    },
+    {
+      "epoch": 0.09811786698489672,
+      "grad_norm": 0.004091903567314148,
+      "learning_rate": 0.001,
+      "loss": 0.4219,
+      "step": 3556
+    },
+    {
+      "epoch": 0.09814545918596108,
+      "grad_norm": 0.00835462100803852,
+      "learning_rate": 0.001,
+      "loss": 0.4033,
+      "step": 3557
+    },
+    {
+      "epoch": 0.09817305138702545,
+      "grad_norm": 0.0032325852662324905,
+      "learning_rate": 0.001,
+      "loss": 0.4103,
+      "step": 3558
+    },
+    {
+      "epoch": 0.09820064358808983,
+      "grad_norm": 0.003423454938456416,
+      "learning_rate": 0.001,
+      "loss": 0.4135,
+      "step": 3559
+    },
+    {
+      "epoch": 0.0982282357891542,
+      "grad_norm": 0.0024230678100138903,
+      "learning_rate": 0.001,
+      "loss": 0.4107,
+      "step": 3560
+    },
+    {
+      "epoch": 0.09825582799021856,
+      "grad_norm": 0.003030325984582305,
+      "learning_rate": 0.001,
+      "loss": 0.4148,
+      "step": 3561
+    },
+    {
+      "epoch": 0.09828342019128293,
+      "grad_norm": 0.0031275860965251923,
+      "learning_rate": 0.001,
+      "loss": 0.3591,
+      "step": 3562
+    },
+    {
+      "epoch": 0.0983110123923473,
+      "grad_norm": 0.010123792104423046,
+      "learning_rate": 0.001,
+      "loss": 0.3923,
+      "step": 3563
+    },
+    {
+      "epoch": 0.09833860459341168,
+      "grad_norm": 0.0048133572563529015,
+      "learning_rate": 0.001,
+      "loss": 0.3917,
+      "step": 3564
+    },
+    {
+      "epoch": 0.09836619679447604,
+      "grad_norm": 0.002648024819791317,
+      "learning_rate": 0.001,
+      "loss": 0.4199,
+      "step": 3565
+    },
+    {
+      "epoch": 0.09839378899554041,
+      "grad_norm": 0.006900025065988302,
+      "learning_rate": 0.001,
+      "loss": 0.4124,
+      "step": 3566
+    },
+    {
+      "epoch": 0.09842138119660478,
+      "grad_norm": 0.011950865387916565,
+      "learning_rate": 0.001,
+      "loss": 0.4279,
+      "step": 3567
+    },
+    {
+      "epoch": 0.09844897339766914,
+      "grad_norm": 0.0075233737006783485,
+      "learning_rate": 0.001,
+      "loss": 0.3639,
+      "step": 3568
+    },
+    {
+      "epoch": 0.09847656559873352,
+      "grad_norm": 0.006645913701504469,
+      "learning_rate": 0.001,
+      "loss": 0.4328,
+      "step": 3569
+    },
+    {
+      "epoch": 0.09850415779979789,
+      "grad_norm": 0.005523411091417074,
+      "learning_rate": 0.001,
+      "loss": 0.4071,
+      "step": 3570
+    },
+    {
+      "epoch": 0.09853175000086226,
+      "grad_norm": 0.006219461094588041,
+      "learning_rate": 0.001,
+      "loss": 0.3802,
+      "step": 3571
+    },
+    {
+      "epoch": 0.09855934220192662,
+      "grad_norm": 0.002192398766055703,
+      "learning_rate": 0.001,
+      "loss": 0.3657,
+      "step": 3572
+    },
+    {
+      "epoch": 0.09858693440299099,
+      "grad_norm": 0.002795976819470525,
+      "learning_rate": 0.001,
+      "loss": 0.3928,
+      "step": 3573
+    },
+    {
+      "epoch": 0.09861452660405537,
+      "grad_norm": 0.0030005681328475475,
+      "learning_rate": 0.001,
+      "loss": 0.3737,
+      "step": 3574
+    },
+    {
+      "epoch": 0.09864211880511974,
+      "grad_norm": 0.002711366629227996,
+      "learning_rate": 0.001,
+      "loss": 0.4349,
+      "step": 3575
+    },
+    {
+      "epoch": 0.0986697110061841,
+      "grad_norm": 0.0025707653257995844,
+      "learning_rate": 0.001,
+      "loss": 0.4344,
+      "step": 3576
+    },
+    {
+      "epoch": 0.09869730320724847,
+      "grad_norm": 0.002858672058209777,
+      "learning_rate": 0.001,
+      "loss": 0.3998,
+      "step": 3577
+    },
+    {
+      "epoch": 0.09872489540831283,
+      "grad_norm": 0.003099956549704075,
+      "learning_rate": 0.001,
+      "loss": 0.4192,
+      "step": 3578
+    },
+    {
+      "epoch": 0.09875248760937722,
+      "grad_norm": 0.014343291521072388,
+      "learning_rate": 0.001,
+      "loss": 0.3881,
+      "step": 3579
+    },
+    {
+      "epoch": 0.09878007981044158,
+      "grad_norm": 0.0043120840564370155,
+      "learning_rate": 0.001,
+      "loss": 0.3958,
+      "step": 3580
+    },
+    {
+      "epoch": 0.09880767201150595,
+      "grad_norm": 0.0042751263827085495,
+      "learning_rate": 0.001,
+      "loss": 0.3954,
+      "step": 3581
+    },
+    {
+      "epoch": 0.09883526421257031,
+      "grad_norm": 0.0026994547806680202,
+      "learning_rate": 0.001,
+      "loss": 0.3887,
+      "step": 3582
+    },
+    {
+      "epoch": 0.09886285641363468,
+      "grad_norm": 0.002389197936281562,
+      "learning_rate": 0.001,
+      "loss": 0.419,
+      "step": 3583
+    },
+    {
+      "epoch": 0.09889044861469906,
+      "grad_norm": 0.003209868213161826,
+      "learning_rate": 0.001,
+      "loss": 0.4094,
+      "step": 3584
+    },
+    {
+      "epoch": 0.09891804081576343,
+      "grad_norm": 0.004312742967158556,
+      "learning_rate": 0.001,
+      "loss": 0.3506,
+      "step": 3585
+    },
+    {
+      "epoch": 0.0989456330168278,
+      "grad_norm": 0.003966677933931351,
+      "learning_rate": 0.001,
+      "loss": 0.426,
+      "step": 3586
+    },
+    {
+      "epoch": 0.09897322521789216,
+      "grad_norm": 0.008509870618581772,
+      "learning_rate": 0.001,
+      "loss": 0.402,
+      "step": 3587
+    },
+    {
+      "epoch": 0.09900081741895653,
+      "grad_norm": 0.003062241477891803,
+      "learning_rate": 0.001,
+      "loss": 0.3783,
+      "step": 3588
+    },
+    {
+      "epoch": 0.09902840962002091,
+      "grad_norm": 0.0022860439494252205,
+      "learning_rate": 0.001,
+      "loss": 0.426,
+      "step": 3589
+    },
+    {
+      "epoch": 0.09905600182108527,
+      "grad_norm": 0.004430666100233793,
+      "learning_rate": 0.001,
+      "loss": 0.4015,
+      "step": 3590
+    },
+    {
+      "epoch": 0.09908359402214964,
+      "grad_norm": 0.002458814997226,
+      "learning_rate": 0.001,
+      "loss": 0.417,
+      "step": 3591
+    },
+    {
+      "epoch": 0.099111186223214,
+      "grad_norm": 0.0029090861789882183,
+      "learning_rate": 0.001,
+      "loss": 0.3965,
+      "step": 3592
+    },
+    {
+      "epoch": 0.09913877842427837,
+      "grad_norm": 0.0032796203158795834,
+      "learning_rate": 0.001,
+      "loss": 0.3852,
+      "step": 3593
+    },
+    {
+      "epoch": 0.09916637062534275,
+      "grad_norm": 0.002556765917688608,
+      "learning_rate": 0.001,
+      "loss": 0.4206,
+      "step": 3594
+    },
+    {
+      "epoch": 0.09919396282640712,
+      "grad_norm": 0.0023709458764642477,
+      "learning_rate": 0.001,
+      "loss": 0.415,
+      "step": 3595
+    },
+    {
+      "epoch": 0.09922155502747149,
+      "grad_norm": 0.003395737847313285,
+      "learning_rate": 0.001,
+      "loss": 0.3762,
+      "step": 3596
+    },
+    {
+      "epoch": 0.09924914722853585,
+      "grad_norm": 0.0026961613912135363,
+      "learning_rate": 0.001,
+      "loss": 0.3874,
+      "step": 3597
+    },
+    {
+      "epoch": 0.09927673942960022,
+      "grad_norm": 0.0020530601032078266,
+      "learning_rate": 0.001,
+      "loss": 0.4465,
+      "step": 3598
+    },
+    {
+      "epoch": 0.0993043316306646,
+      "grad_norm": 0.0026466413401067257,
+      "learning_rate": 0.001,
+      "loss": 0.4052,
+      "step": 3599
+    },
+    {
+      "epoch": 0.09933192383172897,
+      "grad_norm": 0.003969122655689716,
+      "learning_rate": 0.001,
+      "loss": 0.406,
+      "step": 3600
+    },
+    {
+      "epoch": 0.09935951603279333,
+      "grad_norm": 0.0035336946602910757,
+      "learning_rate": 0.001,
+      "loss": 0.3836,
+      "step": 3601
+    },
+    {
+      "epoch": 0.0993871082338577,
+      "grad_norm": 0.0030988729558885098,
+      "learning_rate": 0.001,
+      "loss": 0.3876,
+      "step": 3602
+    },
+    {
+      "epoch": 0.09941470043492207,
+      "grad_norm": 0.0029431709554046392,
+      "learning_rate": 0.001,
+      "loss": 0.3831,
+      "step": 3603
+    },
+    {
+      "epoch": 0.09944229263598643,
+      "grad_norm": 0.002947601256892085,
+      "learning_rate": 0.001,
+      "loss": 0.3737,
+      "step": 3604
+    },
+    {
+      "epoch": 0.09946988483705081,
+      "grad_norm": 0.002736450405791402,
+      "learning_rate": 0.001,
+      "loss": 0.4254,
+      "step": 3605
+    },
+    {
+      "epoch": 0.09949747703811518,
+      "grad_norm": 0.0027800817042589188,
+      "learning_rate": 0.001,
+      "loss": 0.4183,
+      "step": 3606
+    },
+    {
+      "epoch": 0.09952506923917954,
+      "grad_norm": 0.003060448681935668,
+      "learning_rate": 0.001,
+      "loss": 0.3972,
+      "step": 3607
+    },
+    {
+      "epoch": 0.09955266144024391,
+      "grad_norm": 0.0032676290720701218,
+      "learning_rate": 0.001,
+      "loss": 0.3543,
+      "step": 3608
+    },
+    {
+      "epoch": 0.09958025364130828,
+      "grad_norm": 0.0039128996431827545,
+      "learning_rate": 0.001,
+      "loss": 0.398,
+      "step": 3609
+    },
+    {
+      "epoch": 0.09960784584237266,
+      "grad_norm": 0.0037549827247858047,
+      "learning_rate": 0.001,
+      "loss": 0.3962,
+      "step": 3610
+    },
+    {
+      "epoch": 0.09963543804343702,
+      "grad_norm": 0.00438270578160882,
+      "learning_rate": 0.001,
+      "loss": 0.4057,
+      "step": 3611
+    },
+    {
+      "epoch": 0.09966303024450139,
+      "grad_norm": 0.003163162851706147,
+      "learning_rate": 0.001,
+      "loss": 0.3996,
+      "step": 3612
+    },
+    {
+      "epoch": 0.09969062244556576,
+      "grad_norm": 0.0033597201108932495,
+      "learning_rate": 0.001,
+      "loss": 0.3825,
+      "step": 3613
+    },
+    {
+      "epoch": 0.09971821464663012,
+      "grad_norm": 0.003219824517145753,
+      "learning_rate": 0.001,
+      "loss": 0.4046,
+      "step": 3614
+    },
+    {
+      "epoch": 0.0997458068476945,
+      "grad_norm": 0.0035223192535340786,
+      "learning_rate": 0.001,
+      "loss": 0.3996,
+      "step": 3615
+    },
+    {
+      "epoch": 0.09977339904875887,
+      "grad_norm": 0.00964295119047165,
+      "learning_rate": 0.001,
+      "loss": 0.4004,
+      "step": 3616
+    },
+    {
+      "epoch": 0.09980099124982324,
+      "grad_norm": 0.002427217550575733,
+      "learning_rate": 0.001,
+      "loss": 0.4164,
+      "step": 3617
+    },
+    {
+      "epoch": 0.0998285834508876,
+      "grad_norm": 0.004480184521526098,
+      "learning_rate": 0.001,
+      "loss": 0.4118,
+      "step": 3618
+    },
+    {
+      "epoch": 0.09985617565195197,
+      "grad_norm": 0.0030322298407554626,
+      "learning_rate": 0.001,
+      "loss": 0.4015,
+      "step": 3619
+    },
+    {
+      "epoch": 0.09988376785301635,
+      "grad_norm": 0.02180991880595684,
+      "learning_rate": 0.001,
+      "loss": 0.4125,
+      "step": 3620
+    },
+    {
+      "epoch": 0.09991136005408072,
+      "grad_norm": 0.0030360252130776644,
+      "learning_rate": 0.001,
+      "loss": 0.433,
+      "step": 3621
+    },
+    {
+      "epoch": 0.09993895225514508,
+      "grad_norm": 0.0025207020808011293,
+      "learning_rate": 0.001,
+      "loss": 0.4051,
+      "step": 3622
+    },
+    {
+      "epoch": 0.09996654445620945,
+      "grad_norm": 0.004299542400985956,
+      "learning_rate": 0.001,
+      "loss": 0.4061,
+      "step": 3623
+    },
+    {
+      "epoch": 0.09999413665727382,
+      "grad_norm": 0.004565478768199682,
+      "learning_rate": 0.001,
+      "loss": 0.394,
+      "step": 3624
+    },
+    {
+      "epoch": 0.1000217288583382,
+      "grad_norm": 0.0028781460132449865,
+      "learning_rate": 0.001,
+      "loss": 0.3982,
+      "step": 3625
+    },
+    {
+      "epoch": 0.10004932105940256,
+      "grad_norm": 0.0026228884235024452,
+      "learning_rate": 0.001,
+      "loss": 0.372,
+      "step": 3626
+    },
+    {
+      "epoch": 0.10007691326046693,
+      "grad_norm": 0.0029596835374832153,
+      "learning_rate": 0.001,
+      "loss": 0.4371,
+      "step": 3627
+    },
+    {
+      "epoch": 0.1001045054615313,
+      "grad_norm": 0.007199972402304411,
+      "learning_rate": 0.001,
+      "loss": 0.3777,
+      "step": 3628
+    },
+    {
+      "epoch": 0.10013209766259566,
+      "grad_norm": 0.003069596830755472,
+      "learning_rate": 0.001,
+      "loss": 0.4199,
+      "step": 3629
+    },
+    {
+      "epoch": 0.10015968986366004,
+      "grad_norm": 0.002500646049156785,
+      "learning_rate": 0.001,
+      "loss": 0.4078,
+      "step": 3630
+    },
+    {
+      "epoch": 0.10018728206472441,
+      "grad_norm": 0.0024019493721425533,
+      "learning_rate": 0.001,
+      "loss": 0.4107,
+      "step": 3631
+    },
+    {
+      "epoch": 0.10021487426578878,
+      "grad_norm": 0.0032364402431994677,
+      "learning_rate": 0.001,
+      "loss": 0.4136,
+      "step": 3632
+    },
+    {
+      "epoch": 0.10024246646685314,
+      "grad_norm": 0.00393798528239131,
+      "learning_rate": 0.001,
+      "loss": 0.3833,
+      "step": 3633
+    },
+    {
+      "epoch": 0.10027005866791751,
+      "grad_norm": 0.0036344374530017376,
+      "learning_rate": 0.001,
+      "loss": 0.4075,
+      "step": 3634
+    },
+    {
+      "epoch": 0.10029765086898189,
+      "grad_norm": 0.003431879449635744,
+      "learning_rate": 0.001,
+      "loss": 0.3957,
+      "step": 3635
+    },
+    {
+      "epoch": 0.10032524307004625,
+      "grad_norm": 0.003278666641563177,
+      "learning_rate": 0.001,
+      "loss": 0.4089,
+      "step": 3636
+    },
+    {
+      "epoch": 0.10035283527111062,
+      "grad_norm": 0.003398419125005603,
+      "learning_rate": 0.001,
+      "loss": 0.3683,
+      "step": 3637
+    },
+    {
+      "epoch": 0.10038042747217499,
+      "grad_norm": 0.002605091081932187,
+      "learning_rate": 0.001,
+      "loss": 0.4285,
+      "step": 3638
+    },
+    {
+      "epoch": 0.10040801967323935,
+      "grad_norm": 0.0038700527511537075,
+      "learning_rate": 0.001,
+      "loss": 0.3759,
+      "step": 3639
+    },
+    {
+      "epoch": 0.10043561187430373,
+      "grad_norm": 0.002678008284419775,
+      "learning_rate": 0.001,
+      "loss": 0.4215,
+      "step": 3640
+    },
+    {
+      "epoch": 0.1004632040753681,
+      "grad_norm": 0.004668472800403833,
+      "learning_rate": 0.001,
+      "loss": 0.3779,
+      "step": 3641
+    },
+    {
+      "epoch": 0.10049079627643247,
+      "grad_norm": 0.003300663083791733,
+      "learning_rate": 0.001,
+      "loss": 0.4322,
+      "step": 3642
+    },
+    {
+      "epoch": 0.10051838847749683,
+      "grad_norm": 0.0024147978983819485,
+      "learning_rate": 0.001,
+      "loss": 0.3779,
+      "step": 3643
+    },
+    {
+      "epoch": 0.1005459806785612,
+      "grad_norm": 0.005644883494824171,
+      "learning_rate": 0.001,
+      "loss": 0.4082,
+      "step": 3644
+    },
+    {
+      "epoch": 0.10057357287962558,
+      "grad_norm": 0.004027125891298056,
+      "learning_rate": 0.001,
+      "loss": 0.4173,
+      "step": 3645
+    },
+    {
+      "epoch": 0.10060116508068995,
+      "grad_norm": 0.003284280654042959,
+      "learning_rate": 0.001,
+      "loss": 0.3697,
+      "step": 3646
+    },
+    {
+      "epoch": 0.10062875728175431,
+      "grad_norm": 0.0025769018102437258,
+      "learning_rate": 0.001,
+      "loss": 0.4057,
+      "step": 3647
+    },
+    {
+      "epoch": 0.10065634948281868,
+      "grad_norm": 0.002732061082497239,
+      "learning_rate": 0.001,
+      "loss": 0.4222,
+      "step": 3648
+    },
+    {
+      "epoch": 0.10068394168388305,
+      "grad_norm": 0.00284641538746655,
+      "learning_rate": 0.001,
+      "loss": 0.4051,
+      "step": 3649
+    },
+    {
+      "epoch": 0.10071153388494741,
+      "grad_norm": 0.0040368628688156605,
+      "learning_rate": 0.001,
+      "loss": 0.4206,
+      "step": 3650
+    },
+    {
+      "epoch": 0.1007391260860118,
+      "grad_norm": 0.0030586514621973038,
+      "learning_rate": 0.001,
+      "loss": 0.4049,
+      "step": 3651
+    },
+    {
+      "epoch": 0.10076671828707616,
+      "grad_norm": 0.002908149268478155,
+      "learning_rate": 0.001,
+      "loss": 0.4325,
+      "step": 3652
+    },
+    {
+      "epoch": 0.10079431048814053,
+      "grad_norm": 0.0038641381543129683,
+      "learning_rate": 0.001,
+      "loss": 0.3943,
+      "step": 3653
+    },
+    {
+      "epoch": 0.10082190268920489,
+      "grad_norm": 0.0032964826095849276,
+      "learning_rate": 0.001,
+      "loss": 0.39,
+      "step": 3654
+    },
+    {
+      "epoch": 0.10084949489026926,
+      "grad_norm": 0.0040243249386549,
+      "learning_rate": 0.001,
+      "loss": 0.3653,
+      "step": 3655
+    },
+    {
+      "epoch": 0.10087708709133364,
+      "grad_norm": 0.004164101555943489,
+      "learning_rate": 0.001,
+      "loss": 0.4091,
+      "step": 3656
+    },
+    {
+      "epoch": 0.100904679292398,
+      "grad_norm": 0.003107170108705759,
+      "learning_rate": 0.001,
+      "loss": 0.4099,
+      "step": 3657
+    },
+    {
+      "epoch": 0.10093227149346237,
+      "grad_norm": 0.0029900551307946444,
+      "learning_rate": 0.001,
+      "loss": 0.3947,
+      "step": 3658
+    },
+    {
+      "epoch": 0.10095986369452674,
+      "grad_norm": 0.0030398843809962273,
+      "learning_rate": 0.001,
+      "loss": 0.4166,
+      "step": 3659
+    },
+    {
+      "epoch": 0.1009874558955911,
+      "grad_norm": 0.0038001432549208403,
+      "learning_rate": 0.001,
+      "loss": 0.3986,
+      "step": 3660
+    },
+    {
+      "epoch": 0.10101504809665549,
+      "grad_norm": 0.0026995730586349964,
+      "learning_rate": 0.001,
+      "loss": 0.3667,
+      "step": 3661
+    },
+    {
+      "epoch": 0.10104264029771985,
+      "grad_norm": 0.0022786613553762436,
+      "learning_rate": 0.001,
+      "loss": 0.4441,
+      "step": 3662
+    },
+    {
+      "epoch": 0.10107023249878422,
+      "grad_norm": 0.002397672738879919,
+      "learning_rate": 0.001,
+      "loss": 0.4011,
+      "step": 3663
+    },
+    {
+      "epoch": 0.10109782469984858,
+      "grad_norm": 0.002657962031662464,
+      "learning_rate": 0.001,
+      "loss": 0.3886,
+      "step": 3664
+    },
+    {
+      "epoch": 0.10112541690091295,
+      "grad_norm": 0.004640195053070784,
+      "learning_rate": 0.001,
+      "loss": 0.4264,
+      "step": 3665
+    },
+    {
+      "epoch": 0.10115300910197733,
+      "grad_norm": 0.0031627060379832983,
+      "learning_rate": 0.001,
+      "loss": 0.3886,
+      "step": 3666
+    },
+    {
+      "epoch": 0.1011806013030417,
+      "grad_norm": 0.003310044063255191,
+      "learning_rate": 0.001,
+      "loss": 0.4333,
+      "step": 3667
+    },
+    {
+      "epoch": 0.10120819350410606,
+      "grad_norm": 0.0024094157852232456,
+      "learning_rate": 0.001,
+      "loss": 0.4183,
+      "step": 3668
+    },
+    {
+      "epoch": 0.10123578570517043,
+      "grad_norm": 0.0029870392754673958,
+      "learning_rate": 0.001,
+      "loss": 0.3704,
+      "step": 3669
+    },
+    {
+      "epoch": 0.1012633779062348,
+      "grad_norm": 0.002329483861103654,
+      "learning_rate": 0.001,
+      "loss": 0.4167,
+      "step": 3670
+    },
+    {
+      "epoch": 0.10129097010729918,
+      "grad_norm": 0.003403107402846217,
+      "learning_rate": 0.001,
+      "loss": 0.4032,
+      "step": 3671
+    },
+    {
+      "epoch": 0.10131856230836354,
+      "grad_norm": 0.0027673887088894844,
+      "learning_rate": 0.001,
+      "loss": 0.4072,
+      "step": 3672
+    },
+    {
+      "epoch": 0.10134615450942791,
+      "grad_norm": 0.0028799972496926785,
+      "learning_rate": 0.001,
+      "loss": 0.3715,
+      "step": 3673
+    },
+    {
+      "epoch": 0.10137374671049228,
+      "grad_norm": 0.003228268353268504,
+      "learning_rate": 0.001,
+      "loss": 0.4128,
+      "step": 3674
+    },
+    {
+      "epoch": 0.10140133891155664,
+      "grad_norm": 0.0028087422251701355,
+      "learning_rate": 0.001,
+      "loss": 0.4151,
+      "step": 3675
+    },
+    {
+      "epoch": 0.10142893111262102,
+      "grad_norm": 0.0026430152356624603,
+      "learning_rate": 0.001,
+      "loss": 0.3818,
+      "step": 3676
+    },
+    {
+      "epoch": 0.10145652331368539,
+      "grad_norm": 0.0036075664684176445,
+      "learning_rate": 0.001,
+      "loss": 0.3834,
+      "step": 3677
+    },
+    {
+      "epoch": 0.10148411551474976,
+      "grad_norm": 0.0028451229445636272,
+      "learning_rate": 0.001,
+      "loss": 0.4127,
+      "step": 3678
+    },
+    {
+      "epoch": 0.10151170771581412,
+      "grad_norm": 0.0037802942097187042,
+      "learning_rate": 0.001,
+      "loss": 0.4182,
+      "step": 3679
+    },
+    {
+      "epoch": 0.10153929991687849,
+      "grad_norm": 0.0029138477984815836,
+      "learning_rate": 0.001,
+      "loss": 0.436,
+      "step": 3680
+    },
+    {
+      "epoch": 0.10156689211794287,
+      "grad_norm": 0.003684982191771269,
+      "learning_rate": 0.001,
+      "loss": 0.4201,
+      "step": 3681
+    },
+    {
+      "epoch": 0.10159448431900724,
+      "grad_norm": 0.005630989093333483,
+      "learning_rate": 0.001,
+      "loss": 0.3818,
+      "step": 3682
+    },
+    {
+      "epoch": 0.1016220765200716,
+      "grad_norm": 0.006454580929130316,
+      "learning_rate": 0.001,
+      "loss": 0.4174,
+      "step": 3683
+    },
+    {
+      "epoch": 0.10164966872113597,
+      "grad_norm": 0.007667763624340296,
+      "learning_rate": 0.001,
+      "loss": 0.3887,
+      "step": 3684
+    },
+    {
+      "epoch": 0.10167726092220034,
+      "grad_norm": 0.004302634857594967,
+      "learning_rate": 0.001,
+      "loss": 0.3893,
+      "step": 3685
+    },
+    {
+      "epoch": 0.10170485312326472,
+      "grad_norm": 0.003696159226819873,
+      "learning_rate": 0.001,
+      "loss": 0.4209,
+      "step": 3686
+    },
+    {
+      "epoch": 0.10173244532432908,
+      "grad_norm": 0.004956797696650028,
+      "learning_rate": 0.001,
+      "loss": 0.3985,
+      "step": 3687
+    },
+    {
+      "epoch": 0.10176003752539345,
+      "grad_norm": 0.0026316859293729067,
+      "learning_rate": 0.001,
+      "loss": 0.395,
+      "step": 3688
+    },
+    {
+      "epoch": 0.10178762972645782,
+      "grad_norm": 0.002009750111028552,
+      "learning_rate": 0.001,
+      "loss": 0.4335,
+      "step": 3689
+    },
+    {
+      "epoch": 0.10181522192752218,
+      "grad_norm": 0.003345980541780591,
+      "learning_rate": 0.001,
+      "loss": 0.4194,
+      "step": 3690
+    },
+    {
+      "epoch": 0.10184281412858656,
+      "grad_norm": 0.002231464022770524,
+      "learning_rate": 0.001,
+      "loss": 0.4316,
+      "step": 3691
+    },
+    {
+      "epoch": 0.10187040632965093,
+      "grad_norm": 0.003336479654535651,
+      "learning_rate": 0.001,
+      "loss": 0.3732,
+      "step": 3692
+    },
+    {
+      "epoch": 0.1018979985307153,
+      "grad_norm": 0.002393354196101427,
+      "learning_rate": 0.001,
+      "loss": 0.4018,
+      "step": 3693
+    },
+    {
+      "epoch": 0.10192559073177966,
+      "grad_norm": 0.0024670169223099947,
+      "learning_rate": 0.001,
+      "loss": 0.3915,
+      "step": 3694
+    },
+    {
+      "epoch": 0.10195318293284403,
+      "grad_norm": 0.0034887620713561773,
+      "learning_rate": 0.001,
+      "loss": 0.4231,
+      "step": 3695
+    },
+    {
+      "epoch": 0.1019807751339084,
+      "grad_norm": 0.003547382541000843,
+      "learning_rate": 0.001,
+      "loss": 0.3934,
+      "step": 3696
+    },
+    {
+      "epoch": 0.10200836733497277,
+      "grad_norm": 0.0034907760564237833,
+      "learning_rate": 0.001,
+      "loss": 0.4244,
+      "step": 3697
+    },
+    {
+      "epoch": 0.10203595953603714,
+      "grad_norm": 0.002545100636780262,
+      "learning_rate": 0.001,
+      "loss": 0.3918,
+      "step": 3698
+    },
+    {
+      "epoch": 0.10206355173710151,
+      "grad_norm": 0.004985075909644365,
+      "learning_rate": 0.001,
+      "loss": 0.3924,
+      "step": 3699
+    },
+    {
+      "epoch": 0.10209114393816587,
+      "grad_norm": 0.004573920741677284,
+      "learning_rate": 0.001,
+      "loss": 0.3743,
+      "step": 3700
+    },
+    {
+      "epoch": 0.10211873613923024,
+      "grad_norm": 0.004074465949088335,
+      "learning_rate": 0.001,
+      "loss": 0.4086,
+      "step": 3701
+    },
+    {
+      "epoch": 0.10214632834029462,
+      "grad_norm": 0.0037853543180972338,
+      "learning_rate": 0.001,
+      "loss": 0.4263,
+      "step": 3702
+    },
+    {
+      "epoch": 0.10217392054135899,
+      "grad_norm": 0.002464262768626213,
+      "learning_rate": 0.001,
+      "loss": 0.4116,
+      "step": 3703
+    },
+    {
+      "epoch": 0.10220151274242335,
+      "grad_norm": 0.0036815868224948645,
+      "learning_rate": 0.001,
+      "loss": 0.388,
+      "step": 3704
+    },
+    {
+      "epoch": 0.10222910494348772,
+      "grad_norm": 0.0033971264492720366,
+      "learning_rate": 0.001,
+      "loss": 0.4613,
+      "step": 3705
+    },
+    {
+      "epoch": 0.10225669714455209,
+      "grad_norm": 0.0029886479023844004,
+      "learning_rate": 0.001,
+      "loss": 0.413,
+      "step": 3706
+    },
+    {
+      "epoch": 0.10228428934561647,
+      "grad_norm": 0.0036080891732126474,
+      "learning_rate": 0.001,
+      "loss": 0.3937,
+      "step": 3707
+    },
+    {
+      "epoch": 0.10231188154668083,
+      "grad_norm": 0.0026688736397773027,
+      "learning_rate": 0.001,
+      "loss": 0.432,
+      "step": 3708
+    },
+    {
+      "epoch": 0.1023394737477452,
+      "grad_norm": 0.003568600630387664,
+      "learning_rate": 0.001,
+      "loss": 0.3813,
+      "step": 3709
+    },
+    {
+      "epoch": 0.10236706594880957,
+      "grad_norm": 0.0037499042227864265,
+      "learning_rate": 0.001,
+      "loss": 0.4049,
+      "step": 3710
+    },
+    {
+      "epoch": 0.10239465814987393,
+      "grad_norm": 0.0027967335190624,
+      "learning_rate": 0.001,
+      "loss": 0.4164,
+      "step": 3711
+    },
+    {
+      "epoch": 0.10242225035093831,
+      "grad_norm": 0.002266339026391506,
+      "learning_rate": 0.001,
+      "loss": 0.3947,
+      "step": 3712
+    },
+    {
+      "epoch": 0.10244984255200268,
+      "grad_norm": 0.002678538439795375,
+      "learning_rate": 0.001,
+      "loss": 0.3994,
+      "step": 3713
+    },
+    {
+      "epoch": 0.10247743475306705,
+      "grad_norm": 0.007051249034702778,
+      "learning_rate": 0.001,
+      "loss": 0.3694,
+      "step": 3714
+    },
+    {
+      "epoch": 0.10250502695413141,
+      "grad_norm": 0.002434907481074333,
+      "learning_rate": 0.001,
+      "loss": 0.3636,
+      "step": 3715
+    },
+    {
+      "epoch": 0.10253261915519578,
+      "grad_norm": 0.002865745685994625,
+      "learning_rate": 0.001,
+      "loss": 0.4641,
+      "step": 3716
+    },
+    {
+      "epoch": 0.10256021135626016,
+      "grad_norm": 0.0022143360693007708,
+      "learning_rate": 0.001,
+      "loss": 0.3947,
+      "step": 3717
+    },
+    {
+      "epoch": 0.10258780355732453,
+      "grad_norm": 0.0022539508063346148,
+      "learning_rate": 0.001,
+      "loss": 0.4341,
+      "step": 3718
+    },
+    {
+      "epoch": 0.10261539575838889,
+      "grad_norm": 0.0032584406435489655,
+      "learning_rate": 0.001,
+      "loss": 0.4105,
+      "step": 3719
+    },
+    {
+      "epoch": 0.10264298795945326,
+      "grad_norm": 0.003400850109755993,
+      "learning_rate": 0.001,
+      "loss": 0.3808,
+      "step": 3720
+    },
+    {
+      "epoch": 0.10267058016051762,
+      "grad_norm": 0.004264235496520996,
+      "learning_rate": 0.001,
+      "loss": 0.3823,
+      "step": 3721
+    },
+    {
+      "epoch": 0.102698172361582,
+      "grad_norm": 0.0028461632318794727,
+      "learning_rate": 0.001,
+      "loss": 0.4444,
+      "step": 3722
+    },
+    {
+      "epoch": 0.10272576456264637,
+      "grad_norm": 0.00392636563628912,
+      "learning_rate": 0.001,
+      "loss": 0.3891,
+      "step": 3723
+    },
+    {
+      "epoch": 0.10275335676371074,
+      "grad_norm": 0.002802118891850114,
+      "learning_rate": 0.001,
+      "loss": 0.4593,
+      "step": 3724
+    },
+    {
+      "epoch": 0.1027809489647751,
+      "grad_norm": 0.0052732862532138824,
+      "learning_rate": 0.001,
+      "loss": 0.4124,
+      "step": 3725
+    },
+    {
+      "epoch": 0.10280854116583947,
+      "grad_norm": 0.009514648467302322,
+      "learning_rate": 0.001,
+      "loss": 0.3976,
+      "step": 3726
+    },
+    {
+      "epoch": 0.10283613336690385,
+      "grad_norm": 0.0024788815062493086,
+      "learning_rate": 0.001,
+      "loss": 0.4052,
+      "step": 3727
+    },
+    {
+      "epoch": 0.10286372556796822,
+      "grad_norm": 0.003208071691915393,
+      "learning_rate": 0.001,
+      "loss": 0.4126,
+      "step": 3728
+    },
+    {
+      "epoch": 0.10289131776903258,
+      "grad_norm": 0.002506793709471822,
+      "learning_rate": 0.001,
+      "loss": 0.4013,
+      "step": 3729
+    },
+    {
+      "epoch": 0.10291890997009695,
+      "grad_norm": 0.005780140403658152,
+      "learning_rate": 0.001,
+      "loss": 0.3869,
+      "step": 3730
+    },
+    {
+      "epoch": 0.10294650217116132,
+      "grad_norm": 0.005190226249396801,
+      "learning_rate": 0.001,
+      "loss": 0.4279,
+      "step": 3731
+    },
+    {
+      "epoch": 0.1029740943722257,
+      "grad_norm": 0.005167331546545029,
+      "learning_rate": 0.001,
+      "loss": 0.3991,
+      "step": 3732
+    },
+    {
+      "epoch": 0.10300168657329006,
+      "grad_norm": 0.003800647798925638,
+      "learning_rate": 0.001,
+      "loss": 0.3657,
+      "step": 3733
+    },
+    {
+      "epoch": 0.10302927877435443,
+      "grad_norm": 0.004843884147703648,
+      "learning_rate": 0.001,
+      "loss": 0.4188,
+      "step": 3734
+    },
+    {
+      "epoch": 0.1030568709754188,
+      "grad_norm": 0.0037740804255008698,
+      "learning_rate": 0.001,
+      "loss": 0.3795,
+      "step": 3735
+    },
+    {
+      "epoch": 0.10308446317648316,
+      "grad_norm": 0.004264209885150194,
+      "learning_rate": 0.001,
+      "loss": 0.3936,
+      "step": 3736
+    },
+    {
+      "epoch": 0.10311205537754754,
+      "grad_norm": 0.007586845196783543,
+      "learning_rate": 0.001,
+      "loss": 0.3892,
+      "step": 3737
+    },
+    {
+      "epoch": 0.10313964757861191,
+      "grad_norm": 0.005896701943129301,
+      "learning_rate": 0.001,
+      "loss": 0.4083,
+      "step": 3738
+    },
+    {
+      "epoch": 0.10316723977967628,
+      "grad_norm": 0.004077561665326357,
+      "learning_rate": 0.001,
+      "loss": 0.442,
+      "step": 3739
+    },
+    {
+      "epoch": 0.10319483198074064,
+      "grad_norm": 0.003110091434791684,
+      "learning_rate": 0.001,
+      "loss": 0.4075,
+      "step": 3740
+    },
+    {
+      "epoch": 0.10322242418180501,
+      "grad_norm": 0.0023520744871348143,
+      "learning_rate": 0.001,
+      "loss": 0.428,
+      "step": 3741
+    },
+    {
+      "epoch": 0.10325001638286938,
+      "grad_norm": 0.006090542767196894,
+      "learning_rate": 0.001,
+      "loss": 0.3767,
+      "step": 3742
+    },
+    {
+      "epoch": 0.10327760858393376,
+      "grad_norm": 0.002889704890549183,
+      "learning_rate": 0.001,
+      "loss": 0.4024,
+      "step": 3743
+    },
+    {
+      "epoch": 0.10330520078499812,
+      "grad_norm": 0.0024959116708487272,
+      "learning_rate": 0.001,
+      "loss": 0.3798,
+      "step": 3744
+    },
+    {
+      "epoch": 0.10333279298606249,
+      "grad_norm": 0.002503210911527276,
+      "learning_rate": 0.001,
+      "loss": 0.4306,
+      "step": 3745
+    },
+    {
+      "epoch": 0.10336038518712685,
+      "grad_norm": 0.0028133681043982506,
+      "learning_rate": 0.001,
+      "loss": 0.4008,
+      "step": 3746
+    },
+    {
+      "epoch": 0.10338797738819122,
+      "grad_norm": 0.0032689927611500025,
+      "learning_rate": 0.001,
+      "loss": 0.3972,
+      "step": 3747
+    },
+    {
+      "epoch": 0.1034155695892556,
+      "grad_norm": 0.002301878994330764,
+      "learning_rate": 0.001,
+      "loss": 0.4217,
+      "step": 3748
+    },
+    {
+      "epoch": 0.10344316179031997,
+      "grad_norm": 0.0024407797027379274,
+      "learning_rate": 0.001,
+      "loss": 0.376,
+      "step": 3749
+    },
+    {
+      "epoch": 0.10347075399138433,
+      "grad_norm": 0.0035565400030463934,
+      "learning_rate": 0.001,
+      "loss": 0.3959,
+      "step": 3750
+    },
+    {
+      "epoch": 0.1034983461924487,
+      "grad_norm": 0.002976832212880254,
+      "learning_rate": 0.001,
+      "loss": 0.3942,
+      "step": 3751
+    },
+    {
+      "epoch": 0.10352593839351307,
+      "grad_norm": 0.0027037430554628372,
+      "learning_rate": 0.001,
+      "loss": 0.4186,
+      "step": 3752
+    },
+    {
+      "epoch": 0.10355353059457745,
+      "grad_norm": 0.004547620192170143,
+      "learning_rate": 0.001,
+      "loss": 0.4047,
+      "step": 3753
+    },
+    {
+      "epoch": 0.10358112279564181,
+      "grad_norm": 0.0025993280578404665,
+      "learning_rate": 0.001,
+      "loss": 0.4146,
+      "step": 3754
+    },
+    {
+      "epoch": 0.10360871499670618,
+      "grad_norm": 0.0020117738749831915,
+      "learning_rate": 0.001,
+      "loss": 0.4048,
+      "step": 3755
+    },
+    {
+      "epoch": 0.10363630719777055,
+      "grad_norm": 0.003054060973227024,
+      "learning_rate": 0.001,
+      "loss": 0.4195,
+      "step": 3756
+    },
+    {
+      "epoch": 0.10366389939883491,
+      "grad_norm": 0.0028975980821996927,
+      "learning_rate": 0.001,
+      "loss": 0.3869,
+      "step": 3757
+    },
+    {
+      "epoch": 0.1036914915998993,
+      "grad_norm": 0.004843092989176512,
+      "learning_rate": 0.001,
+      "loss": 0.3785,
+      "step": 3758
+    },
+    {
+      "epoch": 0.10371908380096366,
+      "grad_norm": 0.003735753009095788,
+      "learning_rate": 0.001,
+      "loss": 0.3961,
+      "step": 3759
+    },
+    {
+      "epoch": 0.10374667600202803,
+      "grad_norm": 0.0024528366047888994,
+      "learning_rate": 0.001,
+      "loss": 0.4391,
+      "step": 3760
+    },
+    {
+      "epoch": 0.10377426820309239,
+      "grad_norm": 0.003306907368823886,
+      "learning_rate": 0.001,
+      "loss": 0.3829,
+      "step": 3761
+    },
+    {
+      "epoch": 0.10380186040415676,
+      "grad_norm": 0.0029531391337513924,
+      "learning_rate": 0.001,
+      "loss": 0.3595,
+      "step": 3762
+    },
+    {
+      "epoch": 0.10382945260522114,
+      "grad_norm": 0.004337473772466183,
+      "learning_rate": 0.001,
+      "loss": 0.4374,
+      "step": 3763
+    },
+    {
+      "epoch": 0.1038570448062855,
+      "grad_norm": 0.0033757942728698254,
+      "learning_rate": 0.001,
+      "loss": 0.386,
+      "step": 3764
+    },
+    {
+      "epoch": 0.10388463700734987,
+      "grad_norm": 0.004451198037713766,
+      "learning_rate": 0.001,
+      "loss": 0.4011,
+      "step": 3765
+    },
+    {
+      "epoch": 0.10391222920841424,
+      "grad_norm": 0.0029722759500145912,
+      "learning_rate": 0.001,
+      "loss": 0.3771,
+      "step": 3766
+    },
+    {
+      "epoch": 0.1039398214094786,
+      "grad_norm": 0.003191061783581972,
+      "learning_rate": 0.001,
+      "loss": 0.4029,
+      "step": 3767
+    },
+    {
+      "epoch": 0.10396741361054299,
+      "grad_norm": 0.0037221303209662437,
+      "learning_rate": 0.001,
+      "loss": 0.3659,
+      "step": 3768
+    },
+    {
+      "epoch": 0.10399500581160735,
+      "grad_norm": 0.0024105177726596594,
+      "learning_rate": 0.001,
+      "loss": 0.421,
+      "step": 3769
+    },
+    {
+      "epoch": 0.10402259801267172,
+      "grad_norm": 0.002493768697604537,
+      "learning_rate": 0.001,
+      "loss": 0.3957,
+      "step": 3770
+    },
+    {
+      "epoch": 0.10405019021373609,
+      "grad_norm": 0.0025503532961010933,
+      "learning_rate": 0.001,
+      "loss": 0.3833,
+      "step": 3771
+    },
+    {
+      "epoch": 0.10407778241480045,
+      "grad_norm": 0.0032149364706128836,
+      "learning_rate": 0.001,
+      "loss": 0.3904,
+      "step": 3772
+    },
+    {
+      "epoch": 0.10410537461586483,
+      "grad_norm": 0.004015072248876095,
+      "learning_rate": 0.001,
+      "loss": 0.4076,
+      "step": 3773
+    },
+    {
+      "epoch": 0.1041329668169292,
+      "grad_norm": 0.008095541037619114,
+      "learning_rate": 0.001,
+      "loss": 0.37,
+      "step": 3774
+    },
+    {
+      "epoch": 0.10416055901799356,
+      "grad_norm": 0.0025133301969617605,
+      "learning_rate": 0.001,
+      "loss": 0.4088,
+      "step": 3775
+    },
+    {
+      "epoch": 0.10418815121905793,
+      "grad_norm": 0.0035531746689230204,
+      "learning_rate": 0.001,
+      "loss": 0.379,
+      "step": 3776
+    },
+    {
+      "epoch": 0.1042157434201223,
+      "grad_norm": 0.003656880697235465,
+      "learning_rate": 0.001,
+      "loss": 0.388,
+      "step": 3777
+    },
+    {
+      "epoch": 0.10424333562118668,
+      "grad_norm": 0.005002745892852545,
+      "learning_rate": 0.001,
+      "loss": 0.3961,
+      "step": 3778
+    },
+    {
+      "epoch": 0.10427092782225104,
+      "grad_norm": 0.0031288950704038143,
+      "learning_rate": 0.001,
+      "loss": 0.4222,
+      "step": 3779
+    },
+    {
+      "epoch": 0.10429852002331541,
+      "grad_norm": 0.0030303194653242826,
+      "learning_rate": 0.001,
+      "loss": 0.387,
+      "step": 3780
+    },
+    {
+      "epoch": 0.10432611222437978,
+      "grad_norm": 0.004291849210858345,
+      "learning_rate": 0.001,
+      "loss": 0.3933,
+      "step": 3781
+    },
+    {
+      "epoch": 0.10435370442544414,
+      "grad_norm": 0.003425056580454111,
+      "learning_rate": 0.001,
+      "loss": 0.3847,
+      "step": 3782
+    },
+    {
+      "epoch": 0.10438129662650852,
+      "grad_norm": 0.0025096056051552296,
+      "learning_rate": 0.001,
+      "loss": 0.3992,
+      "step": 3783
+    },
+    {
+      "epoch": 0.10440888882757289,
+      "grad_norm": 0.002500693080946803,
+      "learning_rate": 0.001,
+      "loss": 0.403,
+      "step": 3784
+    },
+    {
+      "epoch": 0.10443648102863726,
+      "grad_norm": 0.003590058069676161,
+      "learning_rate": 0.001,
+      "loss": 0.3998,
+      "step": 3785
+    },
+    {
+      "epoch": 0.10446407322970162,
+      "grad_norm": 0.004308347124606371,
+      "learning_rate": 0.001,
+      "loss": 0.4492,
+      "step": 3786
+    },
+    {
+      "epoch": 0.10449166543076599,
+      "grad_norm": 0.002307620132341981,
+      "learning_rate": 0.001,
+      "loss": 0.4146,
+      "step": 3787
+    },
+    {
+      "epoch": 0.10451925763183036,
+      "grad_norm": 0.0036616444122046232,
+      "learning_rate": 0.001,
+      "loss": 0.3787,
+      "step": 3788
+    },
+    {
+      "epoch": 0.10454684983289474,
+      "grad_norm": 0.0035604690201580524,
+      "learning_rate": 0.001,
+      "loss": 0.3597,
+      "step": 3789
+    },
+    {
+      "epoch": 0.1045744420339591,
+      "grad_norm": 0.00397746916860342,
+      "learning_rate": 0.001,
+      "loss": 0.3954,
+      "step": 3790
+    },
+    {
+      "epoch": 0.10460203423502347,
+      "grad_norm": 0.009449174627661705,
+      "learning_rate": 0.001,
+      "loss": 0.3521,
+      "step": 3791
+    },
+    {
+      "epoch": 0.10462962643608784,
+      "grad_norm": 0.0032818394247442484,
+      "learning_rate": 0.001,
+      "loss": 0.3655,
+      "step": 3792
+    },
+    {
+      "epoch": 0.1046572186371522,
+      "grad_norm": 0.0032766228541731834,
+      "learning_rate": 0.001,
+      "loss": 0.4209,
+      "step": 3793
+    },
+    {
+      "epoch": 0.10468481083821658,
+      "grad_norm": 0.002585778711363673,
+      "learning_rate": 0.001,
+      "loss": 0.4043,
+      "step": 3794
+    },
+    {
+      "epoch": 0.10471240303928095,
+      "grad_norm": 0.0029392503201961517,
+      "learning_rate": 0.001,
+      "loss": 0.4474,
+      "step": 3795
+    },
+    {
+      "epoch": 0.10473999524034532,
+      "grad_norm": 0.0027161298785358667,
+      "learning_rate": 0.001,
+      "loss": 0.4161,
+      "step": 3796
+    },
+    {
+      "epoch": 0.10476758744140968,
+      "grad_norm": 0.0034348920453339815,
+      "learning_rate": 0.001,
+      "loss": 0.4016,
+      "step": 3797
+    },
+    {
+      "epoch": 0.10479517964247405,
+      "grad_norm": 0.007731628604233265,
+      "learning_rate": 0.001,
+      "loss": 0.3592,
+      "step": 3798
+    },
+    {
+      "epoch": 0.10482277184353843,
+      "grad_norm": 0.0029055611230432987,
+      "learning_rate": 0.001,
+      "loss": 0.3931,
+      "step": 3799
+    },
+    {
+      "epoch": 0.1048503640446028,
+      "grad_norm": 0.0027976164128631353,
+      "learning_rate": 0.001,
+      "loss": 0.388,
+      "step": 3800
+    },
+    {
+      "epoch": 0.10487795624566716,
+      "grad_norm": 0.003006401937454939,
+      "learning_rate": 0.001,
+      "loss": 0.3864,
+      "step": 3801
+    },
+    {
+      "epoch": 0.10490554844673153,
+      "grad_norm": 0.002237136010080576,
+      "learning_rate": 0.001,
+      "loss": 0.4018,
+      "step": 3802
+    },
+    {
+      "epoch": 0.1049331406477959,
+      "grad_norm": 0.003247616346925497,
+      "learning_rate": 0.001,
+      "loss": 0.381,
+      "step": 3803
+    },
+    {
+      "epoch": 0.10496073284886027,
+      "grad_norm": 0.002951403148472309,
+      "learning_rate": 0.001,
+      "loss": 0.3899,
+      "step": 3804
+    },
+    {
+      "epoch": 0.10498832504992464,
+      "grad_norm": 0.002603907370939851,
+      "learning_rate": 0.001,
+      "loss": 0.4005,
+      "step": 3805
+    },
+    {
+      "epoch": 0.10501591725098901,
+      "grad_norm": 0.0022911475971341133,
+      "learning_rate": 0.001,
+      "loss": 0.4007,
+      "step": 3806
+    },
+    {
+      "epoch": 0.10504350945205337,
+      "grad_norm": 0.002795920707285404,
+      "learning_rate": 0.001,
+      "loss": 0.3948,
+      "step": 3807
+    },
+    {
+      "epoch": 0.10507110165311774,
+      "grad_norm": 0.0031260910909622908,
+      "learning_rate": 0.001,
+      "loss": 0.386,
+      "step": 3808
+    },
+    {
+      "epoch": 0.10509869385418212,
+      "grad_norm": 0.003506281180307269,
+      "learning_rate": 0.001,
+      "loss": 0.3794,
+      "step": 3809
+    },
+    {
+      "epoch": 0.10512628605524649,
+      "grad_norm": 0.0027451482601463795,
+      "learning_rate": 0.001,
+      "loss": 0.3876,
+      "step": 3810
+    },
+    {
+      "epoch": 0.10515387825631085,
+      "grad_norm": 0.0057808831334114075,
+      "learning_rate": 0.001,
+      "loss": 0.4082,
+      "step": 3811
+    },
+    {
+      "epoch": 0.10518147045737522,
+      "grad_norm": 0.003006263403221965,
+      "learning_rate": 0.001,
+      "loss": 0.3698,
+      "step": 3812
+    },
+    {
+      "epoch": 0.10520906265843959,
+      "grad_norm": 0.004194669425487518,
+      "learning_rate": 0.001,
+      "loss": 0.4142,
+      "step": 3813
+    },
+    {
+      "epoch": 0.10523665485950397,
+      "grad_norm": 0.004824482370167971,
+      "learning_rate": 0.001,
+      "loss": 0.4101,
+      "step": 3814
+    },
+    {
+      "epoch": 0.10526424706056833,
+      "grad_norm": 0.0029831102583557367,
+      "learning_rate": 0.001,
+      "loss": 0.4164,
+      "step": 3815
+    },
+    {
+      "epoch": 0.1052918392616327,
+      "grad_norm": 0.004361843690276146,
+      "learning_rate": 0.001,
+      "loss": 0.3874,
+      "step": 3816
+    },
+    {
+      "epoch": 0.10531943146269707,
+      "grad_norm": 0.002336485544219613,
+      "learning_rate": 0.001,
+      "loss": 0.4153,
+      "step": 3817
+    },
+    {
+      "epoch": 0.10534702366376143,
+      "grad_norm": 0.0023848165292292833,
+      "learning_rate": 0.001,
+      "loss": 0.4001,
+      "step": 3818
+    },
+    {
+      "epoch": 0.10537461586482581,
+      "grad_norm": 0.0029371667187660933,
+      "learning_rate": 0.001,
+      "loss": 0.361,
+      "step": 3819
+    },
+    {
+      "epoch": 0.10540220806589018,
+      "grad_norm": 0.003010603366419673,
+      "learning_rate": 0.001,
+      "loss": 0.3866,
+      "step": 3820
+    },
+    {
+      "epoch": 0.10542980026695455,
+      "grad_norm": 0.007170096971094608,
+      "learning_rate": 0.001,
+      "loss": 0.4616,
+      "step": 3821
+    },
+    {
+      "epoch": 0.10545739246801891,
+      "grad_norm": 0.002445077523589134,
+      "learning_rate": 0.001,
+      "loss": 0.4147,
+      "step": 3822
+    },
+    {
+      "epoch": 0.10548498466908328,
+      "grad_norm": 0.0034536407329142094,
+      "learning_rate": 0.001,
+      "loss": 0.4023,
+      "step": 3823
+    },
+    {
+      "epoch": 0.10551257687014766,
+      "grad_norm": 0.0021885402966290712,
+      "learning_rate": 0.001,
+      "loss": 0.4053,
+      "step": 3824
+    },
+    {
+      "epoch": 0.10554016907121203,
+      "grad_norm": 0.0027080499567091465,
+      "learning_rate": 0.001,
+      "loss": 0.416,
+      "step": 3825
+    },
+    {
+      "epoch": 0.10556776127227639,
+      "grad_norm": 0.0036047815810889006,
+      "learning_rate": 0.001,
+      "loss": 0.3697,
+      "step": 3826
+    },
+    {
+      "epoch": 0.10559535347334076,
+      "grad_norm": 0.0027917807456105947,
+      "learning_rate": 0.001,
+      "loss": 0.3715,
+      "step": 3827
+    },
+    {
+      "epoch": 0.10562294567440512,
+      "grad_norm": 0.0034859776496887207,
+      "learning_rate": 0.001,
+      "loss": 0.3811,
+      "step": 3828
+    },
+    {
+      "epoch": 0.1056505378754695,
+      "grad_norm": 0.0031901709735393524,
+      "learning_rate": 0.001,
+      "loss": 0.4007,
+      "step": 3829
+    },
+    {
+      "epoch": 0.10567813007653387,
+      "grad_norm": 0.004165272694081068,
+      "learning_rate": 0.001,
+      "loss": 0.3996,
+      "step": 3830
+    },
+    {
+      "epoch": 0.10570572227759824,
+      "grad_norm": 0.0031863113399595022,
+      "learning_rate": 0.001,
+      "loss": 0.3956,
+      "step": 3831
+    },
+    {
+      "epoch": 0.1057333144786626,
+      "grad_norm": 0.0035512226168066263,
+      "learning_rate": 0.001,
+      "loss": 0.3881,
+      "step": 3832
+    },
+    {
+      "epoch": 0.10576090667972697,
+      "grad_norm": 0.0030755288898944855,
+      "learning_rate": 0.001,
+      "loss": 0.3944,
+      "step": 3833
+    },
+    {
+      "epoch": 0.10578849888079135,
+      "grad_norm": 0.0031162879895418882,
+      "learning_rate": 0.001,
+      "loss": 0.4025,
+      "step": 3834
+    },
+    {
+      "epoch": 0.10581609108185572,
+      "grad_norm": 0.0038108036387711763,
+      "learning_rate": 0.001,
+      "loss": 0.4228,
+      "step": 3835
+    },
+    {
+      "epoch": 0.10584368328292008,
+      "grad_norm": 0.0033550935331732035,
+      "learning_rate": 0.001,
+      "loss": 0.3907,
+      "step": 3836
+    },
+    {
+      "epoch": 0.10587127548398445,
+      "grad_norm": 0.0031523280777037144,
+      "learning_rate": 0.001,
+      "loss": 0.4011,
+      "step": 3837
+    },
+    {
+      "epoch": 0.10589886768504882,
+      "grad_norm": 0.0038961879909038544,
+      "learning_rate": 0.001,
+      "loss": 0.395,
+      "step": 3838
+    },
+    {
+      "epoch": 0.10592645988611318,
+      "grad_norm": 0.005496688652783632,
+      "learning_rate": 0.001,
+      "loss": 0.3864,
+      "step": 3839
+    },
+    {
+      "epoch": 0.10595405208717756,
+      "grad_norm": 0.0032198880799114704,
+      "learning_rate": 0.001,
+      "loss": 0.3935,
+      "step": 3840
+    },
+    {
+      "epoch": 0.10598164428824193,
+      "grad_norm": 0.003234037896618247,
+      "learning_rate": 0.001,
+      "loss": 0.4042,
+      "step": 3841
+    },
+    {
+      "epoch": 0.1060092364893063,
+      "grad_norm": 0.002870423486456275,
+      "learning_rate": 0.001,
+      "loss": 0.3869,
+      "step": 3842
+    },
+    {
+      "epoch": 0.10603682869037066,
+      "grad_norm": 0.004519653040915728,
+      "learning_rate": 0.001,
+      "loss": 0.4125,
+      "step": 3843
+    },
+    {
+      "epoch": 0.10606442089143503,
+      "grad_norm": 0.003621830837801099,
+      "learning_rate": 0.001,
+      "loss": 0.3895,
+      "step": 3844
+    },
+    {
+      "epoch": 0.10609201309249941,
+      "grad_norm": 0.0029260909650474787,
+      "learning_rate": 0.001,
+      "loss": 0.4112,
+      "step": 3845
+    },
+    {
+      "epoch": 0.10611960529356378,
+      "grad_norm": 0.0031509913969784975,
+      "learning_rate": 0.001,
+      "loss": 0.3623,
+      "step": 3846
+    },
+    {
+      "epoch": 0.10614719749462814,
+      "grad_norm": 0.006669745780527592,
+      "learning_rate": 0.001,
+      "loss": 0.4215,
+      "step": 3847
+    },
+    {
+      "epoch": 0.10617478969569251,
+      "grad_norm": 0.003406877163797617,
+      "learning_rate": 0.001,
+      "loss": 0.4188,
+      "step": 3848
+    },
+    {
+      "epoch": 0.10620238189675688,
+      "grad_norm": 0.008724176324903965,
+      "learning_rate": 0.001,
+      "loss": 0.4219,
+      "step": 3849
+    },
+    {
+      "epoch": 0.10622997409782126,
+      "grad_norm": 0.0026642687153071165,
+      "learning_rate": 0.001,
+      "loss": 0.4441,
+      "step": 3850
+    },
+    {
+      "epoch": 0.10625756629888562,
+      "grad_norm": 0.003902031574398279,
+      "learning_rate": 0.001,
+      "loss": 0.4033,
+      "step": 3851
+    },
+    {
+      "epoch": 0.10628515849994999,
+      "grad_norm": 0.0034857727587223053,
+      "learning_rate": 0.001,
+      "loss": 0.3652,
+      "step": 3852
+    },
+    {
+      "epoch": 0.10631275070101436,
+      "grad_norm": 0.0022453153505921364,
+      "learning_rate": 0.001,
+      "loss": 0.4164,
+      "step": 3853
+    },
+    {
+      "epoch": 0.10634034290207872,
+      "grad_norm": 0.002694975584745407,
+      "learning_rate": 0.001,
+      "loss": 0.4143,
+      "step": 3854
+    },
+    {
+      "epoch": 0.1063679351031431,
+      "grad_norm": 0.005093062296509743,
+      "learning_rate": 0.001,
+      "loss": 0.4107,
+      "step": 3855
+    },
+    {
+      "epoch": 0.10639552730420747,
+      "grad_norm": 0.004576206207275391,
+      "learning_rate": 0.001,
+      "loss": 0.4017,
+      "step": 3856
+    },
+    {
+      "epoch": 0.10642311950527183,
+      "grad_norm": 0.0031380197033286095,
+      "learning_rate": 0.001,
+      "loss": 0.3979,
+      "step": 3857
+    },
+    {
+      "epoch": 0.1064507117063362,
+      "grad_norm": 0.003493053140118718,
+      "learning_rate": 0.001,
+      "loss": 0.3976,
+      "step": 3858
+    },
+    {
+      "epoch": 0.10647830390740057,
+      "grad_norm": 0.0024905947502702475,
+      "learning_rate": 0.001,
+      "loss": 0.4554,
+      "step": 3859
+    },
+    {
+      "epoch": 0.10650589610846495,
+      "grad_norm": 0.00544704170897603,
+      "learning_rate": 0.001,
+      "loss": 0.3989,
+      "step": 3860
+    },
+    {
+      "epoch": 0.10653348830952931,
+      "grad_norm": 0.0029771511908620596,
+      "learning_rate": 0.001,
+      "loss": 0.3882,
+      "step": 3861
+    },
+    {
+      "epoch": 0.10656108051059368,
+      "grad_norm": 0.0027052282821387053,
+      "learning_rate": 0.001,
+      "loss": 0.3955,
+      "step": 3862
+    },
+    {
+      "epoch": 0.10658867271165805,
+      "grad_norm": 0.003082839772105217,
+      "learning_rate": 0.001,
+      "loss": 0.3908,
+      "step": 3863
+    },
+    {
+      "epoch": 0.10661626491272241,
+      "grad_norm": 0.003736154641956091,
+      "learning_rate": 0.001,
+      "loss": 0.3857,
+      "step": 3864
+    },
+    {
+      "epoch": 0.1066438571137868,
+      "grad_norm": 0.0027385384310036898,
+      "learning_rate": 0.001,
+      "loss": 0.4051,
+      "step": 3865
+    },
+    {
+      "epoch": 0.10667144931485116,
+      "grad_norm": 0.002778218826279044,
+      "learning_rate": 0.001,
+      "loss": 0.4041,
+      "step": 3866
+    },
+    {
+      "epoch": 0.10669904151591553,
+      "grad_norm": 0.01646745204925537,
+      "learning_rate": 0.001,
+      "loss": 0.3694,
+      "step": 3867
+    },
+    {
+      "epoch": 0.1067266337169799,
+      "grad_norm": 0.0036807360593229532,
+      "learning_rate": 0.001,
+      "loss": 0.3961,
+      "step": 3868
+    },
+    {
+      "epoch": 0.10675422591804426,
+      "grad_norm": 0.006385852582752705,
+      "learning_rate": 0.001,
+      "loss": 0.3896,
+      "step": 3869
+    },
+    {
+      "epoch": 0.10678181811910864,
+      "grad_norm": 0.0027478185947984457,
+      "learning_rate": 0.001,
+      "loss": 0.4045,
+      "step": 3870
+    },
+    {
+      "epoch": 0.106809410320173,
+      "grad_norm": 0.00484267994761467,
+      "learning_rate": 0.001,
+      "loss": 0.4049,
+      "step": 3871
+    },
+    {
+      "epoch": 0.10683700252123737,
+      "grad_norm": 0.0029064714908599854,
+      "learning_rate": 0.001,
+      "loss": 0.3901,
+      "step": 3872
+    },
+    {
+      "epoch": 0.10686459472230174,
+      "grad_norm": 0.004293619655072689,
+      "learning_rate": 0.001,
+      "loss": 0.4189,
+      "step": 3873
+    },
+    {
+      "epoch": 0.1068921869233661,
+      "grad_norm": 0.009066743776202202,
+      "learning_rate": 0.001,
+      "loss": 0.4004,
+      "step": 3874
+    },
+    {
+      "epoch": 0.10691977912443049,
+      "grad_norm": 0.004455687943845987,
+      "learning_rate": 0.001,
+      "loss": 0.3816,
+      "step": 3875
+    },
+    {
+      "epoch": 0.10694737132549485,
+      "grad_norm": 0.008621391840279102,
+      "learning_rate": 0.001,
+      "loss": 0.3958,
+      "step": 3876
+    },
+    {
+      "epoch": 0.10697496352655922,
+      "grad_norm": 0.0032543812412768602,
+      "learning_rate": 0.001,
+      "loss": 0.394,
+      "step": 3877
+    },
+    {
+      "epoch": 0.10700255572762359,
+      "grad_norm": 0.003056267276406288,
+      "learning_rate": 0.001,
+      "loss": 0.4099,
+      "step": 3878
+    },
+    {
+      "epoch": 0.10703014792868795,
+      "grad_norm": 0.003457016311585903,
+      "learning_rate": 0.001,
+      "loss": 0.3706,
+      "step": 3879
+    },
+    {
+      "epoch": 0.10705774012975233,
+      "grad_norm": 0.0029117148369550705,
+      "learning_rate": 0.001,
+      "loss": 0.3916,
+      "step": 3880
+    },
+    {
+      "epoch": 0.1070853323308167,
+      "grad_norm": 0.003112402046099305,
+      "learning_rate": 0.001,
+      "loss": 0.394,
+      "step": 3881
+    },
+    {
+      "epoch": 0.10711292453188107,
+      "grad_norm": 0.003459386760368943,
+      "learning_rate": 0.001,
+      "loss": 0.382,
+      "step": 3882
+    },
+    {
+      "epoch": 0.10714051673294543,
+      "grad_norm": 0.007629405707120895,
+      "learning_rate": 0.001,
+      "loss": 0.3746,
+      "step": 3883
+    },
+    {
+      "epoch": 0.1071681089340098,
+      "grad_norm": 0.007017344702035189,
+      "learning_rate": 0.001,
+      "loss": 0.3503,
+      "step": 3884
+    },
+    {
+      "epoch": 0.10719570113507416,
+      "grad_norm": 0.006143512669950724,
+      "learning_rate": 0.001,
+      "loss": 0.4128,
+      "step": 3885
+    },
+    {
+      "epoch": 0.10722329333613854,
+      "grad_norm": 0.007819131016731262,
+      "learning_rate": 0.001,
+      "loss": 0.4314,
+      "step": 3886
+    },
+    {
+      "epoch": 0.10725088553720291,
+      "grad_norm": 0.0033061886206269264,
+      "learning_rate": 0.001,
+      "loss": 0.44,
+      "step": 3887
+    },
+    {
+      "epoch": 0.10727847773826728,
+      "grad_norm": 0.0026390962302684784,
+      "learning_rate": 0.001,
+      "loss": 0.3773,
+      "step": 3888
+    },
+    {
+      "epoch": 0.10730606993933164,
+      "grad_norm": 0.003349416656419635,
+      "learning_rate": 0.001,
+      "loss": 0.4298,
+      "step": 3889
+    },
+    {
+      "epoch": 0.10733366214039601,
+      "grad_norm": 0.004052475094795227,
+      "learning_rate": 0.001,
+      "loss": 0.3929,
+      "step": 3890
+    },
+    {
+      "epoch": 0.10736125434146039,
+      "grad_norm": 0.008710183203220367,
+      "learning_rate": 0.001,
+      "loss": 0.3899,
+      "step": 3891
+    },
+    {
+      "epoch": 0.10738884654252476,
+      "grad_norm": 0.0028958211187273264,
+      "learning_rate": 0.001,
+      "loss": 0.4112,
+      "step": 3892
+    },
+    {
+      "epoch": 0.10741643874358912,
+      "grad_norm": 0.002961238846182823,
+      "learning_rate": 0.001,
+      "loss": 0.3816,
+      "step": 3893
+    },
+    {
+      "epoch": 0.10744403094465349,
+      "grad_norm": 0.0025948896072804928,
+      "learning_rate": 0.001,
+      "loss": 0.4133,
+      "step": 3894
+    },
+    {
+      "epoch": 0.10747162314571786,
+      "grad_norm": 0.0037905005738139153,
+      "learning_rate": 0.001,
+      "loss": 0.3927,
+      "step": 3895
+    },
+    {
+      "epoch": 0.10749921534678224,
+      "grad_norm": 0.003213467774912715,
+      "learning_rate": 0.001,
+      "loss": 0.4159,
+      "step": 3896
+    },
+    {
+      "epoch": 0.1075268075478466,
+      "grad_norm": 0.002972138812765479,
+      "learning_rate": 0.001,
+      "loss": 0.4206,
+      "step": 3897
+    },
+    {
+      "epoch": 0.10755439974891097,
+      "grad_norm": 0.0030399637762457132,
+      "learning_rate": 0.001,
+      "loss": 0.4082,
+      "step": 3898
+    },
+    {
+      "epoch": 0.10758199194997534,
+      "grad_norm": 0.002291029319167137,
+      "learning_rate": 0.001,
+      "loss": 0.382,
+      "step": 3899
+    },
+    {
+      "epoch": 0.1076095841510397,
+      "grad_norm": 0.002236071042716503,
+      "learning_rate": 0.001,
+      "loss": 0.4434,
+      "step": 3900
+    },
+    {
+      "epoch": 0.10763717635210408,
+      "grad_norm": 0.0024943517055362463,
+      "learning_rate": 0.001,
+      "loss": 0.3686,
+      "step": 3901
+    },
+    {
+      "epoch": 0.10766476855316845,
+      "grad_norm": 0.002602664055302739,
+      "learning_rate": 0.001,
+      "loss": 0.4001,
+      "step": 3902
+    },
+    {
+      "epoch": 0.10769236075423282,
+      "grad_norm": 0.0031983822118490934,
+      "learning_rate": 0.001,
+      "loss": 0.4126,
+      "step": 3903
+    },
+    {
+      "epoch": 0.10771995295529718,
+      "grad_norm": 0.0027117652352899313,
+      "learning_rate": 0.001,
+      "loss": 0.4063,
+      "step": 3904
+    },
+    {
+      "epoch": 0.10774754515636155,
+      "grad_norm": 0.002372644143179059,
+      "learning_rate": 0.001,
+      "loss": 0.3827,
+      "step": 3905
+    },
+    {
+      "epoch": 0.10777513735742593,
+      "grad_norm": 0.0038946103304624557,
+      "learning_rate": 0.001,
+      "loss": 0.3759,
+      "step": 3906
+    },
+    {
+      "epoch": 0.1078027295584903,
+      "grad_norm": 0.00335517106577754,
+      "learning_rate": 0.001,
+      "loss": 0.4195,
+      "step": 3907
+    },
+    {
+      "epoch": 0.10783032175955466,
+      "grad_norm": 0.015401921235024929,
+      "learning_rate": 0.001,
+      "loss": 0.4003,
+      "step": 3908
+    },
+    {
+      "epoch": 0.10785791396061903,
+      "grad_norm": 0.003028671722859144,
+      "learning_rate": 0.001,
+      "loss": 0.4093,
+      "step": 3909
+    },
+    {
+      "epoch": 0.1078855061616834,
+      "grad_norm": 0.003672859398648143,
+      "learning_rate": 0.001,
+      "loss": 0.3891,
+      "step": 3910
+    },
+    {
+      "epoch": 0.10791309836274778,
+      "grad_norm": 0.0038496828638017178,
+      "learning_rate": 0.001,
+      "loss": 0.429,
+      "step": 3911
+    },
+    {
+      "epoch": 0.10794069056381214,
+      "grad_norm": 0.0068849422968924046,
+      "learning_rate": 0.001,
+      "loss": 0.4226,
+      "step": 3912
+    },
+    {
+      "epoch": 0.10796828276487651,
+      "grad_norm": 0.0074515510350465775,
+      "learning_rate": 0.001,
+      "loss": 0.3765,
+      "step": 3913
+    },
+    {
+      "epoch": 0.10799587496594087,
+      "grad_norm": 0.0070841130800545216,
+      "learning_rate": 0.001,
+      "loss": 0.4285,
+      "step": 3914
+    },
+    {
+      "epoch": 0.10802346716700524,
+      "grad_norm": 0.0054721771739423275,
+      "learning_rate": 0.001,
+      "loss": 0.3939,
+      "step": 3915
+    },
+    {
+      "epoch": 0.10805105936806962,
+      "grad_norm": 0.004237642977386713,
+      "learning_rate": 0.001,
+      "loss": 0.4349,
+      "step": 3916
+    },
+    {
+      "epoch": 0.10807865156913399,
+      "grad_norm": 0.004252060316503048,
+      "learning_rate": 0.001,
+      "loss": 0.3891,
+      "step": 3917
+    },
+    {
+      "epoch": 0.10810624377019835,
+      "grad_norm": 0.0034951562993228436,
+      "learning_rate": 0.001,
+      "loss": 0.3818,
+      "step": 3918
+    },
+    {
+      "epoch": 0.10813383597126272,
+      "grad_norm": 0.004239407833665609,
+      "learning_rate": 0.001,
+      "loss": 0.4368,
+      "step": 3919
+    },
+    {
+      "epoch": 0.10816142817232709,
+      "grad_norm": 0.0040299855172634125,
+      "learning_rate": 0.001,
+      "loss": 0.36,
+      "step": 3920
+    },
+    {
+      "epoch": 0.10818902037339147,
+      "grad_norm": 0.0028297097887843847,
+      "learning_rate": 0.001,
+      "loss": 0.4012,
+      "step": 3921
+    },
+    {
+      "epoch": 0.10821661257445583,
+      "grad_norm": 0.0039049547631293535,
+      "learning_rate": 0.001,
+      "loss": 0.4299,
+      "step": 3922
+    },
+    {
+      "epoch": 0.1082442047755202,
+      "grad_norm": 0.0031370699871331453,
+      "learning_rate": 0.001,
+      "loss": 0.4042,
+      "step": 3923
+    },
+    {
+      "epoch": 0.10827179697658457,
+      "grad_norm": 0.004705764818936586,
+      "learning_rate": 0.001,
+      "loss": 0.3727,
+      "step": 3924
+    },
+    {
+      "epoch": 0.10829938917764893,
+      "grad_norm": 0.0024270943831652403,
+      "learning_rate": 0.001,
+      "loss": 0.3855,
+      "step": 3925
+    },
+    {
+      "epoch": 0.10832698137871331,
+      "grad_norm": 0.003461951157078147,
+      "learning_rate": 0.001,
+      "loss": 0.4069,
+      "step": 3926
+    },
+    {
+      "epoch": 0.10835457357977768,
+      "grad_norm": 0.008057190105319023,
+      "learning_rate": 0.001,
+      "loss": 0.3887,
+      "step": 3927
+    },
+    {
+      "epoch": 0.10838216578084205,
+      "grad_norm": 0.0031221345998346806,
+      "learning_rate": 0.001,
+      "loss": 0.3631,
+      "step": 3928
+    },
+    {
+      "epoch": 0.10840975798190641,
+      "grad_norm": 0.0029621014837175608,
+      "learning_rate": 0.001,
+      "loss": 0.3889,
+      "step": 3929
+    },
+    {
+      "epoch": 0.10843735018297078,
+      "grad_norm": 0.002558749634772539,
+      "learning_rate": 0.001,
+      "loss": 0.3698,
+      "step": 3930
+    },
+    {
+      "epoch": 0.10846494238403515,
+      "grad_norm": 0.002620436018332839,
+      "learning_rate": 0.001,
+      "loss": 0.4195,
+      "step": 3931
+    },
+    {
+      "epoch": 0.10849253458509953,
+      "grad_norm": 0.0027069852221757174,
+      "learning_rate": 0.001,
+      "loss": 0.4048,
+      "step": 3932
+    },
+    {
+      "epoch": 0.10852012678616389,
+      "grad_norm": 0.003972397185862064,
+      "learning_rate": 0.001,
+      "loss": 0.3961,
+      "step": 3933
+    },
+    {
+      "epoch": 0.10854771898722826,
+      "grad_norm": 0.0021818478126078844,
+      "learning_rate": 0.001,
+      "loss": 0.4113,
+      "step": 3934
+    },
+    {
+      "epoch": 0.10857531118829263,
+      "grad_norm": 0.0026342314667999744,
+      "learning_rate": 0.001,
+      "loss": 0.409,
+      "step": 3935
+    },
+    {
+      "epoch": 0.10860290338935699,
+      "grad_norm": 0.002321448177099228,
+      "learning_rate": 0.001,
+      "loss": 0.4049,
+      "step": 3936
+    },
+    {
+      "epoch": 0.10863049559042137,
+      "grad_norm": 0.002941095968708396,
+      "learning_rate": 0.001,
+      "loss": 0.3933,
+      "step": 3937
+    },
+    {
+      "epoch": 0.10865808779148574,
+      "grad_norm": 0.0027615423314273357,
+      "learning_rate": 0.001,
+      "loss": 0.393,
+      "step": 3938
+    },
+    {
+      "epoch": 0.1086856799925501,
+      "grad_norm": 0.0035208980552852154,
+      "learning_rate": 0.001,
+      "loss": 0.4064,
+      "step": 3939
+    },
+    {
+      "epoch": 0.10871327219361447,
+      "grad_norm": 0.0035143953282386065,
+      "learning_rate": 0.001,
+      "loss": 0.3768,
+      "step": 3940
+    },
+    {
+      "epoch": 0.10874086439467884,
+      "grad_norm": 0.003013983368873596,
+      "learning_rate": 0.001,
+      "loss": 0.3959,
+      "step": 3941
+    },
+    {
+      "epoch": 0.10876845659574322,
+      "grad_norm": 0.003047554986551404,
+      "learning_rate": 0.001,
+      "loss": 0.4309,
+      "step": 3942
+    },
+    {
+      "epoch": 0.10879604879680758,
+      "grad_norm": 0.0038127705920487642,
+      "learning_rate": 0.001,
+      "loss": 0.3845,
+      "step": 3943
+    },
+    {
+      "epoch": 0.10882364099787195,
+      "grad_norm": 0.003960581962019205,
+      "learning_rate": 0.001,
+      "loss": 0.3942,
+      "step": 3944
+    },
+    {
+      "epoch": 0.10885123319893632,
+      "grad_norm": 0.0032766389194875956,
+      "learning_rate": 0.001,
+      "loss": 0.4133,
+      "step": 3945
+    },
+    {
+      "epoch": 0.10887882540000068,
+      "grad_norm": 0.005258220247924328,
+      "learning_rate": 0.001,
+      "loss": 0.3556,
+      "step": 3946
+    },
+    {
+      "epoch": 0.10890641760106506,
+      "grad_norm": 0.0038592154160141945,
+      "learning_rate": 0.001,
+      "loss": 0.3538,
+      "step": 3947
+    },
+    {
+      "epoch": 0.10893400980212943,
+      "grad_norm": 0.004304266069084406,
+      "learning_rate": 0.001,
+      "loss": 0.3872,
+      "step": 3948
+    },
+    {
+      "epoch": 0.1089616020031938,
+      "grad_norm": 0.0032214997336268425,
+      "learning_rate": 0.001,
+      "loss": 0.378,
+      "step": 3949
+    },
+    {
+      "epoch": 0.10898919420425816,
+      "grad_norm": 0.0059113227762281895,
+      "learning_rate": 0.001,
+      "loss": 0.3923,
+      "step": 3950
+    },
+    {
+      "epoch": 0.10901678640532253,
+      "grad_norm": 0.0029449171852320433,
+      "learning_rate": 0.001,
+      "loss": 0.393,
+      "step": 3951
+    },
+    {
+      "epoch": 0.10904437860638691,
+      "grad_norm": 0.0030160732567310333,
+      "learning_rate": 0.001,
+      "loss": 0.3834,
+      "step": 3952
+    },
+    {
+      "epoch": 0.10907197080745128,
+      "grad_norm": 0.012647976167500019,
+      "learning_rate": 0.001,
+      "loss": 0.3665,
+      "step": 3953
+    },
+    {
+      "epoch": 0.10909956300851564,
+      "grad_norm": 0.0026820586062967777,
+      "learning_rate": 0.001,
+      "loss": 0.4019,
+      "step": 3954
+    },
+    {
+      "epoch": 0.10912715520958001,
+      "grad_norm": 0.006842675618827343,
+      "learning_rate": 0.001,
+      "loss": 0.4137,
+      "step": 3955
+    },
+    {
+      "epoch": 0.10915474741064438,
+      "grad_norm": 0.0030738625209778547,
+      "learning_rate": 0.001,
+      "loss": 0.3864,
+      "step": 3956
+    },
+    {
+      "epoch": 0.10918233961170876,
+      "grad_norm": 0.0023346368689090014,
+      "learning_rate": 0.001,
+      "loss": 0.3768,
+      "step": 3957
+    },
+    {
+      "epoch": 0.10920993181277312,
+      "grad_norm": 0.005200542975217104,
+      "learning_rate": 0.001,
+      "loss": 0.3889,
+      "step": 3958
+    },
+    {
+      "epoch": 0.10923752401383749,
+      "grad_norm": 0.0057869781740009785,
+      "learning_rate": 0.001,
+      "loss": 0.3675,
+      "step": 3959
+    },
+    {
+      "epoch": 0.10926511621490186,
+      "grad_norm": 0.0025842657778412104,
+      "learning_rate": 0.001,
+      "loss": 0.3964,
+      "step": 3960
+    },
+    {
+      "epoch": 0.10929270841596622,
+      "grad_norm": 0.002922437386587262,
+      "learning_rate": 0.001,
+      "loss": 0.4117,
+      "step": 3961
+    },
+    {
+      "epoch": 0.1093203006170306,
+      "grad_norm": 0.0038535622879862785,
+      "learning_rate": 0.001,
+      "loss": 0.4104,
+      "step": 3962
+    },
+    {
+      "epoch": 0.10934789281809497,
+      "grad_norm": 0.0034645821433514357,
+      "learning_rate": 0.001,
+      "loss": 0.364,
+      "step": 3963
+    },
+    {
+      "epoch": 0.10937548501915934,
+      "grad_norm": 0.0022498066537082195,
+      "learning_rate": 0.001,
+      "loss": 0.4021,
+      "step": 3964
+    },
+    {
+      "epoch": 0.1094030772202237,
+      "grad_norm": 0.0035552133340388536,
+      "learning_rate": 0.001,
+      "loss": 0.4002,
+      "step": 3965
+    },
+    {
+      "epoch": 0.10943066942128807,
+      "grad_norm": 0.0029964253772050142,
+      "learning_rate": 0.001,
+      "loss": 0.4,
+      "step": 3966
+    },
+    {
+      "epoch": 0.10945826162235245,
+      "grad_norm": 0.002939821919426322,
+      "learning_rate": 0.001,
+      "loss": 0.3842,
+      "step": 3967
+    },
+    {
+      "epoch": 0.10948585382341681,
+      "grad_norm": 0.0021805204451084137,
+      "learning_rate": 0.001,
+      "loss": 0.4022,
+      "step": 3968
+    },
+    {
+      "epoch": 0.10951344602448118,
+      "grad_norm": 0.010238613933324814,
+      "learning_rate": 0.001,
+      "loss": 0.3945,
+      "step": 3969
+    },
+    {
+      "epoch": 0.10954103822554555,
+      "grad_norm": 0.00320064858533442,
+      "learning_rate": 0.001,
+      "loss": 0.4245,
+      "step": 3970
+    },
+    {
+      "epoch": 0.10956863042660991,
+      "grad_norm": 0.0038515296764671803,
+      "learning_rate": 0.001,
+      "loss": 0.3847,
+      "step": 3971
+    },
+    {
+      "epoch": 0.1095962226276743,
+      "grad_norm": 0.0033414731733500957,
+      "learning_rate": 0.001,
+      "loss": 0.3992,
+      "step": 3972
+    },
+    {
+      "epoch": 0.10962381482873866,
+      "grad_norm": 0.002315077930688858,
+      "learning_rate": 0.001,
+      "loss": 0.4104,
+      "step": 3973
+    },
+    {
+      "epoch": 0.10965140702980303,
+      "grad_norm": 0.0025413348339498043,
+      "learning_rate": 0.001,
+      "loss": 0.4289,
+      "step": 3974
+    },
+    {
+      "epoch": 0.1096789992308674,
+      "grad_norm": 0.010324854403734207,
+      "learning_rate": 0.001,
+      "loss": 0.3901,
+      "step": 3975
+    },
+    {
+      "epoch": 0.10970659143193176,
+      "grad_norm": 0.009162775240838528,
+      "learning_rate": 0.001,
+      "loss": 0.3952,
+      "step": 3976
+    },
+    {
+      "epoch": 0.10973418363299613,
+      "grad_norm": 0.00687329052016139,
+      "learning_rate": 0.001,
+      "loss": 0.4187,
+      "step": 3977
+    },
+    {
+      "epoch": 0.10976177583406051,
+      "grad_norm": 0.009686012752354145,
+      "learning_rate": 0.001,
+      "loss": 0.3732,
+      "step": 3978
+    },
+    {
+      "epoch": 0.10978936803512487,
+      "grad_norm": 0.003329911269247532,
+      "learning_rate": 0.001,
+      "loss": 0.3573,
+      "step": 3979
+    },
+    {
+      "epoch": 0.10981696023618924,
+      "grad_norm": 0.003336430061608553,
+      "learning_rate": 0.001,
+      "loss": 0.4589,
+      "step": 3980
+    },
+    {
+      "epoch": 0.1098445524372536,
+      "grad_norm": 0.0021596092265099287,
+      "learning_rate": 0.001,
+      "loss": 0.4357,
+      "step": 3981
+    },
+    {
+      "epoch": 0.10987214463831797,
+      "grad_norm": 0.003178425133228302,
+      "learning_rate": 0.001,
+      "loss": 0.4034,
+      "step": 3982
+    },
+    {
+      "epoch": 0.10989973683938235,
+      "grad_norm": 0.0019768651109188795,
+      "learning_rate": 0.001,
+      "loss": 0.3811,
+      "step": 3983
+    },
+    {
+      "epoch": 0.10992732904044672,
+      "grad_norm": 0.0026141603011637926,
+      "learning_rate": 0.001,
+      "loss": 0.4071,
+      "step": 3984
+    },
+    {
+      "epoch": 0.10995492124151109,
+      "grad_norm": 0.00246097962372005,
+      "learning_rate": 0.001,
+      "loss": 0.3946,
+      "step": 3985
+    },
+    {
+      "epoch": 0.10998251344257545,
+      "grad_norm": 0.002583438763394952,
+      "learning_rate": 0.001,
+      "loss": 0.377,
+      "step": 3986
+    },
+    {
+      "epoch": 0.11001010564363982,
+      "grad_norm": 0.003509828122332692,
+      "learning_rate": 0.001,
+      "loss": 0.437,
+      "step": 3987
+    },
+    {
+      "epoch": 0.1100376978447042,
+      "grad_norm": 0.002548534655943513,
+      "learning_rate": 0.001,
+      "loss": 0.3785,
+      "step": 3988
+    },
+    {
+      "epoch": 0.11006529004576857,
+      "grad_norm": 0.002522020833566785,
+      "learning_rate": 0.001,
+      "loss": 0.379,
+      "step": 3989
+    },
+    {
+      "epoch": 0.11009288224683293,
+      "grad_norm": 0.0027163205668330193,
+      "learning_rate": 0.001,
+      "loss": 0.3933,
+      "step": 3990
+    },
+    {
+      "epoch": 0.1101204744478973,
+      "grad_norm": 0.0020943363197147846,
+      "learning_rate": 0.001,
+      "loss": 0.3713,
+      "step": 3991
+    },
+    {
+      "epoch": 0.11014806664896166,
+      "grad_norm": 0.002914323704317212,
+      "learning_rate": 0.001,
+      "loss": 0.4143,
+      "step": 3992
+    },
+    {
+      "epoch": 0.11017565885002605,
+      "grad_norm": 0.0023269762750715017,
+      "learning_rate": 0.001,
+      "loss": 0.4048,
+      "step": 3993
+    },
+    {
+      "epoch": 0.11020325105109041,
+      "grad_norm": 0.002514853375032544,
+      "learning_rate": 0.001,
+      "loss": 0.4124,
+      "step": 3994
+    },
+    {
+      "epoch": 0.11023084325215478,
+      "grad_norm": 0.002852360252290964,
+      "learning_rate": 0.001,
+      "loss": 0.3768,
+      "step": 3995
+    },
+    {
+      "epoch": 0.11025843545321914,
+      "grad_norm": 0.00278305122628808,
+      "learning_rate": 0.001,
+      "loss": 0.399,
+      "step": 3996
+    },
+    {
+      "epoch": 0.11028602765428351,
+      "grad_norm": 0.0031666734721511602,
+      "learning_rate": 0.001,
+      "loss": 0.3916,
+      "step": 3997
+    },
+    {
+      "epoch": 0.11031361985534789,
+      "grad_norm": 0.006654566153883934,
+      "learning_rate": 0.001,
+      "loss": 0.4034,
+      "step": 3998
+    },
+    {
+      "epoch": 0.11034121205641226,
+      "grad_norm": 0.002697068266570568,
+      "learning_rate": 0.001,
+      "loss": 0.4105,
+      "step": 3999
+    },
+    {
+      "epoch": 0.11036880425747662,
+      "grad_norm": 0.003131921635940671,
+      "learning_rate": 0.001,
+      "loss": 0.4092,
+      "step": 4000
+    },
+    {
+      "epoch": 0.11036880425747662,
+      "eval_runtime": 24.9249,
+      "eval_samples_per_second": 1.284,
+      "eval_steps_per_second": 0.16,
+      "step": 4000
+    },
+    {
+      "epoch": 0.11039639645854099,
+      "grad_norm": 0.009388426318764687,
+      "learning_rate": 0.001,
+      "loss": 0.3925,
+      "step": 4001
+    },
+    {
+      "epoch": 0.11042398865960536,
+      "grad_norm": 0.002704363316297531,
+      "learning_rate": 0.001,
+      "loss": 0.3666,
+      "step": 4002
+    },
+    {
+      "epoch": 0.11045158086066974,
+      "grad_norm": 0.0023160416167229414,
+      "learning_rate": 0.001,
+      "loss": 0.4108,
+      "step": 4003
+    },
+    {
+      "epoch": 0.1104791730617341,
+      "grad_norm": 0.0025515344459563494,
+      "learning_rate": 0.001,
+      "loss": 0.4213,
+      "step": 4004
+    },
+    {
+      "epoch": 0.11050676526279847,
+      "grad_norm": 0.0036002611741423607,
+      "learning_rate": 0.001,
+      "loss": 0.4029,
+      "step": 4005
+    },
+    {
+      "epoch": 0.11053435746386284,
+      "grad_norm": 0.003040984272956848,
+      "learning_rate": 0.001,
+      "loss": 0.3941,
+      "step": 4006
+    },
+    {
+      "epoch": 0.1105619496649272,
+      "grad_norm": 0.00276200077496469,
+      "learning_rate": 0.001,
+      "loss": 0.3724,
+      "step": 4007
+    },
+    {
+      "epoch": 0.11058954186599158,
+      "grad_norm": 0.0028444197960197926,
+      "learning_rate": 0.001,
+      "loss": 0.3991,
+      "step": 4008
+    },
+    {
+      "epoch": 0.11061713406705595,
+      "grad_norm": 0.0025242117699235678,
+      "learning_rate": 0.001,
+      "loss": 0.416,
+      "step": 4009
+    },
+    {
+      "epoch": 0.11064472626812032,
+      "grad_norm": 0.004357707686722279,
+      "learning_rate": 0.001,
+      "loss": 0.3811,
+      "step": 4010
+    },
+    {
+      "epoch": 0.11067231846918468,
+      "grad_norm": 0.01803600788116455,
+      "learning_rate": 0.001,
+      "loss": 0.4133,
+      "step": 4011
+    },
+    {
+      "epoch": 0.11069991067024905,
+      "grad_norm": 0.0028698742389678955,
+      "learning_rate": 0.001,
+      "loss": 0.4018,
+      "step": 4012
+    },
+    {
+      "epoch": 0.11072750287131343,
+      "grad_norm": 0.0033235198352485895,
+      "learning_rate": 0.001,
+      "loss": 0.4138,
+      "step": 4013
+    },
+    {
+      "epoch": 0.1107550950723778,
+      "grad_norm": 0.0038872750010341406,
+      "learning_rate": 0.001,
+      "loss": 0.4174,
+      "step": 4014
+    },
+    {
+      "epoch": 0.11078268727344216,
+      "grad_norm": 0.002596453996375203,
+      "learning_rate": 0.001,
+      "loss": 0.4487,
+      "step": 4015
+    },
+    {
+      "epoch": 0.11081027947450653,
+      "grad_norm": 0.004450778476893902,
+      "learning_rate": 0.001,
+      "loss": 0.406,
+      "step": 4016
+    },
+    {
+      "epoch": 0.1108378716755709,
+      "grad_norm": 0.0031362990848720074,
+      "learning_rate": 0.001,
+      "loss": 0.405,
+      "step": 4017
+    },
+    {
+      "epoch": 0.11086546387663528,
+      "grad_norm": 0.0024727729614824057,
+      "learning_rate": 0.001,
+      "loss": 0.4156,
+      "step": 4018
+    },
+    {
+      "epoch": 0.11089305607769964,
+      "grad_norm": 0.003262293990701437,
+      "learning_rate": 0.001,
+      "loss": 0.3506,
+      "step": 4019
+    },
+    {
+      "epoch": 0.11092064827876401,
+      "grad_norm": 0.002469596453011036,
+      "learning_rate": 0.001,
+      "loss": 0.4224,
+      "step": 4020
+    },
+    {
+      "epoch": 0.11094824047982838,
+      "grad_norm": 0.0029696666169911623,
+      "learning_rate": 0.001,
+      "loss": 0.4081,
+      "step": 4021
+    },
+    {
+      "epoch": 0.11097583268089274,
+      "grad_norm": 0.0031070455443114042,
+      "learning_rate": 0.001,
+      "loss": 0.3808,
+      "step": 4022
+    },
+    {
+      "epoch": 0.11100342488195711,
+      "grad_norm": 0.002378135221078992,
+      "learning_rate": 0.001,
+      "loss": 0.3993,
+      "step": 4023
+    },
+    {
+      "epoch": 0.11103101708302149,
+      "grad_norm": 0.0020418402273207903,
+      "learning_rate": 0.001,
+      "loss": 0.3987,
+      "step": 4024
+    },
+    {
+      "epoch": 0.11105860928408585,
+      "grad_norm": 0.00241975043900311,
+      "learning_rate": 0.001,
+      "loss": 0.3835,
+      "step": 4025
+    },
+    {
+      "epoch": 0.11108620148515022,
+      "grad_norm": 0.0036054837983101606,
+      "learning_rate": 0.001,
+      "loss": 0.375,
+      "step": 4026
+    },
+    {
+      "epoch": 0.11111379368621459,
+      "grad_norm": 0.002273577032610774,
+      "learning_rate": 0.001,
+      "loss": 0.4122,
+      "step": 4027
+    },
+    {
+      "epoch": 0.11114138588727895,
+      "grad_norm": 0.008908730000257492,
+      "learning_rate": 0.001,
+      "loss": 0.4113,
+      "step": 4028
+    },
+    {
+      "epoch": 0.11116897808834333,
+      "grad_norm": 0.0027021903079003096,
+      "learning_rate": 0.001,
+      "loss": 0.3773,
+      "step": 4029
+    },
+    {
+      "epoch": 0.1111965702894077,
+      "grad_norm": 0.002809008816257119,
+      "learning_rate": 0.001,
+      "loss": 0.3948,
+      "step": 4030
+    },
+    {
+      "epoch": 0.11122416249047207,
+      "grad_norm": 0.004108759108930826,
+      "learning_rate": 0.001,
+      "loss": 0.3934,
+      "step": 4031
+    },
+    {
+      "epoch": 0.11125175469153643,
+      "grad_norm": 0.004110720008611679,
+      "learning_rate": 0.001,
+      "loss": 0.3871,
+      "step": 4032
+    },
+    {
+      "epoch": 0.1112793468926008,
+      "grad_norm": 0.003450944786891341,
+      "learning_rate": 0.001,
+      "loss": 0.4001,
+      "step": 4033
+    },
+    {
+      "epoch": 0.11130693909366518,
+      "grad_norm": 0.0027327281422913074,
+      "learning_rate": 0.001,
+      "loss": 0.42,
+      "step": 4034
+    },
+    {
+      "epoch": 0.11133453129472955,
+      "grad_norm": 0.003815416479483247,
+      "learning_rate": 0.001,
+      "loss": 0.4287,
+      "step": 4035
+    },
+    {
+      "epoch": 0.11136212349579391,
+      "grad_norm": 0.0020829702261835337,
+      "learning_rate": 0.001,
+      "loss": 0.3838,
+      "step": 4036
+    },
+    {
+      "epoch": 0.11138971569685828,
+      "grad_norm": 0.0024780267849564552,
+      "learning_rate": 0.001,
+      "loss": 0.4031,
+      "step": 4037
+    },
+    {
+      "epoch": 0.11141730789792265,
+      "grad_norm": 0.004010303877294064,
+      "learning_rate": 0.001,
+      "loss": 0.4179,
+      "step": 4038
+    },
+    {
+      "epoch": 0.11144490009898703,
+      "grad_norm": 0.0030929851345717907,
+      "learning_rate": 0.001,
+      "loss": 0.3852,
+      "step": 4039
+    },
+    {
+      "epoch": 0.11147249230005139,
+      "grad_norm": 0.0041397190652787685,
+      "learning_rate": 0.001,
+      "loss": 0.3881,
+      "step": 4040
+    },
+    {
+      "epoch": 0.11150008450111576,
+      "grad_norm": 0.0039605977945029736,
+      "learning_rate": 0.001,
+      "loss": 0.3877,
+      "step": 4041
+    },
+    {
+      "epoch": 0.11152767670218013,
+      "grad_norm": 0.003413001075387001,
+      "learning_rate": 0.001,
+      "loss": 0.3726,
+      "step": 4042
+    },
+    {
+      "epoch": 0.11155526890324449,
+      "grad_norm": 0.01140381395816803,
+      "learning_rate": 0.001,
+      "loss": 0.3825,
+      "step": 4043
+    },
+    {
+      "epoch": 0.11158286110430887,
+      "grad_norm": 0.005510971415787935,
+      "learning_rate": 0.001,
+      "loss": 0.4163,
+      "step": 4044
+    },
+    {
+      "epoch": 0.11161045330537324,
+      "grad_norm": 0.007506036199629307,
+      "learning_rate": 0.001,
+      "loss": 0.4018,
+      "step": 4045
+    },
+    {
+      "epoch": 0.1116380455064376,
+      "grad_norm": 0.007963884621858597,
+      "learning_rate": 0.001,
+      "loss": 0.3802,
+      "step": 4046
+    },
+    {
+      "epoch": 0.11166563770750197,
+      "grad_norm": 0.007050946820527315,
+      "learning_rate": 0.001,
+      "loss": 0.4122,
+      "step": 4047
+    },
+    {
+      "epoch": 0.11169322990856634,
+      "grad_norm": 0.00468503637239337,
+      "learning_rate": 0.001,
+      "loss": 0.3957,
+      "step": 4048
+    },
+    {
+      "epoch": 0.11172082210963072,
+      "grad_norm": 0.003186695510521531,
+      "learning_rate": 0.001,
+      "loss": 0.4234,
+      "step": 4049
+    },
+    {
+      "epoch": 0.11174841431069509,
+      "grad_norm": 0.03650260344147682,
+      "learning_rate": 0.001,
+      "loss": 0.3924,
+      "step": 4050
+    },
+    {
+      "epoch": 0.11177600651175945,
+      "grad_norm": 0.006556427571922541,
+      "learning_rate": 0.001,
+      "loss": 0.3908,
+      "step": 4051
+    },
+    {
+      "epoch": 0.11180359871282382,
+      "grad_norm": 0.003500849474221468,
+      "learning_rate": 0.001,
+      "loss": 0.3871,
+      "step": 4052
+    },
+    {
+      "epoch": 0.11183119091388818,
+      "grad_norm": 0.005811590701341629,
+      "learning_rate": 0.001,
+      "loss": 0.4169,
+      "step": 4053
+    },
+    {
+      "epoch": 0.11185878311495256,
+      "grad_norm": 0.004504525102674961,
+      "learning_rate": 0.001,
+      "loss": 0.4123,
+      "step": 4054
+    },
+    {
+      "epoch": 0.11188637531601693,
+      "grad_norm": 0.00391266867518425,
+      "learning_rate": 0.001,
+      "loss": 0.3716,
+      "step": 4055
+    },
+    {
+      "epoch": 0.1119139675170813,
+      "grad_norm": 0.002217537024989724,
+      "learning_rate": 0.001,
+      "loss": 0.4385,
+      "step": 4056
+    },
+    {
+      "epoch": 0.11194155971814566,
+      "grad_norm": 0.0025836548302322626,
+      "learning_rate": 0.001,
+      "loss": 0.3797,
+      "step": 4057
+    },
+    {
+      "epoch": 0.11196915191921003,
+      "grad_norm": 0.0025196904316544533,
+      "learning_rate": 0.001,
+      "loss": 0.3861,
+      "step": 4058
+    },
+    {
+      "epoch": 0.11199674412027441,
+      "grad_norm": 0.0020293437410146,
+      "learning_rate": 0.001,
+      "loss": 0.4053,
+      "step": 4059
+    },
+    {
+      "epoch": 0.11202433632133878,
+      "grad_norm": 0.0022750168573111296,
+      "learning_rate": 0.001,
+      "loss": 0.4117,
+      "step": 4060
+    },
+    {
+      "epoch": 0.11205192852240314,
+      "grad_norm": 0.0026435197796672583,
+      "learning_rate": 0.001,
+      "loss": 0.4171,
+      "step": 4061
+    },
+    {
+      "epoch": 0.11207952072346751,
+      "grad_norm": 0.0026955234352499247,
+      "learning_rate": 0.001,
+      "loss": 0.4178,
+      "step": 4062
+    },
+    {
+      "epoch": 0.11210711292453188,
+      "grad_norm": 0.0022352158557623625,
+      "learning_rate": 0.001,
+      "loss": 0.3989,
+      "step": 4063
+    },
+    {
+      "epoch": 0.11213470512559626,
+      "grad_norm": 0.00261326739564538,
+      "learning_rate": 0.001,
+      "loss": 0.431,
+      "step": 4064
+    },
+    {
+      "epoch": 0.11216229732666062,
+      "grad_norm": 0.0022083420772105455,
+      "learning_rate": 0.001,
+      "loss": 0.3838,
+      "step": 4065
+    },
+    {
+      "epoch": 0.11218988952772499,
+      "grad_norm": 0.002633447526022792,
+      "learning_rate": 0.001,
+      "loss": 0.406,
+      "step": 4066
+    },
+    {
+      "epoch": 0.11221748172878936,
+      "grad_norm": 0.00373605964705348,
+      "learning_rate": 0.001,
+      "loss": 0.4227,
+      "step": 4067
+    },
+    {
+      "epoch": 0.11224507392985372,
+      "grad_norm": 0.00297834281809628,
+      "learning_rate": 0.001,
+      "loss": 0.3889,
+      "step": 4068
+    },
+    {
+      "epoch": 0.1122726661309181,
+      "grad_norm": 0.015468292869627476,
+      "learning_rate": 0.001,
+      "loss": 0.3891,
+      "step": 4069
+    },
+    {
+      "epoch": 0.11230025833198247,
+      "grad_norm": 0.004757543094456196,
+      "learning_rate": 0.001,
+      "loss": 0.4245,
+      "step": 4070
+    },
+    {
+      "epoch": 0.11232785053304684,
+      "grad_norm": 0.004110514651983976,
+      "learning_rate": 0.001,
+      "loss": 0.4556,
+      "step": 4071
+    },
+    {
+      "epoch": 0.1123554427341112,
+      "grad_norm": 0.0029956744983792305,
+      "learning_rate": 0.001,
+      "loss": 0.4207,
+      "step": 4072
+    },
+    {
+      "epoch": 0.11238303493517557,
+      "grad_norm": 0.006121024955064058,
+      "learning_rate": 0.001,
+      "loss": 0.4228,
+      "step": 4073
+    },
+    {
+      "epoch": 0.11241062713623994,
+      "grad_norm": 0.011045140214264393,
+      "learning_rate": 0.001,
+      "loss": 0.377,
+      "step": 4074
+    },
+    {
+      "epoch": 0.11243821933730432,
+      "grad_norm": 0.002652381081134081,
+      "learning_rate": 0.001,
+      "loss": 0.3963,
+      "step": 4075
+    },
+    {
+      "epoch": 0.11246581153836868,
+      "grad_norm": 0.005784259643405676,
+      "learning_rate": 0.001,
+      "loss": 0.4095,
+      "step": 4076
+    },
+    {
+      "epoch": 0.11249340373943305,
+      "grad_norm": 0.004484557081013918,
+      "learning_rate": 0.001,
+      "loss": 0.4009,
+      "step": 4077
+    },
+    {
+      "epoch": 0.11252099594049741,
+      "grad_norm": 0.004235697444528341,
+      "learning_rate": 0.001,
+      "loss": 0.3975,
+      "step": 4078
+    },
+    {
+      "epoch": 0.11254858814156178,
+      "grad_norm": 0.002368953777477145,
+      "learning_rate": 0.001,
+      "loss": 0.4364,
+      "step": 4079
+    },
+    {
+      "epoch": 0.11257618034262616,
+      "grad_norm": 0.002926155459135771,
+      "learning_rate": 0.001,
+      "loss": 0.3957,
+      "step": 4080
+    },
+    {
+      "epoch": 0.11260377254369053,
+      "grad_norm": 0.0026046691928058863,
+      "learning_rate": 0.001,
+      "loss": 0.396,
+      "step": 4081
+    },
+    {
+      "epoch": 0.1126313647447549,
+      "grad_norm": 0.0025377613492310047,
+      "learning_rate": 0.001,
+      "loss": 0.3658,
+      "step": 4082
+    },
+    {
+      "epoch": 0.11265895694581926,
+      "grad_norm": 0.0027987242210656404,
+      "learning_rate": 0.001,
+      "loss": 0.398,
+      "step": 4083
+    },
+    {
+      "epoch": 0.11268654914688363,
+      "grad_norm": 0.002613126765936613,
+      "learning_rate": 0.001,
+      "loss": 0.4128,
+      "step": 4084
+    },
+    {
+      "epoch": 0.11271414134794801,
+      "grad_norm": 0.002449605381116271,
+      "learning_rate": 0.001,
+      "loss": 0.3999,
+      "step": 4085
+    },
+    {
+      "epoch": 0.11274173354901237,
+      "grad_norm": 0.0033523484598845243,
+      "learning_rate": 0.001,
+      "loss": 0.4057,
+      "step": 4086
+    },
+    {
+      "epoch": 0.11276932575007674,
+      "grad_norm": 0.003658158238977194,
+      "learning_rate": 0.001,
+      "loss": 0.3999,
+      "step": 4087
+    },
+    {
+      "epoch": 0.1127969179511411,
+      "grad_norm": 0.0031146903056651354,
+      "learning_rate": 0.001,
+      "loss": 0.4306,
+      "step": 4088
+    },
+    {
+      "epoch": 0.11282451015220547,
+      "grad_norm": 0.004147498402744532,
+      "learning_rate": 0.001,
+      "loss": 0.3737,
+      "step": 4089
+    },
+    {
+      "epoch": 0.11285210235326985,
+      "grad_norm": 0.0027466421015560627,
+      "learning_rate": 0.001,
+      "loss": 0.4213,
+      "step": 4090
+    },
+    {
+      "epoch": 0.11287969455433422,
+      "grad_norm": 0.00301600550301373,
+      "learning_rate": 0.001,
+      "loss": 0.3818,
+      "step": 4091
+    },
+    {
+      "epoch": 0.11290728675539859,
+      "grad_norm": 0.002960977843031287,
+      "learning_rate": 0.001,
+      "loss": 0.4128,
+      "step": 4092
+    },
+    {
+      "epoch": 0.11293487895646295,
+      "grad_norm": 0.0030630468390882015,
+      "learning_rate": 0.001,
+      "loss": 0.3792,
+      "step": 4093
+    },
+    {
+      "epoch": 0.11296247115752732,
+      "grad_norm": 0.002365349093452096,
+      "learning_rate": 0.001,
+      "loss": 0.4169,
+      "step": 4094
+    },
+    {
+      "epoch": 0.1129900633585917,
+      "grad_norm": 0.005711345002055168,
+      "learning_rate": 0.001,
+      "loss": 0.3701,
+      "step": 4095
+    },
+    {
+      "epoch": 0.11301765555965607,
+      "grad_norm": 0.003600204363465309,
+      "learning_rate": 0.001,
+      "loss": 0.4023,
+      "step": 4096
+    },
+    {
+      "epoch": 0.11304524776072043,
+      "grad_norm": 0.003578650299459696,
+      "learning_rate": 0.001,
+      "loss": 0.4069,
+      "step": 4097
+    },
+    {
+      "epoch": 0.1130728399617848,
+      "grad_norm": 0.003018326824530959,
+      "learning_rate": 0.001,
+      "loss": 0.4217,
+      "step": 4098
+    },
+    {
+      "epoch": 0.11310043216284917,
+      "grad_norm": 0.0023286626674234867,
+      "learning_rate": 0.001,
+      "loss": 0.4477,
+      "step": 4099
+    },
+    {
+      "epoch": 0.11312802436391355,
+      "grad_norm": 0.0055235689505934715,
+      "learning_rate": 0.001,
+      "loss": 0.4097,
+      "step": 4100
+    },
+    {
+      "epoch": 0.11315561656497791,
+      "grad_norm": 0.003630418796092272,
+      "learning_rate": 0.001,
+      "loss": 0.3977,
+      "step": 4101
+    },
+    {
+      "epoch": 0.11318320876604228,
+      "grad_norm": 0.0026847817935049534,
+      "learning_rate": 0.001,
+      "loss": 0.3564,
+      "step": 4102
+    },
+    {
+      "epoch": 0.11321080096710665,
+      "grad_norm": 0.0022519559133797884,
+      "learning_rate": 0.001,
+      "loss": 0.4048,
+      "step": 4103
+    },
+    {
+      "epoch": 0.11323839316817101,
+      "grad_norm": 0.002188954036682844,
+      "learning_rate": 0.001,
+      "loss": 0.4181,
+      "step": 4104
+    },
+    {
+      "epoch": 0.11326598536923539,
+      "grad_norm": 0.006972792092710733,
+      "learning_rate": 0.001,
+      "loss": 0.4001,
+      "step": 4105
+    },
+    {
+      "epoch": 0.11329357757029976,
+      "grad_norm": 0.0025407804641872644,
+      "learning_rate": 0.001,
+      "loss": 0.4251,
+      "step": 4106
+    },
+    {
+      "epoch": 0.11332116977136412,
+      "grad_norm": 0.006640659179538488,
+      "learning_rate": 0.001,
+      "loss": 0.3743,
+      "step": 4107
+    },
+    {
+      "epoch": 0.11334876197242849,
+      "grad_norm": 0.0022770599462091923,
+      "learning_rate": 0.001,
+      "loss": 0.4122,
+      "step": 4108
+    },
+    {
+      "epoch": 0.11337635417349286,
+      "grad_norm": 0.0027237252797931433,
+      "learning_rate": 0.001,
+      "loss": 0.3964,
+      "step": 4109
+    },
+    {
+      "epoch": 0.11340394637455724,
+      "grad_norm": 0.0026092720218002796,
+      "learning_rate": 0.001,
+      "loss": 0.4007,
+      "step": 4110
+    },
+    {
+      "epoch": 0.1134315385756216,
+      "grad_norm": 0.0025469979736953974,
+      "learning_rate": 0.001,
+      "loss": 0.3901,
+      "step": 4111
+    },
+    {
+      "epoch": 0.11345913077668597,
+      "grad_norm": 0.004103371873497963,
+      "learning_rate": 0.001,
+      "loss": 0.4138,
+      "step": 4112
+    },
+    {
+      "epoch": 0.11348672297775034,
+      "grad_norm": 0.002415394876152277,
+      "learning_rate": 0.001,
+      "loss": 0.3828,
+      "step": 4113
+    },
+    {
+      "epoch": 0.1135143151788147,
+      "grad_norm": 0.0024486854672431946,
+      "learning_rate": 0.001,
+      "loss": 0.3623,
+      "step": 4114
+    },
+    {
+      "epoch": 0.11354190737987908,
+      "grad_norm": 0.003065606579184532,
+      "learning_rate": 0.001,
+      "loss": 0.3931,
+      "step": 4115
+    },
+    {
+      "epoch": 0.11356949958094345,
+      "grad_norm": 0.0029552297201007605,
+      "learning_rate": 0.001,
+      "loss": 0.3854,
+      "step": 4116
+    },
+    {
+      "epoch": 0.11359709178200782,
+      "grad_norm": 0.0028423569165170193,
+      "learning_rate": 0.001,
+      "loss": 0.4042,
+      "step": 4117
+    },
+    {
+      "epoch": 0.11362468398307218,
+      "grad_norm": 0.002471365500241518,
+      "learning_rate": 0.001,
+      "loss": 0.3874,
+      "step": 4118
+    },
+    {
+      "epoch": 0.11365227618413655,
+      "grad_norm": 0.0026159638073295355,
+      "learning_rate": 0.001,
+      "loss": 0.4352,
+      "step": 4119
+    },
+    {
+      "epoch": 0.11367986838520092,
+      "grad_norm": 0.00349643686786294,
+      "learning_rate": 0.001,
+      "loss": 0.4144,
+      "step": 4120
+    },
+    {
+      "epoch": 0.1137074605862653,
+      "grad_norm": 0.003111085621640086,
+      "learning_rate": 0.001,
+      "loss": 0.3827,
+      "step": 4121
+    },
+    {
+      "epoch": 0.11373505278732966,
+      "grad_norm": 0.002907720860093832,
+      "learning_rate": 0.001,
+      "loss": 0.4288,
+      "step": 4122
+    },
+    {
+      "epoch": 0.11376264498839403,
+      "grad_norm": 0.004095634911209345,
+      "learning_rate": 0.001,
+      "loss": 0.3959,
+      "step": 4123
+    },
+    {
+      "epoch": 0.1137902371894584,
+      "grad_norm": 0.0028071727138012648,
+      "learning_rate": 0.001,
+      "loss": 0.4061,
+      "step": 4124
+    },
+    {
+      "epoch": 0.11381782939052276,
+      "grad_norm": 0.0032708507496863604,
+      "learning_rate": 0.001,
+      "loss": 0.3787,
+      "step": 4125
+    },
+    {
+      "epoch": 0.11384542159158714,
+      "grad_norm": 0.002617691410705447,
+      "learning_rate": 0.001,
+      "loss": 0.4297,
+      "step": 4126
+    },
+    {
+      "epoch": 0.11387301379265151,
+      "grad_norm": 0.0025609673466533422,
+      "learning_rate": 0.001,
+      "loss": 0.4136,
+      "step": 4127
+    },
+    {
+      "epoch": 0.11390060599371588,
+      "grad_norm": 0.003452820936217904,
+      "learning_rate": 0.001,
+      "loss": 0.4136,
+      "step": 4128
+    },
+    {
+      "epoch": 0.11392819819478024,
+      "grad_norm": 0.0029548651073127985,
+      "learning_rate": 0.001,
+      "loss": 0.4409,
+      "step": 4129
+    },
+    {
+      "epoch": 0.11395579039584461,
+      "grad_norm": 0.002860912587493658,
+      "learning_rate": 0.001,
+      "loss": 0.3902,
+      "step": 4130
+    },
+    {
+      "epoch": 0.11398338259690899,
+      "grad_norm": 0.0028807439375668764,
+      "learning_rate": 0.001,
+      "loss": 0.4236,
+      "step": 4131
+    },
+    {
+      "epoch": 0.11401097479797336,
+      "grad_norm": 0.0028092057909816504,
+      "learning_rate": 0.001,
+      "loss": 0.3951,
+      "step": 4132
+    },
+    {
+      "epoch": 0.11403856699903772,
+      "grad_norm": 0.003919110633432865,
+      "learning_rate": 0.001,
+      "loss": 0.4015,
+      "step": 4133
+    },
+    {
+      "epoch": 0.11406615920010209,
+      "grad_norm": 0.0034238446969538927,
+      "learning_rate": 0.001,
+      "loss": 0.3811,
+      "step": 4134
+    },
+    {
+      "epoch": 0.11409375140116645,
+      "grad_norm": 0.0033501333091408014,
+      "learning_rate": 0.001,
+      "loss": 0.4174,
+      "step": 4135
+    },
+    {
+      "epoch": 0.11412134360223083,
+      "grad_norm": 0.002704891376197338,
+      "learning_rate": 0.001,
+      "loss": 0.4056,
+      "step": 4136
+    },
+    {
+      "epoch": 0.1141489358032952,
+      "grad_norm": 0.003555488074198365,
+      "learning_rate": 0.001,
+      "loss": 0.3915,
+      "step": 4137
+    },
+    {
+      "epoch": 0.11417652800435957,
+      "grad_norm": 0.0075040231458842754,
+      "learning_rate": 0.001,
+      "loss": 0.4176,
+      "step": 4138
+    },
+    {
+      "epoch": 0.11420412020542393,
+      "grad_norm": 0.008047969080507755,
+      "learning_rate": 0.001,
+      "loss": 0.4179,
+      "step": 4139
+    },
+    {
+      "epoch": 0.1142317124064883,
+      "grad_norm": 0.003022789489477873,
+      "learning_rate": 0.001,
+      "loss": 0.4296,
+      "step": 4140
+    },
+    {
+      "epoch": 0.11425930460755268,
+      "grad_norm": 0.0042315032333135605,
+      "learning_rate": 0.001,
+      "loss": 0.4537,
+      "step": 4141
+    },
+    {
+      "epoch": 0.11428689680861705,
+      "grad_norm": 0.0028275889344513416,
+      "learning_rate": 0.001,
+      "loss": 0.449,
+      "step": 4142
+    },
+    {
+      "epoch": 0.11431448900968141,
+      "grad_norm": 0.002496670465916395,
+      "learning_rate": 0.001,
+      "loss": 0.3984,
+      "step": 4143
+    },
+    {
+      "epoch": 0.11434208121074578,
+      "grad_norm": 0.004964656662195921,
+      "learning_rate": 0.001,
+      "loss": 0.3953,
+      "step": 4144
+    },
+    {
+      "epoch": 0.11436967341181015,
+      "grad_norm": 0.0024401163682341576,
+      "learning_rate": 0.001,
+      "loss": 0.4282,
+      "step": 4145
+    },
+    {
+      "epoch": 0.11439726561287453,
+      "grad_norm": 0.004458567593246698,
+      "learning_rate": 0.001,
+      "loss": 0.3921,
+      "step": 4146
+    },
+    {
+      "epoch": 0.1144248578139389,
+      "grad_norm": 0.002872915705665946,
+      "learning_rate": 0.001,
+      "loss": 0.4262,
+      "step": 4147
+    },
+    {
+      "epoch": 0.11445245001500326,
+      "grad_norm": 0.004791326820850372,
+      "learning_rate": 0.001,
+      "loss": 0.4012,
+      "step": 4148
+    },
+    {
+      "epoch": 0.11448004221606763,
+      "grad_norm": 0.0037970049306750298,
+      "learning_rate": 0.001,
+      "loss": 0.381,
+      "step": 4149
+    },
+    {
+      "epoch": 0.11450763441713199,
+      "grad_norm": 0.0025677571538835764,
+      "learning_rate": 0.001,
+      "loss": 0.4225,
+      "step": 4150
+    },
+    {
+      "epoch": 0.11453522661819637,
+      "grad_norm": 0.0031208605505526066,
+      "learning_rate": 0.001,
+      "loss": 0.4148,
+      "step": 4151
+    },
+    {
+      "epoch": 0.11456281881926074,
+      "grad_norm": 0.0031444996129721403,
+      "learning_rate": 0.001,
+      "loss": 0.3881,
+      "step": 4152
+    },
+    {
+      "epoch": 0.1145904110203251,
+      "grad_norm": 0.0035791201516985893,
+      "learning_rate": 0.001,
+      "loss": 0.4098,
+      "step": 4153
+    },
+    {
+      "epoch": 0.11461800322138947,
+      "grad_norm": 0.0027255616150796413,
+      "learning_rate": 0.001,
+      "loss": 0.3751,
+      "step": 4154
+    },
+    {
+      "epoch": 0.11464559542245384,
+      "grad_norm": 0.003225408960133791,
+      "learning_rate": 0.001,
+      "loss": 0.374,
+      "step": 4155
+    },
+    {
+      "epoch": 0.11467318762351822,
+      "grad_norm": 0.0035840212367475033,
+      "learning_rate": 0.001,
+      "loss": 0.3518,
+      "step": 4156
+    },
+    {
+      "epoch": 0.11470077982458259,
+      "grad_norm": 0.0035705710761249065,
+      "learning_rate": 0.001,
+      "loss": 0.3689,
+      "step": 4157
+    },
+    {
+      "epoch": 0.11472837202564695,
+      "grad_norm": 0.0029657899867743254,
+      "learning_rate": 0.001,
+      "loss": 0.4079,
+      "step": 4158
+    },
+    {
+      "epoch": 0.11475596422671132,
+      "grad_norm": 0.003734529484063387,
+      "learning_rate": 0.001,
+      "loss": 0.3859,
+      "step": 4159
+    },
+    {
+      "epoch": 0.11478355642777568,
+      "grad_norm": 0.00309072551317513,
+      "learning_rate": 0.001,
+      "loss": 0.3934,
+      "step": 4160
+    },
+    {
+      "epoch": 0.11481114862884007,
+      "grad_norm": 0.004012387245893478,
+      "learning_rate": 0.001,
+      "loss": 0.3629,
+      "step": 4161
+    },
+    {
+      "epoch": 0.11483874082990443,
+      "grad_norm": 0.002934156684204936,
+      "learning_rate": 0.001,
+      "loss": 0.4,
+      "step": 4162
+    },
+    {
+      "epoch": 0.1148663330309688,
+      "grad_norm": 0.004066895227879286,
+      "learning_rate": 0.001,
+      "loss": 0.4157,
+      "step": 4163
+    },
+    {
+      "epoch": 0.11489392523203316,
+      "grad_norm": 0.004248685669153929,
+      "learning_rate": 0.001,
+      "loss": 0.4092,
+      "step": 4164
+    },
+    {
+      "epoch": 0.11492151743309753,
+      "grad_norm": 0.0024275535251945257,
+      "learning_rate": 0.001,
+      "loss": 0.4446,
+      "step": 4165
+    },
+    {
+      "epoch": 0.1149491096341619,
+      "grad_norm": 0.0025079327169805765,
+      "learning_rate": 0.001,
+      "loss": 0.4163,
+      "step": 4166
+    },
+    {
+      "epoch": 0.11497670183522628,
+      "grad_norm": 0.0024487797636538744,
+      "learning_rate": 0.001,
+      "loss": 0.3915,
+      "step": 4167
+    },
+    {
+      "epoch": 0.11500429403629064,
+      "grad_norm": 0.002970887813717127,
+      "learning_rate": 0.001,
+      "loss": 0.4153,
+      "step": 4168
+    },
+    {
+      "epoch": 0.11503188623735501,
+      "grad_norm": 0.0025243964046239853,
+      "learning_rate": 0.001,
+      "loss": 0.3893,
+      "step": 4169
+    },
+    {
+      "epoch": 0.11505947843841938,
+      "grad_norm": 0.004984840750694275,
+      "learning_rate": 0.001,
+      "loss": 0.3893,
+      "step": 4170
+    },
+    {
+      "epoch": 0.11508707063948374,
+      "grad_norm": 0.0044680144637823105,
+      "learning_rate": 0.001,
+      "loss": 0.3893,
+      "step": 4171
+    },
+    {
+      "epoch": 0.11511466284054812,
+      "grad_norm": 0.0035056746564805508,
+      "learning_rate": 0.001,
+      "loss": 0.3963,
+      "step": 4172
+    },
+    {
+      "epoch": 0.11514225504161249,
+      "grad_norm": 0.0023045954294502735,
+      "learning_rate": 0.001,
+      "loss": 0.4143,
+      "step": 4173
+    },
+    {
+      "epoch": 0.11516984724267686,
+      "grad_norm": 0.0029870544094592333,
+      "learning_rate": 0.001,
+      "loss": 0.3857,
+      "step": 4174
+    },
+    {
+      "epoch": 0.11519743944374122,
+      "grad_norm": 0.002851466415449977,
+      "learning_rate": 0.001,
+      "loss": 0.4221,
+      "step": 4175
+    },
+    {
+      "epoch": 0.11522503164480559,
+      "grad_norm": 0.0037020803429186344,
+      "learning_rate": 0.001,
+      "loss": 0.3927,
+      "step": 4176
+    },
+    {
+      "epoch": 0.11525262384586997,
+      "grad_norm": 0.0025792547967284918,
+      "learning_rate": 0.001,
+      "loss": 0.4076,
+      "step": 4177
+    },
+    {
+      "epoch": 0.11528021604693434,
+      "grad_norm": 0.005641425959765911,
+      "learning_rate": 0.001,
+      "loss": 0.3746,
+      "step": 4178
+    },
+    {
+      "epoch": 0.1153078082479987,
+      "grad_norm": 0.006405304651707411,
+      "learning_rate": 0.001,
+      "loss": 0.4176,
+      "step": 4179
+    },
+    {
+      "epoch": 0.11533540044906307,
+      "grad_norm": 0.003738192841410637,
+      "learning_rate": 0.001,
+      "loss": 0.4232,
+      "step": 4180
+    },
+    {
+      "epoch": 0.11536299265012744,
+      "grad_norm": 0.002722861710935831,
+      "learning_rate": 0.001,
+      "loss": 0.3712,
+      "step": 4181
+    },
+    {
+      "epoch": 0.11539058485119182,
+      "grad_norm": 0.003077869303524494,
+      "learning_rate": 0.001,
+      "loss": 0.3665,
+      "step": 4182
+    },
+    {
+      "epoch": 0.11541817705225618,
+      "grad_norm": 0.0023466874845325947,
+      "learning_rate": 0.001,
+      "loss": 0.391,
+      "step": 4183
+    },
+    {
+      "epoch": 0.11544576925332055,
+      "grad_norm": 0.0029894413892179728,
+      "learning_rate": 0.001,
+      "loss": 0.4229,
+      "step": 4184
+    },
+    {
+      "epoch": 0.11547336145438492,
+      "grad_norm": 0.005931466352194548,
+      "learning_rate": 0.001,
+      "loss": 0.4229,
+      "step": 4185
+    },
+    {
+      "epoch": 0.11550095365544928,
+      "grad_norm": 0.002602703869342804,
+      "learning_rate": 0.001,
+      "loss": 0.4107,
+      "step": 4186
+    },
+    {
+      "epoch": 0.11552854585651366,
+      "grad_norm": 0.00304593937471509,
+      "learning_rate": 0.001,
+      "loss": 0.3973,
+      "step": 4187
+    },
+    {
+      "epoch": 0.11555613805757803,
+      "grad_norm": 0.002939543453976512,
+      "learning_rate": 0.001,
+      "loss": 0.3698,
+      "step": 4188
+    },
+    {
+      "epoch": 0.1155837302586424,
+      "grad_norm": 0.0026717365253716707,
+      "learning_rate": 0.001,
+      "loss": 0.4273,
+      "step": 4189
+    },
+    {
+      "epoch": 0.11561132245970676,
+      "grad_norm": 0.0039357393980026245,
+      "learning_rate": 0.001,
+      "loss": 0.3866,
+      "step": 4190
+    },
+    {
+      "epoch": 0.11563891466077113,
+      "grad_norm": 0.0033047099132090807,
+      "learning_rate": 0.001,
+      "loss": 0.3634,
+      "step": 4191
+    },
+    {
+      "epoch": 0.11566650686183551,
+      "grad_norm": 0.00248891394585371,
+      "learning_rate": 0.001,
+      "loss": 0.4038,
+      "step": 4192
+    },
+    {
+      "epoch": 0.11569409906289987,
+      "grad_norm": 0.003228937741369009,
+      "learning_rate": 0.001,
+      "loss": 0.4254,
+      "step": 4193
+    },
+    {
+      "epoch": 0.11572169126396424,
+      "grad_norm": 0.004180791787803173,
+      "learning_rate": 0.001,
+      "loss": 0.389,
+      "step": 4194
+    },
+    {
+      "epoch": 0.11574928346502861,
+      "grad_norm": 0.0026013990864157677,
+      "learning_rate": 0.001,
+      "loss": 0.396,
+      "step": 4195
+    },
+    {
+      "epoch": 0.11577687566609297,
+      "grad_norm": 0.003035642672330141,
+      "learning_rate": 0.001,
+      "loss": 0.3863,
+      "step": 4196
+    },
+    {
+      "epoch": 0.11580446786715735,
+      "grad_norm": 0.004497555084526539,
+      "learning_rate": 0.001,
+      "loss": 0.426,
+      "step": 4197
+    },
+    {
+      "epoch": 0.11583206006822172,
+      "grad_norm": 0.0024611612316221,
+      "learning_rate": 0.001,
+      "loss": 0.3838,
+      "step": 4198
+    },
+    {
+      "epoch": 0.11585965226928609,
+      "grad_norm": 0.002211876679211855,
+      "learning_rate": 0.001,
+      "loss": 0.4061,
+      "step": 4199
+    },
+    {
+      "epoch": 0.11588724447035045,
+      "grad_norm": 0.0038244640454649925,
+      "learning_rate": 0.001,
+      "loss": 0.367,
+      "step": 4200
+    },
+    {
+      "epoch": 0.11591483667141482,
+      "grad_norm": 0.002824941882863641,
+      "learning_rate": 0.001,
+      "loss": 0.3977,
+      "step": 4201
+    },
+    {
+      "epoch": 0.1159424288724792,
+      "grad_norm": 0.00297146150842309,
+      "learning_rate": 0.001,
+      "loss": 0.3969,
+      "step": 4202
+    },
+    {
+      "epoch": 0.11597002107354357,
+      "grad_norm": 0.003329311031848192,
+      "learning_rate": 0.001,
+      "loss": 0.4067,
+      "step": 4203
+    },
+    {
+      "epoch": 0.11599761327460793,
+      "grad_norm": 0.0036552376113831997,
+      "learning_rate": 0.001,
+      "loss": 0.3593,
+      "step": 4204
+    },
+    {
+      "epoch": 0.1160252054756723,
+      "grad_norm": 0.003157126484438777,
+      "learning_rate": 0.001,
+      "loss": 0.3867,
+      "step": 4205
+    },
+    {
+      "epoch": 0.11605279767673667,
+      "grad_norm": 0.0030856141820549965,
+      "learning_rate": 0.001,
+      "loss": 0.3612,
+      "step": 4206
+    },
+    {
+      "epoch": 0.11608038987780105,
+      "grad_norm": 0.0030666659586131573,
+      "learning_rate": 0.001,
+      "loss": 0.4336,
+      "step": 4207
+    },
+    {
+      "epoch": 0.11610798207886541,
+      "grad_norm": 0.00340241938829422,
+      "learning_rate": 0.001,
+      "loss": 0.3949,
+      "step": 4208
+    },
+    {
+      "epoch": 0.11613557427992978,
+      "grad_norm": 0.005646420642733574,
+      "learning_rate": 0.001,
+      "loss": 0.407,
+      "step": 4209
+    },
+    {
+      "epoch": 0.11616316648099415,
+      "grad_norm": 0.002865233225747943,
+      "learning_rate": 0.001,
+      "loss": 0.4253,
+      "step": 4210
+    },
+    {
+      "epoch": 0.11619075868205851,
+      "grad_norm": 0.006409808062016964,
+      "learning_rate": 0.001,
+      "loss": 0.3962,
+      "step": 4211
+    },
+    {
+      "epoch": 0.11621835088312288,
+      "grad_norm": 0.0022089029662311077,
+      "learning_rate": 0.001,
+      "loss": 0.4485,
+      "step": 4212
+    },
+    {
+      "epoch": 0.11624594308418726,
+      "grad_norm": 0.002593854209408164,
+      "learning_rate": 0.001,
+      "loss": 0.4342,
+      "step": 4213
+    },
+    {
+      "epoch": 0.11627353528525163,
+      "grad_norm": 0.004724476020783186,
+      "learning_rate": 0.001,
+      "loss": 0.3933,
+      "step": 4214
+    },
+    {
+      "epoch": 0.11630112748631599,
+      "grad_norm": 0.0066048745065927505,
+      "learning_rate": 0.001,
+      "loss": 0.3659,
+      "step": 4215
+    },
+    {
+      "epoch": 0.11632871968738036,
+      "grad_norm": 0.004728097002953291,
+      "learning_rate": 0.001,
+      "loss": 0.3812,
+      "step": 4216
+    },
+    {
+      "epoch": 0.11635631188844472,
+      "grad_norm": 0.003980184905230999,
+      "learning_rate": 0.001,
+      "loss": 0.4053,
+      "step": 4217
+    },
+    {
+      "epoch": 0.1163839040895091,
+      "grad_norm": 0.002691940637305379,
+      "learning_rate": 0.001,
+      "loss": 0.384,
+      "step": 4218
+    },
+    {
+      "epoch": 0.11641149629057347,
+      "grad_norm": 0.004341395106166601,
+      "learning_rate": 0.001,
+      "loss": 0.3608,
+      "step": 4219
+    },
+    {
+      "epoch": 0.11643908849163784,
+      "grad_norm": 0.0023949614260345697,
+      "learning_rate": 0.001,
+      "loss": 0.3989,
+      "step": 4220
+    },
+    {
+      "epoch": 0.1164666806927022,
+      "grad_norm": 0.0021929224021732807,
+      "learning_rate": 0.001,
+      "loss": 0.3927,
+      "step": 4221
+    },
+    {
+      "epoch": 0.11649427289376657,
+      "grad_norm": 0.002848616801202297,
+      "learning_rate": 0.001,
+      "loss": 0.4129,
+      "step": 4222
+    },
+    {
+      "epoch": 0.11652186509483095,
+      "grad_norm": 0.0023948801681399345,
+      "learning_rate": 0.001,
+      "loss": 0.3991,
+      "step": 4223
+    },
+    {
+      "epoch": 0.11654945729589532,
+      "grad_norm": 0.0029203318990767,
+      "learning_rate": 0.001,
+      "loss": 0.3962,
+      "step": 4224
+    },
+    {
+      "epoch": 0.11657704949695968,
+      "grad_norm": 0.0027072993107140064,
+      "learning_rate": 0.001,
+      "loss": 0.4326,
+      "step": 4225
+    },
+    {
+      "epoch": 0.11660464169802405,
+      "grad_norm": 0.007629503961652517,
+      "learning_rate": 0.001,
+      "loss": 0.4366,
+      "step": 4226
+    },
+    {
+      "epoch": 0.11663223389908842,
+      "grad_norm": 0.004317782819271088,
+      "learning_rate": 0.001,
+      "loss": 0.4254,
+      "step": 4227
+    },
+    {
+      "epoch": 0.1166598261001528,
+      "grad_norm": 0.004796279594302177,
+      "learning_rate": 0.001,
+      "loss": 0.4165,
+      "step": 4228
+    },
+    {
+      "epoch": 0.11668741830121716,
+      "grad_norm": 0.0048171901144087315,
+      "learning_rate": 0.001,
+      "loss": 0.4078,
+      "step": 4229
+    },
+    {
+      "epoch": 0.11671501050228153,
+      "grad_norm": 0.005509024020284414,
+      "learning_rate": 0.001,
+      "loss": 0.3921,
+      "step": 4230
+    },
+    {
+      "epoch": 0.1167426027033459,
+      "grad_norm": 0.002434986876323819,
+      "learning_rate": 0.001,
+      "loss": 0.3991,
+      "step": 4231
+    },
+    {
+      "epoch": 0.11677019490441026,
+      "grad_norm": 0.0024681081995368004,
+      "learning_rate": 0.001,
+      "loss": 0.3868,
+      "step": 4232
+    },
+    {
+      "epoch": 0.11679778710547464,
+      "grad_norm": 0.0029520918615162373,
+      "learning_rate": 0.001,
+      "loss": 0.4171,
+      "step": 4233
+    },
+    {
+      "epoch": 0.11682537930653901,
+      "grad_norm": 0.0039764223620295525,
+      "learning_rate": 0.001,
+      "loss": 0.4279,
+      "step": 4234
+    },
+    {
+      "epoch": 0.11685297150760338,
+      "grad_norm": 0.0033561012241989374,
+      "learning_rate": 0.001,
+      "loss": 0.3801,
+      "step": 4235
+    },
+    {
+      "epoch": 0.11688056370866774,
+      "grad_norm": 0.002082569058984518,
+      "learning_rate": 0.001,
+      "loss": 0.4333,
+      "step": 4236
+    },
+    {
+      "epoch": 0.11690815590973211,
+      "grad_norm": 0.002162642776966095,
+      "learning_rate": 0.001,
+      "loss": 0.3956,
+      "step": 4237
+    },
+    {
+      "epoch": 0.11693574811079649,
+      "grad_norm": 0.004019891377538443,
+      "learning_rate": 0.001,
+      "loss": 0.3855,
+      "step": 4238
+    },
+    {
+      "epoch": 0.11696334031186086,
+      "grad_norm": 0.0039974479004740715,
+      "learning_rate": 0.001,
+      "loss": 0.392,
+      "step": 4239
+    },
+    {
+      "epoch": 0.11699093251292522,
+      "grad_norm": 0.0032143592834472656,
+      "learning_rate": 0.001,
+      "loss": 0.3754,
+      "step": 4240
+    },
+    {
+      "epoch": 0.11701852471398959,
+      "grad_norm": 0.0021860511042177677,
+      "learning_rate": 0.001,
+      "loss": 0.4301,
+      "step": 4241
+    },
+    {
+      "epoch": 0.11704611691505395,
+      "grad_norm": 0.0034913863055408,
+      "learning_rate": 0.001,
+      "loss": 0.4232,
+      "step": 4242
+    },
+    {
+      "epoch": 0.11707370911611834,
+      "grad_norm": 0.002544558374211192,
+      "learning_rate": 0.001,
+      "loss": 0.4308,
+      "step": 4243
+    },
+    {
+      "epoch": 0.1171013013171827,
+      "grad_norm": 0.004531691782176495,
+      "learning_rate": 0.001,
+      "loss": 0.4217,
+      "step": 4244
+    },
+    {
+      "epoch": 0.11712889351824707,
+      "grad_norm": 0.002380553400143981,
+      "learning_rate": 0.001,
+      "loss": 0.3713,
+      "step": 4245
+    },
+    {
+      "epoch": 0.11715648571931143,
+      "grad_norm": 0.003138060914352536,
+      "learning_rate": 0.001,
+      "loss": 0.4387,
+      "step": 4246
+    },
+    {
+      "epoch": 0.1171840779203758,
+      "grad_norm": 0.0038936040364205837,
+      "learning_rate": 0.001,
+      "loss": 0.3675,
+      "step": 4247
+    },
+    {
+      "epoch": 0.11721167012144018,
+      "grad_norm": 0.0034198390785604715,
+      "learning_rate": 0.001,
+      "loss": 0.4041,
+      "step": 4248
+    },
+    {
+      "epoch": 0.11723926232250455,
+      "grad_norm": 0.004554110579192638,
+      "learning_rate": 0.001,
+      "loss": 0.4036,
+      "step": 4249
+    },
+    {
+      "epoch": 0.11726685452356891,
+      "grad_norm": 0.002692369045689702,
+      "learning_rate": 0.001,
+      "loss": 0.3955,
+      "step": 4250
+    },
+    {
+      "epoch": 0.11729444672463328,
+      "grad_norm": 0.0031687277369201183,
+      "learning_rate": 0.001,
+      "loss": 0.3685,
+      "step": 4251
+    },
+    {
+      "epoch": 0.11732203892569765,
+      "grad_norm": 0.0030465414747595787,
+      "learning_rate": 0.001,
+      "loss": 0.4197,
+      "step": 4252
+    },
+    {
+      "epoch": 0.11734963112676203,
+      "grad_norm": 0.0028786477632820606,
+      "learning_rate": 0.001,
+      "loss": 0.3941,
+      "step": 4253
+    },
+    {
+      "epoch": 0.1173772233278264,
+      "grad_norm": 0.0026728706434369087,
+      "learning_rate": 0.001,
+      "loss": 0.4235,
+      "step": 4254
+    },
+    {
+      "epoch": 0.11740481552889076,
+      "grad_norm": 0.0030022338032722473,
+      "learning_rate": 0.001,
+      "loss": 0.407,
+      "step": 4255
+    },
+    {
+      "epoch": 0.11743240772995513,
+      "grad_norm": 0.004039818421006203,
+      "learning_rate": 0.001,
+      "loss": 0.4007,
+      "step": 4256
+    },
+    {
+      "epoch": 0.1174599999310195,
+      "grad_norm": 0.0031344417948275805,
+      "learning_rate": 0.001,
+      "loss": 0.4295,
+      "step": 4257
+    },
+    {
+      "epoch": 0.11748759213208387,
+      "grad_norm": 0.0020141347777098417,
+      "learning_rate": 0.001,
+      "loss": 0.4067,
+      "step": 4258
+    },
+    {
+      "epoch": 0.11751518433314824,
+      "grad_norm": 0.003466543275862932,
+      "learning_rate": 0.001,
+      "loss": 0.3938,
+      "step": 4259
+    },
+    {
+      "epoch": 0.1175427765342126,
+      "grad_norm": 0.006833492312580347,
+      "learning_rate": 0.001,
+      "loss": 0.4395,
+      "step": 4260
+    },
+    {
+      "epoch": 0.11757036873527697,
+      "grad_norm": 0.002628098940476775,
+      "learning_rate": 0.001,
+      "loss": 0.4086,
+      "step": 4261
+    },
+    {
+      "epoch": 0.11759796093634134,
+      "grad_norm": 0.0029284025076776743,
+      "learning_rate": 0.001,
+      "loss": 0.3746,
+      "step": 4262
+    },
+    {
+      "epoch": 0.1176255531374057,
+      "grad_norm": 0.002273330232128501,
+      "learning_rate": 0.001,
+      "loss": 0.4022,
+      "step": 4263
+    },
+    {
+      "epoch": 0.11765314533847009,
+      "grad_norm": 0.004239256959408522,
+      "learning_rate": 0.001,
+      "loss": 0.409,
+      "step": 4264
+    },
+    {
+      "epoch": 0.11768073753953445,
+      "grad_norm": 0.003706045914441347,
+      "learning_rate": 0.001,
+      "loss": 0.3751,
+      "step": 4265
+    },
+    {
+      "epoch": 0.11770832974059882,
+      "grad_norm": 0.004043275490403175,
+      "learning_rate": 0.001,
+      "loss": 0.3808,
+      "step": 4266
+    },
+    {
+      "epoch": 0.11773592194166319,
+      "grad_norm": 0.00288589159026742,
+      "learning_rate": 0.001,
+      "loss": 0.3952,
+      "step": 4267
+    },
+    {
+      "epoch": 0.11776351414272755,
+      "grad_norm": 0.002828913275152445,
+      "learning_rate": 0.001,
+      "loss": 0.3861,
+      "step": 4268
+    },
+    {
+      "epoch": 0.11779110634379193,
+      "grad_norm": 0.0026987362653017044,
+      "learning_rate": 0.001,
+      "loss": 0.4127,
+      "step": 4269
+    },
+    {
+      "epoch": 0.1178186985448563,
+      "grad_norm": 0.004176660440862179,
+      "learning_rate": 0.001,
+      "loss": 0.3911,
+      "step": 4270
+    },
+    {
+      "epoch": 0.11784629074592066,
+      "grad_norm": 0.0025856473948806524,
+      "learning_rate": 0.001,
+      "loss": 0.4223,
+      "step": 4271
+    },
+    {
+      "epoch": 0.11787388294698503,
+      "grad_norm": 0.002688355278223753,
+      "learning_rate": 0.001,
+      "loss": 0.4427,
+      "step": 4272
+    },
+    {
+      "epoch": 0.1179014751480494,
+      "grad_norm": 0.0035168598406016827,
+      "learning_rate": 0.001,
+      "loss": 0.4053,
+      "step": 4273
+    },
+    {
+      "epoch": 0.11792906734911378,
+      "grad_norm": 0.00464267935603857,
+      "learning_rate": 0.001,
+      "loss": 0.3594,
+      "step": 4274
+    },
+    {
+      "epoch": 0.11795665955017814,
+      "grad_norm": 0.002608151640743017,
+      "learning_rate": 0.001,
+      "loss": 0.3941,
+      "step": 4275
+    },
+    {
+      "epoch": 0.11798425175124251,
+      "grad_norm": 0.003174230456352234,
+      "learning_rate": 0.001,
+      "loss": 0.3583,
+      "step": 4276
+    },
+    {
+      "epoch": 0.11801184395230688,
+      "grad_norm": 0.005926152691245079,
+      "learning_rate": 0.001,
+      "loss": 0.4115,
+      "step": 4277
+    },
+    {
+      "epoch": 0.11803943615337124,
+      "grad_norm": 0.0021453346125781536,
+      "learning_rate": 0.001,
+      "loss": 0.3648,
+      "step": 4278
+    },
+    {
+      "epoch": 0.11806702835443562,
+      "grad_norm": 0.003014147747308016,
+      "learning_rate": 0.001,
+      "loss": 0.4007,
+      "step": 4279
+    },
+    {
+      "epoch": 0.11809462055549999,
+      "grad_norm": 0.007064585108309984,
+      "learning_rate": 0.001,
+      "loss": 0.3441,
+      "step": 4280
+    },
+    {
+      "epoch": 0.11812221275656436,
+      "grad_norm": 0.004232319537550211,
+      "learning_rate": 0.001,
+      "loss": 0.4019,
+      "step": 4281
+    },
+    {
+      "epoch": 0.11814980495762872,
+      "grad_norm": 0.0022875110153108835,
+      "learning_rate": 0.001,
+      "loss": 0.3712,
+      "step": 4282
+    },
+    {
+      "epoch": 0.11817739715869309,
+      "grad_norm": 0.0034797275438904762,
+      "learning_rate": 0.001,
+      "loss": 0.4008,
+      "step": 4283
+    },
+    {
+      "epoch": 0.11820498935975747,
+      "grad_norm": 0.0038444402161985636,
+      "learning_rate": 0.001,
+      "loss": 0.3889,
+      "step": 4284
+    },
+    {
+      "epoch": 0.11823258156082184,
+      "grad_norm": 0.004799429327249527,
+      "learning_rate": 0.001,
+      "loss": 0.4482,
+      "step": 4285
+    },
+    {
+      "epoch": 0.1182601737618862,
+      "grad_norm": 0.0031683624256402254,
+      "learning_rate": 0.001,
+      "loss": 0.4242,
+      "step": 4286
+    },
+    {
+      "epoch": 0.11828776596295057,
+      "grad_norm": 0.002779352478682995,
+      "learning_rate": 0.001,
+      "loss": 0.4007,
+      "step": 4287
+    },
+    {
+      "epoch": 0.11831535816401494,
+      "grad_norm": 0.00301496684551239,
+      "learning_rate": 0.001,
+      "loss": 0.452,
+      "step": 4288
+    },
+    {
+      "epoch": 0.11834295036507932,
+      "grad_norm": 0.0021808287128806114,
+      "learning_rate": 0.001,
+      "loss": 0.4234,
+      "step": 4289
+    },
+    {
+      "epoch": 0.11837054256614368,
+      "grad_norm": 0.002374732168391347,
+      "learning_rate": 0.001,
+      "loss": 0.4414,
+      "step": 4290
+    },
+    {
+      "epoch": 0.11839813476720805,
+      "grad_norm": 0.0025609827134758234,
+      "learning_rate": 0.001,
+      "loss": 0.41,
+      "step": 4291
+    },
+    {
+      "epoch": 0.11842572696827242,
+      "grad_norm": 0.0029708382207900286,
+      "learning_rate": 0.001,
+      "loss": 0.3762,
+      "step": 4292
+    },
+    {
+      "epoch": 0.11845331916933678,
+      "grad_norm": 0.002453922526910901,
+      "learning_rate": 0.001,
+      "loss": 0.3997,
+      "step": 4293
+    },
+    {
+      "epoch": 0.11848091137040116,
+      "grad_norm": 0.003155443584546447,
+      "learning_rate": 0.001,
+      "loss": 0.4032,
+      "step": 4294
+    },
+    {
+      "epoch": 0.11850850357146553,
+      "grad_norm": 0.0028496377635747194,
+      "learning_rate": 0.001,
+      "loss": 0.4215,
+      "step": 4295
+    },
+    {
+      "epoch": 0.1185360957725299,
+      "grad_norm": 0.0024521294981241226,
+      "learning_rate": 0.001,
+      "loss": 0.4255,
+      "step": 4296
+    },
+    {
+      "epoch": 0.11856368797359426,
+      "grad_norm": 0.002795828739181161,
+      "learning_rate": 0.001,
+      "loss": 0.4039,
+      "step": 4297
+    },
+    {
+      "epoch": 0.11859128017465863,
+      "grad_norm": 0.0030860670376569033,
+      "learning_rate": 0.001,
+      "loss": 0.3927,
+      "step": 4298
+    },
+    {
+      "epoch": 0.11861887237572301,
+      "grad_norm": 0.005245378706604242,
+      "learning_rate": 0.001,
+      "loss": 0.4086,
+      "step": 4299
+    },
+    {
+      "epoch": 0.11864646457678737,
+      "grad_norm": 0.003146842820569873,
+      "learning_rate": 0.001,
+      "loss": 0.3404,
+      "step": 4300
+    },
+    {
+      "epoch": 0.11867405677785174,
+      "grad_norm": 0.0026854875031858683,
+      "learning_rate": 0.001,
+      "loss": 0.3727,
+      "step": 4301
+    },
+    {
+      "epoch": 0.11870164897891611,
+      "grad_norm": 0.0026074268389493227,
+      "learning_rate": 0.001,
+      "loss": 0.3586,
+      "step": 4302
+    },
+    {
+      "epoch": 0.11872924117998047,
+      "grad_norm": 0.0035085766576230526,
+      "learning_rate": 0.001,
+      "loss": 0.4453,
+      "step": 4303
+    },
+    {
+      "epoch": 0.11875683338104485,
+      "grad_norm": 0.0026303378399461508,
+      "learning_rate": 0.001,
+      "loss": 0.3808,
+      "step": 4304
+    },
+    {
+      "epoch": 0.11878442558210922,
+      "grad_norm": 0.002744373632594943,
+      "learning_rate": 0.001,
+      "loss": 0.3987,
+      "step": 4305
+    },
+    {
+      "epoch": 0.11881201778317359,
+      "grad_norm": 0.002459439681842923,
+      "learning_rate": 0.001,
+      "loss": 0.424,
+      "step": 4306
+    },
+    {
+      "epoch": 0.11883960998423795,
+      "grad_norm": 0.004186084493994713,
+      "learning_rate": 0.001,
+      "loss": 0.3886,
+      "step": 4307
+    },
+    {
+      "epoch": 0.11886720218530232,
+      "grad_norm": 0.002484044060111046,
+      "learning_rate": 0.001,
+      "loss": 0.4072,
+      "step": 4308
+    },
+    {
+      "epoch": 0.11889479438636669,
+      "grad_norm": 0.002908499911427498,
+      "learning_rate": 0.001,
+      "loss": 0.3802,
+      "step": 4309
+    },
+    {
+      "epoch": 0.11892238658743107,
+      "grad_norm": 0.0024023684673011303,
+      "learning_rate": 0.001,
+      "loss": 0.3796,
+      "step": 4310
+    },
+    {
+      "epoch": 0.11894997878849543,
+      "grad_norm": 0.0033678016625344753,
+      "learning_rate": 0.001,
+      "loss": 0.3906,
+      "step": 4311
+    },
+    {
+      "epoch": 0.1189775709895598,
+      "grad_norm": 0.0029683737084269524,
+      "learning_rate": 0.001,
+      "loss": 0.3552,
+      "step": 4312
+    },
+    {
+      "epoch": 0.11900516319062417,
+      "grad_norm": 0.0027305546682327986,
+      "learning_rate": 0.001,
+      "loss": 0.422,
+      "step": 4313
+    },
+    {
+      "epoch": 0.11903275539168853,
+      "grad_norm": 0.002915582386776805,
+      "learning_rate": 0.001,
+      "loss": 0.4102,
+      "step": 4314
+    },
+    {
+      "epoch": 0.11906034759275291,
+      "grad_norm": 0.002866593888029456,
+      "learning_rate": 0.001,
+      "loss": 0.3765,
+      "step": 4315
+    },
+    {
+      "epoch": 0.11908793979381728,
+      "grad_norm": 0.0024439122062176466,
+      "learning_rate": 0.001,
+      "loss": 0.4118,
+      "step": 4316
+    },
+    {
+      "epoch": 0.11911553199488165,
+      "grad_norm": 0.003114825114607811,
+      "learning_rate": 0.001,
+      "loss": 0.3869,
+      "step": 4317
+    },
+    {
+      "epoch": 0.11914312419594601,
+      "grad_norm": 0.005244606640189886,
+      "learning_rate": 0.001,
+      "loss": 0.3832,
+      "step": 4318
+    },
+    {
+      "epoch": 0.11917071639701038,
+      "grad_norm": 0.0027887518517673016,
+      "learning_rate": 0.001,
+      "loss": 0.3903,
+      "step": 4319
+    },
+    {
+      "epoch": 0.11919830859807476,
+      "grad_norm": 0.0033839023672044277,
+      "learning_rate": 0.001,
+      "loss": 0.3994,
+      "step": 4320
+    },
+    {
+      "epoch": 0.11922590079913913,
+      "grad_norm": 0.00488735968247056,
+      "learning_rate": 0.001,
+      "loss": 0.4666,
+      "step": 4321
+    },
+    {
+      "epoch": 0.11925349300020349,
+      "grad_norm": 0.00358974770642817,
+      "learning_rate": 0.001,
+      "loss": 0.3924,
+      "step": 4322
+    },
+    {
+      "epoch": 0.11928108520126786,
+      "grad_norm": 0.004664520733058453,
+      "learning_rate": 0.001,
+      "loss": 0.4276,
+      "step": 4323
+    },
+    {
+      "epoch": 0.11930867740233222,
+      "grad_norm": 0.0029046619310975075,
+      "learning_rate": 0.001,
+      "loss": 0.4279,
+      "step": 4324
+    },
+    {
+      "epoch": 0.1193362696033966,
+      "grad_norm": 0.005748322233557701,
+      "learning_rate": 0.001,
+      "loss": 0.3928,
+      "step": 4325
+    },
+    {
+      "epoch": 0.11936386180446097,
+      "grad_norm": 0.004792370367795229,
+      "learning_rate": 0.001,
+      "loss": 0.3951,
+      "step": 4326
+    },
+    {
+      "epoch": 0.11939145400552534,
+      "grad_norm": 0.002347557572647929,
+      "learning_rate": 0.001,
+      "loss": 0.3878,
+      "step": 4327
+    },
+    {
+      "epoch": 0.1194190462065897,
+      "grad_norm": 0.009171836078166962,
+      "learning_rate": 0.001,
+      "loss": 0.3932,
+      "step": 4328
+    },
+    {
+      "epoch": 0.11944663840765407,
+      "grad_norm": 0.0038018315099179745,
+      "learning_rate": 0.001,
+      "loss": 0.3935,
+      "step": 4329
+    },
+    {
+      "epoch": 0.11947423060871845,
+      "grad_norm": 0.010487204417586327,
+      "learning_rate": 0.001,
+      "loss": 0.4135,
+      "step": 4330
+    },
+    {
+      "epoch": 0.11950182280978282,
+      "grad_norm": 0.0031547516118735075,
+      "learning_rate": 0.001,
+      "loss": 0.4237,
+      "step": 4331
+    },
+    {
+      "epoch": 0.11952941501084718,
+      "grad_norm": 0.008287927135825157,
+      "learning_rate": 0.001,
+      "loss": 0.4101,
+      "step": 4332
+    },
+    {
+      "epoch": 0.11955700721191155,
+      "grad_norm": 0.0035811339039355516,
+      "learning_rate": 0.001,
+      "loss": 0.4101,
+      "step": 4333
+    },
+    {
+      "epoch": 0.11958459941297592,
+      "grad_norm": 0.0023166469763964415,
+      "learning_rate": 0.001,
+      "loss": 0.4415,
+      "step": 4334
+    },
+    {
+      "epoch": 0.1196121916140403,
+      "grad_norm": 0.005073962267488241,
+      "learning_rate": 0.001,
+      "loss": 0.4063,
+      "step": 4335
+    },
+    {
+      "epoch": 0.11963978381510466,
+      "grad_norm": 0.003093453822657466,
+      "learning_rate": 0.001,
+      "loss": 0.3567,
+      "step": 4336
+    },
+    {
+      "epoch": 0.11966737601616903,
+      "grad_norm": 0.002334992168471217,
+      "learning_rate": 0.001,
+      "loss": 0.4305,
+      "step": 4337
+    },
+    {
+      "epoch": 0.1196949682172334,
+      "grad_norm": 0.0023138297256082296,
+      "learning_rate": 0.001,
+      "loss": 0.4247,
+      "step": 4338
+    },
+    {
+      "epoch": 0.11972256041829776,
+      "grad_norm": 0.002155238064005971,
+      "learning_rate": 0.001,
+      "loss": 0.4446,
+      "step": 4339
+    },
+    {
+      "epoch": 0.11975015261936214,
+      "grad_norm": 0.004164060112088919,
+      "learning_rate": 0.001,
+      "loss": 0.4138,
+      "step": 4340
+    },
+    {
+      "epoch": 0.11977774482042651,
+      "grad_norm": 0.0026546409353613853,
+      "learning_rate": 0.001,
+      "loss": 0.3838,
+      "step": 4341
+    },
+    {
+      "epoch": 0.11980533702149088,
+      "grad_norm": 0.004223043564707041,
+      "learning_rate": 0.001,
+      "loss": 0.4106,
+      "step": 4342
+    },
+    {
+      "epoch": 0.11983292922255524,
+      "grad_norm": 0.002268544165417552,
+      "learning_rate": 0.001,
+      "loss": 0.4368,
+      "step": 4343
+    },
+    {
+      "epoch": 0.11986052142361961,
+      "grad_norm": 0.0023410958237946033,
+      "learning_rate": 0.001,
+      "loss": 0.439,
+      "step": 4344
+    },
+    {
+      "epoch": 0.11988811362468399,
+      "grad_norm": 0.003867893014103174,
+      "learning_rate": 0.001,
+      "loss": 0.399,
+      "step": 4345
+    },
+    {
+      "epoch": 0.11991570582574836,
+      "grad_norm": 0.0023783824872225523,
+      "learning_rate": 0.001,
+      "loss": 0.4041,
+      "step": 4346
+    },
+    {
+      "epoch": 0.11994329802681272,
+      "grad_norm": 0.002548053627833724,
+      "learning_rate": 0.001,
+      "loss": 0.4261,
+      "step": 4347
+    },
+    {
+      "epoch": 0.11997089022787709,
+      "grad_norm": 0.0028831157833337784,
+      "learning_rate": 0.001,
+      "loss": 0.3841,
+      "step": 4348
+    },
+    {
+      "epoch": 0.11999848242894146,
+      "grad_norm": 0.0029863922391086817,
+      "learning_rate": 0.001,
+      "loss": 0.3918,
+      "step": 4349
+    },
+    {
+      "epoch": 0.12002607463000584,
+      "grad_norm": 0.002305314177647233,
+      "learning_rate": 0.001,
+      "loss": 0.3892,
+      "step": 4350
+    },
+    {
+      "epoch": 0.1200536668310702,
+      "grad_norm": 0.006781649775803089,
+      "learning_rate": 0.001,
+      "loss": 0.4038,
+      "step": 4351
+    },
+    {
+      "epoch": 0.12008125903213457,
+      "grad_norm": 0.0042722891084849834,
+      "learning_rate": 0.001,
+      "loss": 0.3885,
+      "step": 4352
+    },
+    {
+      "epoch": 0.12010885123319893,
+      "grad_norm": 0.0033779507502913475,
+      "learning_rate": 0.001,
+      "loss": 0.3611,
+      "step": 4353
+    },
+    {
+      "epoch": 0.1201364434342633,
+      "grad_norm": 0.002541282679885626,
+      "learning_rate": 0.001,
+      "loss": 0.4365,
+      "step": 4354
+    },
+    {
+      "epoch": 0.12016403563532767,
+      "grad_norm": 0.002650998765602708,
+      "learning_rate": 0.001,
+      "loss": 0.3754,
+      "step": 4355
+    },
+    {
+      "epoch": 0.12019162783639205,
+      "grad_norm": 0.002635399578139186,
+      "learning_rate": 0.001,
+      "loss": 0.4553,
+      "step": 4356
+    },
+    {
+      "epoch": 0.12021922003745641,
+      "grad_norm": 0.0034205662086606026,
+      "learning_rate": 0.001,
+      "loss": 0.3805,
+      "step": 4357
+    },
+    {
+      "epoch": 0.12024681223852078,
+      "grad_norm": 0.0041115800850093365,
+      "learning_rate": 0.001,
+      "loss": 0.4095,
+      "step": 4358
+    },
+    {
+      "epoch": 0.12027440443958515,
+      "grad_norm": 0.003525017062202096,
+      "learning_rate": 0.001,
+      "loss": 0.4103,
+      "step": 4359
+    },
+    {
+      "epoch": 0.12030199664064951,
+      "grad_norm": 0.003658512607216835,
+      "learning_rate": 0.001,
+      "loss": 0.4322,
+      "step": 4360
+    },
+    {
+      "epoch": 0.1203295888417139,
+      "grad_norm": 0.00296131893992424,
+      "learning_rate": 0.001,
+      "loss": 0.3949,
+      "step": 4361
+    },
+    {
+      "epoch": 0.12035718104277826,
+      "grad_norm": 0.0035319009330123663,
+      "learning_rate": 0.001,
+      "loss": 0.4197,
+      "step": 4362
+    },
+    {
+      "epoch": 0.12038477324384263,
+      "grad_norm": 0.0023319576866924763,
+      "learning_rate": 0.001,
+      "loss": 0.4042,
+      "step": 4363
+    },
+    {
+      "epoch": 0.120412365444907,
+      "grad_norm": 0.002195805311203003,
+      "learning_rate": 0.001,
+      "loss": 0.4338,
+      "step": 4364
+    },
+    {
+      "epoch": 0.12043995764597136,
+      "grad_norm": 0.002451231935992837,
+      "learning_rate": 0.001,
+      "loss": 0.3916,
+      "step": 4365
+    },
+    {
+      "epoch": 0.12046754984703574,
+      "grad_norm": 0.00870354101061821,
+      "learning_rate": 0.001,
+      "loss": 0.3913,
+      "step": 4366
+    },
+    {
+      "epoch": 0.1204951420481001,
+      "grad_norm": 0.002587871393188834,
+      "learning_rate": 0.001,
+      "loss": 0.4032,
+      "step": 4367
+    },
+    {
+      "epoch": 0.12052273424916447,
+      "grad_norm": 0.004466624464839697,
+      "learning_rate": 0.001,
+      "loss": 0.3526,
+      "step": 4368
+    },
+    {
+      "epoch": 0.12055032645022884,
+      "grad_norm": 0.004007890820503235,
+      "learning_rate": 0.001,
+      "loss": 0.3694,
+      "step": 4369
+    },
+    {
+      "epoch": 0.1205779186512932,
+      "grad_norm": 0.015531973913311958,
+      "learning_rate": 0.001,
+      "loss": 0.4283,
+      "step": 4370
+    },
+    {
+      "epoch": 0.12060551085235759,
+      "grad_norm": 0.008623013272881508,
+      "learning_rate": 0.001,
+      "loss": 0.4336,
+      "step": 4371
+    },
+    {
+      "epoch": 0.12063310305342195,
+      "grad_norm": 0.006909455172717571,
+      "learning_rate": 0.001,
+      "loss": 0.3871,
+      "step": 4372
+    },
+    {
+      "epoch": 0.12066069525448632,
+      "grad_norm": 0.009513488039374352,
+      "learning_rate": 0.001,
+      "loss": 0.3819,
+      "step": 4373
+    },
+    {
+      "epoch": 0.12068828745555069,
+      "grad_norm": 0.003246552310883999,
+      "learning_rate": 0.001,
+      "loss": 0.4136,
+      "step": 4374
+    },
+    {
+      "epoch": 0.12071587965661505,
+      "grad_norm": 0.00548326363787055,
+      "learning_rate": 0.001,
+      "loss": 0.3935,
+      "step": 4375
+    },
+    {
+      "epoch": 0.12074347185767943,
+      "grad_norm": 0.004080353304743767,
+      "learning_rate": 0.001,
+      "loss": 0.4058,
+      "step": 4376
+    },
+    {
+      "epoch": 0.1207710640587438,
+      "grad_norm": 0.004176739137619734,
+      "learning_rate": 0.001,
+      "loss": 0.3706,
+      "step": 4377
+    },
+    {
+      "epoch": 0.12079865625980817,
+      "grad_norm": 0.002543453825637698,
+      "learning_rate": 0.001,
+      "loss": 0.4096,
+      "step": 4378
+    },
+    {
+      "epoch": 0.12082624846087253,
+      "grad_norm": 0.0027144979685544968,
+      "learning_rate": 0.001,
+      "loss": 0.4159,
+      "step": 4379
+    },
+    {
+      "epoch": 0.1208538406619369,
+      "grad_norm": 0.0035318818408995867,
+      "learning_rate": 0.001,
+      "loss": 0.3599,
+      "step": 4380
+    },
+    {
+      "epoch": 0.12088143286300128,
+      "grad_norm": 0.005954810418188572,
+      "learning_rate": 0.001,
+      "loss": 0.399,
+      "step": 4381
+    },
+    {
+      "epoch": 0.12090902506406564,
+      "grad_norm": 0.00457935081794858,
+      "learning_rate": 0.001,
+      "loss": 0.4323,
+      "step": 4382
+    },
+    {
+      "epoch": 0.12093661726513001,
+      "grad_norm": 0.004335826262831688,
+      "learning_rate": 0.001,
+      "loss": 0.3982,
+      "step": 4383
+    },
+    {
+      "epoch": 0.12096420946619438,
+      "grad_norm": 0.0032937126234173775,
+      "learning_rate": 0.001,
+      "loss": 0.4219,
+      "step": 4384
+    },
+    {
+      "epoch": 0.12099180166725874,
+      "grad_norm": 0.0024808261077851057,
+      "learning_rate": 0.001,
+      "loss": 0.4096,
+      "step": 4385
+    },
+    {
+      "epoch": 0.12101939386832312,
+      "grad_norm": 0.002847994677722454,
+      "learning_rate": 0.001,
+      "loss": 0.423,
+      "step": 4386
+    },
+    {
+      "epoch": 0.12104698606938749,
+      "grad_norm": 0.002378343604505062,
+      "learning_rate": 0.001,
+      "loss": 0.4155,
+      "step": 4387
+    },
+    {
+      "epoch": 0.12107457827045186,
+      "grad_norm": 0.002491983585059643,
+      "learning_rate": 0.001,
+      "loss": 0.4069,
+      "step": 4388
+    },
+    {
+      "epoch": 0.12110217047151622,
+      "grad_norm": 0.002309272298589349,
+      "learning_rate": 0.001,
+      "loss": 0.4009,
+      "step": 4389
+    },
+    {
+      "epoch": 0.12112976267258059,
+      "grad_norm": 0.0028091350104659796,
+      "learning_rate": 0.001,
+      "loss": 0.3731,
+      "step": 4390
+    },
+    {
+      "epoch": 0.12115735487364497,
+      "grad_norm": 0.00367850624024868,
+      "learning_rate": 0.001,
+      "loss": 0.4191,
+      "step": 4391
+    },
+    {
+      "epoch": 0.12118494707470934,
+      "grad_norm": 0.003562155645340681,
+      "learning_rate": 0.001,
+      "loss": 0.4017,
+      "step": 4392
+    },
+    {
+      "epoch": 0.1212125392757737,
+      "grad_norm": 0.0025711546186357737,
+      "learning_rate": 0.001,
+      "loss": 0.3541,
+      "step": 4393
+    },
+    {
+      "epoch": 0.12124013147683807,
+      "grad_norm": 0.003351216670125723,
+      "learning_rate": 0.001,
+      "loss": 0.3932,
+      "step": 4394
+    },
+    {
+      "epoch": 0.12126772367790244,
+      "grad_norm": 0.002373376628383994,
+      "learning_rate": 0.001,
+      "loss": 0.4049,
+      "step": 4395
+    },
+    {
+      "epoch": 0.12129531587896682,
+      "grad_norm": 0.0035600061528384686,
+      "learning_rate": 0.001,
+      "loss": 0.4044,
+      "step": 4396
+    },
+    {
+      "epoch": 0.12132290808003118,
+      "grad_norm": 0.0033355422783643007,
+      "learning_rate": 0.001,
+      "loss": 0.3438,
+      "step": 4397
+    },
+    {
+      "epoch": 0.12135050028109555,
+      "grad_norm": 0.004041003528982401,
+      "learning_rate": 0.001,
+      "loss": 0.3776,
+      "step": 4398
+    },
+    {
+      "epoch": 0.12137809248215992,
+      "grad_norm": 0.002868924057111144,
+      "learning_rate": 0.001,
+      "loss": 0.3742,
+      "step": 4399
+    },
+    {
+      "epoch": 0.12140568468322428,
+      "grad_norm": 0.003549615852534771,
+      "learning_rate": 0.001,
+      "loss": 0.3907,
+      "step": 4400
+    },
+    {
+      "epoch": 0.12143327688428865,
+      "grad_norm": 0.0034831890370696783,
+      "learning_rate": 0.001,
+      "loss": 0.4147,
+      "step": 4401
+    },
+    {
+      "epoch": 0.12146086908535303,
+      "grad_norm": 0.0029312497936189175,
+      "learning_rate": 0.001,
+      "loss": 0.4079,
+      "step": 4402
+    },
+    {
+      "epoch": 0.1214884612864174,
+      "grad_norm": 0.0037916668225079775,
+      "learning_rate": 0.001,
+      "loss": 0.4078,
+      "step": 4403
+    },
+    {
+      "epoch": 0.12151605348748176,
+      "grad_norm": 0.0031889586243778467,
+      "learning_rate": 0.001,
+      "loss": 0.4114,
+      "step": 4404
+    },
+    {
+      "epoch": 0.12154364568854613,
+      "grad_norm": 0.003701084526255727,
+      "learning_rate": 0.001,
+      "loss": 0.363,
+      "step": 4405
+    },
+    {
+      "epoch": 0.1215712378896105,
+      "grad_norm": 0.00297721428796649,
+      "learning_rate": 0.001,
+      "loss": 0.3948,
+      "step": 4406
+    },
+    {
+      "epoch": 0.12159883009067488,
+      "grad_norm": 0.0032696991693228483,
+      "learning_rate": 0.001,
+      "loss": 0.3994,
+      "step": 4407
+    },
+    {
+      "epoch": 0.12162642229173924,
+      "grad_norm": 0.0023237040732055902,
+      "learning_rate": 0.001,
+      "loss": 0.4037,
+      "step": 4408
+    },
+    {
+      "epoch": 0.12165401449280361,
+      "grad_norm": 0.0023669737856835127,
+      "learning_rate": 0.001,
+      "loss": 0.396,
+      "step": 4409
+    },
+    {
+      "epoch": 0.12168160669386797,
+      "grad_norm": 0.0029125306755304337,
+      "learning_rate": 0.001,
+      "loss": 0.3878,
+      "step": 4410
+    },
+    {
+      "epoch": 0.12170919889493234,
+      "grad_norm": 0.003643943928182125,
+      "learning_rate": 0.001,
+      "loss": 0.4312,
+      "step": 4411
+    },
+    {
+      "epoch": 0.12173679109599672,
+      "grad_norm": 0.002598782768473029,
+      "learning_rate": 0.001,
+      "loss": 0.4217,
+      "step": 4412
+    },
+    {
+      "epoch": 0.12176438329706109,
+      "grad_norm": 0.00362451933324337,
+      "learning_rate": 0.001,
+      "loss": 0.3839,
+      "step": 4413
+    },
+    {
+      "epoch": 0.12179197549812545,
+      "grad_norm": 0.0032412779983133078,
+      "learning_rate": 0.001,
+      "loss": 0.371,
+      "step": 4414
+    },
+    {
+      "epoch": 0.12181956769918982,
+      "grad_norm": 0.0031498463358730078,
+      "learning_rate": 0.001,
+      "loss": 0.3834,
+      "step": 4415
+    },
+    {
+      "epoch": 0.12184715990025419,
+      "grad_norm": 0.0026793426368385553,
+      "learning_rate": 0.001,
+      "loss": 0.4164,
+      "step": 4416
+    },
+    {
+      "epoch": 0.12187475210131857,
+      "grad_norm": 0.002742303302511573,
+      "learning_rate": 0.001,
+      "loss": 0.3773,
+      "step": 4417
+    },
+    {
+      "epoch": 0.12190234430238293,
+      "grad_norm": 0.013603371568024158,
+      "learning_rate": 0.001,
+      "loss": 0.3664,
+      "step": 4418
+    },
+    {
+      "epoch": 0.1219299365034473,
+      "grad_norm": 0.002509272890165448,
+      "learning_rate": 0.001,
+      "loss": 0.3634,
+      "step": 4419
+    },
+    {
+      "epoch": 0.12195752870451167,
+      "grad_norm": 0.0029235216788947582,
+      "learning_rate": 0.001,
+      "loss": 0.3969,
+      "step": 4420
+    },
+    {
+      "epoch": 0.12198512090557603,
+      "grad_norm": 0.003002134384587407,
+      "learning_rate": 0.001,
+      "loss": 0.4022,
+      "step": 4421
+    },
+    {
+      "epoch": 0.12201271310664041,
+      "grad_norm": 0.0032670635264366865,
+      "learning_rate": 0.001,
+      "loss": 0.4005,
+      "step": 4422
+    },
+    {
+      "epoch": 0.12204030530770478,
+      "grad_norm": 0.0023868351709097624,
+      "learning_rate": 0.001,
+      "loss": 0.3716,
+      "step": 4423
+    },
+    {
+      "epoch": 0.12206789750876915,
+      "grad_norm": 0.004251338541507721,
+      "learning_rate": 0.001,
+      "loss": 0.3969,
+      "step": 4424
+    },
+    {
+      "epoch": 0.12209548970983351,
+      "grad_norm": 0.011102253571152687,
+      "learning_rate": 0.001,
+      "loss": 0.3887,
+      "step": 4425
+    },
+    {
+      "epoch": 0.12212308191089788,
+      "grad_norm": 0.0027349465526640415,
+      "learning_rate": 0.001,
+      "loss": 0.3824,
+      "step": 4426
+    },
+    {
+      "epoch": 0.12215067411196226,
+      "grad_norm": 0.0032276634592562914,
+      "learning_rate": 0.001,
+      "loss": 0.4151,
+      "step": 4427
+    },
+    {
+      "epoch": 0.12217826631302663,
+      "grad_norm": 0.0035047761630266905,
+      "learning_rate": 0.001,
+      "loss": 0.3754,
+      "step": 4428
+    },
+    {
+      "epoch": 0.12220585851409099,
+      "grad_norm": 0.0022022626362740993,
+      "learning_rate": 0.001,
+      "loss": 0.4513,
+      "step": 4429
+    },
+    {
+      "epoch": 0.12223345071515536,
+      "grad_norm": 0.0028917898889631033,
+      "learning_rate": 0.001,
+      "loss": 0.4174,
+      "step": 4430
+    },
+    {
+      "epoch": 0.12226104291621973,
+      "grad_norm": 0.0040835002437233925,
+      "learning_rate": 0.001,
+      "loss": 0.3589,
+      "step": 4431
+    },
+    {
+      "epoch": 0.1222886351172841,
+      "grad_norm": 0.0042761219665408134,
+      "learning_rate": 0.001,
+      "loss": 0.4104,
+      "step": 4432
+    },
+    {
+      "epoch": 0.12231622731834847,
+      "grad_norm": 0.0031644178088754416,
+      "learning_rate": 0.001,
+      "loss": 0.4044,
+      "step": 4433
+    },
+    {
+      "epoch": 0.12234381951941284,
+      "grad_norm": 0.005158254411071539,
+      "learning_rate": 0.001,
+      "loss": 0.387,
+      "step": 4434
+    },
+    {
+      "epoch": 0.1223714117204772,
+      "grad_norm": 0.00329927378334105,
+      "learning_rate": 0.001,
+      "loss": 0.3385,
+      "step": 4435
+    },
+    {
+      "epoch": 0.12239900392154157,
+      "grad_norm": 0.0028820266015827656,
+      "learning_rate": 0.001,
+      "loss": 0.3833,
+      "step": 4436
+    },
+    {
+      "epoch": 0.12242659612260595,
+      "grad_norm": 0.0035580925177782774,
+      "learning_rate": 0.001,
+      "loss": 0.4017,
+      "step": 4437
+    },
+    {
+      "epoch": 0.12245418832367032,
+      "grad_norm": 0.0028019326273351908,
+      "learning_rate": 0.001,
+      "loss": 0.3772,
+      "step": 4438
+    },
+    {
+      "epoch": 0.12248178052473468,
+      "grad_norm": 0.002797899767756462,
+      "learning_rate": 0.001,
+      "loss": 0.4142,
+      "step": 4439
+    },
+    {
+      "epoch": 0.12250937272579905,
+      "grad_norm": 0.002947175409644842,
+      "learning_rate": 0.001,
+      "loss": 0.3884,
+      "step": 4440
+    },
+    {
+      "epoch": 0.12253696492686342,
+      "grad_norm": 0.003999659325927496,
+      "learning_rate": 0.001,
+      "loss": 0.397,
+      "step": 4441
+    },
+    {
+      "epoch": 0.1225645571279278,
+      "grad_norm": 0.0022317490074783564,
+      "learning_rate": 0.001,
+      "loss": 0.4191,
+      "step": 4442
+    },
+    {
+      "epoch": 0.12259214932899216,
+      "grad_norm": 0.01325872354209423,
+      "learning_rate": 0.001,
+      "loss": 0.397,
+      "step": 4443
+    },
+    {
+      "epoch": 0.12261974153005653,
+      "grad_norm": 0.0043718283995985985,
+      "learning_rate": 0.001,
+      "loss": 0.4027,
+      "step": 4444
+    },
+    {
+      "epoch": 0.1226473337311209,
+      "grad_norm": 0.003364040283486247,
+      "learning_rate": 0.001,
+      "loss": 0.423,
+      "step": 4445
+    },
+    {
+      "epoch": 0.12267492593218526,
+      "grad_norm": 0.0032444903627038,
+      "learning_rate": 0.001,
+      "loss": 0.3935,
+      "step": 4446
+    },
+    {
+      "epoch": 0.12270251813324963,
+      "grad_norm": 0.004509296268224716,
+      "learning_rate": 0.001,
+      "loss": 0.4078,
+      "step": 4447
+    },
+    {
+      "epoch": 0.12273011033431401,
+      "grad_norm": 0.004130146466195583,
+      "learning_rate": 0.001,
+      "loss": 0.3735,
+      "step": 4448
+    },
+    {
+      "epoch": 0.12275770253537838,
+      "grad_norm": 0.003644294338300824,
+      "learning_rate": 0.001,
+      "loss": 0.4376,
+      "step": 4449
+    },
+    {
+      "epoch": 0.12278529473644274,
+      "grad_norm": 0.003993290476500988,
+      "learning_rate": 0.001,
+      "loss": 0.3884,
+      "step": 4450
+    },
+    {
+      "epoch": 0.12281288693750711,
+      "grad_norm": 0.0026422853115946054,
+      "learning_rate": 0.001,
+      "loss": 0.4278,
+      "step": 4451
+    },
+    {
+      "epoch": 0.12284047913857148,
+      "grad_norm": 0.0023565879091620445,
+      "learning_rate": 0.001,
+      "loss": 0.4267,
+      "step": 4452
+    },
+    {
+      "epoch": 0.12286807133963586,
+      "grad_norm": 0.002863645553588867,
+      "learning_rate": 0.001,
+      "loss": 0.3796,
+      "step": 4453
+    },
+    {
+      "epoch": 0.12289566354070022,
+      "grad_norm": 0.0022721088025718927,
+      "learning_rate": 0.001,
+      "loss": 0.4075,
+      "step": 4454
+    },
+    {
+      "epoch": 0.12292325574176459,
+      "grad_norm": 0.0033876546658575535,
+      "learning_rate": 0.001,
+      "loss": 0.3802,
+      "step": 4455
+    },
+    {
+      "epoch": 0.12295084794282896,
+      "grad_norm": 0.0030466553289443254,
+      "learning_rate": 0.001,
+      "loss": 0.4106,
+      "step": 4456
+    },
+    {
+      "epoch": 0.12297844014389332,
+      "grad_norm": 0.005601429846137762,
+      "learning_rate": 0.001,
+      "loss": 0.4128,
+      "step": 4457
+    },
+    {
+      "epoch": 0.1230060323449577,
+      "grad_norm": 0.0028402013704180717,
+      "learning_rate": 0.001,
+      "loss": 0.4016,
+      "step": 4458
+    },
+    {
+      "epoch": 0.12303362454602207,
+      "grad_norm": 0.004646173678338528,
+      "learning_rate": 0.001,
+      "loss": 0.3447,
+      "step": 4459
+    },
+    {
+      "epoch": 0.12306121674708644,
+      "grad_norm": 0.002111830050125718,
+      "learning_rate": 0.001,
+      "loss": 0.4089,
+      "step": 4460
+    },
+    {
+      "epoch": 0.1230888089481508,
+      "grad_norm": 0.0025454754941165447,
+      "learning_rate": 0.001,
+      "loss": 0.3967,
+      "step": 4461
+    },
+    {
+      "epoch": 0.12311640114921517,
+      "grad_norm": 0.002491764025762677,
+      "learning_rate": 0.001,
+      "loss": 0.3989,
+      "step": 4462
+    },
+    {
+      "epoch": 0.12314399335027955,
+      "grad_norm": 0.005627894774079323,
+      "learning_rate": 0.001,
+      "loss": 0.389,
+      "step": 4463
+    },
+    {
+      "epoch": 0.12317158555134392,
+      "grad_norm": 0.004091967828571796,
+      "learning_rate": 0.001,
+      "loss": 0.4273,
+      "step": 4464
+    },
+    {
+      "epoch": 0.12319917775240828,
+      "grad_norm": 0.0029696535784751177,
+      "learning_rate": 0.001,
+      "loss": 0.3962,
+      "step": 4465
+    },
+    {
+      "epoch": 0.12322676995347265,
+      "grad_norm": 0.004290423821657896,
+      "learning_rate": 0.001,
+      "loss": 0.3906,
+      "step": 4466
+    },
+    {
+      "epoch": 0.12325436215453701,
+      "grad_norm": 0.002825783099979162,
+      "learning_rate": 0.001,
+      "loss": 0.3961,
+      "step": 4467
+    },
+    {
+      "epoch": 0.1232819543556014,
+      "grad_norm": 0.002694531576707959,
+      "learning_rate": 0.001,
+      "loss": 0.3918,
+      "step": 4468
+    },
+    {
+      "epoch": 0.12330954655666576,
+      "grad_norm": 0.002853821264579892,
+      "learning_rate": 0.001,
+      "loss": 0.3638,
+      "step": 4469
+    },
+    {
+      "epoch": 0.12333713875773013,
+      "grad_norm": 0.0025199069641530514,
+      "learning_rate": 0.001,
+      "loss": 0.384,
+      "step": 4470
+    },
+    {
+      "epoch": 0.1233647309587945,
+      "grad_norm": 0.003955396823585033,
+      "learning_rate": 0.001,
+      "loss": 0.3755,
+      "step": 4471
+    },
+    {
+      "epoch": 0.12339232315985886,
+      "grad_norm": 0.0027513199020177126,
+      "learning_rate": 0.001,
+      "loss": 0.3981,
+      "step": 4472
+    },
+    {
+      "epoch": 0.12341991536092324,
+      "grad_norm": 0.004522261209785938,
+      "learning_rate": 0.001,
+      "loss": 0.4623,
+      "step": 4473
+    },
+    {
+      "epoch": 0.12344750756198761,
+      "grad_norm": 0.003415697254240513,
+      "learning_rate": 0.001,
+      "loss": 0.3978,
+      "step": 4474
+    },
+    {
+      "epoch": 0.12347509976305197,
+      "grad_norm": 0.0028595994226634502,
+      "learning_rate": 0.001,
+      "loss": 0.3896,
+      "step": 4475
+    },
+    {
+      "epoch": 0.12350269196411634,
+      "grad_norm": 0.0041776299476623535,
+      "learning_rate": 0.001,
+      "loss": 0.4008,
+      "step": 4476
+    },
+    {
+      "epoch": 0.1235302841651807,
+      "grad_norm": 0.002562503796070814,
+      "learning_rate": 0.001,
+      "loss": 0.3921,
+      "step": 4477
+    },
+    {
+      "epoch": 0.12355787636624509,
+      "grad_norm": 0.0029480233788490295,
+      "learning_rate": 0.001,
+      "loss": 0.402,
+      "step": 4478
+    },
+    {
+      "epoch": 0.12358546856730945,
+      "grad_norm": 0.003219869453459978,
+      "learning_rate": 0.001,
+      "loss": 0.4032,
+      "step": 4479
+    },
+    {
+      "epoch": 0.12361306076837382,
+      "grad_norm": 0.002801501424983144,
+      "learning_rate": 0.001,
+      "loss": 0.3823,
+      "step": 4480
+    },
+    {
+      "epoch": 0.12364065296943819,
+      "grad_norm": 0.004376853816211224,
+      "learning_rate": 0.001,
+      "loss": 0.3591,
+      "step": 4481
+    },
+    {
+      "epoch": 0.12366824517050255,
+      "grad_norm": 0.0027625143993645906,
+      "learning_rate": 0.001,
+      "loss": 0.3813,
+      "step": 4482
+    },
+    {
+      "epoch": 0.12369583737156693,
+      "grad_norm": 0.0024436269886791706,
+      "learning_rate": 0.001,
+      "loss": 0.4358,
+      "step": 4483
+    },
+    {
+      "epoch": 0.1237234295726313,
+      "grad_norm": 0.0035639768466353416,
+      "learning_rate": 0.001,
+      "loss": 0.3796,
+      "step": 4484
+    },
+    {
+      "epoch": 0.12375102177369567,
+      "grad_norm": 0.0029148710891604424,
+      "learning_rate": 0.001,
+      "loss": 0.3849,
+      "step": 4485
+    },
+    {
+      "epoch": 0.12377861397476003,
+      "grad_norm": 0.0025241354014724493,
+      "learning_rate": 0.001,
+      "loss": 0.3841,
+      "step": 4486
+    },
+    {
+      "epoch": 0.1238062061758244,
+      "grad_norm": 0.003036417765542865,
+      "learning_rate": 0.001,
+      "loss": 0.398,
+      "step": 4487
+    },
+    {
+      "epoch": 0.12383379837688878,
+      "grad_norm": 0.002688635839149356,
+      "learning_rate": 0.001,
+      "loss": 0.3572,
+      "step": 4488
+    },
+    {
+      "epoch": 0.12386139057795315,
+      "grad_norm": 0.0027762525714933872,
+      "learning_rate": 0.001,
+      "loss": 0.4021,
+      "step": 4489
+    },
+    {
+      "epoch": 0.12388898277901751,
+      "grad_norm": 0.0030563059262931347,
+      "learning_rate": 0.001,
+      "loss": 0.3528,
+      "step": 4490
+    },
+    {
+      "epoch": 0.12391657498008188,
+      "grad_norm": 0.006547185592353344,
+      "learning_rate": 0.001,
+      "loss": 0.3975,
+      "step": 4491
+    },
+    {
+      "epoch": 0.12394416718114624,
+      "grad_norm": 0.0036082088481634855,
+      "learning_rate": 0.001,
+      "loss": 0.3909,
+      "step": 4492
+    },
+    {
+      "epoch": 0.12397175938221063,
+      "grad_norm": 0.007952166721224785,
+      "learning_rate": 0.001,
+      "loss": 0.3944,
+      "step": 4493
+    },
+    {
+      "epoch": 0.12399935158327499,
+      "grad_norm": 0.0032326721120625734,
+      "learning_rate": 0.001,
+      "loss": 0.3994,
+      "step": 4494
+    },
+    {
+      "epoch": 0.12402694378433936,
+      "grad_norm": 0.003287110012024641,
+      "learning_rate": 0.001,
+      "loss": 0.3341,
+      "step": 4495
+    },
+    {
+      "epoch": 0.12405453598540372,
+      "grad_norm": 0.0022940190974622965,
+      "learning_rate": 0.001,
+      "loss": 0.3939,
+      "step": 4496
+    },
+    {
+      "epoch": 0.12408212818646809,
+      "grad_norm": 0.007416036911308765,
+      "learning_rate": 0.001,
+      "loss": 0.404,
+      "step": 4497
+    },
+    {
+      "epoch": 0.12410972038753246,
+      "grad_norm": 0.002889924682676792,
+      "learning_rate": 0.001,
+      "loss": 0.3884,
+      "step": 4498
+    },
+    {
+      "epoch": 0.12413731258859684,
+      "grad_norm": 0.002148964209482074,
+      "learning_rate": 0.001,
+      "loss": 0.3988,
+      "step": 4499
+    },
+    {
+      "epoch": 0.1241649047896612,
+      "grad_norm": 0.004982593934983015,
+      "learning_rate": 0.001,
+      "loss": 0.4168,
+      "step": 4500
+    },
+    {
+      "epoch": 0.1241649047896612,
+      "eval_runtime": 24.5339,
+      "eval_samples_per_second": 1.304,
+      "eval_steps_per_second": 0.163,
+      "step": 4500
+    },
+    {
+      "epoch": 0.12419249699072557,
+      "grad_norm": 0.004382569808512926,
+      "learning_rate": 0.001,
+      "loss": 0.4112,
+      "step": 4501
+    },
+    {
+      "epoch": 0.12422008919178994,
+      "grad_norm": 0.003418430220335722,
+      "learning_rate": 0.001,
+      "loss": 0.4415,
+      "step": 4502
+    },
+    {
+      "epoch": 0.1242476813928543,
+      "grad_norm": 0.003198147751390934,
+      "learning_rate": 0.001,
+      "loss": 0.3938,
+      "step": 4503
+    },
+    {
+      "epoch": 0.12427527359391868,
+      "grad_norm": 0.00246452703140676,
+      "learning_rate": 0.001,
+      "loss": 0.3847,
+      "step": 4504
+    },
+    {
+      "epoch": 0.12430286579498305,
+      "grad_norm": 0.0038448646664619446,
+      "learning_rate": 0.001,
+      "loss": 0.3798,
+      "step": 4505
+    },
+    {
+      "epoch": 0.12433045799604742,
+      "grad_norm": 0.0039044911973178387,
+      "learning_rate": 0.001,
+      "loss": 0.3727,
+      "step": 4506
+    },
+    {
+      "epoch": 0.12435805019711178,
+      "grad_norm": 0.003824865445494652,
+      "learning_rate": 0.001,
+      "loss": 0.3824,
+      "step": 4507
+    },
+    {
+      "epoch": 0.12438564239817615,
+      "grad_norm": 0.003275007475167513,
+      "learning_rate": 0.001,
+      "loss": 0.4156,
+      "step": 4508
+    },
+    {
+      "epoch": 0.12441323459924053,
+      "grad_norm": 0.00457676500082016,
+      "learning_rate": 0.001,
+      "loss": 0.3967,
+      "step": 4509
+    },
+    {
+      "epoch": 0.1244408268003049,
+      "grad_norm": 0.004399159457534552,
+      "learning_rate": 0.001,
+      "loss": 0.4016,
+      "step": 4510
+    },
+    {
+      "epoch": 0.12446841900136926,
+      "grad_norm": 0.0027072380762547255,
+      "learning_rate": 0.001,
+      "loss": 0.4299,
+      "step": 4511
+    },
+    {
+      "epoch": 0.12449601120243363,
+      "grad_norm": 0.0025107194669544697,
+      "learning_rate": 0.001,
+      "loss": 0.3946,
+      "step": 4512
+    },
+    {
+      "epoch": 0.124523603403498,
+      "grad_norm": 0.004047545604407787,
+      "learning_rate": 0.001,
+      "loss": 0.3901,
+      "step": 4513
+    },
+    {
+      "epoch": 0.12455119560456238,
+      "grad_norm": 0.002424485282972455,
+      "learning_rate": 0.001,
+      "loss": 0.4101,
+      "step": 4514
+    },
+    {
+      "epoch": 0.12457878780562674,
+      "grad_norm": 0.00391024025157094,
+      "learning_rate": 0.001,
+      "loss": 0.3942,
+      "step": 4515
+    },
+    {
+      "epoch": 0.12460638000669111,
+      "grad_norm": 0.0026012531016021967,
+      "learning_rate": 0.001,
+      "loss": 0.4127,
+      "step": 4516
+    },
+    {
+      "epoch": 0.12463397220775548,
+      "grad_norm": 0.0023119868710637093,
+      "learning_rate": 0.001,
+      "loss": 0.4309,
+      "step": 4517
+    },
+    {
+      "epoch": 0.12466156440881984,
+      "grad_norm": 0.0023387623950839043,
+      "learning_rate": 0.001,
+      "loss": 0.4148,
+      "step": 4518
+    },
+    {
+      "epoch": 0.12468915660988422,
+      "grad_norm": 0.0031953216530382633,
+      "learning_rate": 0.001,
+      "loss": 0.394,
+      "step": 4519
+    },
+    {
+      "epoch": 0.12471674881094859,
+      "grad_norm": 0.002653286559507251,
+      "learning_rate": 0.001,
+      "loss": 0.3617,
+      "step": 4520
+    },
+    {
+      "epoch": 0.12474434101201295,
+      "grad_norm": 0.006683747284114361,
+      "learning_rate": 0.001,
+      "loss": 0.3744,
+      "step": 4521
+    },
+    {
+      "epoch": 0.12477193321307732,
+      "grad_norm": 0.002483023563399911,
+      "learning_rate": 0.001,
+      "loss": 0.3819,
+      "step": 4522
+    },
+    {
+      "epoch": 0.12479952541414169,
+      "grad_norm": 0.004147149156779051,
+      "learning_rate": 0.001,
+      "loss": 0.4002,
+      "step": 4523
+    },
+    {
+      "epoch": 0.12482711761520607,
+      "grad_norm": 0.0026530877221375704,
+      "learning_rate": 0.001,
+      "loss": 0.3871,
+      "step": 4524
+    },
+    {
+      "epoch": 0.12485470981627043,
+      "grad_norm": 0.003452074248343706,
+      "learning_rate": 0.001,
+      "loss": 0.3931,
+      "step": 4525
+    },
+    {
+      "epoch": 0.1248823020173348,
+      "grad_norm": 0.0028085915837436914,
+      "learning_rate": 0.001,
+      "loss": 0.3797,
+      "step": 4526
+    },
+    {
+      "epoch": 0.12490989421839917,
+      "grad_norm": 0.0034218342043459415,
+      "learning_rate": 0.001,
+      "loss": 0.4248,
+      "step": 4527
+    },
+    {
+      "epoch": 0.12493748641946353,
+      "grad_norm": 0.007770628668367863,
+      "learning_rate": 0.001,
+      "loss": 0.3624,
+      "step": 4528
+    },
+    {
+      "epoch": 0.12496507862052791,
+      "grad_norm": 0.0029227614868432283,
+      "learning_rate": 0.001,
+      "loss": 0.4236,
+      "step": 4529
+    },
+    {
+      "epoch": 0.12499267082159228,
+      "grad_norm": 0.0030319190118461847,
+      "learning_rate": 0.001,
+      "loss": 0.3845,
+      "step": 4530
+    },
+    {
+      "epoch": 0.12502026302265665,
+      "grad_norm": 0.002946640131995082,
+      "learning_rate": 0.001,
+      "loss": 0.3868,
+      "step": 4531
+    },
+    {
+      "epoch": 0.12504785522372103,
+      "grad_norm": 0.003349126083776355,
+      "learning_rate": 0.001,
+      "loss": 0.4028,
+      "step": 4532
+    },
+    {
+      "epoch": 0.12507544742478538,
+      "grad_norm": 0.0028201621025800705,
+      "learning_rate": 0.001,
+      "loss": 0.4228,
+      "step": 4533
+    },
+    {
+      "epoch": 0.12510303962584976,
+      "grad_norm": 0.002343755681067705,
+      "learning_rate": 0.001,
+      "loss": 0.4049,
+      "step": 4534
+    },
+    {
+      "epoch": 0.1251306318269141,
+      "grad_norm": 0.004897142760455608,
+      "learning_rate": 0.001,
+      "loss": 0.4179,
+      "step": 4535
+    },
+    {
+      "epoch": 0.1251582240279785,
+      "grad_norm": 0.003420259803533554,
+      "learning_rate": 0.001,
+      "loss": 0.3931,
+      "step": 4536
+    },
+    {
+      "epoch": 0.12518581622904287,
+      "grad_norm": 0.005415117833763361,
+      "learning_rate": 0.001,
+      "loss": 0.3938,
+      "step": 4537
+    },
+    {
+      "epoch": 0.12521340843010723,
+      "grad_norm": 0.009312170557677746,
+      "learning_rate": 0.001,
+      "loss": 0.4319,
+      "step": 4538
+    },
+    {
+      "epoch": 0.1252410006311716,
+      "grad_norm": 0.0031622762326151133,
+      "learning_rate": 0.001,
+      "loss": 0.404,
+      "step": 4539
+    },
+    {
+      "epoch": 0.12526859283223596,
+      "grad_norm": 0.0035899661015719175,
+      "learning_rate": 0.001,
+      "loss": 0.4331,
+      "step": 4540
+    },
+    {
+      "epoch": 0.12529618503330034,
+      "grad_norm": 0.005790769122540951,
+      "learning_rate": 0.001,
+      "loss": 0.3559,
+      "step": 4541
+    },
+    {
+      "epoch": 0.12532377723436472,
+      "grad_norm": 0.003316226415336132,
+      "learning_rate": 0.001,
+      "loss": 0.3989,
+      "step": 4542
+    },
+    {
+      "epoch": 0.12535136943542907,
+      "grad_norm": 0.003180850762873888,
+      "learning_rate": 0.001,
+      "loss": 0.3986,
+      "step": 4543
+    },
+    {
+      "epoch": 0.12537896163649345,
+      "grad_norm": 0.0034962312784045935,
+      "learning_rate": 0.001,
+      "loss": 0.4058,
+      "step": 4544
+    },
+    {
+      "epoch": 0.1254065538375578,
+      "grad_norm": 0.0036736545152962208,
+      "learning_rate": 0.001,
+      "loss": 0.4017,
+      "step": 4545
+    },
+    {
+      "epoch": 0.12543414603862219,
+      "grad_norm": 0.0032101564574986696,
+      "learning_rate": 0.001,
+      "loss": 0.4218,
+      "step": 4546
+    },
+    {
+      "epoch": 0.12546173823968657,
+      "grad_norm": 0.0029373078141361475,
+      "learning_rate": 0.001,
+      "loss": 0.4029,
+      "step": 4547
+    },
+    {
+      "epoch": 0.12548933044075092,
+      "grad_norm": 0.0039803143590688705,
+      "learning_rate": 0.001,
+      "loss": 0.4196,
+      "step": 4548
+    },
+    {
+      "epoch": 0.1255169226418153,
+      "grad_norm": 0.006120176054537296,
+      "learning_rate": 0.001,
+      "loss": 0.3971,
+      "step": 4549
+    },
+    {
+      "epoch": 0.12554451484287965,
+      "grad_norm": 0.004431293345987797,
+      "learning_rate": 0.001,
+      "loss": 0.4117,
+      "step": 4550
+    },
+    {
+      "epoch": 0.12557210704394403,
+      "grad_norm": 0.0034864030312746763,
+      "learning_rate": 0.001,
+      "loss": 0.4042,
+      "step": 4551
+    },
+    {
+      "epoch": 0.1255996992450084,
+      "grad_norm": 0.003251910675317049,
+      "learning_rate": 0.001,
+      "loss": 0.3791,
+      "step": 4552
+    },
+    {
+      "epoch": 0.12562729144607276,
+      "grad_norm": 0.0032173239160329103,
+      "learning_rate": 0.001,
+      "loss": 0.358,
+      "step": 4553
+    },
+    {
+      "epoch": 0.12565488364713714,
+      "grad_norm": 0.002825138159096241,
+      "learning_rate": 0.001,
+      "loss": 0.3771,
+      "step": 4554
+    },
+    {
+      "epoch": 0.1256824758482015,
+      "grad_norm": 0.0024809478782117367,
+      "learning_rate": 0.001,
+      "loss": 0.4102,
+      "step": 4555
+    },
+    {
+      "epoch": 0.12571006804926588,
+      "grad_norm": 0.0038729074876755476,
+      "learning_rate": 0.001,
+      "loss": 0.3765,
+      "step": 4556
+    },
+    {
+      "epoch": 0.12573766025033026,
+      "grad_norm": 0.0027885730378329754,
+      "learning_rate": 0.001,
+      "loss": 0.4325,
+      "step": 4557
+    },
+    {
+      "epoch": 0.1257652524513946,
+      "grad_norm": 0.002833339385688305,
+      "learning_rate": 0.001,
+      "loss": 0.4231,
+      "step": 4558
+    },
+    {
+      "epoch": 0.125792844652459,
+      "grad_norm": 0.00270766019821167,
+      "learning_rate": 0.001,
+      "loss": 0.3694,
+      "step": 4559
+    },
+    {
+      "epoch": 0.12582043685352334,
+      "grad_norm": 0.0024998581502586603,
+      "learning_rate": 0.001,
+      "loss": 0.3947,
+      "step": 4560
+    },
+    {
+      "epoch": 0.12584802905458772,
+      "grad_norm": 0.010215898975729942,
+      "learning_rate": 0.001,
+      "loss": 0.359,
+      "step": 4561
+    },
+    {
+      "epoch": 0.1258756212556521,
+      "grad_norm": 0.015013232827186584,
+      "learning_rate": 0.001,
+      "loss": 0.3853,
+      "step": 4562
+    },
+    {
+      "epoch": 0.12590321345671646,
+      "grad_norm": 0.010075882077217102,
+      "learning_rate": 0.001,
+      "loss": 0.4007,
+      "step": 4563
+    },
+    {
+      "epoch": 0.12593080565778084,
+      "grad_norm": 0.02185934968292713,
+      "learning_rate": 0.001,
+      "loss": 0.3701,
+      "step": 4564
+    },
+    {
+      "epoch": 0.1259583978588452,
+      "grad_norm": 0.003104067873209715,
+      "learning_rate": 0.001,
+      "loss": 0.4061,
+      "step": 4565
+    },
+    {
+      "epoch": 0.12598599005990957,
+      "grad_norm": 0.0032787187956273556,
+      "learning_rate": 0.001,
+      "loss": 0.397,
+      "step": 4566
+    },
+    {
+      "epoch": 0.12601358226097392,
+      "grad_norm": 0.002555015729740262,
+      "learning_rate": 0.001,
+      "loss": 0.4352,
+      "step": 4567
+    },
+    {
+      "epoch": 0.1260411744620383,
+      "grad_norm": 0.0023251352831721306,
+      "learning_rate": 0.001,
+      "loss": 0.4093,
+      "step": 4568
+    },
+    {
+      "epoch": 0.12606876666310268,
+      "grad_norm": 0.0028923735953867435,
+      "learning_rate": 0.001,
+      "loss": 0.4166,
+      "step": 4569
+    },
+    {
+      "epoch": 0.12609635886416704,
+      "grad_norm": 0.003060357179492712,
+      "learning_rate": 0.001,
+      "loss": 0.388,
+      "step": 4570
+    },
+    {
+      "epoch": 0.12612395106523142,
+      "grad_norm": 0.0036045622546225786,
+      "learning_rate": 0.001,
+      "loss": 0.3544,
+      "step": 4571
+    },
+    {
+      "epoch": 0.12615154326629577,
+      "grad_norm": 0.002739776624366641,
+      "learning_rate": 0.001,
+      "loss": 0.4174,
+      "step": 4572
+    },
+    {
+      "epoch": 0.12617913546736015,
+      "grad_norm": 0.0037477100268006325,
+      "learning_rate": 0.001,
+      "loss": 0.4281,
+      "step": 4573
+    },
+    {
+      "epoch": 0.12620672766842453,
+      "grad_norm": 0.002796964254230261,
+      "learning_rate": 0.001,
+      "loss": 0.3802,
+      "step": 4574
+    },
+    {
+      "epoch": 0.12623431986948888,
+      "grad_norm": 0.0034001306630671024,
+      "learning_rate": 0.001,
+      "loss": 0.4039,
+      "step": 4575
+    },
+    {
+      "epoch": 0.12626191207055326,
+      "grad_norm": 0.004743472672998905,
+      "learning_rate": 0.001,
+      "loss": 0.4293,
+      "step": 4576
+    },
+    {
+      "epoch": 0.12628950427161761,
+      "grad_norm": 0.0023272617254406214,
+      "learning_rate": 0.001,
+      "loss": 0.3782,
+      "step": 4577
+    },
+    {
+      "epoch": 0.126317096472682,
+      "grad_norm": 0.0036957256961613894,
+      "learning_rate": 0.001,
+      "loss": 0.3981,
+      "step": 4578
+    },
+    {
+      "epoch": 0.12634468867374637,
+      "grad_norm": 0.002441568998619914,
+      "learning_rate": 0.001,
+      "loss": 0.4053,
+      "step": 4579
+    },
+    {
+      "epoch": 0.12637228087481073,
+      "grad_norm": 0.003656483720988035,
+      "learning_rate": 0.001,
+      "loss": 0.3826,
+      "step": 4580
+    },
+    {
+      "epoch": 0.1263998730758751,
+      "grad_norm": 0.005753074306994677,
+      "learning_rate": 0.001,
+      "loss": 0.3647,
+      "step": 4581
+    },
+    {
+      "epoch": 0.12642746527693946,
+      "grad_norm": 0.005195307079702616,
+      "learning_rate": 0.001,
+      "loss": 0.4113,
+      "step": 4582
+    },
+    {
+      "epoch": 0.12645505747800384,
+      "grad_norm": 0.003572377609089017,
+      "learning_rate": 0.001,
+      "loss": 0.4046,
+      "step": 4583
+    },
+    {
+      "epoch": 0.12648264967906822,
+      "grad_norm": 0.006414738483726978,
+      "learning_rate": 0.001,
+      "loss": 0.3894,
+      "step": 4584
+    },
+    {
+      "epoch": 0.12651024188013257,
+      "grad_norm": 0.003181818872690201,
+      "learning_rate": 0.001,
+      "loss": 0.376,
+      "step": 4585
+    },
+    {
+      "epoch": 0.12653783408119695,
+      "grad_norm": 0.0062148310244083405,
+      "learning_rate": 0.001,
+      "loss": 0.3956,
+      "step": 4586
+    },
+    {
+      "epoch": 0.1265654262822613,
+      "grad_norm": 0.00339969783090055,
+      "learning_rate": 0.001,
+      "loss": 0.4389,
+      "step": 4587
+    },
+    {
+      "epoch": 0.1265930184833257,
+      "grad_norm": 0.003279429627582431,
+      "learning_rate": 0.001,
+      "loss": 0.415,
+      "step": 4588
+    },
+    {
+      "epoch": 0.12662061068439007,
+      "grad_norm": 0.003928397316485643,
+      "learning_rate": 0.001,
+      "loss": 0.4015,
+      "step": 4589
+    },
+    {
+      "epoch": 0.12664820288545442,
+      "grad_norm": 0.0071240440011024475,
+      "learning_rate": 0.001,
+      "loss": 0.371,
+      "step": 4590
+    },
+    {
+      "epoch": 0.1266757950865188,
+      "grad_norm": 0.0035491851158440113,
+      "learning_rate": 0.001,
+      "loss": 0.3887,
+      "step": 4591
+    },
+    {
+      "epoch": 0.12670338728758315,
+      "grad_norm": 0.003695262363180518,
+      "learning_rate": 0.001,
+      "loss": 0.3705,
+      "step": 4592
+    },
+    {
+      "epoch": 0.12673097948864753,
+      "grad_norm": 0.002730879234150052,
+      "learning_rate": 0.001,
+      "loss": 0.4148,
+      "step": 4593
+    },
+    {
+      "epoch": 0.1267585716897119,
+      "grad_norm": 0.0026994007639586926,
+      "learning_rate": 0.001,
+      "loss": 0.4262,
+      "step": 4594
+    },
+    {
+      "epoch": 0.12678616389077627,
+      "grad_norm": 0.0025865414645522833,
+      "learning_rate": 0.001,
+      "loss": 0.4126,
+      "step": 4595
+    },
+    {
+      "epoch": 0.12681375609184065,
+      "grad_norm": 0.003949947189539671,
+      "learning_rate": 0.001,
+      "loss": 0.3755,
+      "step": 4596
+    },
+    {
+      "epoch": 0.126841348292905,
+      "grad_norm": 0.0021580797620117664,
+      "learning_rate": 0.001,
+      "loss": 0.3898,
+      "step": 4597
+    },
+    {
+      "epoch": 0.12686894049396938,
+      "grad_norm": 0.003605265635997057,
+      "learning_rate": 0.001,
+      "loss": 0.3938,
+      "step": 4598
+    },
+    {
+      "epoch": 0.12689653269503376,
+      "grad_norm": 0.0027530419174581766,
+      "learning_rate": 0.001,
+      "loss": 0.4002,
+      "step": 4599
+    },
+    {
+      "epoch": 0.1269241248960981,
+      "grad_norm": 0.002252806443721056,
+      "learning_rate": 0.001,
+      "loss": 0.4241,
+      "step": 4600
+    },
+    {
+      "epoch": 0.1269517170971625,
+      "grad_norm": 0.0026294796261936426,
+      "learning_rate": 0.001,
+      "loss": 0.4238,
+      "step": 4601
+    },
+    {
+      "epoch": 0.12697930929822684,
+      "grad_norm": 0.00271353917196393,
+      "learning_rate": 0.001,
+      "loss": 0.412,
+      "step": 4602
+    },
+    {
+      "epoch": 0.12700690149929122,
+      "grad_norm": 0.004706821870058775,
+      "learning_rate": 0.001,
+      "loss": 0.3532,
+      "step": 4603
+    },
+    {
+      "epoch": 0.1270344937003556,
+      "grad_norm": 0.0026412513107061386,
+      "learning_rate": 0.001,
+      "loss": 0.4048,
+      "step": 4604
+    },
+    {
+      "epoch": 0.12706208590141996,
+      "grad_norm": 0.00251567829400301,
+      "learning_rate": 0.001,
+      "loss": 0.3959,
+      "step": 4605
+    },
+    {
+      "epoch": 0.12708967810248434,
+      "grad_norm": 0.003459150902926922,
+      "learning_rate": 0.001,
+      "loss": 0.4001,
+      "step": 4606
+    },
+    {
+      "epoch": 0.1271172703035487,
+      "grad_norm": 0.002293146215379238,
+      "learning_rate": 0.001,
+      "loss": 0.3427,
+      "step": 4607
+    },
+    {
+      "epoch": 0.12714486250461307,
+      "grad_norm": 0.0029938959050923586,
+      "learning_rate": 0.001,
+      "loss": 0.3961,
+      "step": 4608
+    },
+    {
+      "epoch": 0.12717245470567745,
+      "grad_norm": 0.0036902104038745165,
+      "learning_rate": 0.001,
+      "loss": 0.3973,
+      "step": 4609
+    },
+    {
+      "epoch": 0.1272000469067418,
+      "grad_norm": 0.004806086421012878,
+      "learning_rate": 0.001,
+      "loss": 0.4121,
+      "step": 4610
+    },
+    {
+      "epoch": 0.12722763910780618,
+      "grad_norm": 0.002169287297874689,
+      "learning_rate": 0.001,
+      "loss": 0.4396,
+      "step": 4611
+    },
+    {
+      "epoch": 0.12725523130887054,
+      "grad_norm": 0.0030692543368786573,
+      "learning_rate": 0.001,
+      "loss": 0.3516,
+      "step": 4612
+    },
+    {
+      "epoch": 0.12728282350993492,
+      "grad_norm": 0.004069609101861715,
+      "learning_rate": 0.001,
+      "loss": 0.4112,
+      "step": 4613
+    },
+    {
+      "epoch": 0.1273104157109993,
+      "grad_norm": 0.0051890406757593155,
+      "learning_rate": 0.001,
+      "loss": 0.3719,
+      "step": 4614
+    },
+    {
+      "epoch": 0.12733800791206365,
+      "grad_norm": 0.002785927150398493,
+      "learning_rate": 0.001,
+      "loss": 0.3835,
+      "step": 4615
+    },
+    {
+      "epoch": 0.12736560011312803,
+      "grad_norm": 0.0058522410690784454,
+      "learning_rate": 0.001,
+      "loss": 0.417,
+      "step": 4616
+    },
+    {
+      "epoch": 0.12739319231419238,
+      "grad_norm": 0.0026462904643267393,
+      "learning_rate": 0.001,
+      "loss": 0.4455,
+      "step": 4617
+    },
+    {
+      "epoch": 0.12742078451525676,
+      "grad_norm": 0.012372707948088646,
+      "learning_rate": 0.001,
+      "loss": 0.3911,
+      "step": 4618
+    },
+    {
+      "epoch": 0.12744837671632114,
+      "grad_norm": 0.003128821961581707,
+      "learning_rate": 0.001,
+      "loss": 0.4198,
+      "step": 4619
+    },
+    {
+      "epoch": 0.1274759689173855,
+      "grad_norm": 0.004159901756793261,
+      "learning_rate": 0.001,
+      "loss": 0.4282,
+      "step": 4620
+    },
+    {
+      "epoch": 0.12750356111844988,
+      "grad_norm": 0.00412391172721982,
+      "learning_rate": 0.001,
+      "loss": 0.3741,
+      "step": 4621
+    },
+    {
+      "epoch": 0.12753115331951423,
+      "grad_norm": 0.002524176612496376,
+      "learning_rate": 0.001,
+      "loss": 0.4158,
+      "step": 4622
+    },
+    {
+      "epoch": 0.1275587455205786,
+      "grad_norm": 0.004930204711854458,
+      "learning_rate": 0.001,
+      "loss": 0.3922,
+      "step": 4623
+    },
+    {
+      "epoch": 0.127586337721643,
+      "grad_norm": 0.003738861531019211,
+      "learning_rate": 0.001,
+      "loss": 0.3992,
+      "step": 4624
+    },
+    {
+      "epoch": 0.12761392992270734,
+      "grad_norm": 0.0171899925917387,
+      "learning_rate": 0.001,
+      "loss": 0.391,
+      "step": 4625
+    },
+    {
+      "epoch": 0.12764152212377172,
+      "grad_norm": 0.018499117344617844,
+      "learning_rate": 0.001,
+      "loss": 0.3724,
+      "step": 4626
+    },
+    {
+      "epoch": 0.12766911432483607,
+      "grad_norm": 0.0035764595959335566,
+      "learning_rate": 0.001,
+      "loss": 0.3737,
+      "step": 4627
+    },
+    {
+      "epoch": 0.12769670652590046,
+      "grad_norm": 0.00345508917234838,
+      "learning_rate": 0.001,
+      "loss": 0.3695,
+      "step": 4628
+    },
+    {
+      "epoch": 0.12772429872696484,
+      "grad_norm": 0.0037645609118044376,
+      "learning_rate": 0.001,
+      "loss": 0.3914,
+      "step": 4629
+    },
+    {
+      "epoch": 0.1277518909280292,
+      "grad_norm": 0.0043193078599870205,
+      "learning_rate": 0.001,
+      "loss": 0.3713,
+      "step": 4630
+    },
+    {
+      "epoch": 0.12777948312909357,
+      "grad_norm": 0.0038787908852100372,
+      "learning_rate": 0.001,
+      "loss": 0.4257,
+      "step": 4631
+    },
+    {
+      "epoch": 0.12780707533015792,
+      "grad_norm": 0.007828318513929844,
+      "learning_rate": 0.001,
+      "loss": 0.4126,
+      "step": 4632
+    },
+    {
+      "epoch": 0.1278346675312223,
+      "grad_norm": 0.0029059904627501965,
+      "learning_rate": 0.001,
+      "loss": 0.4446,
+      "step": 4633
+    },
+    {
+      "epoch": 0.12786225973228668,
+      "grad_norm": 0.005105176474899054,
+      "learning_rate": 0.001,
+      "loss": 0.3566,
+      "step": 4634
+    },
+    {
+      "epoch": 0.12788985193335103,
+      "grad_norm": 0.005164528265595436,
+      "learning_rate": 0.001,
+      "loss": 0.4067,
+      "step": 4635
+    },
+    {
+      "epoch": 0.12791744413441541,
+      "grad_norm": 0.003127202857285738,
+      "learning_rate": 0.001,
+      "loss": 0.3882,
+      "step": 4636
+    },
+    {
+      "epoch": 0.12794503633547977,
+      "grad_norm": 0.004962852690368891,
+      "learning_rate": 0.001,
+      "loss": 0.3678,
+      "step": 4637
+    },
+    {
+      "epoch": 0.12797262853654415,
+      "grad_norm": 0.003928507212549448,
+      "learning_rate": 0.001,
+      "loss": 0.3994,
+      "step": 4638
+    },
+    {
+      "epoch": 0.12800022073760853,
+      "grad_norm": 0.002523329108953476,
+      "learning_rate": 0.001,
+      "loss": 0.4012,
+      "step": 4639
+    },
+    {
+      "epoch": 0.12802781293867288,
+      "grad_norm": 0.003745138179510832,
+      "learning_rate": 0.001,
+      "loss": 0.4143,
+      "step": 4640
+    },
+    {
+      "epoch": 0.12805540513973726,
+      "grad_norm": 0.007446894887834787,
+      "learning_rate": 0.001,
+      "loss": 0.4105,
+      "step": 4641
+    },
+    {
+      "epoch": 0.1280829973408016,
+      "grad_norm": 0.0024344183038920164,
+      "learning_rate": 0.001,
+      "loss": 0.4278,
+      "step": 4642
+    },
+    {
+      "epoch": 0.128110589541866,
+      "grad_norm": 0.0059473030269145966,
+      "learning_rate": 0.001,
+      "loss": 0.4239,
+      "step": 4643
+    },
+    {
+      "epoch": 0.12813818174293037,
+      "grad_norm": 0.00471229525282979,
+      "learning_rate": 0.001,
+      "loss": 0.3605,
+      "step": 4644
+    },
+    {
+      "epoch": 0.12816577394399473,
+      "grad_norm": 0.004949926398694515,
+      "learning_rate": 0.001,
+      "loss": 0.375,
+      "step": 4645
+    },
+    {
+      "epoch": 0.1281933661450591,
+      "grad_norm": 0.0033453593496233225,
+      "learning_rate": 0.001,
+      "loss": 0.3764,
+      "step": 4646
+    },
+    {
+      "epoch": 0.12822095834612346,
+      "grad_norm": 0.003853026544675231,
+      "learning_rate": 0.001,
+      "loss": 0.4356,
+      "step": 4647
+    },
+    {
+      "epoch": 0.12824855054718784,
+      "grad_norm": 0.0037174660246819258,
+      "learning_rate": 0.001,
+      "loss": 0.392,
+      "step": 4648
+    },
+    {
+      "epoch": 0.12827614274825222,
+      "grad_norm": 0.002829108154401183,
+      "learning_rate": 0.001,
+      "loss": 0.414,
+      "step": 4649
+    },
+    {
+      "epoch": 0.12830373494931657,
+      "grad_norm": 0.002781393239274621,
+      "learning_rate": 0.001,
+      "loss": 0.4145,
+      "step": 4650
+    },
+    {
+      "epoch": 0.12833132715038095,
+      "grad_norm": 0.012161584571003914,
+      "learning_rate": 0.001,
+      "loss": 0.4238,
+      "step": 4651
+    },
+    {
+      "epoch": 0.1283589193514453,
+      "grad_norm": 0.004447166807949543,
+      "learning_rate": 0.001,
+      "loss": 0.3697,
+      "step": 4652
+    },
+    {
+      "epoch": 0.12838651155250969,
+      "grad_norm": 0.010521035641431808,
+      "learning_rate": 0.001,
+      "loss": 0.3953,
+      "step": 4653
+    },
+    {
+      "epoch": 0.12841410375357407,
+      "grad_norm": 0.0034564679954200983,
+      "learning_rate": 0.001,
+      "loss": 0.4372,
+      "step": 4654
+    },
+    {
+      "epoch": 0.12844169595463842,
+      "grad_norm": 0.003944997675716877,
+      "learning_rate": 0.001,
+      "loss": 0.4111,
+      "step": 4655
+    },
+    {
+      "epoch": 0.1284692881557028,
+      "grad_norm": 0.0037498660385608673,
+      "learning_rate": 0.001,
+      "loss": 0.3737,
+      "step": 4656
+    },
+    {
+      "epoch": 0.12849688035676715,
+      "grad_norm": 0.0032498843502253294,
+      "learning_rate": 0.001,
+      "loss": 0.4231,
+      "step": 4657
+    },
+    {
+      "epoch": 0.12852447255783153,
+      "grad_norm": 0.007645866367965937,
+      "learning_rate": 0.001,
+      "loss": 0.3664,
+      "step": 4658
+    },
+    {
+      "epoch": 0.12855206475889588,
+      "grad_norm": 0.006332173477858305,
+      "learning_rate": 0.001,
+      "loss": 0.4242,
+      "step": 4659
+    },
+    {
+      "epoch": 0.12857965695996026,
+      "grad_norm": 0.004206281155347824,
+      "learning_rate": 0.001,
+      "loss": 0.4048,
+      "step": 4660
+    },
+    {
+      "epoch": 0.12860724916102464,
+      "grad_norm": 0.0026335117872804403,
+      "learning_rate": 0.001,
+      "loss": 0.4098,
+      "step": 4661
+    },
+    {
+      "epoch": 0.128634841362089,
+      "grad_norm": 0.004516151268035173,
+      "learning_rate": 0.001,
+      "loss": 0.3905,
+      "step": 4662
+    },
+    {
+      "epoch": 0.12866243356315338,
+      "grad_norm": 0.0032997119706124067,
+      "learning_rate": 0.001,
+      "loss": 0.4237,
+      "step": 4663
+    },
+    {
+      "epoch": 0.12869002576421773,
+      "grad_norm": 0.0038388820830732584,
+      "learning_rate": 0.001,
+      "loss": 0.4067,
+      "step": 4664
+    },
+    {
+      "epoch": 0.1287176179652821,
+      "grad_norm": 0.0035206389147788286,
+      "learning_rate": 0.001,
+      "loss": 0.4026,
+      "step": 4665
+    },
+    {
+      "epoch": 0.1287452101663465,
+      "grad_norm": 0.005415271036326885,
+      "learning_rate": 0.001,
+      "loss": 0.4253,
+      "step": 4666
+    },
+    {
+      "epoch": 0.12877280236741084,
+      "grad_norm": 0.002756484318524599,
+      "learning_rate": 0.001,
+      "loss": 0.3805,
+      "step": 4667
+    },
+    {
+      "epoch": 0.12880039456847522,
+      "grad_norm": 0.0028146097902208567,
+      "learning_rate": 0.001,
+      "loss": 0.4216,
+      "step": 4668
+    },
+    {
+      "epoch": 0.12882798676953958,
+      "grad_norm": 0.004214905202388763,
+      "learning_rate": 0.001,
+      "loss": 0.4183,
+      "step": 4669
+    },
+    {
+      "epoch": 0.12885557897060396,
+      "grad_norm": 0.00516669312492013,
+      "learning_rate": 0.001,
+      "loss": 0.3785,
+      "step": 4670
+    },
+    {
+      "epoch": 0.12888317117166834,
+      "grad_norm": 0.0031740644481033087,
+      "learning_rate": 0.001,
+      "loss": 0.3944,
+      "step": 4671
+    },
+    {
+      "epoch": 0.1289107633727327,
+      "grad_norm": 0.004468605387955904,
+      "learning_rate": 0.001,
+      "loss": 0.4262,
+      "step": 4672
+    },
+    {
+      "epoch": 0.12893835557379707,
+      "grad_norm": 0.003522511338815093,
+      "learning_rate": 0.001,
+      "loss": 0.4624,
+      "step": 4673
+    },
+    {
+      "epoch": 0.12896594777486142,
+      "grad_norm": 0.002765173325315118,
+      "learning_rate": 0.001,
+      "loss": 0.4136,
+      "step": 4674
+    },
+    {
+      "epoch": 0.1289935399759258,
+      "grad_norm": 0.0032698458526283503,
+      "learning_rate": 0.001,
+      "loss": 0.3992,
+      "step": 4675
+    },
+    {
+      "epoch": 0.12902113217699018,
+      "grad_norm": 0.0034274624194949865,
+      "learning_rate": 0.001,
+      "loss": 0.3779,
+      "step": 4676
+    },
+    {
+      "epoch": 0.12904872437805454,
+      "grad_norm": 0.003263387130573392,
+      "learning_rate": 0.001,
+      "loss": 0.4153,
+      "step": 4677
+    },
+    {
+      "epoch": 0.12907631657911892,
+      "grad_norm": 0.0030025255400687456,
+      "learning_rate": 0.001,
+      "loss": 0.406,
+      "step": 4678
+    },
+    {
+      "epoch": 0.12910390878018327,
+      "grad_norm": 0.003843993414193392,
+      "learning_rate": 0.001,
+      "loss": 0.3746,
+      "step": 4679
+    },
+    {
+      "epoch": 0.12913150098124765,
+      "grad_norm": 0.0030434427317231894,
+      "learning_rate": 0.001,
+      "loss": 0.4037,
+      "step": 4680
+    },
+    {
+      "epoch": 0.12915909318231203,
+      "grad_norm": 0.0034381363075226545,
+      "learning_rate": 0.001,
+      "loss": 0.3714,
+      "step": 4681
+    },
+    {
+      "epoch": 0.12918668538337638,
+      "grad_norm": 0.0032730584498494864,
+      "learning_rate": 0.001,
+      "loss": 0.388,
+      "step": 4682
+    },
+    {
+      "epoch": 0.12921427758444076,
+      "grad_norm": 0.004053408745676279,
+      "learning_rate": 0.001,
+      "loss": 0.3626,
+      "step": 4683
+    },
+    {
+      "epoch": 0.12924186978550511,
+      "grad_norm": 0.003272439120337367,
+      "learning_rate": 0.001,
+      "loss": 0.4149,
+      "step": 4684
+    },
+    {
+      "epoch": 0.1292694619865695,
+      "grad_norm": 0.0030873471405357122,
+      "learning_rate": 0.001,
+      "loss": 0.3804,
+      "step": 4685
+    },
+    {
+      "epoch": 0.12929705418763388,
+      "grad_norm": 0.0030633402056992054,
+      "learning_rate": 0.001,
+      "loss": 0.4443,
+      "step": 4686
+    },
+    {
+      "epoch": 0.12932464638869823,
+      "grad_norm": 0.003019932424649596,
+      "learning_rate": 0.001,
+      "loss": 0.3883,
+      "step": 4687
+    },
+    {
+      "epoch": 0.1293522385897626,
+      "grad_norm": 0.0027192363049834967,
+      "learning_rate": 0.001,
+      "loss": 0.4195,
+      "step": 4688
+    },
+    {
+      "epoch": 0.12937983079082696,
+      "grad_norm": 0.003051141509786248,
+      "learning_rate": 0.001,
+      "loss": 0.4528,
+      "step": 4689
+    },
+    {
+      "epoch": 0.12940742299189134,
+      "grad_norm": 0.004393309820443392,
+      "learning_rate": 0.001,
+      "loss": 0.3668,
+      "step": 4690
+    },
+    {
+      "epoch": 0.12943501519295572,
+      "grad_norm": 0.004716060124337673,
+      "learning_rate": 0.001,
+      "loss": 0.3668,
+      "step": 4691
+    },
+    {
+      "epoch": 0.12946260739402007,
+      "grad_norm": 0.0047832694835960865,
+      "learning_rate": 0.001,
+      "loss": 0.4004,
+      "step": 4692
+    },
+    {
+      "epoch": 0.12949019959508445,
+      "grad_norm": 0.0040343874134123325,
+      "learning_rate": 0.001,
+      "loss": 0.4414,
+      "step": 4693
+    },
+    {
+      "epoch": 0.1295177917961488,
+      "grad_norm": 0.006127424072474241,
+      "learning_rate": 0.001,
+      "loss": 0.4169,
+      "step": 4694
+    },
+    {
+      "epoch": 0.1295453839972132,
+      "grad_norm": 0.002882004715502262,
+      "learning_rate": 0.001,
+      "loss": 0.4052,
+      "step": 4695
+    },
+    {
+      "epoch": 0.12957297619827757,
+      "grad_norm": 0.003973776008933783,
+      "learning_rate": 0.001,
+      "loss": 0.393,
+      "step": 4696
+    },
+    {
+      "epoch": 0.12960056839934192,
+      "grad_norm": 0.0027190132532268763,
+      "learning_rate": 0.001,
+      "loss": 0.4071,
+      "step": 4697
+    },
+    {
+      "epoch": 0.1296281606004063,
+      "grad_norm": 0.0025696575175970793,
+      "learning_rate": 0.001,
+      "loss": 0.3987,
+      "step": 4698
+    },
+    {
+      "epoch": 0.12965575280147065,
+      "grad_norm": 0.0032083040568977594,
+      "learning_rate": 0.001,
+      "loss": 0.3845,
+      "step": 4699
+    },
+    {
+      "epoch": 0.12968334500253503,
+      "grad_norm": 0.0025488468818366528,
+      "learning_rate": 0.001,
+      "loss": 0.4216,
+      "step": 4700
+    },
+    {
+      "epoch": 0.1297109372035994,
+      "grad_norm": 0.009488187730312347,
+      "learning_rate": 0.001,
+      "loss": 0.4063,
+      "step": 4701
+    },
+    {
+      "epoch": 0.12973852940466377,
+      "grad_norm": 0.0029801761265844107,
+      "learning_rate": 0.001,
+      "loss": 0.4158,
+      "step": 4702
+    },
+    {
+      "epoch": 0.12976612160572815,
+      "grad_norm": 0.0247980747371912,
+      "learning_rate": 0.001,
+      "loss": 0.3803,
+      "step": 4703
+    },
+    {
+      "epoch": 0.1297937138067925,
+      "grad_norm": 0.002381372032687068,
+      "learning_rate": 0.001,
+      "loss": 0.4051,
+      "step": 4704
+    },
+    {
+      "epoch": 0.12982130600785688,
+      "grad_norm": 0.0035768726374953985,
+      "learning_rate": 0.001,
+      "loss": 0.3934,
+      "step": 4705
+    },
+    {
+      "epoch": 0.12984889820892126,
+      "grad_norm": 0.003190788673236966,
+      "learning_rate": 0.001,
+      "loss": 0.4226,
+      "step": 4706
+    },
+    {
+      "epoch": 0.1298764904099856,
+      "grad_norm": 0.00331826857291162,
+      "learning_rate": 0.001,
+      "loss": 0.4116,
+      "step": 4707
+    },
+    {
+      "epoch": 0.12990408261105,
+      "grad_norm": 0.0032035168260335922,
+      "learning_rate": 0.001,
+      "loss": 0.3938,
+      "step": 4708
+    },
+    {
+      "epoch": 0.12993167481211434,
+      "grad_norm": 0.0037546653766185045,
+      "learning_rate": 0.001,
+      "loss": 0.3856,
+      "step": 4709
+    },
+    {
+      "epoch": 0.12995926701317873,
+      "grad_norm": 0.0026541994884610176,
+      "learning_rate": 0.001,
+      "loss": 0.4312,
+      "step": 4710
+    },
+    {
+      "epoch": 0.1299868592142431,
+      "grad_norm": 0.004107889253646135,
+      "learning_rate": 0.001,
+      "loss": 0.4092,
+      "step": 4711
+    },
+    {
+      "epoch": 0.13001445141530746,
+      "grad_norm": 0.002514521824195981,
+      "learning_rate": 0.001,
+      "loss": 0.4137,
+      "step": 4712
+    },
+    {
+      "epoch": 0.13004204361637184,
+      "grad_norm": 0.0028199944645166397,
+      "learning_rate": 0.001,
+      "loss": 0.3967,
+      "step": 4713
+    },
+    {
+      "epoch": 0.1300696358174362,
+      "grad_norm": 0.0027020128909498453,
+      "learning_rate": 0.001,
+      "loss": 0.449,
+      "step": 4714
+    },
+    {
+      "epoch": 0.13009722801850057,
+      "grad_norm": 0.003255255287513137,
+      "learning_rate": 0.001,
+      "loss": 0.4046,
+      "step": 4715
+    },
+    {
+      "epoch": 0.13012482021956495,
+      "grad_norm": 0.0035547774750739336,
+      "learning_rate": 0.001,
+      "loss": 0.3949,
+      "step": 4716
+    },
+    {
+      "epoch": 0.1301524124206293,
+      "grad_norm": 0.0025250576436519623,
+      "learning_rate": 0.001,
+      "loss": 0.4117,
+      "step": 4717
+    },
+    {
+      "epoch": 0.13018000462169368,
+      "grad_norm": 0.002805963857099414,
+      "learning_rate": 0.001,
+      "loss": 0.3898,
+      "step": 4718
+    },
+    {
+      "epoch": 0.13020759682275804,
+      "grad_norm": 0.005695881322026253,
+      "learning_rate": 0.001,
+      "loss": 0.4223,
+      "step": 4719
+    },
+    {
+      "epoch": 0.13023518902382242,
+      "grad_norm": 0.002897542668506503,
+      "learning_rate": 0.001,
+      "loss": 0.3924,
+      "step": 4720
+    },
+    {
+      "epoch": 0.1302627812248868,
+      "grad_norm": 0.0037734811194241047,
+      "learning_rate": 0.001,
+      "loss": 0.3864,
+      "step": 4721
+    },
+    {
+      "epoch": 0.13029037342595115,
+      "grad_norm": 0.003529176115989685,
+      "learning_rate": 0.001,
+      "loss": 0.3992,
+      "step": 4722
+    },
+    {
+      "epoch": 0.13031796562701553,
+      "grad_norm": 0.003130922093987465,
+      "learning_rate": 0.001,
+      "loss": 0.4175,
+      "step": 4723
+    },
+    {
+      "epoch": 0.13034555782807988,
+      "grad_norm": 0.0031659335363656282,
+      "learning_rate": 0.001,
+      "loss": 0.4084,
+      "step": 4724
+    },
+    {
+      "epoch": 0.13037315002914426,
+      "grad_norm": 0.002901850501075387,
+      "learning_rate": 0.001,
+      "loss": 0.3685,
+      "step": 4725
+    },
+    {
+      "epoch": 0.13040074223020864,
+      "grad_norm": 0.0034328114707022905,
+      "learning_rate": 0.001,
+      "loss": 0.4135,
+      "step": 4726
+    },
+    {
+      "epoch": 0.130428334431273,
+      "grad_norm": 0.0028659338131546974,
+      "learning_rate": 0.001,
+      "loss": 0.4166,
+      "step": 4727
+    },
+    {
+      "epoch": 0.13045592663233738,
+      "grad_norm": 0.002500742208212614,
+      "learning_rate": 0.001,
+      "loss": 0.431,
+      "step": 4728
+    },
+    {
+      "epoch": 0.13048351883340173,
+      "grad_norm": 0.004093059338629246,
+      "learning_rate": 0.001,
+      "loss": 0.4415,
+      "step": 4729
+    },
+    {
+      "epoch": 0.1305111110344661,
+      "grad_norm": 0.006217313464730978,
+      "learning_rate": 0.001,
+      "loss": 0.4253,
+      "step": 4730
+    },
+    {
+      "epoch": 0.1305387032355305,
+      "grad_norm": 0.003967654425650835,
+      "learning_rate": 0.001,
+      "loss": 0.398,
+      "step": 4731
+    },
+    {
+      "epoch": 0.13056629543659484,
+      "grad_norm": 0.002272986341267824,
+      "learning_rate": 0.001,
+      "loss": 0.4046,
+      "step": 4732
+    },
+    {
+      "epoch": 0.13059388763765922,
+      "grad_norm": 0.003741595894098282,
+      "learning_rate": 0.001,
+      "loss": 0.35,
+      "step": 4733
+    },
+    {
+      "epoch": 0.13062147983872358,
+      "grad_norm": 0.004285778850317001,
+      "learning_rate": 0.001,
+      "loss": 0.381,
+      "step": 4734
+    },
+    {
+      "epoch": 0.13064907203978796,
+      "grad_norm": 0.003829494584351778,
+      "learning_rate": 0.001,
+      "loss": 0.3918,
+      "step": 4735
+    },
+    {
+      "epoch": 0.13067666424085234,
+      "grad_norm": 0.0037094939034432173,
+      "learning_rate": 0.001,
+      "loss": 0.3885,
+      "step": 4736
+    },
+    {
+      "epoch": 0.1307042564419167,
+      "grad_norm": 0.005124232266098261,
+      "learning_rate": 0.001,
+      "loss": 0.4105,
+      "step": 4737
+    },
+    {
+      "epoch": 0.13073184864298107,
+      "grad_norm": 0.0033820930402725935,
+      "learning_rate": 0.001,
+      "loss": 0.3564,
+      "step": 4738
+    },
+    {
+      "epoch": 0.13075944084404542,
+      "grad_norm": 0.004075558390468359,
+      "learning_rate": 0.001,
+      "loss": 0.3771,
+      "step": 4739
+    },
+    {
+      "epoch": 0.1307870330451098,
+      "grad_norm": 0.003454482415691018,
+      "learning_rate": 0.001,
+      "loss": 0.4064,
+      "step": 4740
+    },
+    {
+      "epoch": 0.13081462524617418,
+      "grad_norm": 0.003381206886842847,
+      "learning_rate": 0.001,
+      "loss": 0.3995,
+      "step": 4741
+    },
+    {
+      "epoch": 0.13084221744723853,
+      "grad_norm": 0.0028117720503360033,
+      "learning_rate": 0.001,
+      "loss": 0.3949,
+      "step": 4742
+    },
+    {
+      "epoch": 0.13086980964830291,
+      "grad_norm": 0.002668531145900488,
+      "learning_rate": 0.001,
+      "loss": 0.3892,
+      "step": 4743
+    },
+    {
+      "epoch": 0.13089740184936727,
+      "grad_norm": 0.003135831095278263,
+      "learning_rate": 0.001,
+      "loss": 0.4174,
+      "step": 4744
+    },
+    {
+      "epoch": 0.13092499405043165,
+      "grad_norm": 0.008101669140160084,
+      "learning_rate": 0.001,
+      "loss": 0.3729,
+      "step": 4745
+    },
+    {
+      "epoch": 0.13095258625149603,
+      "grad_norm": 0.0035032073501497507,
+      "learning_rate": 0.001,
+      "loss": 0.4256,
+      "step": 4746
+    },
+    {
+      "epoch": 0.13098017845256038,
+      "grad_norm": 0.003943659830838442,
+      "learning_rate": 0.001,
+      "loss": 0.4057,
+      "step": 4747
+    },
+    {
+      "epoch": 0.13100777065362476,
+      "grad_norm": 0.0026659085415303707,
+      "learning_rate": 0.001,
+      "loss": 0.3986,
+      "step": 4748
+    },
+    {
+      "epoch": 0.1310353628546891,
+      "grad_norm": 0.004375234711915255,
+      "learning_rate": 0.001,
+      "loss": 0.4039,
+      "step": 4749
+    },
+    {
+      "epoch": 0.1310629550557535,
+      "grad_norm": 0.0033472348004579544,
+      "learning_rate": 0.001,
+      "loss": 0.3845,
+      "step": 4750
+    },
+    {
+      "epoch": 0.13109054725681787,
+      "grad_norm": 0.005840728525072336,
+      "learning_rate": 0.001,
+      "loss": 0.4264,
+      "step": 4751
+    },
+    {
+      "epoch": 0.13111813945788223,
+      "grad_norm": 0.0032132677733898163,
+      "learning_rate": 0.001,
+      "loss": 0.4109,
+      "step": 4752
+    },
+    {
+      "epoch": 0.1311457316589466,
+      "grad_norm": 0.002400551922619343,
+      "learning_rate": 0.001,
+      "loss": 0.446,
+      "step": 4753
+    },
+    {
+      "epoch": 0.13117332386001096,
+      "grad_norm": 0.0028581595979630947,
+      "learning_rate": 0.001,
+      "loss": 0.4224,
+      "step": 4754
+    },
+    {
+      "epoch": 0.13120091606107534,
+      "grad_norm": 0.0029292525723576546,
+      "learning_rate": 0.001,
+      "loss": 0.3745,
+      "step": 4755
+    },
+    {
+      "epoch": 0.1312285082621397,
+      "grad_norm": 0.004841271787881851,
+      "learning_rate": 0.001,
+      "loss": 0.4376,
+      "step": 4756
+    },
+    {
+      "epoch": 0.13125610046320407,
+      "grad_norm": 0.004950361791998148,
+      "learning_rate": 0.001,
+      "loss": 0.3883,
+      "step": 4757
+    },
+    {
+      "epoch": 0.13128369266426845,
+      "grad_norm": 0.003650795202702284,
+      "learning_rate": 0.001,
+      "loss": 0.3823,
+      "step": 4758
+    },
+    {
+      "epoch": 0.1313112848653328,
+      "grad_norm": 0.005590515211224556,
+      "learning_rate": 0.001,
+      "loss": 0.3603,
+      "step": 4759
+    },
+    {
+      "epoch": 0.13133887706639719,
+      "grad_norm": 0.004784191958606243,
+      "learning_rate": 0.001,
+      "loss": 0.4078,
+      "step": 4760
+    },
+    {
+      "epoch": 0.13136646926746154,
+      "grad_norm": 0.006027446128427982,
+      "learning_rate": 0.001,
+      "loss": 0.3782,
+      "step": 4761
+    },
+    {
+      "epoch": 0.13139406146852592,
+      "grad_norm": 0.0037396312691271305,
+      "learning_rate": 0.001,
+      "loss": 0.3694,
+      "step": 4762
+    },
+    {
+      "epoch": 0.1314216536695903,
+      "grad_norm": 0.0032113094348460436,
+      "learning_rate": 0.001,
+      "loss": 0.4085,
+      "step": 4763
+    },
+    {
+      "epoch": 0.13144924587065465,
+      "grad_norm": 0.00637141102924943,
+      "learning_rate": 0.001,
+      "loss": 0.4146,
+      "step": 4764
+    },
+    {
+      "epoch": 0.13147683807171903,
+      "grad_norm": 0.005106314085423946,
+      "learning_rate": 0.001,
+      "loss": 0.4162,
+      "step": 4765
+    },
+    {
+      "epoch": 0.13150443027278338,
+      "grad_norm": 0.00308993854559958,
+      "learning_rate": 0.001,
+      "loss": 0.4092,
+      "step": 4766
+    },
+    {
+      "epoch": 0.13153202247384777,
+      "grad_norm": 0.007722698617726564,
+      "learning_rate": 0.001,
+      "loss": 0.3741,
+      "step": 4767
+    },
+    {
+      "epoch": 0.13155961467491215,
+      "grad_norm": 0.00279480149038136,
+      "learning_rate": 0.001,
+      "loss": 0.3943,
+      "step": 4768
+    },
+    {
+      "epoch": 0.1315872068759765,
+      "grad_norm": 0.00339706614613533,
+      "learning_rate": 0.001,
+      "loss": 0.3949,
+      "step": 4769
+    },
+    {
+      "epoch": 0.13161479907704088,
+      "grad_norm": 0.0045981621369719505,
+      "learning_rate": 0.001,
+      "loss": 0.4086,
+      "step": 4770
+    },
+    {
+      "epoch": 0.13164239127810523,
+      "grad_norm": 0.0029820918571203947,
+      "learning_rate": 0.001,
+      "loss": 0.4246,
+      "step": 4771
+    },
+    {
+      "epoch": 0.1316699834791696,
+      "grad_norm": 0.0026766203809529543,
+      "learning_rate": 0.001,
+      "loss": 0.4042,
+      "step": 4772
+    },
+    {
+      "epoch": 0.131697575680234,
+      "grad_norm": 0.004069194197654724,
+      "learning_rate": 0.001,
+      "loss": 0.3814,
+      "step": 4773
+    },
+    {
+      "epoch": 0.13172516788129834,
+      "grad_norm": 0.005836902651935816,
+      "learning_rate": 0.001,
+      "loss": 0.3624,
+      "step": 4774
+    },
+    {
+      "epoch": 0.13175276008236272,
+      "grad_norm": 0.002671006368473172,
+      "learning_rate": 0.001,
+      "loss": 0.4289,
+      "step": 4775
+    },
+    {
+      "epoch": 0.13178035228342708,
+      "grad_norm": 0.0025038421154022217,
+      "learning_rate": 0.001,
+      "loss": 0.426,
+      "step": 4776
+    },
+    {
+      "epoch": 0.13180794448449146,
+      "grad_norm": 0.0062514557503163815,
+      "learning_rate": 0.001,
+      "loss": 0.4202,
+      "step": 4777
+    },
+    {
+      "epoch": 0.13183553668555584,
+      "grad_norm": 0.005402828101068735,
+      "learning_rate": 0.001,
+      "loss": 0.4204,
+      "step": 4778
+    },
+    {
+      "epoch": 0.1318631288866202,
+      "grad_norm": 0.005721025634557009,
+      "learning_rate": 0.001,
+      "loss": 0.359,
+      "step": 4779
+    },
+    {
+      "epoch": 0.13189072108768457,
+      "grad_norm": 0.0032702479511499405,
+      "learning_rate": 0.001,
+      "loss": 0.4039,
+      "step": 4780
+    },
+    {
+      "epoch": 0.13191831328874892,
+      "grad_norm": 0.0022660638205707073,
+      "learning_rate": 0.001,
+      "loss": 0.3813,
+      "step": 4781
+    },
+    {
+      "epoch": 0.1319459054898133,
+      "grad_norm": 0.008734694682061672,
+      "learning_rate": 0.001,
+      "loss": 0.4201,
+      "step": 4782
+    },
+    {
+      "epoch": 0.13197349769087768,
+      "grad_norm": 0.0025738070253282785,
+      "learning_rate": 0.001,
+      "loss": 0.4214,
+      "step": 4783
+    },
+    {
+      "epoch": 0.13200108989194204,
+      "grad_norm": 0.004817556589841843,
+      "learning_rate": 0.001,
+      "loss": 0.4043,
+      "step": 4784
+    },
+    {
+      "epoch": 0.13202868209300642,
+      "grad_norm": 0.0032889090944081545,
+      "learning_rate": 0.001,
+      "loss": 0.3642,
+      "step": 4785
+    },
+    {
+      "epoch": 0.13205627429407077,
+      "grad_norm": 0.002885392401367426,
+      "learning_rate": 0.001,
+      "loss": 0.3915,
+      "step": 4786
+    },
+    {
+      "epoch": 0.13208386649513515,
+      "grad_norm": 0.002848939271643758,
+      "learning_rate": 0.001,
+      "loss": 0.3888,
+      "step": 4787
+    },
+    {
+      "epoch": 0.13211145869619953,
+      "grad_norm": 0.004833425395190716,
+      "learning_rate": 0.001,
+      "loss": 0.368,
+      "step": 4788
+    },
+    {
+      "epoch": 0.13213905089726388,
+      "grad_norm": 0.005285562947392464,
+      "learning_rate": 0.001,
+      "loss": 0.4323,
+      "step": 4789
+    },
+    {
+      "epoch": 0.13216664309832826,
+      "grad_norm": 0.002420694101601839,
+      "learning_rate": 0.001,
+      "loss": 0.4163,
+      "step": 4790
+    },
+    {
+      "epoch": 0.13219423529939262,
+      "grad_norm": 0.0041345711797475815,
+      "learning_rate": 0.001,
+      "loss": 0.4253,
+      "step": 4791
+    },
+    {
+      "epoch": 0.132221827500457,
+      "grad_norm": 0.00341211399063468,
+      "learning_rate": 0.001,
+      "loss": 0.441,
+      "step": 4792
+    },
+    {
+      "epoch": 0.13224941970152138,
+      "grad_norm": 0.003937454894185066,
+      "learning_rate": 0.001,
+      "loss": 0.4131,
+      "step": 4793
+    },
+    {
+      "epoch": 0.13227701190258573,
+      "grad_norm": 0.0027174679562449455,
+      "learning_rate": 0.001,
+      "loss": 0.4081,
+      "step": 4794
+    },
+    {
+      "epoch": 0.1323046041036501,
+      "grad_norm": 0.004002917557954788,
+      "learning_rate": 0.001,
+      "loss": 0.3918,
+      "step": 4795
+    },
+    {
+      "epoch": 0.13233219630471446,
+      "grad_norm": 0.0035276333801448345,
+      "learning_rate": 0.001,
+      "loss": 0.4092,
+      "step": 4796
+    },
+    {
+      "epoch": 0.13235978850577884,
+      "grad_norm": 0.0030461258720606565,
+      "learning_rate": 0.001,
+      "loss": 0.4176,
+      "step": 4797
+    },
+    {
+      "epoch": 0.13238738070684322,
+      "grad_norm": 0.003558572381734848,
+      "learning_rate": 0.001,
+      "loss": 0.4045,
+      "step": 4798
+    },
+    {
+      "epoch": 0.13241497290790757,
+      "grad_norm": 0.00515527231618762,
+      "learning_rate": 0.001,
+      "loss": 0.4131,
+      "step": 4799
+    },
+    {
+      "epoch": 0.13244256510897195,
+      "grad_norm": 0.0030274007003754377,
+      "learning_rate": 0.001,
+      "loss": 0.3776,
+      "step": 4800
+    },
+    {
+      "epoch": 0.1324701573100363,
+      "grad_norm": 0.0033126547932624817,
+      "learning_rate": 0.001,
+      "loss": 0.4172,
+      "step": 4801
+    },
+    {
+      "epoch": 0.1324977495111007,
+      "grad_norm": 0.0031248959712684155,
+      "learning_rate": 0.001,
+      "loss": 0.4282,
+      "step": 4802
+    },
+    {
+      "epoch": 0.13252534171216507,
+      "grad_norm": 0.0032562301494181156,
+      "learning_rate": 0.001,
+      "loss": 0.4023,
+      "step": 4803
+    },
+    {
+      "epoch": 0.13255293391322942,
+      "grad_norm": 0.005781130399554968,
+      "learning_rate": 0.001,
+      "loss": 0.3802,
+      "step": 4804
+    },
+    {
+      "epoch": 0.1325805261142938,
+      "grad_norm": 0.0039030571933835745,
+      "learning_rate": 0.001,
+      "loss": 0.4075,
+      "step": 4805
+    },
+    {
+      "epoch": 0.13260811831535815,
+      "grad_norm": 0.002995892893522978,
+      "learning_rate": 0.001,
+      "loss": 0.4002,
+      "step": 4806
+    },
+    {
+      "epoch": 0.13263571051642253,
+      "grad_norm": 0.002603591652587056,
+      "learning_rate": 0.001,
+      "loss": 0.3841,
+      "step": 4807
+    },
+    {
+      "epoch": 0.13266330271748691,
+      "grad_norm": 0.00366300530731678,
+      "learning_rate": 0.001,
+      "loss": 0.404,
+      "step": 4808
+    },
+    {
+      "epoch": 0.13269089491855127,
+      "grad_norm": 0.0033524134196341038,
+      "learning_rate": 0.001,
+      "loss": 0.3889,
+      "step": 4809
+    },
+    {
+      "epoch": 0.13271848711961565,
+      "grad_norm": 0.003584874328225851,
+      "learning_rate": 0.001,
+      "loss": 0.3963,
+      "step": 4810
+    },
+    {
+      "epoch": 0.13274607932068,
+      "grad_norm": 0.004354959353804588,
+      "learning_rate": 0.001,
+      "loss": 0.3705,
+      "step": 4811
+    },
+    {
+      "epoch": 0.13277367152174438,
+      "grad_norm": 0.0041418191976845264,
+      "learning_rate": 0.001,
+      "loss": 0.4072,
+      "step": 4812
+    },
+    {
+      "epoch": 0.13280126372280876,
+      "grad_norm": 0.0028464416973292828,
+      "learning_rate": 0.001,
+      "loss": 0.3683,
+      "step": 4813
+    },
+    {
+      "epoch": 0.1328288559238731,
+      "grad_norm": 0.0049109673127532005,
+      "learning_rate": 0.001,
+      "loss": 0.3907,
+      "step": 4814
+    },
+    {
+      "epoch": 0.1328564481249375,
+      "grad_norm": 0.0039614904671907425,
+      "learning_rate": 0.001,
+      "loss": 0.3736,
+      "step": 4815
+    },
+    {
+      "epoch": 0.13288404032600185,
+      "grad_norm": 0.008978486992418766,
+      "learning_rate": 0.001,
+      "loss": 0.3636,
+      "step": 4816
+    },
+    {
+      "epoch": 0.13291163252706623,
+      "grad_norm": 0.006667922250926495,
+      "learning_rate": 0.001,
+      "loss": 0.417,
+      "step": 4817
+    },
+    {
+      "epoch": 0.1329392247281306,
+      "grad_norm": 0.004778092727065086,
+      "learning_rate": 0.001,
+      "loss": 0.4362,
+      "step": 4818
+    },
+    {
+      "epoch": 0.13296681692919496,
+      "grad_norm": 0.0056798397563397884,
+      "learning_rate": 0.001,
+      "loss": 0.4426,
+      "step": 4819
+    },
+    {
+      "epoch": 0.13299440913025934,
+      "grad_norm": 0.0033784289844334126,
+      "learning_rate": 0.001,
+      "loss": 0.4164,
+      "step": 4820
+    },
+    {
+      "epoch": 0.1330220013313237,
+      "grad_norm": 0.002719793003052473,
+      "learning_rate": 0.001,
+      "loss": 0.4182,
+      "step": 4821
+    },
+    {
+      "epoch": 0.13304959353238807,
+      "grad_norm": 0.003707107389345765,
+      "learning_rate": 0.001,
+      "loss": 0.4093,
+      "step": 4822
+    },
+    {
+      "epoch": 0.13307718573345245,
+      "grad_norm": 0.0044320011511445045,
+      "learning_rate": 0.001,
+      "loss": 0.3985,
+      "step": 4823
+    },
+    {
+      "epoch": 0.1331047779345168,
+      "grad_norm": 0.003244374878704548,
+      "learning_rate": 0.001,
+      "loss": 0.3985,
+      "step": 4824
+    },
+    {
+      "epoch": 0.13313237013558119,
+      "grad_norm": 0.006342526059597731,
+      "learning_rate": 0.001,
+      "loss": 0.4233,
+      "step": 4825
+    },
+    {
+      "epoch": 0.13315996233664554,
+      "grad_norm": 0.002801056718453765,
+      "learning_rate": 0.001,
+      "loss": 0.3753,
+      "step": 4826
+    },
+    {
+      "epoch": 0.13318755453770992,
+      "grad_norm": 0.004005104303359985,
+      "learning_rate": 0.001,
+      "loss": 0.3835,
+      "step": 4827
+    },
+    {
+      "epoch": 0.1332151467387743,
+      "grad_norm": 0.0026726596988737583,
+      "learning_rate": 0.001,
+      "loss": 0.3881,
+      "step": 4828
+    },
+    {
+      "epoch": 0.13324273893983865,
+      "grad_norm": 0.0022171332966536283,
+      "learning_rate": 0.001,
+      "loss": 0.395,
+      "step": 4829
+    },
+    {
+      "epoch": 0.13327033114090303,
+      "grad_norm": 0.00455888919532299,
+      "learning_rate": 0.001,
+      "loss": 0.3438,
+      "step": 4830
+    },
+    {
+      "epoch": 0.13329792334196738,
+      "grad_norm": 0.0036296145990490913,
+      "learning_rate": 0.001,
+      "loss": 0.4194,
+      "step": 4831
+    },
+    {
+      "epoch": 0.13332551554303176,
+      "grad_norm": 0.0030942729208618402,
+      "learning_rate": 0.001,
+      "loss": 0.3936,
+      "step": 4832
+    },
+    {
+      "epoch": 0.13335310774409614,
+      "grad_norm": 0.0031570447608828545,
+      "learning_rate": 0.001,
+      "loss": 0.4177,
+      "step": 4833
+    },
+    {
+      "epoch": 0.1333806999451605,
+      "grad_norm": 0.0029306053183972836,
+      "learning_rate": 0.001,
+      "loss": 0.4032,
+      "step": 4834
+    },
+    {
+      "epoch": 0.13340829214622488,
+      "grad_norm": 0.002967880107462406,
+      "learning_rate": 0.001,
+      "loss": 0.3975,
+      "step": 4835
+    },
+    {
+      "epoch": 0.13343588434728923,
+      "grad_norm": 0.0028411787934601307,
+      "learning_rate": 0.001,
+      "loss": 0.4018,
+      "step": 4836
+    },
+    {
+      "epoch": 0.1334634765483536,
+      "grad_norm": 0.003896566806361079,
+      "learning_rate": 0.001,
+      "loss": 0.3884,
+      "step": 4837
+    },
+    {
+      "epoch": 0.133491068749418,
+      "grad_norm": 0.0024064083117991686,
+      "learning_rate": 0.001,
+      "loss": 0.3944,
+      "step": 4838
+    },
+    {
+      "epoch": 0.13351866095048234,
+      "grad_norm": 0.0045156353153288364,
+      "learning_rate": 0.001,
+      "loss": 0.4272,
+      "step": 4839
+    },
+    {
+      "epoch": 0.13354625315154672,
+      "grad_norm": 0.002685493789613247,
+      "learning_rate": 0.001,
+      "loss": 0.4084,
+      "step": 4840
+    },
+    {
+      "epoch": 0.13357384535261108,
+      "grad_norm": 0.00366023276001215,
+      "learning_rate": 0.001,
+      "loss": 0.3942,
+      "step": 4841
+    },
+    {
+      "epoch": 0.13360143755367546,
+      "grad_norm": 0.0024847208987921476,
+      "learning_rate": 0.001,
+      "loss": 0.4506,
+      "step": 4842
+    },
+    {
+      "epoch": 0.13362902975473984,
+      "grad_norm": 0.004642703104764223,
+      "learning_rate": 0.001,
+      "loss": 0.3911,
+      "step": 4843
+    },
+    {
+      "epoch": 0.1336566219558042,
+      "grad_norm": 0.004185870289802551,
+      "learning_rate": 0.001,
+      "loss": 0.417,
+      "step": 4844
+    },
+    {
+      "epoch": 0.13368421415686857,
+      "grad_norm": 0.0043914420530200005,
+      "learning_rate": 0.001,
+      "loss": 0.4254,
+      "step": 4845
+    },
+    {
+      "epoch": 0.13371180635793292,
+      "grad_norm": 0.0026118417736142874,
+      "learning_rate": 0.001,
+      "loss": 0.4118,
+      "step": 4846
+    },
+    {
+      "epoch": 0.1337393985589973,
+      "grad_norm": 0.0031845802441239357,
+      "learning_rate": 0.001,
+      "loss": 0.3999,
+      "step": 4847
+    },
+    {
+      "epoch": 0.13376699076006165,
+      "grad_norm": 0.0029715183191001415,
+      "learning_rate": 0.001,
+      "loss": 0.4254,
+      "step": 4848
+    },
+    {
+      "epoch": 0.13379458296112604,
+      "grad_norm": 0.0023027430288493633,
+      "learning_rate": 0.001,
+      "loss": 0.4288,
+      "step": 4849
+    },
+    {
+      "epoch": 0.13382217516219042,
+      "grad_norm": 0.002911708317697048,
+      "learning_rate": 0.001,
+      "loss": 0.425,
+      "step": 4850
+    },
+    {
+      "epoch": 0.13384976736325477,
+      "grad_norm": 0.002629220252856612,
+      "learning_rate": 0.001,
+      "loss": 0.4045,
+      "step": 4851
+    },
+    {
+      "epoch": 0.13387735956431915,
+      "grad_norm": 0.005986600182950497,
+      "learning_rate": 0.001,
+      "loss": 0.3997,
+      "step": 4852
+    },
+    {
+      "epoch": 0.1339049517653835,
+      "grad_norm": 0.006612063851207495,
+      "learning_rate": 0.001,
+      "loss": 0.3881,
+      "step": 4853
+    },
+    {
+      "epoch": 0.13393254396644788,
+      "grad_norm": 0.003582441946491599,
+      "learning_rate": 0.001,
+      "loss": 0.3711,
+      "step": 4854
+    },
+    {
+      "epoch": 0.13396013616751226,
+      "grad_norm": 0.0032464175019413233,
+      "learning_rate": 0.001,
+      "loss": 0.3766,
+      "step": 4855
+    },
+    {
+      "epoch": 0.13398772836857661,
+      "grad_norm": 0.0038717519491910934,
+      "learning_rate": 0.001,
+      "loss": 0.3888,
+      "step": 4856
+    },
+    {
+      "epoch": 0.134015320569641,
+      "grad_norm": 0.00332628283649683,
+      "learning_rate": 0.001,
+      "loss": 0.375,
+      "step": 4857
+    },
+    {
+      "epoch": 0.13404291277070535,
+      "grad_norm": 0.004962892737239599,
+      "learning_rate": 0.001,
+      "loss": 0.412,
+      "step": 4858
+    },
+    {
+      "epoch": 0.13407050497176973,
+      "grad_norm": 0.0022890097461640835,
+      "learning_rate": 0.001,
+      "loss": 0.4177,
+      "step": 4859
+    },
+    {
+      "epoch": 0.1340980971728341,
+      "grad_norm": 0.0030799328815191984,
+      "learning_rate": 0.001,
+      "loss": 0.4119,
+      "step": 4860
+    },
+    {
+      "epoch": 0.13412568937389846,
+      "grad_norm": 0.003105921670794487,
+      "learning_rate": 0.001,
+      "loss": 0.3852,
+      "step": 4861
+    },
+    {
+      "epoch": 0.13415328157496284,
+      "grad_norm": 0.0060799745842814445,
+      "learning_rate": 0.001,
+      "loss": 0.4202,
+      "step": 4862
+    },
+    {
+      "epoch": 0.1341808737760272,
+      "grad_norm": 0.005220312625169754,
+      "learning_rate": 0.001,
+      "loss": 0.403,
+      "step": 4863
+    },
+    {
+      "epoch": 0.13420846597709157,
+      "grad_norm": 0.005123880226165056,
+      "learning_rate": 0.001,
+      "loss": 0.4101,
+      "step": 4864
+    },
+    {
+      "epoch": 0.13423605817815595,
+      "grad_norm": 0.002265162765979767,
+      "learning_rate": 0.001,
+      "loss": 0.4223,
+      "step": 4865
+    },
+    {
+      "epoch": 0.1342636503792203,
+      "grad_norm": 0.004077676683664322,
+      "learning_rate": 0.001,
+      "loss": 0.3996,
+      "step": 4866
+    },
+    {
+      "epoch": 0.1342912425802847,
+      "grad_norm": 0.0035009863786399364,
+      "learning_rate": 0.001,
+      "loss": 0.3876,
+      "step": 4867
+    },
+    {
+      "epoch": 0.13431883478134904,
+      "grad_norm": 0.003494169097393751,
+      "learning_rate": 0.001,
+      "loss": 0.3784,
+      "step": 4868
+    },
+    {
+      "epoch": 0.13434642698241342,
+      "grad_norm": 0.0059214490465819836,
+      "learning_rate": 0.001,
+      "loss": 0.4206,
+      "step": 4869
+    },
+    {
+      "epoch": 0.1343740191834778,
+      "grad_norm": 0.0038102336693555117,
+      "learning_rate": 0.001,
+      "loss": 0.3951,
+      "step": 4870
+    },
+    {
+      "epoch": 0.13440161138454215,
+      "grad_norm": 0.01034005731344223,
+      "learning_rate": 0.001,
+      "loss": 0.4085,
+      "step": 4871
+    },
+    {
+      "epoch": 0.13442920358560653,
+      "grad_norm": 0.003015523310750723,
+      "learning_rate": 0.001,
+      "loss": 0.4183,
+      "step": 4872
+    },
+    {
+      "epoch": 0.13445679578667089,
+      "grad_norm": 0.003311215667054057,
+      "learning_rate": 0.001,
+      "loss": 0.3871,
+      "step": 4873
+    },
+    {
+      "epoch": 0.13448438798773527,
+      "grad_norm": 0.002569688018411398,
+      "learning_rate": 0.001,
+      "loss": 0.4212,
+      "step": 4874
+    },
+    {
+      "epoch": 0.13451198018879965,
+      "grad_norm": 0.002594457007944584,
+      "learning_rate": 0.001,
+      "loss": 0.4051,
+      "step": 4875
+    },
+    {
+      "epoch": 0.134539572389864,
+      "grad_norm": 0.004568415228277445,
+      "learning_rate": 0.001,
+      "loss": 0.4062,
+      "step": 4876
+    },
+    {
+      "epoch": 0.13456716459092838,
+      "grad_norm": 0.0033749649301171303,
+      "learning_rate": 0.001,
+      "loss": 0.3851,
+      "step": 4877
+    },
+    {
+      "epoch": 0.13459475679199273,
+      "grad_norm": 0.003532090689986944,
+      "learning_rate": 0.001,
+      "loss": 0.4134,
+      "step": 4878
+    },
+    {
+      "epoch": 0.1346223489930571,
+      "grad_norm": 0.0033763758838176727,
+      "learning_rate": 0.001,
+      "loss": 0.4118,
+      "step": 4879
+    },
+    {
+      "epoch": 0.1346499411941215,
+      "grad_norm": 0.002815960207954049,
+      "learning_rate": 0.001,
+      "loss": 0.3919,
+      "step": 4880
+    },
+    {
+      "epoch": 0.13467753339518584,
+      "grad_norm": 0.0021568064112216234,
+      "learning_rate": 0.001,
+      "loss": 0.4046,
+      "step": 4881
+    },
+    {
+      "epoch": 0.13470512559625022,
+      "grad_norm": 0.002766069257631898,
+      "learning_rate": 0.001,
+      "loss": 0.3838,
+      "step": 4882
+    },
+    {
+      "epoch": 0.13473271779731458,
+      "grad_norm": 0.004232902079820633,
+      "learning_rate": 0.001,
+      "loss": 0.4244,
+      "step": 4883
+    },
+    {
+      "epoch": 0.13476030999837896,
+      "grad_norm": 0.007129458710551262,
+      "learning_rate": 0.001,
+      "loss": 0.4131,
+      "step": 4884
+    },
+    {
+      "epoch": 0.13478790219944334,
+      "grad_norm": 0.0027489245403558016,
+      "learning_rate": 0.001,
+      "loss": 0.4027,
+      "step": 4885
+    },
+    {
+      "epoch": 0.1348154944005077,
+      "grad_norm": 0.002653737785294652,
+      "learning_rate": 0.001,
+      "loss": 0.4018,
+      "step": 4886
+    },
+    {
+      "epoch": 0.13484308660157207,
+      "grad_norm": 0.0031049742829054594,
+      "learning_rate": 0.001,
+      "loss": 0.4161,
+      "step": 4887
+    },
+    {
+      "epoch": 0.13487067880263642,
+      "grad_norm": 0.002779177390038967,
+      "learning_rate": 0.001,
+      "loss": 0.4165,
+      "step": 4888
+    },
+    {
+      "epoch": 0.1348982710037008,
+      "grad_norm": 0.003438913496211171,
+      "learning_rate": 0.001,
+      "loss": 0.378,
+      "step": 4889
+    },
+    {
+      "epoch": 0.13492586320476518,
+      "grad_norm": 0.003269481472671032,
+      "learning_rate": 0.001,
+      "loss": 0.3955,
+      "step": 4890
+    },
+    {
+      "epoch": 0.13495345540582954,
+      "grad_norm": 0.00541806360706687,
+      "learning_rate": 0.001,
+      "loss": 0.4346,
+      "step": 4891
+    },
+    {
+      "epoch": 0.13498104760689392,
+      "grad_norm": 0.007846074178814888,
+      "learning_rate": 0.001,
+      "loss": 0.3962,
+      "step": 4892
+    },
+    {
+      "epoch": 0.13500863980795827,
+      "grad_norm": 0.006200781557708979,
+      "learning_rate": 0.001,
+      "loss": 0.406,
+      "step": 4893
+    },
+    {
+      "epoch": 0.13503623200902265,
+      "grad_norm": 0.003382598515599966,
+      "learning_rate": 0.001,
+      "loss": 0.3839,
+      "step": 4894
+    },
+    {
+      "epoch": 0.13506382421008703,
+      "grad_norm": 0.002724557416513562,
+      "learning_rate": 0.001,
+      "loss": 0.422,
+      "step": 4895
+    },
+    {
+      "epoch": 0.13509141641115138,
+      "grad_norm": 0.0024383303243666887,
+      "learning_rate": 0.001,
+      "loss": 0.3955,
+      "step": 4896
+    },
+    {
+      "epoch": 0.13511900861221576,
+      "grad_norm": 0.0024829639587551355,
+      "learning_rate": 0.001,
+      "loss": 0.3831,
+      "step": 4897
+    },
+    {
+      "epoch": 0.13514660081328012,
+      "grad_norm": 0.002763790776953101,
+      "learning_rate": 0.001,
+      "loss": 0.4006,
+      "step": 4898
+    },
+    {
+      "epoch": 0.1351741930143445,
+      "grad_norm": 0.0031322850845754147,
+      "learning_rate": 0.001,
+      "loss": 0.4005,
+      "step": 4899
+    },
+    {
+      "epoch": 0.13520178521540888,
+      "grad_norm": 0.002493941690772772,
+      "learning_rate": 0.001,
+      "loss": 0.3807,
+      "step": 4900
+    },
+    {
+      "epoch": 0.13522937741647323,
+      "grad_norm": 0.002849231008440256,
+      "learning_rate": 0.001,
+      "loss": 0.3888,
+      "step": 4901
+    },
+    {
+      "epoch": 0.1352569696175376,
+      "grad_norm": 0.0022846634965389967,
+      "learning_rate": 0.001,
+      "loss": 0.4765,
+      "step": 4902
+    },
+    {
+      "epoch": 0.13528456181860196,
+      "grad_norm": 0.003928142134100199,
+      "learning_rate": 0.001,
+      "loss": 0.395,
+      "step": 4903
+    },
+    {
+      "epoch": 0.13531215401966634,
+      "grad_norm": 0.0029574292711913586,
+      "learning_rate": 0.001,
+      "loss": 0.4335,
+      "step": 4904
+    },
+    {
+      "epoch": 0.13533974622073072,
+      "grad_norm": 0.0024456526152789593,
+      "learning_rate": 0.001,
+      "loss": 0.4359,
+      "step": 4905
+    },
+    {
+      "epoch": 0.13536733842179507,
+      "grad_norm": 0.005585736595094204,
+      "learning_rate": 0.001,
+      "loss": 0.4165,
+      "step": 4906
+    },
+    {
+      "epoch": 0.13539493062285946,
+      "grad_norm": 0.003591830376535654,
+      "learning_rate": 0.001,
+      "loss": 0.4199,
+      "step": 4907
+    },
+    {
+      "epoch": 0.1354225228239238,
+      "grad_norm": 0.0025483304634690285,
+      "learning_rate": 0.001,
+      "loss": 0.374,
+      "step": 4908
+    },
+    {
+      "epoch": 0.1354501150249882,
+      "grad_norm": 0.0023265292402356863,
+      "learning_rate": 0.001,
+      "loss": 0.3977,
+      "step": 4909
+    },
+    {
+      "epoch": 0.13547770722605257,
+      "grad_norm": 0.0066468482837080956,
+      "learning_rate": 0.001,
+      "loss": 0.387,
+      "step": 4910
+    },
+    {
+      "epoch": 0.13550529942711692,
+      "grad_norm": 0.003016105853021145,
+      "learning_rate": 0.001,
+      "loss": 0.406,
+      "step": 4911
+    },
+    {
+      "epoch": 0.1355328916281813,
+      "grad_norm": 0.002601313404738903,
+      "learning_rate": 0.001,
+      "loss": 0.3942,
+      "step": 4912
+    },
+    {
+      "epoch": 0.13556048382924565,
+      "grad_norm": 0.0035740050952881575,
+      "learning_rate": 0.001,
+      "loss": 0.4112,
+      "step": 4913
+    },
+    {
+      "epoch": 0.13558807603031003,
+      "grad_norm": 0.0030930940993130207,
+      "learning_rate": 0.001,
+      "loss": 0.4203,
+      "step": 4914
+    },
+    {
+      "epoch": 0.13561566823137441,
+      "grad_norm": 0.0029879440553486347,
+      "learning_rate": 0.001,
+      "loss": 0.4089,
+      "step": 4915
+    },
+    {
+      "epoch": 0.13564326043243877,
+      "grad_norm": 0.003520643338561058,
+      "learning_rate": 0.001,
+      "loss": 0.4005,
+      "step": 4916
+    },
+    {
+      "epoch": 0.13567085263350315,
+      "grad_norm": 0.0026404429227113724,
+      "learning_rate": 0.001,
+      "loss": 0.4151,
+      "step": 4917
+    },
+    {
+      "epoch": 0.1356984448345675,
+      "grad_norm": 0.0031609549187123775,
+      "learning_rate": 0.001,
+      "loss": 0.3546,
+      "step": 4918
+    },
+    {
+      "epoch": 0.13572603703563188,
+      "grad_norm": 0.002632451243698597,
+      "learning_rate": 0.001,
+      "loss": 0.3774,
+      "step": 4919
+    },
+    {
+      "epoch": 0.13575362923669626,
+      "grad_norm": 0.004959171637892723,
+      "learning_rate": 0.001,
+      "loss": 0.3821,
+      "step": 4920
+    },
+    {
+      "epoch": 0.1357812214377606,
+      "grad_norm": 0.0027791087049990892,
+      "learning_rate": 0.001,
+      "loss": 0.3692,
+      "step": 4921
+    },
+    {
+      "epoch": 0.135808813638825,
+      "grad_norm": 0.002658676588907838,
+      "learning_rate": 0.001,
+      "loss": 0.4051,
+      "step": 4922
+    },
+    {
+      "epoch": 0.13583640583988935,
+      "grad_norm": 0.0022139514330774546,
+      "learning_rate": 0.001,
+      "loss": 0.42,
+      "step": 4923
+    },
+    {
+      "epoch": 0.13586399804095373,
+      "grad_norm": 0.0026424683164805174,
+      "learning_rate": 0.001,
+      "loss": 0.3854,
+      "step": 4924
+    },
+    {
+      "epoch": 0.1358915902420181,
+      "grad_norm": 0.0026688999496400356,
+      "learning_rate": 0.001,
+      "loss": 0.3799,
+      "step": 4925
+    },
+    {
+      "epoch": 0.13591918244308246,
+      "grad_norm": 0.003557176562026143,
+      "learning_rate": 0.001,
+      "loss": 0.404,
+      "step": 4926
+    },
+    {
+      "epoch": 0.13594677464414684,
+      "grad_norm": 0.002718021161854267,
+      "learning_rate": 0.001,
+      "loss": 0.4014,
+      "step": 4927
+    },
+    {
+      "epoch": 0.1359743668452112,
+      "grad_norm": 0.0034369598142802715,
+      "learning_rate": 0.001,
+      "loss": 0.4215,
+      "step": 4928
+    },
+    {
+      "epoch": 0.13600195904627557,
+      "grad_norm": 0.003354028332978487,
+      "learning_rate": 0.001,
+      "loss": 0.3488,
+      "step": 4929
+    },
+    {
+      "epoch": 0.13602955124733995,
+      "grad_norm": 0.0030789237935096025,
+      "learning_rate": 0.001,
+      "loss": 0.367,
+      "step": 4930
+    },
+    {
+      "epoch": 0.1360571434484043,
+      "grad_norm": 0.003499686485156417,
+      "learning_rate": 0.001,
+      "loss": 0.4172,
+      "step": 4931
+    },
+    {
+      "epoch": 0.13608473564946869,
+      "grad_norm": 0.003687124466523528,
+      "learning_rate": 0.001,
+      "loss": 0.4303,
+      "step": 4932
+    },
+    {
+      "epoch": 0.13611232785053304,
+      "grad_norm": 0.003492203541100025,
+      "learning_rate": 0.001,
+      "loss": 0.454,
+      "step": 4933
+    },
+    {
+      "epoch": 0.13613992005159742,
+      "grad_norm": 0.002475053770467639,
+      "learning_rate": 0.001,
+      "loss": 0.413,
+      "step": 4934
+    },
+    {
+      "epoch": 0.1361675122526618,
+      "grad_norm": 0.002234766026958823,
+      "learning_rate": 0.001,
+      "loss": 0.3814,
+      "step": 4935
+    },
+    {
+      "epoch": 0.13619510445372615,
+      "grad_norm": 0.002103250939399004,
+      "learning_rate": 0.001,
+      "loss": 0.4257,
+      "step": 4936
+    },
+    {
+      "epoch": 0.13622269665479053,
+      "grad_norm": 0.0036407755687832832,
+      "learning_rate": 0.001,
+      "loss": 0.3888,
+      "step": 4937
+    },
+    {
+      "epoch": 0.13625028885585488,
+      "grad_norm": 0.003084122436121106,
+      "learning_rate": 0.001,
+      "loss": 0.3908,
+      "step": 4938
+    },
+    {
+      "epoch": 0.13627788105691926,
+      "grad_norm": 0.0025477514136582613,
+      "learning_rate": 0.001,
+      "loss": 0.4523,
+      "step": 4939
+    },
+    {
+      "epoch": 0.13630547325798364,
+      "grad_norm": 0.002407640218734741,
+      "learning_rate": 0.001,
+      "loss": 0.4229,
+      "step": 4940
+    },
+    {
+      "epoch": 0.136333065459048,
+      "grad_norm": 0.004071627277880907,
+      "learning_rate": 0.001,
+      "loss": 0.393,
+      "step": 4941
+    },
+    {
+      "epoch": 0.13636065766011238,
+      "grad_norm": 0.004198818933218718,
+      "learning_rate": 0.001,
+      "loss": 0.3831,
+      "step": 4942
+    },
+    {
+      "epoch": 0.13638824986117673,
+      "grad_norm": 0.0038220062851905823,
+      "learning_rate": 0.001,
+      "loss": 0.3884,
+      "step": 4943
+    },
+    {
+      "epoch": 0.1364158420622411,
+      "grad_norm": 0.003277060342952609,
+      "learning_rate": 0.001,
+      "loss": 0.427,
+      "step": 4944
+    },
+    {
+      "epoch": 0.13644343426330546,
+      "grad_norm": 0.0034869378432631493,
+      "learning_rate": 0.001,
+      "loss": 0.4056,
+      "step": 4945
+    },
+    {
+      "epoch": 0.13647102646436984,
+      "grad_norm": 0.00437937444075942,
+      "learning_rate": 0.001,
+      "loss": 0.3728,
+      "step": 4946
+    },
+    {
+      "epoch": 0.13649861866543422,
+      "grad_norm": 0.0038593048229813576,
+      "learning_rate": 0.001,
+      "loss": 0.3715,
+      "step": 4947
+    },
+    {
+      "epoch": 0.13652621086649858,
+      "grad_norm": 0.0027042904403060675,
+      "learning_rate": 0.001,
+      "loss": 0.4218,
+      "step": 4948
+    },
+    {
+      "epoch": 0.13655380306756296,
+      "grad_norm": 0.003592225257307291,
+      "learning_rate": 0.001,
+      "loss": 0.3708,
+      "step": 4949
+    },
+    {
+      "epoch": 0.1365813952686273,
+      "grad_norm": 0.005848668981343508,
+      "learning_rate": 0.001,
+      "loss": 0.3964,
+      "step": 4950
+    },
+    {
+      "epoch": 0.1366089874696917,
+      "grad_norm": 0.01795089617371559,
+      "learning_rate": 0.001,
+      "loss": 0.3951,
+      "step": 4951
+    },
+    {
+      "epoch": 0.13663657967075607,
+      "grad_norm": 0.004261403810232878,
+      "learning_rate": 0.001,
+      "loss": 0.4031,
+      "step": 4952
+    },
+    {
+      "epoch": 0.13666417187182042,
+      "grad_norm": 0.0033142368774861097,
+      "learning_rate": 0.001,
+      "loss": 0.4232,
+      "step": 4953
+    },
+    {
+      "epoch": 0.1366917640728848,
+      "grad_norm": 0.003166605019941926,
+      "learning_rate": 0.001,
+      "loss": 0.4016,
+      "step": 4954
+    },
+    {
+      "epoch": 0.13671935627394916,
+      "grad_norm": 0.003505310043692589,
+      "learning_rate": 0.001,
+      "loss": 0.3785,
+      "step": 4955
+    },
+    {
+      "epoch": 0.13674694847501354,
+      "grad_norm": 0.005368894897401333,
+      "learning_rate": 0.001,
+      "loss": 0.3984,
+      "step": 4956
+    },
+    {
+      "epoch": 0.13677454067607792,
+      "grad_norm": 0.00306013785302639,
+      "learning_rate": 0.001,
+      "loss": 0.3971,
+      "step": 4957
+    },
+    {
+      "epoch": 0.13680213287714227,
+      "grad_norm": 0.005562109872698784,
+      "learning_rate": 0.001,
+      "loss": 0.3933,
+      "step": 4958
+    },
+    {
+      "epoch": 0.13682972507820665,
+      "grad_norm": 0.0037877087015658617,
+      "learning_rate": 0.001,
+      "loss": 0.4104,
+      "step": 4959
+    },
+    {
+      "epoch": 0.136857317279271,
+      "grad_norm": 0.0031170272268354893,
+      "learning_rate": 0.001,
+      "loss": 0.4073,
+      "step": 4960
+    },
+    {
+      "epoch": 0.13688490948033538,
+      "grad_norm": 0.003254547482356429,
+      "learning_rate": 0.001,
+      "loss": 0.4079,
+      "step": 4961
+    },
+    {
+      "epoch": 0.13691250168139976,
+      "grad_norm": 0.003371870843693614,
+      "learning_rate": 0.001,
+      "loss": 0.3904,
+      "step": 4962
+    },
+    {
+      "epoch": 0.13694009388246411,
+      "grad_norm": 0.003327438374981284,
+      "learning_rate": 0.001,
+      "loss": 0.3929,
+      "step": 4963
+    },
+    {
+      "epoch": 0.1369676860835285,
+      "grad_norm": 0.003542753402143717,
+      "learning_rate": 0.001,
+      "loss": 0.3767,
+      "step": 4964
+    },
+    {
+      "epoch": 0.13699527828459285,
+      "grad_norm": 0.0030288288835436106,
+      "learning_rate": 0.001,
+      "loss": 0.3652,
+      "step": 4965
+    },
+    {
+      "epoch": 0.13702287048565723,
+      "grad_norm": 0.0034882482141256332,
+      "learning_rate": 0.001,
+      "loss": 0.4056,
+      "step": 4966
+    },
+    {
+      "epoch": 0.1370504626867216,
+      "grad_norm": 0.005606031510978937,
+      "learning_rate": 0.001,
+      "loss": 0.394,
+      "step": 4967
+    },
+    {
+      "epoch": 0.13707805488778596,
+      "grad_norm": 0.004056036937981844,
+      "learning_rate": 0.001,
+      "loss": 0.4103,
+      "step": 4968
+    },
+    {
+      "epoch": 0.13710564708885034,
+      "grad_norm": 0.0032308900263160467,
+      "learning_rate": 0.001,
+      "loss": 0.4076,
+      "step": 4969
+    },
+    {
+      "epoch": 0.1371332392899147,
+      "grad_norm": 0.006253343541175127,
+      "learning_rate": 0.001,
+      "loss": 0.3885,
+      "step": 4970
+    },
+    {
+      "epoch": 0.13716083149097907,
+      "grad_norm": 0.003451489843428135,
+      "learning_rate": 0.001,
+      "loss": 0.3788,
+      "step": 4971
+    },
+    {
+      "epoch": 0.13718842369204345,
+      "grad_norm": 0.0033425847068428993,
+      "learning_rate": 0.001,
+      "loss": 0.3824,
+      "step": 4972
+    },
+    {
+      "epoch": 0.1372160158931078,
+      "grad_norm": 0.0033426876179873943,
+      "learning_rate": 0.001,
+      "loss": 0.437,
+      "step": 4973
+    },
+    {
+      "epoch": 0.1372436080941722,
+      "grad_norm": 0.0031063987407833338,
+      "learning_rate": 0.001,
+      "loss": 0.3927,
+      "step": 4974
+    },
+    {
+      "epoch": 0.13727120029523654,
+      "grad_norm": 0.003505886532366276,
+      "learning_rate": 0.001,
+      "loss": 0.4198,
+      "step": 4975
+    },
+    {
+      "epoch": 0.13729879249630092,
+      "grad_norm": 0.0025633492041379213,
+      "learning_rate": 0.001,
+      "loss": 0.4173,
+      "step": 4976
+    },
+    {
+      "epoch": 0.1373263846973653,
+      "grad_norm": 0.003809051588177681,
+      "learning_rate": 0.001,
+      "loss": 0.4064,
+      "step": 4977
+    },
+    {
+      "epoch": 0.13735397689842965,
+      "grad_norm": 0.004108694847673178,
+      "learning_rate": 0.001,
+      "loss": 0.3849,
+      "step": 4978
+    },
+    {
+      "epoch": 0.13738156909949403,
+      "grad_norm": 0.0032902159728109837,
+      "learning_rate": 0.001,
+      "loss": 0.4003,
+      "step": 4979
+    },
+    {
+      "epoch": 0.13740916130055839,
+      "grad_norm": 0.004460613243281841,
+      "learning_rate": 0.001,
+      "loss": 0.3988,
+      "step": 4980
+    },
+    {
+      "epoch": 0.13743675350162277,
+      "grad_norm": 0.0026315529830753803,
+      "learning_rate": 0.001,
+      "loss": 0.409,
+      "step": 4981
+    },
+    {
+      "epoch": 0.13746434570268715,
+      "grad_norm": 0.0023952291812747717,
+      "learning_rate": 0.001,
+      "loss": 0.4163,
+      "step": 4982
+    },
+    {
+      "epoch": 0.1374919379037515,
+      "grad_norm": 0.003244011662900448,
+      "learning_rate": 0.001,
+      "loss": 0.3681,
+      "step": 4983
+    },
+    {
+      "epoch": 0.13751953010481588,
+      "grad_norm": 0.003023752709850669,
+      "learning_rate": 0.001,
+      "loss": 0.4021,
+      "step": 4984
+    },
+    {
+      "epoch": 0.13754712230588023,
+      "grad_norm": 0.002322046086192131,
+      "learning_rate": 0.001,
+      "loss": 0.3911,
+      "step": 4985
+    },
+    {
+      "epoch": 0.1375747145069446,
+      "grad_norm": 0.0032940306700766087,
+      "learning_rate": 0.001,
+      "loss": 0.3987,
+      "step": 4986
+    },
+    {
+      "epoch": 0.137602306708009,
+      "grad_norm": 0.002528809243813157,
+      "learning_rate": 0.001,
+      "loss": 0.4007,
+      "step": 4987
+    },
+    {
+      "epoch": 0.13762989890907334,
+      "grad_norm": 0.003885491518303752,
+      "learning_rate": 0.001,
+      "loss": 0.3912,
+      "step": 4988
+    },
+    {
+      "epoch": 0.13765749111013773,
+      "grad_norm": 0.0026430352590978146,
+      "learning_rate": 0.001,
+      "loss": 0.3899,
+      "step": 4989
+    },
+    {
+      "epoch": 0.13768508331120208,
+      "grad_norm": 0.0037311362102627754,
+      "learning_rate": 0.001,
+      "loss": 0.3837,
+      "step": 4990
+    },
+    {
+      "epoch": 0.13771267551226646,
+      "grad_norm": 0.002789480844512582,
+      "learning_rate": 0.001,
+      "loss": 0.3944,
+      "step": 4991
+    },
+    {
+      "epoch": 0.13774026771333084,
+      "grad_norm": 0.0031957447063177824,
+      "learning_rate": 0.001,
+      "loss": 0.4011,
+      "step": 4992
+    },
+    {
+      "epoch": 0.1377678599143952,
+      "grad_norm": 0.0032790224067866802,
+      "learning_rate": 0.001,
+      "loss": 0.4207,
+      "step": 4993
+    },
+    {
+      "epoch": 0.13779545211545957,
+      "grad_norm": 0.0031076695304363966,
+      "learning_rate": 0.001,
+      "loss": 0.3813,
+      "step": 4994
+    },
+    {
+      "epoch": 0.13782304431652392,
+      "grad_norm": 0.0024428521282970905,
+      "learning_rate": 0.001,
+      "loss": 0.3828,
+      "step": 4995
+    },
+    {
+      "epoch": 0.1378506365175883,
+      "grad_norm": 0.003022199496626854,
+      "learning_rate": 0.001,
+      "loss": 0.4236,
+      "step": 4996
+    },
+    {
+      "epoch": 0.13787822871865268,
+      "grad_norm": 0.0028683668933808804,
+      "learning_rate": 0.001,
+      "loss": 0.3924,
+      "step": 4997
+    },
+    {
+      "epoch": 0.13790582091971704,
+      "grad_norm": 0.003475520294159651,
+      "learning_rate": 0.001,
+      "loss": 0.395,
+      "step": 4998
+    },
+    {
+      "epoch": 0.13793341312078142,
+      "grad_norm": 0.003010012209415436,
+      "learning_rate": 0.001,
+      "loss": 0.4281,
+      "step": 4999
+    },
+    {
+      "epoch": 0.13796100532184577,
+      "grad_norm": 0.002013305900618434,
+      "learning_rate": 0.001,
+      "loss": 0.3957,
+      "step": 5000
+    },
+    {
+      "epoch": 0.13796100532184577,
+      "eval_runtime": 23.8099,
+      "eval_samples_per_second": 1.344,
+      "eval_steps_per_second": 0.168,
+      "step": 5000
+    },
+    {
+      "epoch": 0.13798859752291015,
+      "grad_norm": 0.00346556818112731,
+      "learning_rate": 0.001,
+      "loss": 0.4065,
+      "step": 5001
+    },
+    {
+      "epoch": 0.13801618972397453,
+      "grad_norm": 0.003510264679789543,
+      "learning_rate": 0.001,
+      "loss": 0.4371,
+      "step": 5002
+    },
+    {
+      "epoch": 0.13804378192503888,
+      "grad_norm": 0.0026663157623261213,
+      "learning_rate": 0.001,
+      "loss": 0.4163,
+      "step": 5003
+    },
+    {
+      "epoch": 0.13807137412610326,
+      "grad_norm": 0.0029142978601157665,
+      "learning_rate": 0.001,
+      "loss": 0.3741,
+      "step": 5004
+    },
+    {
+      "epoch": 0.13809896632716762,
+      "grad_norm": 0.0032160128466784954,
+      "learning_rate": 0.001,
+      "loss": 0.3955,
+      "step": 5005
+    },
+    {
+      "epoch": 0.138126558528232,
+      "grad_norm": 0.0034054117277264595,
+      "learning_rate": 0.001,
+      "loss": 0.388,
+      "step": 5006
+    },
+    {
+      "epoch": 0.13815415072929638,
+      "grad_norm": 0.0025454023852944374,
+      "learning_rate": 0.001,
+      "loss": 0.4313,
+      "step": 5007
+    },
+    {
+      "epoch": 0.13818174293036073,
+      "grad_norm": 0.0025894741993397474,
+      "learning_rate": 0.001,
+      "loss": 0.3961,
+      "step": 5008
+    },
+    {
+      "epoch": 0.1382093351314251,
+      "grad_norm": 0.0027122844476252794,
+      "learning_rate": 0.001,
+      "loss": 0.3894,
+      "step": 5009
+    },
+    {
+      "epoch": 0.13823692733248946,
+      "grad_norm": 0.0039015160873532295,
+      "learning_rate": 0.001,
+      "loss": 0.4005,
+      "step": 5010
+    },
+    {
+      "epoch": 0.13826451953355384,
+      "grad_norm": 0.003185526467859745,
+      "learning_rate": 0.001,
+      "loss": 0.3548,
+      "step": 5011
+    },
+    {
+      "epoch": 0.13829211173461822,
+      "grad_norm": 0.004475030116736889,
+      "learning_rate": 0.001,
+      "loss": 0.3721,
+      "step": 5012
+    },
+    {
+      "epoch": 0.13831970393568258,
+      "grad_norm": 0.003455979051068425,
+      "learning_rate": 0.001,
+      "loss": 0.4008,
+      "step": 5013
+    },
+    {
+      "epoch": 0.13834729613674696,
+      "grad_norm": 0.002884593093767762,
+      "learning_rate": 0.001,
+      "loss": 0.3896,
+      "step": 5014
+    },
+    {
+      "epoch": 0.1383748883378113,
+      "grad_norm": 0.002263484289869666,
+      "learning_rate": 0.001,
+      "loss": 0.4356,
+      "step": 5015
+    },
+    {
+      "epoch": 0.1384024805388757,
+      "grad_norm": 0.003735289676114917,
+      "learning_rate": 0.001,
+      "loss": 0.3739,
+      "step": 5016
+    },
+    {
+      "epoch": 0.13843007273994007,
+      "grad_norm": 0.003224038053303957,
+      "learning_rate": 0.001,
+      "loss": 0.3994,
+      "step": 5017
+    },
+    {
+      "epoch": 0.13845766494100442,
+      "grad_norm": 0.004887696355581284,
+      "learning_rate": 0.001,
+      "loss": 0.4223,
+      "step": 5018
+    },
+    {
+      "epoch": 0.1384852571420688,
+      "grad_norm": 0.004760736133903265,
+      "learning_rate": 0.001,
+      "loss": 0.3837,
+      "step": 5019
+    },
+    {
+      "epoch": 0.13851284934313315,
+      "grad_norm": 0.0037501987535506487,
+      "learning_rate": 0.001,
+      "loss": 0.4277,
+      "step": 5020
+    },
+    {
+      "epoch": 0.13854044154419753,
+      "grad_norm": 0.0027801606338471174,
+      "learning_rate": 0.001,
+      "loss": 0.4105,
+      "step": 5021
+    },
+    {
+      "epoch": 0.13856803374526191,
+      "grad_norm": 0.004847459960728884,
+      "learning_rate": 0.001,
+      "loss": 0.4016,
+      "step": 5022
+    },
+    {
+      "epoch": 0.13859562594632627,
+      "grad_norm": 0.0057189855724573135,
+      "learning_rate": 0.001,
+      "loss": 0.3729,
+      "step": 5023
+    },
+    {
+      "epoch": 0.13862321814739065,
+      "grad_norm": 0.002882015658542514,
+      "learning_rate": 0.001,
+      "loss": 0.3725,
+      "step": 5024
+    },
+    {
+      "epoch": 0.138650810348455,
+      "grad_norm": 0.002629178110510111,
+      "learning_rate": 0.001,
+      "loss": 0.3793,
+      "step": 5025
+    },
+    {
+      "epoch": 0.13867840254951938,
+      "grad_norm": 0.0042918650433421135,
+      "learning_rate": 0.001,
+      "loss": 0.3935,
+      "step": 5026
+    },
+    {
+      "epoch": 0.13870599475058376,
+      "grad_norm": 0.002527192234992981,
+      "learning_rate": 0.001,
+      "loss": 0.3924,
+      "step": 5027
+    },
+    {
+      "epoch": 0.1387335869516481,
+      "grad_norm": 0.0032653820235282183,
+      "learning_rate": 0.001,
+      "loss": 0.4435,
+      "step": 5028
+    },
+    {
+      "epoch": 0.1387611791527125,
+      "grad_norm": 0.007327108643949032,
+      "learning_rate": 0.001,
+      "loss": 0.3957,
+      "step": 5029
+    },
+    {
+      "epoch": 0.13878877135377685,
+      "grad_norm": 0.0036960134748369455,
+      "learning_rate": 0.001,
+      "loss": 0.4006,
+      "step": 5030
+    },
+    {
+      "epoch": 0.13881636355484123,
+      "grad_norm": 0.0025663794949650764,
+      "learning_rate": 0.001,
+      "loss": 0.3843,
+      "step": 5031
+    },
+    {
+      "epoch": 0.1388439557559056,
+      "grad_norm": 0.0026950971223413944,
+      "learning_rate": 0.001,
+      "loss": 0.3932,
+      "step": 5032
+    },
+    {
+      "epoch": 0.13887154795696996,
+      "grad_norm": 0.002336070640012622,
+      "learning_rate": 0.001,
+      "loss": 0.4152,
+      "step": 5033
+    },
+    {
+      "epoch": 0.13889914015803434,
+      "grad_norm": 0.0024258256889879704,
+      "learning_rate": 0.001,
+      "loss": 0.4286,
+      "step": 5034
+    },
+    {
+      "epoch": 0.1389267323590987,
+      "grad_norm": 0.002580720465630293,
+      "learning_rate": 0.001,
+      "loss": 0.3786,
+      "step": 5035
+    },
+    {
+      "epoch": 0.13895432456016307,
+      "grad_norm": 0.00329298572614789,
+      "learning_rate": 0.001,
+      "loss": 0.3764,
+      "step": 5036
+    },
+    {
+      "epoch": 0.13898191676122743,
+      "grad_norm": 0.0029764564242213964,
+      "learning_rate": 0.001,
+      "loss": 0.3951,
+      "step": 5037
+    },
+    {
+      "epoch": 0.1390095089622918,
+      "grad_norm": 0.002715889597311616,
+      "learning_rate": 0.001,
+      "loss": 0.382,
+      "step": 5038
+    },
+    {
+      "epoch": 0.13903710116335619,
+      "grad_norm": 0.002393190050497651,
+      "learning_rate": 0.001,
+      "loss": 0.3743,
+      "step": 5039
+    },
+    {
+      "epoch": 0.13906469336442054,
+      "grad_norm": 0.0024876859970390797,
+      "learning_rate": 0.001,
+      "loss": 0.397,
+      "step": 5040
+    },
+    {
+      "epoch": 0.13909228556548492,
+      "grad_norm": 0.0032655515242367983,
+      "learning_rate": 0.001,
+      "loss": 0.3793,
+      "step": 5041
+    },
+    {
+      "epoch": 0.13911987776654927,
+      "grad_norm": 0.002442015800625086,
+      "learning_rate": 0.001,
+      "loss": 0.4281,
+      "step": 5042
+    },
+    {
+      "epoch": 0.13914746996761365,
+      "grad_norm": 0.002807747106999159,
+      "learning_rate": 0.001,
+      "loss": 0.4181,
+      "step": 5043
+    },
+    {
+      "epoch": 0.13917506216867803,
+      "grad_norm": 0.005945953540503979,
+      "learning_rate": 0.001,
+      "loss": 0.3867,
+      "step": 5044
+    },
+    {
+      "epoch": 0.13920265436974238,
+      "grad_norm": 0.0031765319872647524,
+      "learning_rate": 0.001,
+      "loss": 0.416,
+      "step": 5045
+    },
+    {
+      "epoch": 0.13923024657080676,
+      "grad_norm": 0.0037979150656610727,
+      "learning_rate": 0.001,
+      "loss": 0.4096,
+      "step": 5046
+    },
+    {
+      "epoch": 0.13925783877187112,
+      "grad_norm": 0.0030904149170964956,
+      "learning_rate": 0.001,
+      "loss": 0.4243,
+      "step": 5047
+    },
+    {
+      "epoch": 0.1392854309729355,
+      "grad_norm": 0.0037039672024548054,
+      "learning_rate": 0.001,
+      "loss": 0.3912,
+      "step": 5048
+    },
+    {
+      "epoch": 0.13931302317399988,
+      "grad_norm": 0.0033529032953083515,
+      "learning_rate": 0.001,
+      "loss": 0.3702,
+      "step": 5049
+    },
+    {
+      "epoch": 0.13934061537506423,
+      "grad_norm": 0.0024797532241791487,
+      "learning_rate": 0.001,
+      "loss": 0.4347,
+      "step": 5050
+    },
+    {
+      "epoch": 0.1393682075761286,
+      "grad_norm": 0.003208388341590762,
+      "learning_rate": 0.001,
+      "loss": 0.3977,
+      "step": 5051
+    },
+    {
+      "epoch": 0.13939579977719296,
+      "grad_norm": 0.0024538834113627672,
+      "learning_rate": 0.001,
+      "loss": 0.4231,
+      "step": 5052
+    },
+    {
+      "epoch": 0.13942339197825734,
+      "grad_norm": 0.0028640846721827984,
+      "learning_rate": 0.001,
+      "loss": 0.3768,
+      "step": 5053
+    },
+    {
+      "epoch": 0.13945098417932172,
+      "grad_norm": 0.002653739182278514,
+      "learning_rate": 0.001,
+      "loss": 0.4074,
+      "step": 5054
+    },
+    {
+      "epoch": 0.13947857638038608,
+      "grad_norm": 0.003615357680246234,
+      "learning_rate": 0.001,
+      "loss": 0.419,
+      "step": 5055
+    },
+    {
+      "epoch": 0.13950616858145046,
+      "grad_norm": 0.002402637619525194,
+      "learning_rate": 0.001,
+      "loss": 0.409,
+      "step": 5056
+    },
+    {
+      "epoch": 0.1395337607825148,
+      "grad_norm": 0.0025258746463805437,
+      "learning_rate": 0.001,
+      "loss": 0.4042,
+      "step": 5057
+    },
+    {
+      "epoch": 0.1395613529835792,
+      "grad_norm": 0.0028938499744981527,
+      "learning_rate": 0.001,
+      "loss": 0.3877,
+      "step": 5058
+    },
+    {
+      "epoch": 0.13958894518464357,
+      "grad_norm": 0.008427090011537075,
+      "learning_rate": 0.001,
+      "loss": 0.4044,
+      "step": 5059
+    },
+    {
+      "epoch": 0.13961653738570792,
+      "grad_norm": 0.00399380037561059,
+      "learning_rate": 0.001,
+      "loss": 0.3757,
+      "step": 5060
+    },
+    {
+      "epoch": 0.1396441295867723,
+      "grad_norm": 0.002580770291388035,
+      "learning_rate": 0.001,
+      "loss": 0.3837,
+      "step": 5061
+    },
+    {
+      "epoch": 0.13967172178783666,
+      "grad_norm": 0.0026310610119253397,
+      "learning_rate": 0.001,
+      "loss": 0.4212,
+      "step": 5062
+    },
+    {
+      "epoch": 0.13969931398890104,
+      "grad_norm": 0.003709648735821247,
+      "learning_rate": 0.001,
+      "loss": 0.3849,
+      "step": 5063
+    },
+    {
+      "epoch": 0.13972690618996542,
+      "grad_norm": 0.0026174660306423903,
+      "learning_rate": 0.001,
+      "loss": 0.4129,
+      "step": 5064
+    },
+    {
+      "epoch": 0.13975449839102977,
+      "grad_norm": 0.002821024041622877,
+      "learning_rate": 0.001,
+      "loss": 0.4026,
+      "step": 5065
+    },
+    {
+      "epoch": 0.13978209059209415,
+      "grad_norm": 0.0045610954985022545,
+      "learning_rate": 0.001,
+      "loss": 0.3604,
+      "step": 5066
+    },
+    {
+      "epoch": 0.1398096827931585,
+      "grad_norm": 0.003461558138951659,
+      "learning_rate": 0.001,
+      "loss": 0.3955,
+      "step": 5067
+    },
+    {
+      "epoch": 0.13983727499422288,
+      "grad_norm": 0.00443047983571887,
+      "learning_rate": 0.001,
+      "loss": 0.4274,
+      "step": 5068
+    },
+    {
+      "epoch": 0.13986486719528726,
+      "grad_norm": 0.0035488642752170563,
+      "learning_rate": 0.001,
+      "loss": 0.4064,
+      "step": 5069
+    },
+    {
+      "epoch": 0.13989245939635161,
+      "grad_norm": 0.005335607565939426,
+      "learning_rate": 0.001,
+      "loss": 0.4271,
+      "step": 5070
+    },
+    {
+      "epoch": 0.139920051597416,
+      "grad_norm": 0.003680909052491188,
+      "learning_rate": 0.001,
+      "loss": 0.4332,
+      "step": 5071
+    },
+    {
+      "epoch": 0.13994764379848035,
+      "grad_norm": 0.004192323889583349,
+      "learning_rate": 0.001,
+      "loss": 0.3989,
+      "step": 5072
+    },
+    {
+      "epoch": 0.13997523599954473,
+      "grad_norm": 0.0035443026572465897,
+      "learning_rate": 0.001,
+      "loss": 0.4363,
+      "step": 5073
+    },
+    {
+      "epoch": 0.1400028282006091,
+      "grad_norm": 0.0038134013302624226,
+      "learning_rate": 0.001,
+      "loss": 0.3794,
+      "step": 5074
+    },
+    {
+      "epoch": 0.14003042040167346,
+      "grad_norm": 0.0036664907820522785,
+      "learning_rate": 0.001,
+      "loss": 0.428,
+      "step": 5075
+    },
+    {
+      "epoch": 0.14005801260273784,
+      "grad_norm": 0.0024540217127650976,
+      "learning_rate": 0.001,
+      "loss": 0.3829,
+      "step": 5076
+    },
+    {
+      "epoch": 0.1400856048038022,
+      "grad_norm": 0.0027097316924482584,
+      "learning_rate": 0.001,
+      "loss": 0.4205,
+      "step": 5077
+    },
+    {
+      "epoch": 0.14011319700486657,
+      "grad_norm": 0.002922753570601344,
+      "learning_rate": 0.001,
+      "loss": 0.4021,
+      "step": 5078
+    },
+    {
+      "epoch": 0.14014078920593095,
+      "grad_norm": 0.003050590632483363,
+      "learning_rate": 0.001,
+      "loss": 0.4488,
+      "step": 5079
+    },
+    {
+      "epoch": 0.1401683814069953,
+      "grad_norm": 0.0027282421942800283,
+      "learning_rate": 0.001,
+      "loss": 0.406,
+      "step": 5080
+    },
+    {
+      "epoch": 0.1401959736080597,
+      "grad_norm": 0.0027401361148804426,
+      "learning_rate": 0.001,
+      "loss": 0.4186,
+      "step": 5081
+    },
+    {
+      "epoch": 0.14022356580912404,
+      "grad_norm": 0.006228750105947256,
+      "learning_rate": 0.001,
+      "loss": 0.3926,
+      "step": 5082
+    },
+    {
+      "epoch": 0.14025115801018842,
+      "grad_norm": 0.002542686415836215,
+      "learning_rate": 0.001,
+      "loss": 0.453,
+      "step": 5083
+    },
+    {
+      "epoch": 0.1402787502112528,
+      "grad_norm": 0.002365612657740712,
+      "learning_rate": 0.001,
+      "loss": 0.4089,
+      "step": 5084
+    },
+    {
+      "epoch": 0.14030634241231715,
+      "grad_norm": 0.008088381960988045,
+      "learning_rate": 0.001,
+      "loss": 0.3808,
+      "step": 5085
+    },
+    {
+      "epoch": 0.14033393461338153,
+      "grad_norm": 0.002785349264740944,
+      "learning_rate": 0.001,
+      "loss": 0.3973,
+      "step": 5086
+    },
+    {
+      "epoch": 0.14036152681444589,
+      "grad_norm": 0.00489590922370553,
+      "learning_rate": 0.001,
+      "loss": 0.4241,
+      "step": 5087
+    },
+    {
+      "epoch": 0.14038911901551027,
+      "grad_norm": 0.0027916128747165203,
+      "learning_rate": 0.001,
+      "loss": 0.3714,
+      "step": 5088
+    },
+    {
+      "epoch": 0.14041671121657465,
+      "grad_norm": 0.002344654407352209,
+      "learning_rate": 0.001,
+      "loss": 0.388,
+      "step": 5089
+    },
+    {
+      "epoch": 0.140444303417639,
+      "grad_norm": 0.0031531101558357477,
+      "learning_rate": 0.001,
+      "loss": 0.3841,
+      "step": 5090
+    },
+    {
+      "epoch": 0.14047189561870338,
+      "grad_norm": 0.0027754863258451223,
+      "learning_rate": 0.001,
+      "loss": 0.3897,
+      "step": 5091
+    },
+    {
+      "epoch": 0.14049948781976773,
+      "grad_norm": 0.003328294726088643,
+      "learning_rate": 0.001,
+      "loss": 0.4077,
+      "step": 5092
+    },
+    {
+      "epoch": 0.1405270800208321,
+      "grad_norm": 0.0021852529607713223,
+      "learning_rate": 0.001,
+      "loss": 0.4113,
+      "step": 5093
+    },
+    {
+      "epoch": 0.1405546722218965,
+      "grad_norm": 0.004019197542220354,
+      "learning_rate": 0.001,
+      "loss": 0.3825,
+      "step": 5094
+    },
+    {
+      "epoch": 0.14058226442296085,
+      "grad_norm": 0.0024554249830543995,
+      "learning_rate": 0.001,
+      "loss": 0.4263,
+      "step": 5095
+    },
+    {
+      "epoch": 0.14060985662402523,
+      "grad_norm": 0.0032448593992739916,
+      "learning_rate": 0.001,
+      "loss": 0.3924,
+      "step": 5096
+    },
+    {
+      "epoch": 0.14063744882508958,
+      "grad_norm": 0.0027907416224479675,
+      "learning_rate": 0.001,
+      "loss": 0.4575,
+      "step": 5097
+    },
+    {
+      "epoch": 0.14066504102615396,
+      "grad_norm": 0.006815130822360516,
+      "learning_rate": 0.001,
+      "loss": 0.401,
+      "step": 5098
+    },
+    {
+      "epoch": 0.14069263322721834,
+      "grad_norm": 0.00370893650688231,
+      "learning_rate": 0.001,
+      "loss": 0.3962,
+      "step": 5099
+    },
+    {
+      "epoch": 0.1407202254282827,
+      "grad_norm": 0.002180712530389428,
+      "learning_rate": 0.001,
+      "loss": 0.4123,
+      "step": 5100
+    },
+    {
+      "epoch": 0.14074781762934707,
+      "grad_norm": 0.0035834074951708317,
+      "learning_rate": 0.001,
+      "loss": 0.3723,
+      "step": 5101
+    },
+    {
+      "epoch": 0.14077540983041142,
+      "grad_norm": 0.003018912160769105,
+      "learning_rate": 0.001,
+      "loss": 0.4232,
+      "step": 5102
+    },
+    {
+      "epoch": 0.1408030020314758,
+      "grad_norm": 0.002520001260563731,
+      "learning_rate": 0.001,
+      "loss": 0.3921,
+      "step": 5103
+    },
+    {
+      "epoch": 0.14083059423254018,
+      "grad_norm": 0.00585381593555212,
+      "learning_rate": 0.001,
+      "loss": 0.4159,
+      "step": 5104
+    },
+    {
+      "epoch": 0.14085818643360454,
+      "grad_norm": 0.0028747236356139183,
+      "learning_rate": 0.001,
+      "loss": 0.4045,
+      "step": 5105
+    },
+    {
+      "epoch": 0.14088577863466892,
+      "grad_norm": 0.005821559578180313,
+      "learning_rate": 0.001,
+      "loss": 0.3755,
+      "step": 5106
+    },
+    {
+      "epoch": 0.14091337083573327,
+      "grad_norm": 0.0025346369948238134,
+      "learning_rate": 0.001,
+      "loss": 0.399,
+      "step": 5107
+    },
+    {
+      "epoch": 0.14094096303679765,
+      "grad_norm": 0.002172187902033329,
+      "learning_rate": 0.001,
+      "loss": 0.4051,
+      "step": 5108
+    },
+    {
+      "epoch": 0.14096855523786203,
+      "grad_norm": 0.0038861907087266445,
+      "learning_rate": 0.001,
+      "loss": 0.4174,
+      "step": 5109
+    },
+    {
+      "epoch": 0.14099614743892638,
+      "grad_norm": 0.0024652110878378153,
+      "learning_rate": 0.001,
+      "loss": 0.3963,
+      "step": 5110
+    },
+    {
+      "epoch": 0.14102373963999076,
+      "grad_norm": 0.003532834816724062,
+      "learning_rate": 0.001,
+      "loss": 0.4111,
+      "step": 5111
+    },
+    {
+      "epoch": 0.14105133184105512,
+      "grad_norm": 0.006292164325714111,
+      "learning_rate": 0.001,
+      "loss": 0.3828,
+      "step": 5112
+    },
+    {
+      "epoch": 0.1410789240421195,
+      "grad_norm": 0.00268157827667892,
+      "learning_rate": 0.001,
+      "loss": 0.4055,
+      "step": 5113
+    },
+    {
+      "epoch": 0.14110651624318388,
+      "grad_norm": 0.0035765564534813166,
+      "learning_rate": 0.001,
+      "loss": 0.3599,
+      "step": 5114
+    },
+    {
+      "epoch": 0.14113410844424823,
+      "grad_norm": 0.0026933897752314806,
+      "learning_rate": 0.001,
+      "loss": 0.3904,
+      "step": 5115
+    },
+    {
+      "epoch": 0.1411617006453126,
+      "grad_norm": 0.0043404726311564445,
+      "learning_rate": 0.001,
+      "loss": 0.4012,
+      "step": 5116
+    },
+    {
+      "epoch": 0.14118929284637696,
+      "grad_norm": 0.0029684489127248526,
+      "learning_rate": 0.001,
+      "loss": 0.3667,
+      "step": 5117
+    },
+    {
+      "epoch": 0.14121688504744134,
+      "grad_norm": 0.0028438603039830923,
+      "learning_rate": 0.001,
+      "loss": 0.3819,
+      "step": 5118
+    },
+    {
+      "epoch": 0.14124447724850572,
+      "grad_norm": 0.002665071515366435,
+      "learning_rate": 0.001,
+      "loss": 0.3911,
+      "step": 5119
+    },
+    {
+      "epoch": 0.14127206944957008,
+      "grad_norm": 0.0035951558966189623,
+      "learning_rate": 0.001,
+      "loss": 0.3738,
+      "step": 5120
+    },
+    {
+      "epoch": 0.14129966165063446,
+      "grad_norm": 0.007121792994439602,
+      "learning_rate": 0.001,
+      "loss": 0.4219,
+      "step": 5121
+    },
+    {
+      "epoch": 0.1413272538516988,
+      "grad_norm": 0.0026034133043140173,
+      "learning_rate": 0.001,
+      "loss": 0.4131,
+      "step": 5122
+    },
+    {
+      "epoch": 0.1413548460527632,
+      "grad_norm": 0.004363723564893007,
+      "learning_rate": 0.001,
+      "loss": 0.4695,
+      "step": 5123
+    },
+    {
+      "epoch": 0.14138243825382757,
+      "grad_norm": 0.002366288099437952,
+      "learning_rate": 0.001,
+      "loss": 0.4195,
+      "step": 5124
+    },
+    {
+      "epoch": 0.14141003045489192,
+      "grad_norm": 0.002647766610607505,
+      "learning_rate": 0.001,
+      "loss": 0.4012,
+      "step": 5125
+    },
+    {
+      "epoch": 0.1414376226559563,
+      "grad_norm": 0.002447773702442646,
+      "learning_rate": 0.001,
+      "loss": 0.4414,
+      "step": 5126
+    },
+    {
+      "epoch": 0.14146521485702065,
+      "grad_norm": 0.0026955637149512768,
+      "learning_rate": 0.001,
+      "loss": 0.3644,
+      "step": 5127
+    },
+    {
+      "epoch": 0.14149280705808503,
+      "grad_norm": 0.004317718092352152,
+      "learning_rate": 0.001,
+      "loss": 0.3877,
+      "step": 5128
+    },
+    {
+      "epoch": 0.1415203992591494,
+      "grad_norm": 0.002862673718482256,
+      "learning_rate": 0.001,
+      "loss": 0.4218,
+      "step": 5129
+    },
+    {
+      "epoch": 0.14154799146021377,
+      "grad_norm": 0.004444323480129242,
+      "learning_rate": 0.001,
+      "loss": 0.417,
+      "step": 5130
+    },
+    {
+      "epoch": 0.14157558366127815,
+      "grad_norm": 0.008723779581487179,
+      "learning_rate": 0.001,
+      "loss": 0.4092,
+      "step": 5131
+    },
+    {
+      "epoch": 0.1416031758623425,
+      "grad_norm": 0.0023829471319913864,
+      "learning_rate": 0.001,
+      "loss": 0.427,
+      "step": 5132
+    },
+    {
+      "epoch": 0.14163076806340688,
+      "grad_norm": 0.0023019250947982073,
+      "learning_rate": 0.001,
+      "loss": 0.404,
+      "step": 5133
+    },
+    {
+      "epoch": 0.14165836026447123,
+      "grad_norm": 0.002866733353585005,
+      "learning_rate": 0.001,
+      "loss": 0.4068,
+      "step": 5134
+    },
+    {
+      "epoch": 0.14168595246553561,
+      "grad_norm": 0.002325895708054304,
+      "learning_rate": 0.001,
+      "loss": 0.3829,
+      "step": 5135
+    },
+    {
+      "epoch": 0.1417135446666,
+      "grad_norm": 0.0031419056467711926,
+      "learning_rate": 0.001,
+      "loss": 0.394,
+      "step": 5136
+    },
+    {
+      "epoch": 0.14174113686766435,
+      "grad_norm": 0.0049896384589374065,
+      "learning_rate": 0.001,
+      "loss": 0.4001,
+      "step": 5137
+    },
+    {
+      "epoch": 0.14176872906872873,
+      "grad_norm": 0.006104922853410244,
+      "learning_rate": 0.001,
+      "loss": 0.3583,
+      "step": 5138
+    },
+    {
+      "epoch": 0.14179632126979308,
+      "grad_norm": 0.005146206822246313,
+      "learning_rate": 0.001,
+      "loss": 0.3609,
+      "step": 5139
+    },
+    {
+      "epoch": 0.14182391347085746,
+      "grad_norm": 0.0036859933752566576,
+      "learning_rate": 0.001,
+      "loss": 0.3677,
+      "step": 5140
+    },
+    {
+      "epoch": 0.14185150567192184,
+      "grad_norm": 0.0021758072543889284,
+      "learning_rate": 0.001,
+      "loss": 0.401,
+      "step": 5141
+    },
+    {
+      "epoch": 0.1418790978729862,
+      "grad_norm": 0.002568889642134309,
+      "learning_rate": 0.001,
+      "loss": 0.4083,
+      "step": 5142
+    },
+    {
+      "epoch": 0.14190669007405057,
+      "grad_norm": 0.003079349873587489,
+      "learning_rate": 0.001,
+      "loss": 0.4044,
+      "step": 5143
+    },
+    {
+      "epoch": 0.14193428227511493,
+      "grad_norm": 0.006028784904628992,
+      "learning_rate": 0.001,
+      "loss": 0.3697,
+      "step": 5144
+    },
+    {
+      "epoch": 0.1419618744761793,
+      "grad_norm": 0.0028043135534971952,
+      "learning_rate": 0.001,
+      "loss": 0.3852,
+      "step": 5145
+    },
+    {
+      "epoch": 0.1419894666772437,
+      "grad_norm": 0.0026649232022464275,
+      "learning_rate": 0.001,
+      "loss": 0.4122,
+      "step": 5146
+    },
+    {
+      "epoch": 0.14201705887830804,
+      "grad_norm": 0.003915826324373484,
+      "learning_rate": 0.001,
+      "loss": 0.382,
+      "step": 5147
+    },
+    {
+      "epoch": 0.14204465107937242,
+      "grad_norm": 0.0024036671966314316,
+      "learning_rate": 0.001,
+      "loss": 0.423,
+      "step": 5148
+    },
+    {
+      "epoch": 0.14207224328043677,
+      "grad_norm": 0.0042000822722911835,
+      "learning_rate": 0.001,
+      "loss": 0.3781,
+      "step": 5149
+    },
+    {
+      "epoch": 0.14209983548150115,
+      "grad_norm": 0.0055633410811424255,
+      "learning_rate": 0.001,
+      "loss": 0.4201,
+      "step": 5150
+    },
+    {
+      "epoch": 0.14212742768256553,
+      "grad_norm": 0.0037763305008411407,
+      "learning_rate": 0.001,
+      "loss": 0.4247,
+      "step": 5151
+    },
+    {
+      "epoch": 0.14215501988362989,
+      "grad_norm": 0.0034036405850201845,
+      "learning_rate": 0.001,
+      "loss": 0.399,
+      "step": 5152
+    },
+    {
+      "epoch": 0.14218261208469427,
+      "grad_norm": 0.005322457291185856,
+      "learning_rate": 0.001,
+      "loss": 0.4153,
+      "step": 5153
+    },
+    {
+      "epoch": 0.14221020428575862,
+      "grad_norm": 0.0037400966975837946,
+      "learning_rate": 0.001,
+      "loss": 0.4159,
+      "step": 5154
+    },
+    {
+      "epoch": 0.142237796486823,
+      "grad_norm": 0.005500406958162785,
+      "learning_rate": 0.001,
+      "loss": 0.4355,
+      "step": 5155
+    },
+    {
+      "epoch": 0.14226538868788738,
+      "grad_norm": 0.003691243240609765,
+      "learning_rate": 0.001,
+      "loss": 0.4085,
+      "step": 5156
+    },
+    {
+      "epoch": 0.14229298088895173,
+      "grad_norm": 0.005320982076227665,
+      "learning_rate": 0.001,
+      "loss": 0.3824,
+      "step": 5157
+    },
+    {
+      "epoch": 0.1423205730900161,
+      "grad_norm": 0.0033200359903275967,
+      "learning_rate": 0.001,
+      "loss": 0.4042,
+      "step": 5158
+    },
+    {
+      "epoch": 0.14234816529108046,
+      "grad_norm": 0.0034416653215885162,
+      "learning_rate": 0.001,
+      "loss": 0.398,
+      "step": 5159
+    },
+    {
+      "epoch": 0.14237575749214484,
+      "grad_norm": 0.0032915968913584948,
+      "learning_rate": 0.001,
+      "loss": 0.4318,
+      "step": 5160
+    },
+    {
+      "epoch": 0.14240334969320922,
+      "grad_norm": 0.00513947568833828,
+      "learning_rate": 0.001,
+      "loss": 0.4042,
+      "step": 5161
+    },
+    {
+      "epoch": 0.14243094189427358,
+      "grad_norm": 0.005058473441749811,
+      "learning_rate": 0.001,
+      "loss": 0.3942,
+      "step": 5162
+    },
+    {
+      "epoch": 0.14245853409533796,
+      "grad_norm": 0.003918751142919064,
+      "learning_rate": 0.001,
+      "loss": 0.3813,
+      "step": 5163
+    },
+    {
+      "epoch": 0.1424861262964023,
+      "grad_norm": 0.0029717017896473408,
+      "learning_rate": 0.001,
+      "loss": 0.4186,
+      "step": 5164
+    },
+    {
+      "epoch": 0.1425137184974667,
+      "grad_norm": 0.004301862791180611,
+      "learning_rate": 0.001,
+      "loss": 0.3641,
+      "step": 5165
+    },
+    {
+      "epoch": 0.14254131069853107,
+      "grad_norm": 0.0028698716778308153,
+      "learning_rate": 0.001,
+      "loss": 0.4313,
+      "step": 5166
+    },
+    {
+      "epoch": 0.14256890289959542,
+      "grad_norm": 0.002715424867346883,
+      "learning_rate": 0.001,
+      "loss": 0.407,
+      "step": 5167
+    },
+    {
+      "epoch": 0.1425964951006598,
+      "grad_norm": 0.002695337636396289,
+      "learning_rate": 0.001,
+      "loss": 0.403,
+      "step": 5168
+    },
+    {
+      "epoch": 0.14262408730172416,
+      "grad_norm": 0.002396870171651244,
+      "learning_rate": 0.001,
+      "loss": 0.3978,
+      "step": 5169
+    },
+    {
+      "epoch": 0.14265167950278854,
+      "grad_norm": 0.0023134411312639713,
+      "learning_rate": 0.001,
+      "loss": 0.405,
+      "step": 5170
+    },
+    {
+      "epoch": 0.14267927170385292,
+      "grad_norm": 0.007215626537799835,
+      "learning_rate": 0.001,
+      "loss": 0.4436,
+      "step": 5171
+    },
+    {
+      "epoch": 0.14270686390491727,
+      "grad_norm": 0.0030377882067114115,
+      "learning_rate": 0.001,
+      "loss": 0.3841,
+      "step": 5172
+    },
+    {
+      "epoch": 0.14273445610598165,
+      "grad_norm": 0.0052809203043580055,
+      "learning_rate": 0.001,
+      "loss": 0.3806,
+      "step": 5173
+    },
+    {
+      "epoch": 0.142762048307046,
+      "grad_norm": 0.0030719470232725143,
+      "learning_rate": 0.001,
+      "loss": 0.3556,
+      "step": 5174
+    },
+    {
+      "epoch": 0.14278964050811038,
+      "grad_norm": 0.003323314245790243,
+      "learning_rate": 0.001,
+      "loss": 0.3951,
+      "step": 5175
+    },
+    {
+      "epoch": 0.14281723270917476,
+      "grad_norm": 0.004299065563827753,
+      "learning_rate": 0.001,
+      "loss": 0.4111,
+      "step": 5176
+    },
+    {
+      "epoch": 0.14284482491023912,
+      "grad_norm": 0.006395953707396984,
+      "learning_rate": 0.001,
+      "loss": 0.4018,
+      "step": 5177
+    },
+    {
+      "epoch": 0.1428724171113035,
+      "grad_norm": 0.0036343904212117195,
+      "learning_rate": 0.001,
+      "loss": 0.3925,
+      "step": 5178
+    },
+    {
+      "epoch": 0.14290000931236785,
+      "grad_norm": 0.003354752203449607,
+      "learning_rate": 0.001,
+      "loss": 0.4022,
+      "step": 5179
+    },
+    {
+      "epoch": 0.14292760151343223,
+      "grad_norm": 0.0029036353807896376,
+      "learning_rate": 0.001,
+      "loss": 0.4085,
+      "step": 5180
+    },
+    {
+      "epoch": 0.1429551937144966,
+      "grad_norm": 0.0033736219629645348,
+      "learning_rate": 0.001,
+      "loss": 0.3972,
+      "step": 5181
+    },
+    {
+      "epoch": 0.14298278591556096,
+      "grad_norm": 0.00228399527259171,
+      "learning_rate": 0.001,
+      "loss": 0.4349,
+      "step": 5182
+    },
+    {
+      "epoch": 0.14301037811662534,
+      "grad_norm": 0.0025460305623710155,
+      "learning_rate": 0.001,
+      "loss": 0.3771,
+      "step": 5183
+    },
+    {
+      "epoch": 0.1430379703176897,
+      "grad_norm": 0.0031551956199109554,
+      "learning_rate": 0.001,
+      "loss": 0.3833,
+      "step": 5184
+    },
+    {
+      "epoch": 0.14306556251875407,
+      "grad_norm": 0.003111305646598339,
+      "learning_rate": 0.001,
+      "loss": 0.399,
+      "step": 5185
+    },
+    {
+      "epoch": 0.14309315471981846,
+      "grad_norm": 0.0028325961902737617,
+      "learning_rate": 0.001,
+      "loss": 0.4014,
+      "step": 5186
+    },
+    {
+      "epoch": 0.1431207469208828,
+      "grad_norm": 0.0034688191954046488,
+      "learning_rate": 0.001,
+      "loss": 0.4025,
+      "step": 5187
+    },
+    {
+      "epoch": 0.1431483391219472,
+      "grad_norm": 0.0022599126677960157,
+      "learning_rate": 0.001,
+      "loss": 0.4351,
+      "step": 5188
+    },
+    {
+      "epoch": 0.14317593132301154,
+      "grad_norm": 0.004831294994801283,
+      "learning_rate": 0.001,
+      "loss": 0.4004,
+      "step": 5189
+    },
+    {
+      "epoch": 0.14320352352407592,
+      "grad_norm": 0.0029807155951857567,
+      "learning_rate": 0.001,
+      "loss": 0.3886,
+      "step": 5190
+    },
+    {
+      "epoch": 0.1432311157251403,
+      "grad_norm": 0.003226724686101079,
+      "learning_rate": 0.001,
+      "loss": 0.4137,
+      "step": 5191
+    },
+    {
+      "epoch": 0.14325870792620465,
+      "grad_norm": 0.0025674651842564344,
+      "learning_rate": 0.001,
+      "loss": 0.4403,
+      "step": 5192
+    },
+    {
+      "epoch": 0.14328630012726903,
+      "grad_norm": 0.002636546269059181,
+      "learning_rate": 0.001,
+      "loss": 0.4202,
+      "step": 5193
+    },
+    {
+      "epoch": 0.1433138923283334,
+      "grad_norm": 0.003959611523896456,
+      "learning_rate": 0.001,
+      "loss": 0.4153,
+      "step": 5194
+    },
+    {
+      "epoch": 0.14334148452939777,
+      "grad_norm": 0.00245193880982697,
+      "learning_rate": 0.001,
+      "loss": 0.4404,
+      "step": 5195
+    },
+    {
+      "epoch": 0.14336907673046215,
+      "grad_norm": 0.0033164939377456903,
+      "learning_rate": 0.001,
+      "loss": 0.384,
+      "step": 5196
+    },
+    {
+      "epoch": 0.1433966689315265,
+      "grad_norm": 0.0031362988520413637,
+      "learning_rate": 0.001,
+      "loss": 0.3822,
+      "step": 5197
+    },
+    {
+      "epoch": 0.14342426113259088,
+      "grad_norm": 0.0031361919827759266,
+      "learning_rate": 0.001,
+      "loss": 0.387,
+      "step": 5198
+    },
+    {
+      "epoch": 0.14345185333365523,
+      "grad_norm": 0.0025203994009643793,
+      "learning_rate": 0.001,
+      "loss": 0.4244,
+      "step": 5199
+    },
+    {
+      "epoch": 0.1434794455347196,
+      "grad_norm": 0.0021688216365873814,
+      "learning_rate": 0.001,
+      "loss": 0.3967,
+      "step": 5200
+    },
+    {
+      "epoch": 0.143507037735784,
+      "grad_norm": 0.0022967627737671137,
+      "learning_rate": 0.001,
+      "loss": 0.443,
+      "step": 5201
+    },
+    {
+      "epoch": 0.14353462993684835,
+      "grad_norm": 0.002367371693253517,
+      "learning_rate": 0.001,
+      "loss": 0.4249,
+      "step": 5202
+    },
+    {
+      "epoch": 0.14356222213791273,
+      "grad_norm": 0.002286283066496253,
+      "learning_rate": 0.001,
+      "loss": 0.381,
+      "step": 5203
+    },
+    {
+      "epoch": 0.14358981433897708,
+      "grad_norm": 0.002556881867349148,
+      "learning_rate": 0.001,
+      "loss": 0.4153,
+      "step": 5204
+    },
+    {
+      "epoch": 0.14361740654004146,
+      "grad_norm": 0.002457389608025551,
+      "learning_rate": 0.001,
+      "loss": 0.4515,
+      "step": 5205
+    },
+    {
+      "epoch": 0.14364499874110584,
+      "grad_norm": 0.0026917587965726852,
+      "learning_rate": 0.001,
+      "loss": 0.4123,
+      "step": 5206
+    },
+    {
+      "epoch": 0.1436725909421702,
+      "grad_norm": 0.0024027330800890923,
+      "learning_rate": 0.001,
+      "loss": 0.4429,
+      "step": 5207
+    },
+    {
+      "epoch": 0.14370018314323457,
+      "grad_norm": 0.0036219994071871042,
+      "learning_rate": 0.001,
+      "loss": 0.3975,
+      "step": 5208
+    },
+    {
+      "epoch": 0.14372777534429892,
+      "grad_norm": 0.0024791264440864325,
+      "learning_rate": 0.001,
+      "loss": 0.3944,
+      "step": 5209
+    },
+    {
+      "epoch": 0.1437553675453633,
+      "grad_norm": 0.002413226757198572,
+      "learning_rate": 0.001,
+      "loss": 0.4413,
+      "step": 5210
+    },
+    {
+      "epoch": 0.14378295974642769,
+      "grad_norm": 0.0027174693532288074,
+      "learning_rate": 0.001,
+      "loss": 0.3878,
+      "step": 5211
+    },
+    {
+      "epoch": 0.14381055194749204,
+      "grad_norm": 0.002803023438900709,
+      "learning_rate": 0.001,
+      "loss": 0.3682,
+      "step": 5212
+    },
+    {
+      "epoch": 0.14383814414855642,
+      "grad_norm": 0.004972567781805992,
+      "learning_rate": 0.001,
+      "loss": 0.4127,
+      "step": 5213
+    },
+    {
+      "epoch": 0.14386573634962077,
+      "grad_norm": 0.003360881470143795,
+      "learning_rate": 0.001,
+      "loss": 0.414,
+      "step": 5214
+    },
+    {
+      "epoch": 0.14389332855068515,
+      "grad_norm": 0.0027532910462468863,
+      "learning_rate": 0.001,
+      "loss": 0.3699,
+      "step": 5215
+    },
+    {
+      "epoch": 0.14392092075174953,
+      "grad_norm": 0.003589104162529111,
+      "learning_rate": 0.001,
+      "loss": 0.3631,
+      "step": 5216
+    },
+    {
+      "epoch": 0.14394851295281388,
+      "grad_norm": 0.004771512001752853,
+      "learning_rate": 0.001,
+      "loss": 0.3912,
+      "step": 5217
+    },
+    {
+      "epoch": 0.14397610515387826,
+      "grad_norm": 0.0026405095122754574,
+      "learning_rate": 0.001,
+      "loss": 0.3975,
+      "step": 5218
+    },
+    {
+      "epoch": 0.14400369735494262,
+      "grad_norm": 0.004283549264073372,
+      "learning_rate": 0.001,
+      "loss": 0.3746,
+      "step": 5219
+    },
+    {
+      "epoch": 0.144031289556007,
+      "grad_norm": 0.0045742373913526535,
+      "learning_rate": 0.001,
+      "loss": 0.3972,
+      "step": 5220
+    },
+    {
+      "epoch": 0.14405888175707138,
+      "grad_norm": 0.0042295148596167564,
+      "learning_rate": 0.001,
+      "loss": 0.392,
+      "step": 5221
+    },
+    {
+      "epoch": 0.14408647395813573,
+      "grad_norm": 0.004257339984178543,
+      "learning_rate": 0.001,
+      "loss": 0.42,
+      "step": 5222
+    },
+    {
+      "epoch": 0.1441140661592001,
+      "grad_norm": 0.002708716783672571,
+      "learning_rate": 0.001,
+      "loss": 0.4022,
+      "step": 5223
+    },
+    {
+      "epoch": 0.14414165836026446,
+      "grad_norm": 0.004137181676924229,
+      "learning_rate": 0.001,
+      "loss": 0.4084,
+      "step": 5224
+    },
+    {
+      "epoch": 0.14416925056132884,
+      "grad_norm": 0.005688484758138657,
+      "learning_rate": 0.001,
+      "loss": 0.4089,
+      "step": 5225
+    },
+    {
+      "epoch": 0.1441968427623932,
+      "grad_norm": 0.008182493969798088,
+      "learning_rate": 0.001,
+      "loss": 0.4253,
+      "step": 5226
+    },
+    {
+      "epoch": 0.14422443496345758,
+      "grad_norm": 0.003255674382671714,
+      "learning_rate": 0.001,
+      "loss": 0.4268,
+      "step": 5227
+    },
+    {
+      "epoch": 0.14425202716452196,
+      "grad_norm": 0.0024103031028062105,
+      "learning_rate": 0.001,
+      "loss": 0.4263,
+      "step": 5228
+    },
+    {
+      "epoch": 0.1442796193655863,
+      "grad_norm": 0.0030367637518793344,
+      "learning_rate": 0.001,
+      "loss": 0.4082,
+      "step": 5229
+    },
+    {
+      "epoch": 0.1443072115666507,
+      "grad_norm": 0.004183416720479727,
+      "learning_rate": 0.001,
+      "loss": 0.42,
+      "step": 5230
+    },
+    {
+      "epoch": 0.14433480376771504,
+      "grad_norm": 0.002727340441197157,
+      "learning_rate": 0.001,
+      "loss": 0.3887,
+      "step": 5231
+    },
+    {
+      "epoch": 0.14436239596877942,
+      "grad_norm": 0.003591054119169712,
+      "learning_rate": 0.001,
+      "loss": 0.3937,
+      "step": 5232
+    },
+    {
+      "epoch": 0.1443899881698438,
+      "grad_norm": 0.0028464680071920156,
+      "learning_rate": 0.001,
+      "loss": 0.359,
+      "step": 5233
+    },
+    {
+      "epoch": 0.14441758037090816,
+      "grad_norm": 0.006335907150059938,
+      "learning_rate": 0.001,
+      "loss": 0.3894,
+      "step": 5234
+    },
+    {
+      "epoch": 0.14444517257197254,
+      "grad_norm": 0.0027192372363060713,
+      "learning_rate": 0.001,
+      "loss": 0.4119,
+      "step": 5235
+    },
+    {
+      "epoch": 0.1444727647730369,
+      "grad_norm": 0.003246571170166135,
+      "learning_rate": 0.001,
+      "loss": 0.385,
+      "step": 5236
+    },
+    {
+      "epoch": 0.14450035697410127,
+      "grad_norm": 0.0024179634638130665,
+      "learning_rate": 0.001,
+      "loss": 0.4123,
+      "step": 5237
+    },
+    {
+      "epoch": 0.14452794917516565,
+      "grad_norm": 0.006938959006220102,
+      "learning_rate": 0.001,
+      "loss": 0.4152,
+      "step": 5238
+    },
+    {
+      "epoch": 0.14455554137623,
+      "grad_norm": 0.004923888016492128,
+      "learning_rate": 0.001,
+      "loss": 0.4235,
+      "step": 5239
+    },
+    {
+      "epoch": 0.14458313357729438,
+      "grad_norm": 0.0031706364825367928,
+      "learning_rate": 0.001,
+      "loss": 0.3494,
+      "step": 5240
+    },
+    {
+      "epoch": 0.14461072577835873,
+      "grad_norm": 0.002737032249569893,
+      "learning_rate": 0.001,
+      "loss": 0.4248,
+      "step": 5241
+    },
+    {
+      "epoch": 0.14463831797942311,
+      "grad_norm": 0.0025771090295165777,
+      "learning_rate": 0.001,
+      "loss": 0.4101,
+      "step": 5242
+    },
+    {
+      "epoch": 0.1446659101804875,
+      "grad_norm": 0.0030646645464003086,
+      "learning_rate": 0.001,
+      "loss": 0.3956,
+      "step": 5243
+    },
+    {
+      "epoch": 0.14469350238155185,
+      "grad_norm": 0.0036759376525878906,
+      "learning_rate": 0.001,
+      "loss": 0.3846,
+      "step": 5244
+    },
+    {
+      "epoch": 0.14472109458261623,
+      "grad_norm": 0.0030812059994786978,
+      "learning_rate": 0.001,
+      "loss": 0.4069,
+      "step": 5245
+    },
+    {
+      "epoch": 0.14474868678368058,
+      "grad_norm": 0.0022093786392360926,
+      "learning_rate": 0.001,
+      "loss": 0.4016,
+      "step": 5246
+    },
+    {
+      "epoch": 0.14477627898474496,
+      "grad_norm": 0.009639604948461056,
+      "learning_rate": 0.001,
+      "loss": 0.4283,
+      "step": 5247
+    },
+    {
+      "epoch": 0.14480387118580934,
+      "grad_norm": 0.0032258466817438602,
+      "learning_rate": 0.001,
+      "loss": 0.4242,
+      "step": 5248
+    },
+    {
+      "epoch": 0.1448314633868737,
+      "grad_norm": 0.0022797686979174614,
+      "learning_rate": 0.001,
+      "loss": 0.4,
+      "step": 5249
+    },
+    {
+      "epoch": 0.14485905558793807,
+      "grad_norm": 0.004146902356296778,
+      "learning_rate": 0.001,
+      "loss": 0.3939,
+      "step": 5250
+    },
+    {
+      "epoch": 0.14488664778900243,
+      "grad_norm": 0.006115823984146118,
+      "learning_rate": 0.001,
+      "loss": 0.4382,
+      "step": 5251
+    },
+    {
+      "epoch": 0.1449142399900668,
+      "grad_norm": 0.007470074575394392,
+      "learning_rate": 0.001,
+      "loss": 0.4258,
+      "step": 5252
+    },
+    {
+      "epoch": 0.1449418321911312,
+      "grad_norm": 0.005569383502006531,
+      "learning_rate": 0.001,
+      "loss": 0.4357,
+      "step": 5253
+    },
+    {
+      "epoch": 0.14496942439219554,
+      "grad_norm": 0.004871412180364132,
+      "learning_rate": 0.001,
+      "loss": 0.4048,
+      "step": 5254
+    },
+    {
+      "epoch": 0.14499701659325992,
+      "grad_norm": 0.0038668601773679256,
+      "learning_rate": 0.001,
+      "loss": 0.3764,
+      "step": 5255
+    },
+    {
+      "epoch": 0.14502460879432427,
+      "grad_norm": 0.00738053023815155,
+      "learning_rate": 0.001,
+      "loss": 0.4162,
+      "step": 5256
+    },
+    {
+      "epoch": 0.14505220099538865,
+      "grad_norm": 0.004165946505963802,
+      "learning_rate": 0.001,
+      "loss": 0.3671,
+      "step": 5257
+    },
+    {
+      "epoch": 0.14507979319645303,
+      "grad_norm": 0.004606361500918865,
+      "learning_rate": 0.001,
+      "loss": 0.3856,
+      "step": 5258
+    },
+    {
+      "epoch": 0.14510738539751739,
+      "grad_norm": 0.002536054700613022,
+      "learning_rate": 0.001,
+      "loss": 0.4243,
+      "step": 5259
+    },
+    {
+      "epoch": 0.14513497759858177,
+      "grad_norm": 0.005060167983174324,
+      "learning_rate": 0.001,
+      "loss": 0.4091,
+      "step": 5260
+    },
+    {
+      "epoch": 0.14516256979964612,
+      "grad_norm": 0.004944161977618933,
+      "learning_rate": 0.001,
+      "loss": 0.3806,
+      "step": 5261
+    },
+    {
+      "epoch": 0.1451901620007105,
+      "grad_norm": 0.0029536515939980745,
+      "learning_rate": 0.001,
+      "loss": 0.3705,
+      "step": 5262
+    },
+    {
+      "epoch": 0.14521775420177488,
+      "grad_norm": 0.0047346302308142185,
+      "learning_rate": 0.001,
+      "loss": 0.3741,
+      "step": 5263
+    },
+    {
+      "epoch": 0.14524534640283923,
+      "grad_norm": 0.0024177066516131163,
+      "learning_rate": 0.001,
+      "loss": 0.4059,
+      "step": 5264
+    },
+    {
+      "epoch": 0.1452729386039036,
+      "grad_norm": 0.0036898923572152853,
+      "learning_rate": 0.001,
+      "loss": 0.4304,
+      "step": 5265
+    },
+    {
+      "epoch": 0.14530053080496796,
+      "grad_norm": 0.0027802560944110155,
+      "learning_rate": 0.001,
+      "loss": 0.3662,
+      "step": 5266
+    },
+    {
+      "epoch": 0.14532812300603234,
+      "grad_norm": 0.0035621714778244495,
+      "learning_rate": 0.001,
+      "loss": 0.4256,
+      "step": 5267
+    },
+    {
+      "epoch": 0.14535571520709673,
+      "grad_norm": 0.004401834215968847,
+      "learning_rate": 0.001,
+      "loss": 0.427,
+      "step": 5268
+    },
+    {
+      "epoch": 0.14538330740816108,
+      "grad_norm": 0.0035394374281167984,
+      "learning_rate": 0.001,
+      "loss": 0.4051,
+      "step": 5269
+    },
+    {
+      "epoch": 0.14541089960922546,
+      "grad_norm": 0.017217174172401428,
+      "learning_rate": 0.001,
+      "loss": 0.3675,
+      "step": 5270
+    },
+    {
+      "epoch": 0.1454384918102898,
+      "grad_norm": 0.0022773677483201027,
+      "learning_rate": 0.001,
+      "loss": 0.384,
+      "step": 5271
+    },
+    {
+      "epoch": 0.1454660840113542,
+      "grad_norm": 0.002403079532086849,
+      "learning_rate": 0.001,
+      "loss": 0.4004,
+      "step": 5272
+    },
+    {
+      "epoch": 0.14549367621241857,
+      "grad_norm": 0.0033804429695010185,
+      "learning_rate": 0.001,
+      "loss": 0.3854,
+      "step": 5273
+    },
+    {
+      "epoch": 0.14552126841348292,
+      "grad_norm": 0.002910598646849394,
+      "learning_rate": 0.001,
+      "loss": 0.3705,
+      "step": 5274
+    },
+    {
+      "epoch": 0.1455488606145473,
+      "grad_norm": 0.008523901924490929,
+      "learning_rate": 0.001,
+      "loss": 0.4028,
+      "step": 5275
+    },
+    {
+      "epoch": 0.14557645281561166,
+      "grad_norm": 0.003371842671185732,
+      "learning_rate": 0.001,
+      "loss": 0.3933,
+      "step": 5276
+    },
+    {
+      "epoch": 0.14560404501667604,
+      "grad_norm": 0.009514998644590378,
+      "learning_rate": 0.001,
+      "loss": 0.38,
+      "step": 5277
+    },
+    {
+      "epoch": 0.14563163721774042,
+      "grad_norm": 0.002840045839548111,
+      "learning_rate": 0.001,
+      "loss": 0.3884,
+      "step": 5278
+    },
+    {
+      "epoch": 0.14565922941880477,
+      "grad_norm": 0.0025841807946562767,
+      "learning_rate": 0.001,
+      "loss": 0.4236,
+      "step": 5279
+    },
+    {
+      "epoch": 0.14568682161986915,
+      "grad_norm": 0.003922698087990284,
+      "learning_rate": 0.001,
+      "loss": 0.3799,
+      "step": 5280
+    },
+    {
+      "epoch": 0.1457144138209335,
+      "grad_norm": 0.004982591141015291,
+      "learning_rate": 0.001,
+      "loss": 0.3799,
+      "step": 5281
+    },
+    {
+      "epoch": 0.14574200602199788,
+      "grad_norm": 0.003860799828544259,
+      "learning_rate": 0.001,
+      "loss": 0.3455,
+      "step": 5282
+    },
+    {
+      "epoch": 0.14576959822306226,
+      "grad_norm": 0.003407267387956381,
+      "learning_rate": 0.001,
+      "loss": 0.3778,
+      "step": 5283
+    },
+    {
+      "epoch": 0.14579719042412662,
+      "grad_norm": 0.003815464908257127,
+      "learning_rate": 0.001,
+      "loss": 0.4121,
+      "step": 5284
+    },
+    {
+      "epoch": 0.145824782625191,
+      "grad_norm": 0.002572751371189952,
+      "learning_rate": 0.001,
+      "loss": 0.4426,
+      "step": 5285
+    },
+    {
+      "epoch": 0.14585237482625535,
+      "grad_norm": 0.0026371986605226994,
+      "learning_rate": 0.001,
+      "loss": 0.4126,
+      "step": 5286
+    },
+    {
+      "epoch": 0.14587996702731973,
+      "grad_norm": 0.0028192142490297556,
+      "learning_rate": 0.001,
+      "loss": 0.3687,
+      "step": 5287
+    },
+    {
+      "epoch": 0.1459075592283841,
+      "grad_norm": 0.0033562618773430586,
+      "learning_rate": 0.001,
+      "loss": 0.384,
+      "step": 5288
+    },
+    {
+      "epoch": 0.14593515142944846,
+      "grad_norm": 0.0029199314303696156,
+      "learning_rate": 0.001,
+      "loss": 0.4294,
+      "step": 5289
+    },
+    {
+      "epoch": 0.14596274363051284,
+      "grad_norm": 0.0029893077444285154,
+      "learning_rate": 0.001,
+      "loss": 0.3899,
+      "step": 5290
+    },
+    {
+      "epoch": 0.1459903358315772,
+      "grad_norm": 0.002518982160836458,
+      "learning_rate": 0.001,
+      "loss": 0.3975,
+      "step": 5291
+    },
+    {
+      "epoch": 0.14601792803264158,
+      "grad_norm": 0.0025784457102417946,
+      "learning_rate": 0.001,
+      "loss": 0.3852,
+      "step": 5292
+    },
+    {
+      "epoch": 0.14604552023370596,
+      "grad_norm": 0.002763275755569339,
+      "learning_rate": 0.001,
+      "loss": 0.3677,
+      "step": 5293
+    },
+    {
+      "epoch": 0.1460731124347703,
+      "grad_norm": 0.0020732074044644833,
+      "learning_rate": 0.001,
+      "loss": 0.38,
+      "step": 5294
+    },
+    {
+      "epoch": 0.1461007046358347,
+      "grad_norm": 0.002579174702987075,
+      "learning_rate": 0.001,
+      "loss": 0.3737,
+      "step": 5295
+    },
+    {
+      "epoch": 0.14612829683689904,
+      "grad_norm": 0.0026421458460390568,
+      "learning_rate": 0.001,
+      "loss": 0.4119,
+      "step": 5296
+    },
+    {
+      "epoch": 0.14615588903796342,
+      "grad_norm": 0.0025588644202798605,
+      "learning_rate": 0.001,
+      "loss": 0.3795,
+      "step": 5297
+    },
+    {
+      "epoch": 0.1461834812390278,
+      "grad_norm": 0.002680868376046419,
+      "learning_rate": 0.001,
+      "loss": 0.3794,
+      "step": 5298
+    },
+    {
+      "epoch": 0.14621107344009215,
+      "grad_norm": 0.002638003323227167,
+      "learning_rate": 0.001,
+      "loss": 0.372,
+      "step": 5299
+    },
+    {
+      "epoch": 0.14623866564115653,
+      "grad_norm": 0.002944410778582096,
+      "learning_rate": 0.001,
+      "loss": 0.3894,
+      "step": 5300
+    },
+    {
+      "epoch": 0.1462662578422209,
+      "grad_norm": 0.0032715783454477787,
+      "learning_rate": 0.001,
+      "loss": 0.4187,
+      "step": 5301
+    },
+    {
+      "epoch": 0.14629385004328527,
+      "grad_norm": 0.0038660853169858456,
+      "learning_rate": 0.001,
+      "loss": 0.3874,
+      "step": 5302
+    },
+    {
+      "epoch": 0.14632144224434965,
+      "grad_norm": 0.0023569557815790176,
+      "learning_rate": 0.001,
+      "loss": 0.399,
+      "step": 5303
+    },
+    {
+      "epoch": 0.146349034445414,
+      "grad_norm": 0.01049228198826313,
+      "learning_rate": 0.001,
+      "loss": 0.3707,
+      "step": 5304
+    },
+    {
+      "epoch": 0.14637662664647838,
+      "grad_norm": 0.0023562125861644745,
+      "learning_rate": 0.001,
+      "loss": 0.4398,
+      "step": 5305
+    },
+    {
+      "epoch": 0.14640421884754273,
+      "grad_norm": 0.002021912718191743,
+      "learning_rate": 0.001,
+      "loss": 0.4416,
+      "step": 5306
+    },
+    {
+      "epoch": 0.1464318110486071,
+      "grad_norm": 0.002516190754249692,
+      "learning_rate": 0.001,
+      "loss": 0.4164,
+      "step": 5307
+    },
+    {
+      "epoch": 0.1464594032496715,
+      "grad_norm": 0.0023932051844894886,
+      "learning_rate": 0.001,
+      "loss": 0.3724,
+      "step": 5308
+    },
+    {
+      "epoch": 0.14648699545073585,
+      "grad_norm": 0.0037087369710206985,
+      "learning_rate": 0.001,
+      "loss": 0.3692,
+      "step": 5309
+    },
+    {
+      "epoch": 0.14651458765180023,
+      "grad_norm": 0.002698789816349745,
+      "learning_rate": 0.001,
+      "loss": 0.4025,
+      "step": 5310
+    },
+    {
+      "epoch": 0.14654217985286458,
+      "grad_norm": 0.0025848429650068283,
+      "learning_rate": 0.001,
+      "loss": 0.379,
+      "step": 5311
+    },
+    {
+      "epoch": 0.14656977205392896,
+      "grad_norm": 0.00317647447809577,
+      "learning_rate": 0.001,
+      "loss": 0.3865,
+      "step": 5312
+    },
+    {
+      "epoch": 0.14659736425499334,
+      "grad_norm": 0.0027970026712864637,
+      "learning_rate": 0.001,
+      "loss": 0.3687,
+      "step": 5313
+    },
+    {
+      "epoch": 0.1466249564560577,
+      "grad_norm": 0.0031060813926160336,
+      "learning_rate": 0.001,
+      "loss": 0.3773,
+      "step": 5314
+    },
+    {
+      "epoch": 0.14665254865712207,
+      "grad_norm": 0.004864480346441269,
+      "learning_rate": 0.001,
+      "loss": 0.4137,
+      "step": 5315
+    },
+    {
+      "epoch": 0.14668014085818643,
+      "grad_norm": 0.0022926428355276585,
+      "learning_rate": 0.001,
+      "loss": 0.4042,
+      "step": 5316
+    },
+    {
+      "epoch": 0.1467077330592508,
+      "grad_norm": 0.0029684528708457947,
+      "learning_rate": 0.001,
+      "loss": 0.3917,
+      "step": 5317
+    },
+    {
+      "epoch": 0.14673532526031516,
+      "grad_norm": 0.0022980044595897198,
+      "learning_rate": 0.001,
+      "loss": 0.4402,
+      "step": 5318
+    },
+    {
+      "epoch": 0.14676291746137954,
+      "grad_norm": 0.0021655671298503876,
+      "learning_rate": 0.001,
+      "loss": 0.4054,
+      "step": 5319
+    },
+    {
+      "epoch": 0.14679050966244392,
+      "grad_norm": 0.002639767248183489,
+      "learning_rate": 0.001,
+      "loss": 0.3838,
+      "step": 5320
+    },
+    {
+      "epoch": 0.14681810186350827,
+      "grad_norm": 0.005498727783560753,
+      "learning_rate": 0.001,
+      "loss": 0.3696,
+      "step": 5321
+    },
+    {
+      "epoch": 0.14684569406457265,
+      "grad_norm": 0.0033152250107377768,
+      "learning_rate": 0.001,
+      "loss": 0.4278,
+      "step": 5322
+    },
+    {
+      "epoch": 0.146873286265637,
+      "grad_norm": 0.004971036221832037,
+      "learning_rate": 0.001,
+      "loss": 0.3751,
+      "step": 5323
+    },
+    {
+      "epoch": 0.14690087846670138,
+      "grad_norm": 0.0024375154171139,
+      "learning_rate": 0.001,
+      "loss": 0.3895,
+      "step": 5324
+    },
+    {
+      "epoch": 0.14692847066776576,
+      "grad_norm": 0.002927098423242569,
+      "learning_rate": 0.001,
+      "loss": 0.3463,
+      "step": 5325
+    },
+    {
+      "epoch": 0.14695606286883012,
+      "grad_norm": 0.0036146354395896196,
+      "learning_rate": 0.001,
+      "loss": 0.3996,
+      "step": 5326
+    },
+    {
+      "epoch": 0.1469836550698945,
+      "grad_norm": 0.0024644196964800358,
+      "learning_rate": 0.001,
+      "loss": 0.4235,
+      "step": 5327
+    },
+    {
+      "epoch": 0.14701124727095885,
+      "grad_norm": 0.0025911214761435986,
+      "learning_rate": 0.001,
+      "loss": 0.4279,
+      "step": 5328
+    },
+    {
+      "epoch": 0.14703883947202323,
+      "grad_norm": 0.0025596728082746267,
+      "learning_rate": 0.001,
+      "loss": 0.3997,
+      "step": 5329
+    },
+    {
+      "epoch": 0.1470664316730876,
+      "grad_norm": 0.002164337085559964,
+      "learning_rate": 0.001,
+      "loss": 0.416,
+      "step": 5330
+    },
+    {
+      "epoch": 0.14709402387415196,
+      "grad_norm": 0.002624908462166786,
+      "learning_rate": 0.001,
+      "loss": 0.4399,
+      "step": 5331
+    },
+    {
+      "epoch": 0.14712161607521634,
+      "grad_norm": 0.003358762711286545,
+      "learning_rate": 0.001,
+      "loss": 0.3718,
+      "step": 5332
+    },
+    {
+      "epoch": 0.1471492082762807,
+      "grad_norm": 0.008317602798342705,
+      "learning_rate": 0.001,
+      "loss": 0.3849,
+      "step": 5333
+    },
+    {
+      "epoch": 0.14717680047734508,
+      "grad_norm": 0.002502370160073042,
+      "learning_rate": 0.001,
+      "loss": 0.4124,
+      "step": 5334
+    },
+    {
+      "epoch": 0.14720439267840946,
+      "grad_norm": 0.002517363289371133,
+      "learning_rate": 0.001,
+      "loss": 0.4155,
+      "step": 5335
+    },
+    {
+      "epoch": 0.1472319848794738,
+      "grad_norm": 0.003344867378473282,
+      "learning_rate": 0.001,
+      "loss": 0.384,
+      "step": 5336
+    },
+    {
+      "epoch": 0.1472595770805382,
+      "grad_norm": 0.003851872170343995,
+      "learning_rate": 0.001,
+      "loss": 0.3729,
+      "step": 5337
+    },
+    {
+      "epoch": 0.14728716928160254,
+      "grad_norm": 0.002437508897855878,
+      "learning_rate": 0.001,
+      "loss": 0.4154,
+      "step": 5338
+    },
+    {
+      "epoch": 0.14731476148266692,
+      "grad_norm": 0.0034037036821246147,
+      "learning_rate": 0.001,
+      "loss": 0.3994,
+      "step": 5339
+    },
+    {
+      "epoch": 0.1473423536837313,
+      "grad_norm": 0.0027613432612270117,
+      "learning_rate": 0.001,
+      "loss": 0.4394,
+      "step": 5340
+    },
+    {
+      "epoch": 0.14736994588479566,
+      "grad_norm": 0.002495642751455307,
+      "learning_rate": 0.001,
+      "loss": 0.4221,
+      "step": 5341
+    },
+    {
+      "epoch": 0.14739753808586004,
+      "grad_norm": 0.0029324067290872335,
+      "learning_rate": 0.001,
+      "loss": 0.3461,
+      "step": 5342
+    },
+    {
+      "epoch": 0.1474251302869244,
+      "grad_norm": 0.003891981905326247,
+      "learning_rate": 0.001,
+      "loss": 0.3634,
+      "step": 5343
+    },
+    {
+      "epoch": 0.14745272248798877,
+      "grad_norm": 0.005491616670042276,
+      "learning_rate": 0.001,
+      "loss": 0.3875,
+      "step": 5344
+    },
+    {
+      "epoch": 0.14748031468905315,
+      "grad_norm": 0.0028310040943324566,
+      "learning_rate": 0.001,
+      "loss": 0.3742,
+      "step": 5345
+    },
+    {
+      "epoch": 0.1475079068901175,
+      "grad_norm": 0.0035124190617352724,
+      "learning_rate": 0.001,
+      "loss": 0.4099,
+      "step": 5346
+    },
+    {
+      "epoch": 0.14753549909118188,
+      "grad_norm": 0.003958097193390131,
+      "learning_rate": 0.001,
+      "loss": 0.4097,
+      "step": 5347
+    },
+    {
+      "epoch": 0.14756309129224623,
+      "grad_norm": 0.0031724879518151283,
+      "learning_rate": 0.001,
+      "loss": 0.3901,
+      "step": 5348
+    },
+    {
+      "epoch": 0.14759068349331061,
+      "grad_norm": 0.006823610980063677,
+      "learning_rate": 0.001,
+      "loss": 0.3982,
+      "step": 5349
+    },
+    {
+      "epoch": 0.147618275694375,
+      "grad_norm": 0.0023501457180827856,
+      "learning_rate": 0.001,
+      "loss": 0.3836,
+      "step": 5350
+    },
+    {
+      "epoch": 0.14764586789543935,
+      "grad_norm": 0.0020646711345762014,
+      "learning_rate": 0.001,
+      "loss": 0.4265,
+      "step": 5351
+    },
+    {
+      "epoch": 0.14767346009650373,
+      "grad_norm": 0.002760658971965313,
+      "learning_rate": 0.001,
+      "loss": 0.4048,
+      "step": 5352
+    },
+    {
+      "epoch": 0.14770105229756808,
+      "grad_norm": 0.0029554900247603655,
+      "learning_rate": 0.001,
+      "loss": 0.4108,
+      "step": 5353
+    },
+    {
+      "epoch": 0.14772864449863246,
+      "grad_norm": 0.0038962808903306723,
+      "learning_rate": 0.001,
+      "loss": 0.4192,
+      "step": 5354
+    },
+    {
+      "epoch": 0.14775623669969684,
+      "grad_norm": 0.002745499601587653,
+      "learning_rate": 0.001,
+      "loss": 0.3895,
+      "step": 5355
+    },
+    {
+      "epoch": 0.1477838289007612,
+      "grad_norm": 0.004177890717983246,
+      "learning_rate": 0.001,
+      "loss": 0.3632,
+      "step": 5356
+    },
+    {
+      "epoch": 0.14781142110182557,
+      "grad_norm": 0.002319454913958907,
+      "learning_rate": 0.001,
+      "loss": 0.4225,
+      "step": 5357
+    },
+    {
+      "epoch": 0.14783901330288993,
+      "grad_norm": 0.0026902640238404274,
+      "learning_rate": 0.001,
+      "loss": 0.4026,
+      "step": 5358
+    },
+    {
+      "epoch": 0.1478666055039543,
+      "grad_norm": 0.003064702032133937,
+      "learning_rate": 0.001,
+      "loss": 0.4165,
+      "step": 5359
+    },
+    {
+      "epoch": 0.1478941977050187,
+      "grad_norm": 0.003410772420465946,
+      "learning_rate": 0.001,
+      "loss": 0.3601,
+      "step": 5360
+    },
+    {
+      "epoch": 0.14792178990608304,
+      "grad_norm": 0.0030657979659736156,
+      "learning_rate": 0.001,
+      "loss": 0.4095,
+      "step": 5361
+    },
+    {
+      "epoch": 0.14794938210714742,
+      "grad_norm": 0.008095750585198402,
+      "learning_rate": 0.001,
+      "loss": 0.4101,
+      "step": 5362
+    },
+    {
+      "epoch": 0.14797697430821177,
+      "grad_norm": 0.004853304475545883,
+      "learning_rate": 0.001,
+      "loss": 0.404,
+      "step": 5363
+    },
+    {
+      "epoch": 0.14800456650927615,
+      "grad_norm": 0.0032035887707024813,
+      "learning_rate": 0.001,
+      "loss": 0.4001,
+      "step": 5364
+    },
+    {
+      "epoch": 0.14803215871034053,
+      "grad_norm": 0.004527990240603685,
+      "learning_rate": 0.001,
+      "loss": 0.3729,
+      "step": 5365
+    },
+    {
+      "epoch": 0.14805975091140489,
+      "grad_norm": 0.006700141355395317,
+      "learning_rate": 0.001,
+      "loss": 0.375,
+      "step": 5366
+    },
+    {
+      "epoch": 0.14808734311246927,
+      "grad_norm": 0.00633378978818655,
+      "learning_rate": 0.001,
+      "loss": 0.4253,
+      "step": 5367
+    },
+    {
+      "epoch": 0.14811493531353362,
+      "grad_norm": 0.0023305609356611967,
+      "learning_rate": 0.001,
+      "loss": 0.4106,
+      "step": 5368
+    },
+    {
+      "epoch": 0.148142527514598,
+      "grad_norm": 0.0034147268161177635,
+      "learning_rate": 0.001,
+      "loss": 0.3556,
+      "step": 5369
+    },
+    {
+      "epoch": 0.14817011971566238,
+      "grad_norm": 0.003086140612140298,
+      "learning_rate": 0.001,
+      "loss": 0.3958,
+      "step": 5370
+    },
+    {
+      "epoch": 0.14819771191672673,
+      "grad_norm": 0.0061439163982868195,
+      "learning_rate": 0.001,
+      "loss": 0.3608,
+      "step": 5371
+    },
+    {
+      "epoch": 0.1482253041177911,
+      "grad_norm": 0.0025781714357435703,
+      "learning_rate": 0.001,
+      "loss": 0.3895,
+      "step": 5372
+    },
+    {
+      "epoch": 0.14825289631885546,
+      "grad_norm": 0.0026104720309376717,
+      "learning_rate": 0.001,
+      "loss": 0.427,
+      "step": 5373
+    },
+    {
+      "epoch": 0.14828048851991985,
+      "grad_norm": 0.0037834334652870893,
+      "learning_rate": 0.001,
+      "loss": 0.391,
+      "step": 5374
+    },
+    {
+      "epoch": 0.14830808072098423,
+      "grad_norm": 0.005206381902098656,
+      "learning_rate": 0.001,
+      "loss": 0.3974,
+      "step": 5375
+    },
+    {
+      "epoch": 0.14833567292204858,
+      "grad_norm": 0.002475143875926733,
+      "learning_rate": 0.001,
+      "loss": 0.4218,
+      "step": 5376
+    },
+    {
+      "epoch": 0.14836326512311296,
+      "grad_norm": 0.005634800065308809,
+      "learning_rate": 0.001,
+      "loss": 0.388,
+      "step": 5377
+    },
+    {
+      "epoch": 0.1483908573241773,
+      "grad_norm": 0.004982566460967064,
+      "learning_rate": 0.001,
+      "loss": 0.4025,
+      "step": 5378
+    },
+    {
+      "epoch": 0.1484184495252417,
+      "grad_norm": 0.0032306865323334932,
+      "learning_rate": 0.001,
+      "loss": 0.4247,
+      "step": 5379
+    },
+    {
+      "epoch": 0.14844604172630607,
+      "grad_norm": 0.0074884905479848385,
+      "learning_rate": 0.001,
+      "loss": 0.405,
+      "step": 5380
+    },
+    {
+      "epoch": 0.14847363392737042,
+      "grad_norm": 0.003231652779504657,
+      "learning_rate": 0.001,
+      "loss": 0.4048,
+      "step": 5381
+    },
+    {
+      "epoch": 0.1485012261284348,
+      "grad_norm": 0.004374918062239885,
+      "learning_rate": 0.001,
+      "loss": 0.3829,
+      "step": 5382
+    },
+    {
+      "epoch": 0.14852881832949916,
+      "grad_norm": 0.003996079787611961,
+      "learning_rate": 0.001,
+      "loss": 0.3606,
+      "step": 5383
+    },
+    {
+      "epoch": 0.14855641053056354,
+      "grad_norm": 0.0023612806107848883,
+      "learning_rate": 0.001,
+      "loss": 0.4327,
+      "step": 5384
+    },
+    {
+      "epoch": 0.14858400273162792,
+      "grad_norm": 0.002323776250705123,
+      "learning_rate": 0.001,
+      "loss": 0.3969,
+      "step": 5385
+    },
+    {
+      "epoch": 0.14861159493269227,
+      "grad_norm": 0.003044640878215432,
+      "learning_rate": 0.001,
+      "loss": 0.4175,
+      "step": 5386
+    },
+    {
+      "epoch": 0.14863918713375665,
+      "grad_norm": 0.0029183158185333014,
+      "learning_rate": 0.001,
+      "loss": 0.3853,
+      "step": 5387
+    },
+    {
+      "epoch": 0.148666779334821,
+      "grad_norm": 0.0031076192390173674,
+      "learning_rate": 0.001,
+      "loss": 0.4054,
+      "step": 5388
+    },
+    {
+      "epoch": 0.14869437153588538,
+      "grad_norm": 0.0032536948565393686,
+      "learning_rate": 0.001,
+      "loss": 0.4417,
+      "step": 5389
+    },
+    {
+      "epoch": 0.14872196373694976,
+      "grad_norm": 0.0028777210973203182,
+      "learning_rate": 0.001,
+      "loss": 0.4035,
+      "step": 5390
+    },
+    {
+      "epoch": 0.14874955593801412,
+      "grad_norm": 0.003044008044525981,
+      "learning_rate": 0.001,
+      "loss": 0.4017,
+      "step": 5391
+    },
+    {
+      "epoch": 0.1487771481390785,
+      "grad_norm": 0.004723208025097847,
+      "learning_rate": 0.001,
+      "loss": 0.3847,
+      "step": 5392
+    },
+    {
+      "epoch": 0.14880474034014285,
+      "grad_norm": 0.003199602710083127,
+      "learning_rate": 0.001,
+      "loss": 0.4212,
+      "step": 5393
+    },
+    {
+      "epoch": 0.14883233254120723,
+      "grad_norm": 0.0029654945246875286,
+      "learning_rate": 0.001,
+      "loss": 0.4177,
+      "step": 5394
+    },
+    {
+      "epoch": 0.1488599247422716,
+      "grad_norm": 0.003065300639718771,
+      "learning_rate": 0.001,
+      "loss": 0.4206,
+      "step": 5395
+    },
+    {
+      "epoch": 0.14888751694333596,
+      "grad_norm": 0.003566544270142913,
+      "learning_rate": 0.001,
+      "loss": 0.3763,
+      "step": 5396
+    },
+    {
+      "epoch": 0.14891510914440034,
+      "grad_norm": 0.0034122609067708254,
+      "learning_rate": 0.001,
+      "loss": 0.4046,
+      "step": 5397
+    },
+    {
+      "epoch": 0.1489427013454647,
+      "grad_norm": 0.0025029098615050316,
+      "learning_rate": 0.001,
+      "loss": 0.3565,
+      "step": 5398
+    },
+    {
+      "epoch": 0.14897029354652908,
+      "grad_norm": 0.0030023674480617046,
+      "learning_rate": 0.001,
+      "loss": 0.3885,
+      "step": 5399
+    },
+    {
+      "epoch": 0.14899788574759346,
+      "grad_norm": 0.00424955366179347,
+      "learning_rate": 0.001,
+      "loss": 0.3888,
+      "step": 5400
+    },
+    {
+      "epoch": 0.1490254779486578,
+      "grad_norm": 0.0030338119249790907,
+      "learning_rate": 0.001,
+      "loss": 0.3927,
+      "step": 5401
+    },
+    {
+      "epoch": 0.1490530701497222,
+      "grad_norm": 0.0028448712546378374,
+      "learning_rate": 0.001,
+      "loss": 0.432,
+      "step": 5402
+    },
+    {
+      "epoch": 0.14908066235078654,
+      "grad_norm": 0.0028374232351779938,
+      "learning_rate": 0.001,
+      "loss": 0.4157,
+      "step": 5403
+    },
+    {
+      "epoch": 0.14910825455185092,
+      "grad_norm": 0.002151310909539461,
+      "learning_rate": 0.001,
+      "loss": 0.4508,
+      "step": 5404
+    },
+    {
+      "epoch": 0.1491358467529153,
+      "grad_norm": 0.002506339456886053,
+      "learning_rate": 0.001,
+      "loss": 0.4381,
+      "step": 5405
+    },
+    {
+      "epoch": 0.14916343895397965,
+      "grad_norm": 0.002707758452743292,
+      "learning_rate": 0.001,
+      "loss": 0.4113,
+      "step": 5406
+    },
+    {
+      "epoch": 0.14919103115504403,
+      "grad_norm": 0.002374933334067464,
+      "learning_rate": 0.001,
+      "loss": 0.3643,
+      "step": 5407
+    },
+    {
+      "epoch": 0.1492186233561084,
+      "grad_norm": 0.0027373386546969414,
+      "learning_rate": 0.001,
+      "loss": 0.3823,
+      "step": 5408
+    },
+    {
+      "epoch": 0.14924621555717277,
+      "grad_norm": 0.004208944737911224,
+      "learning_rate": 0.001,
+      "loss": 0.4265,
+      "step": 5409
+    },
+    {
+      "epoch": 0.14927380775823715,
+      "grad_norm": 0.0031958618201315403,
+      "learning_rate": 0.001,
+      "loss": 0.3799,
+      "step": 5410
+    },
+    {
+      "epoch": 0.1493013999593015,
+      "grad_norm": 0.006793424021452665,
+      "learning_rate": 0.001,
+      "loss": 0.403,
+      "step": 5411
+    },
+    {
+      "epoch": 0.14932899216036588,
+      "grad_norm": 0.003309778869152069,
+      "learning_rate": 0.001,
+      "loss": 0.3925,
+      "step": 5412
+    },
+    {
+      "epoch": 0.14935658436143023,
+      "grad_norm": 0.002356642158702016,
+      "learning_rate": 0.001,
+      "loss": 0.4026,
+      "step": 5413
+    },
+    {
+      "epoch": 0.1493841765624946,
+      "grad_norm": 0.0032743606716394424,
+      "learning_rate": 0.001,
+      "loss": 0.3839,
+      "step": 5414
+    },
+    {
+      "epoch": 0.14941176876355897,
+      "grad_norm": 0.005772759672254324,
+      "learning_rate": 0.001,
+      "loss": 0.4099,
+      "step": 5415
+    },
+    {
+      "epoch": 0.14943936096462335,
+      "grad_norm": 0.0065681166015565395,
+      "learning_rate": 0.001,
+      "loss": 0.4149,
+      "step": 5416
+    },
+    {
+      "epoch": 0.14946695316568773,
+      "grad_norm": 0.0028679503593593836,
+      "learning_rate": 0.001,
+      "loss": 0.3978,
+      "step": 5417
+    },
+    {
+      "epoch": 0.14949454536675208,
+      "grad_norm": 0.003005877137184143,
+      "learning_rate": 0.001,
+      "loss": 0.3848,
+      "step": 5418
+    },
+    {
+      "epoch": 0.14952213756781646,
+      "grad_norm": 0.0022220280952751637,
+      "learning_rate": 0.001,
+      "loss": 0.4393,
+      "step": 5419
+    },
+    {
+      "epoch": 0.1495497297688808,
+      "grad_norm": 0.0033852518536150455,
+      "learning_rate": 0.001,
+      "loss": 0.3692,
+      "step": 5420
+    },
+    {
+      "epoch": 0.1495773219699452,
+      "grad_norm": 0.006787192076444626,
+      "learning_rate": 0.001,
+      "loss": 0.3895,
+      "step": 5421
+    },
+    {
+      "epoch": 0.14960491417100957,
+      "grad_norm": 0.0036864059511572123,
+      "learning_rate": 0.001,
+      "loss": 0.4334,
+      "step": 5422
+    },
+    {
+      "epoch": 0.14963250637207393,
+      "grad_norm": 0.002667534863576293,
+      "learning_rate": 0.001,
+      "loss": 0.4084,
+      "step": 5423
+    },
+    {
+      "epoch": 0.1496600985731383,
+      "grad_norm": 0.003278045216575265,
+      "learning_rate": 0.001,
+      "loss": 0.416,
+      "step": 5424
+    },
+    {
+      "epoch": 0.14968769077420266,
+      "grad_norm": 0.003076856257393956,
+      "learning_rate": 0.001,
+      "loss": 0.4169,
+      "step": 5425
+    },
+    {
+      "epoch": 0.14971528297526704,
+      "grad_norm": 0.0023882794193923473,
+      "learning_rate": 0.001,
+      "loss": 0.3999,
+      "step": 5426
+    },
+    {
+      "epoch": 0.14974287517633142,
+      "grad_norm": 0.003012517001479864,
+      "learning_rate": 0.001,
+      "loss": 0.4049,
+      "step": 5427
+    },
+    {
+      "epoch": 0.14977046737739577,
+      "grad_norm": 0.0023737861774861813,
+      "learning_rate": 0.001,
+      "loss": 0.4265,
+      "step": 5428
+    },
+    {
+      "epoch": 0.14979805957846015,
+      "grad_norm": 0.0022319392301142216,
+      "learning_rate": 0.001,
+      "loss": 0.4013,
+      "step": 5429
+    },
+    {
+      "epoch": 0.1498256517795245,
+      "grad_norm": 0.0032811984419822693,
+      "learning_rate": 0.001,
+      "loss": 0.3898,
+      "step": 5430
+    },
+    {
+      "epoch": 0.14985324398058888,
+      "grad_norm": 0.005578971467912197,
+      "learning_rate": 0.001,
+      "loss": 0.39,
+      "step": 5431
+    },
+    {
+      "epoch": 0.14988083618165327,
+      "grad_norm": 0.003999901469796896,
+      "learning_rate": 0.001,
+      "loss": 0.4003,
+      "step": 5432
+    },
+    {
+      "epoch": 0.14990842838271762,
+      "grad_norm": 0.002937354613095522,
+      "learning_rate": 0.001,
+      "loss": 0.4189,
+      "step": 5433
+    },
+    {
+      "epoch": 0.149936020583782,
+      "grad_norm": 0.004472947679460049,
+      "learning_rate": 0.001,
+      "loss": 0.3836,
+      "step": 5434
+    },
+    {
+      "epoch": 0.14996361278484635,
+      "grad_norm": 0.0043602860532701015,
+      "learning_rate": 0.001,
+      "loss": 0.3565,
+      "step": 5435
+    },
+    {
+      "epoch": 0.14999120498591073,
+      "grad_norm": 0.0026499300729483366,
+      "learning_rate": 0.001,
+      "loss": 0.4039,
+      "step": 5436
+    },
+    {
+      "epoch": 0.1500187971869751,
+      "grad_norm": 0.003208121517673135,
+      "learning_rate": 0.001,
+      "loss": 0.4198,
+      "step": 5437
+    },
+    {
+      "epoch": 0.15004638938803946,
+      "grad_norm": 0.0024985617492347956,
+      "learning_rate": 0.001,
+      "loss": 0.4716,
+      "step": 5438
+    },
+    {
+      "epoch": 0.15007398158910384,
+      "grad_norm": 0.0026381593197584152,
+      "learning_rate": 0.001,
+      "loss": 0.3799,
+      "step": 5439
+    },
+    {
+      "epoch": 0.1501015737901682,
+      "grad_norm": 0.0035929230507463217,
+      "learning_rate": 0.001,
+      "loss": 0.3716,
+      "step": 5440
+    },
+    {
+      "epoch": 0.15012916599123258,
+      "grad_norm": 0.005476321559399366,
+      "learning_rate": 0.001,
+      "loss": 0.4043,
+      "step": 5441
+    },
+    {
+      "epoch": 0.15015675819229696,
+      "grad_norm": 0.00393927376717329,
+      "learning_rate": 0.001,
+      "loss": 0.3844,
+      "step": 5442
+    },
+    {
+      "epoch": 0.1501843503933613,
+      "grad_norm": 0.003273927140980959,
+      "learning_rate": 0.001,
+      "loss": 0.403,
+      "step": 5443
+    },
+    {
+      "epoch": 0.1502119425944257,
+      "grad_norm": 0.015024540945887566,
+      "learning_rate": 0.001,
+      "loss": 0.3915,
+      "step": 5444
+    },
+    {
+      "epoch": 0.15023953479549004,
+      "grad_norm": 0.004785169847309589,
+      "learning_rate": 0.001,
+      "loss": 0.3738,
+      "step": 5445
+    },
+    {
+      "epoch": 0.15026712699655442,
+      "grad_norm": 0.00336294062435627,
+      "learning_rate": 0.001,
+      "loss": 0.406,
+      "step": 5446
+    },
+    {
+      "epoch": 0.1502947191976188,
+      "grad_norm": 0.0036235249135643244,
+      "learning_rate": 0.001,
+      "loss": 0.3839,
+      "step": 5447
+    },
+    {
+      "epoch": 0.15032231139868316,
+      "grad_norm": 0.002741090953350067,
+      "learning_rate": 0.001,
+      "loss": 0.3762,
+      "step": 5448
+    },
+    {
+      "epoch": 0.15034990359974754,
+      "grad_norm": 0.0024599130265414715,
+      "learning_rate": 0.001,
+      "loss": 0.4038,
+      "step": 5449
+    },
+    {
+      "epoch": 0.1503774958008119,
+      "grad_norm": 0.002163279801607132,
+      "learning_rate": 0.001,
+      "loss": 0.4581,
+      "step": 5450
+    },
+    {
+      "epoch": 0.15040508800187627,
+      "grad_norm": 0.005329553037881851,
+      "learning_rate": 0.001,
+      "loss": 0.375,
+      "step": 5451
+    },
+    {
+      "epoch": 0.15043268020294065,
+      "grad_norm": 0.004300289321690798,
+      "learning_rate": 0.001,
+      "loss": 0.3871,
+      "step": 5452
+    },
+    {
+      "epoch": 0.150460272404005,
+      "grad_norm": 0.003734968136996031,
+      "learning_rate": 0.001,
+      "loss": 0.3975,
+      "step": 5453
+    },
+    {
+      "epoch": 0.15048786460506938,
+      "grad_norm": 0.0053179883398115635,
+      "learning_rate": 0.001,
+      "loss": 0.3729,
+      "step": 5454
+    },
+    {
+      "epoch": 0.15051545680613374,
+      "grad_norm": 0.003352331230416894,
+      "learning_rate": 0.001,
+      "loss": 0.41,
+      "step": 5455
+    },
+    {
+      "epoch": 0.15054304900719812,
+      "grad_norm": 0.0027811650652438402,
+      "learning_rate": 0.001,
+      "loss": 0.4354,
+      "step": 5456
+    },
+    {
+      "epoch": 0.1505706412082625,
+      "grad_norm": 0.0069539607502520084,
+      "learning_rate": 0.001,
+      "loss": 0.4068,
+      "step": 5457
+    },
+    {
+      "epoch": 0.15059823340932685,
+      "grad_norm": 0.002543016104027629,
+      "learning_rate": 0.001,
+      "loss": 0.475,
+      "step": 5458
+    },
+    {
+      "epoch": 0.15062582561039123,
+      "grad_norm": 0.004235788248479366,
+      "learning_rate": 0.001,
+      "loss": 0.3636,
+      "step": 5459
+    },
+    {
+      "epoch": 0.15065341781145558,
+      "grad_norm": 0.0021371094044297934,
+      "learning_rate": 0.001,
+      "loss": 0.3899,
+      "step": 5460
+    },
+    {
+      "epoch": 0.15068101001251996,
+      "grad_norm": 0.0031575944740325212,
+      "learning_rate": 0.001,
+      "loss": 0.3835,
+      "step": 5461
+    },
+    {
+      "epoch": 0.15070860221358434,
+      "grad_norm": 0.002763645024970174,
+      "learning_rate": 0.001,
+      "loss": 0.3843,
+      "step": 5462
+    },
+    {
+      "epoch": 0.1507361944146487,
+      "grad_norm": 0.003907541744410992,
+      "learning_rate": 0.001,
+      "loss": 0.3748,
+      "step": 5463
+    },
+    {
+      "epoch": 0.15076378661571307,
+      "grad_norm": 0.004303582012653351,
+      "learning_rate": 0.001,
+      "loss": 0.3733,
+      "step": 5464
+    },
+    {
+      "epoch": 0.15079137881677743,
+      "grad_norm": 0.005774863064289093,
+      "learning_rate": 0.001,
+      "loss": 0.3817,
+      "step": 5465
+    },
+    {
+      "epoch": 0.1508189710178418,
+      "grad_norm": 0.00487649068236351,
+      "learning_rate": 0.001,
+      "loss": 0.3776,
+      "step": 5466
+    },
+    {
+      "epoch": 0.1508465632189062,
+      "grad_norm": 0.0033277124166488647,
+      "learning_rate": 0.001,
+      "loss": 0.418,
+      "step": 5467
+    },
+    {
+      "epoch": 0.15087415541997054,
+      "grad_norm": 0.003046071156859398,
+      "learning_rate": 0.001,
+      "loss": 0.3428,
+      "step": 5468
+    },
+    {
+      "epoch": 0.15090174762103492,
+      "grad_norm": 0.0024130302481353283,
+      "learning_rate": 0.001,
+      "loss": 0.3924,
+      "step": 5469
+    },
+    {
+      "epoch": 0.15092933982209927,
+      "grad_norm": 0.002598168794065714,
+      "learning_rate": 0.001,
+      "loss": 0.3969,
+      "step": 5470
+    },
+    {
+      "epoch": 0.15095693202316365,
+      "grad_norm": 0.00266568036749959,
+      "learning_rate": 0.001,
+      "loss": 0.4135,
+      "step": 5471
+    },
+    {
+      "epoch": 0.15098452422422803,
+      "grad_norm": 0.0028029074892401695,
+      "learning_rate": 0.001,
+      "loss": 0.3714,
+      "step": 5472
+    },
+    {
+      "epoch": 0.1510121164252924,
+      "grad_norm": 0.008100908249616623,
+      "learning_rate": 0.001,
+      "loss": 0.3997,
+      "step": 5473
+    },
+    {
+      "epoch": 0.15103970862635677,
+      "grad_norm": 0.00268415710888803,
+      "learning_rate": 0.001,
+      "loss": 0.4178,
+      "step": 5474
+    },
+    {
+      "epoch": 0.15106730082742112,
+      "grad_norm": 0.00282164104282856,
+      "learning_rate": 0.001,
+      "loss": 0.4258,
+      "step": 5475
+    },
+    {
+      "epoch": 0.1510948930284855,
+      "grad_norm": 0.0029998181853443384,
+      "learning_rate": 0.001,
+      "loss": 0.422,
+      "step": 5476
+    },
+    {
+      "epoch": 0.15112248522954988,
+      "grad_norm": 0.005322444252669811,
+      "learning_rate": 0.001,
+      "loss": 0.3662,
+      "step": 5477
+    },
+    {
+      "epoch": 0.15115007743061423,
+      "grad_norm": 0.0020565236918628216,
+      "learning_rate": 0.001,
+      "loss": 0.4042,
+      "step": 5478
+    },
+    {
+      "epoch": 0.1511776696316786,
+      "grad_norm": 0.0021356120705604553,
+      "learning_rate": 0.001,
+      "loss": 0.4134,
+      "step": 5479
+    },
+    {
+      "epoch": 0.15120526183274297,
+      "grad_norm": 0.003150090342387557,
+      "learning_rate": 0.001,
+      "loss": 0.4225,
+      "step": 5480
+    },
+    {
+      "epoch": 0.15123285403380735,
+      "grad_norm": 0.002523067407310009,
+      "learning_rate": 0.001,
+      "loss": 0.4469,
+      "step": 5481
+    },
+    {
+      "epoch": 0.15126044623487173,
+      "grad_norm": 0.003910950850695372,
+      "learning_rate": 0.001,
+      "loss": 0.4317,
+      "step": 5482
+    },
+    {
+      "epoch": 0.15128803843593608,
+      "grad_norm": 0.0035898578353226185,
+      "learning_rate": 0.001,
+      "loss": 0.429,
+      "step": 5483
+    },
+    {
+      "epoch": 0.15131563063700046,
+      "grad_norm": 0.0025848867371678352,
+      "learning_rate": 0.001,
+      "loss": 0.3965,
+      "step": 5484
+    },
+    {
+      "epoch": 0.1513432228380648,
+      "grad_norm": 0.0024130861274898052,
+      "learning_rate": 0.001,
+      "loss": 0.4291,
+      "step": 5485
+    },
+    {
+      "epoch": 0.1513708150391292,
+      "grad_norm": 0.005302855744957924,
+      "learning_rate": 0.001,
+      "loss": 0.3912,
+      "step": 5486
+    },
+    {
+      "epoch": 0.15139840724019357,
+      "grad_norm": 0.0032018995843827724,
+      "learning_rate": 0.001,
+      "loss": 0.3642,
+      "step": 5487
+    },
+    {
+      "epoch": 0.15142599944125792,
+      "grad_norm": 0.0025910425465554,
+      "learning_rate": 0.001,
+      "loss": 0.3785,
+      "step": 5488
+    },
+    {
+      "epoch": 0.1514535916423223,
+      "grad_norm": 0.003524304600432515,
+      "learning_rate": 0.001,
+      "loss": 0.4421,
+      "step": 5489
+    },
+    {
+      "epoch": 0.15148118384338666,
+      "grad_norm": 0.0029097420629113913,
+      "learning_rate": 0.001,
+      "loss": 0.3874,
+      "step": 5490
+    },
+    {
+      "epoch": 0.15150877604445104,
+      "grad_norm": 0.003354964079335332,
+      "learning_rate": 0.001,
+      "loss": 0.4103,
+      "step": 5491
+    },
+    {
+      "epoch": 0.15153636824551542,
+      "grad_norm": 0.0032378071919083595,
+      "learning_rate": 0.001,
+      "loss": 0.4041,
+      "step": 5492
+    },
+    {
+      "epoch": 0.15156396044657977,
+      "grad_norm": 0.0037639355286955833,
+      "learning_rate": 0.001,
+      "loss": 0.3751,
+      "step": 5493
+    },
+    {
+      "epoch": 0.15159155264764415,
+      "grad_norm": 0.0043389336206018925,
+      "learning_rate": 0.001,
+      "loss": 0.3813,
+      "step": 5494
+    },
+    {
+      "epoch": 0.1516191448487085,
+      "grad_norm": 0.0026339939795434475,
+      "learning_rate": 0.001,
+      "loss": 0.3972,
+      "step": 5495
+    },
+    {
+      "epoch": 0.15164673704977288,
+      "grad_norm": 0.0039835344068706036,
+      "learning_rate": 0.001,
+      "loss": 0.3862,
+      "step": 5496
+    },
+    {
+      "epoch": 0.15167432925083726,
+      "grad_norm": 0.003993440419435501,
+      "learning_rate": 0.001,
+      "loss": 0.4029,
+      "step": 5497
+    },
+    {
+      "epoch": 0.15170192145190162,
+      "grad_norm": 0.003669349942356348,
+      "learning_rate": 0.001,
+      "loss": 0.3656,
+      "step": 5498
+    },
+    {
+      "epoch": 0.151729513652966,
+      "grad_norm": 0.005064620170742273,
+      "learning_rate": 0.001,
+      "loss": 0.3677,
+      "step": 5499
+    },
+    {
+      "epoch": 0.15175710585403035,
+      "grad_norm": 0.0034543632064014673,
+      "learning_rate": 0.001,
+      "loss": 0.4298,
+      "step": 5500
+    },
+    {
+      "epoch": 0.15175710585403035,
+      "eval_runtime": 27.0484,
+      "eval_samples_per_second": 1.183,
+      "eval_steps_per_second": 0.148,
+      "step": 5500
+    },
+    {
+      "epoch": 0.15178469805509473,
+      "grad_norm": 0.0021760701201856136,
+      "learning_rate": 0.001,
+      "loss": 0.4394,
+      "step": 5501
+    },
+    {
+      "epoch": 0.1518122902561591,
+      "grad_norm": 0.0033101621083915234,
+      "learning_rate": 0.001,
+      "loss": 0.392,
+      "step": 5502
+    },
+    {
+      "epoch": 0.15183988245722346,
+      "grad_norm": 0.005687963217496872,
+      "learning_rate": 0.001,
+      "loss": 0.3858,
+      "step": 5503
+    },
+    {
+      "epoch": 0.15186747465828784,
+      "grad_norm": 0.0031170856673270464,
+      "learning_rate": 0.001,
+      "loss": 0.3453,
+      "step": 5504
+    },
+    {
+      "epoch": 0.1518950668593522,
+      "grad_norm": 0.0025914981961250305,
+      "learning_rate": 0.001,
+      "loss": 0.3746,
+      "step": 5505
+    },
+    {
+      "epoch": 0.15192265906041658,
+      "grad_norm": 0.0037096338346600533,
+      "learning_rate": 0.001,
+      "loss": 0.384,
+      "step": 5506
+    },
+    {
+      "epoch": 0.15195025126148093,
+      "grad_norm": 0.0023521913681179285,
+      "learning_rate": 0.001,
+      "loss": 0.4116,
+      "step": 5507
+    },
+    {
+      "epoch": 0.1519778434625453,
+      "grad_norm": 0.0021949107758700848,
+      "learning_rate": 0.001,
+      "loss": 0.4032,
+      "step": 5508
+    },
+    {
+      "epoch": 0.1520054356636097,
+      "grad_norm": 0.00198304932564497,
+      "learning_rate": 0.001,
+      "loss": 0.4209,
+      "step": 5509
+    },
+    {
+      "epoch": 0.15203302786467404,
+      "grad_norm": 0.0021362160332500935,
+      "learning_rate": 0.001,
+      "loss": 0.4115,
+      "step": 5510
+    },
+    {
+      "epoch": 0.15206062006573842,
+      "grad_norm": 0.0028702253475785255,
+      "learning_rate": 0.001,
+      "loss": 0.402,
+      "step": 5511
+    },
+    {
+      "epoch": 0.15208821226680277,
+      "grad_norm": 0.003037867834791541,
+      "learning_rate": 0.001,
+      "loss": 0.3797,
+      "step": 5512
+    },
+    {
+      "epoch": 0.15211580446786716,
+      "grad_norm": 0.004557423759251833,
+      "learning_rate": 0.001,
+      "loss": 0.3621,
+      "step": 5513
+    },
+    {
+      "epoch": 0.15214339666893154,
+      "grad_norm": 0.0027468795888125896,
+      "learning_rate": 0.001,
+      "loss": 0.4179,
+      "step": 5514
+    },
+    {
+      "epoch": 0.1521709888699959,
+      "grad_norm": 0.004307581577450037,
+      "learning_rate": 0.001,
+      "loss": 0.3822,
+      "step": 5515
+    },
+    {
+      "epoch": 0.15219858107106027,
+      "grad_norm": 0.0030242314096540213,
+      "learning_rate": 0.001,
+      "loss": 0.3733,
+      "step": 5516
+    },
+    {
+      "epoch": 0.15222617327212462,
+      "grad_norm": 0.00433747936040163,
+      "learning_rate": 0.001,
+      "loss": 0.4184,
+      "step": 5517
+    },
+    {
+      "epoch": 0.152253765473189,
+      "grad_norm": 0.009036600589752197,
+      "learning_rate": 0.001,
+      "loss": 0.4181,
+      "step": 5518
+    },
+    {
+      "epoch": 0.15228135767425338,
+      "grad_norm": 0.012543873861432076,
+      "learning_rate": 0.001,
+      "loss": 0.4278,
+      "step": 5519
+    },
+    {
+      "epoch": 0.15230894987531773,
+      "grad_norm": 0.009851133450865746,
+      "learning_rate": 0.001,
+      "loss": 0.4012,
+      "step": 5520
+    },
+    {
+      "epoch": 0.15233654207638211,
+      "grad_norm": 0.013184795156121254,
+      "learning_rate": 0.001,
+      "loss": 0.4147,
+      "step": 5521
+    },
+    {
+      "epoch": 0.15236413427744647,
+      "grad_norm": 0.017032135277986526,
+      "learning_rate": 0.001,
+      "loss": 0.3805,
+      "step": 5522
+    },
+    {
+      "epoch": 0.15239172647851085,
+      "grad_norm": 0.002919899532571435,
+      "learning_rate": 0.001,
+      "loss": 0.4131,
+      "step": 5523
+    },
+    {
+      "epoch": 0.15241931867957523,
+      "grad_norm": 0.002256933366879821,
+      "learning_rate": 0.001,
+      "loss": 0.4279,
+      "step": 5524
+    },
+    {
+      "epoch": 0.15244691088063958,
+      "grad_norm": 0.0027415261138230562,
+      "learning_rate": 0.001,
+      "loss": 0.4324,
+      "step": 5525
+    },
+    {
+      "epoch": 0.15247450308170396,
+      "grad_norm": 0.0031243073754012585,
+      "learning_rate": 0.001,
+      "loss": 0.3676,
+      "step": 5526
+    },
+    {
+      "epoch": 0.1525020952827683,
+      "grad_norm": 0.0021765967831015587,
+      "learning_rate": 0.001,
+      "loss": 0.4111,
+      "step": 5527
+    },
+    {
+      "epoch": 0.1525296874838327,
+      "grad_norm": 0.003611369989812374,
+      "learning_rate": 0.001,
+      "loss": 0.3896,
+      "step": 5528
+    },
+    {
+      "epoch": 0.15255727968489707,
+      "grad_norm": 0.005910592619329691,
+      "learning_rate": 0.001,
+      "loss": 0.3917,
+      "step": 5529
+    },
+    {
+      "epoch": 0.15258487188596143,
+      "grad_norm": 0.01214669831097126,
+      "learning_rate": 0.001,
+      "loss": 0.3718,
+      "step": 5530
+    },
+    {
+      "epoch": 0.1526124640870258,
+      "grad_norm": 0.0060065449215471745,
+      "learning_rate": 0.001,
+      "loss": 0.402,
+      "step": 5531
+    },
+    {
+      "epoch": 0.15264005628809016,
+      "grad_norm": 0.0025461267214268446,
+      "learning_rate": 0.001,
+      "loss": 0.3835,
+      "step": 5532
+    },
+    {
+      "epoch": 0.15266764848915454,
+      "grad_norm": 0.002425495535135269,
+      "learning_rate": 0.001,
+      "loss": 0.381,
+      "step": 5533
+    },
+    {
+      "epoch": 0.15269524069021892,
+      "grad_norm": 0.004517144989222288,
+      "learning_rate": 0.001,
+      "loss": 0.3968,
+      "step": 5534
+    },
+    {
+      "epoch": 0.15272283289128327,
+      "grad_norm": 0.003472298150882125,
+      "learning_rate": 0.001,
+      "loss": 0.4016,
+      "step": 5535
+    },
+    {
+      "epoch": 0.15275042509234765,
+      "grad_norm": 0.002562028355896473,
+      "learning_rate": 0.001,
+      "loss": 0.3576,
+      "step": 5536
+    },
+    {
+      "epoch": 0.152778017293412,
+      "grad_norm": 0.0028723133727908134,
+      "learning_rate": 0.001,
+      "loss": 0.4317,
+      "step": 5537
+    },
+    {
+      "epoch": 0.15280560949447639,
+      "grad_norm": 0.0022548751439899206,
+      "learning_rate": 0.001,
+      "loss": 0.4182,
+      "step": 5538
+    },
+    {
+      "epoch": 0.15283320169554077,
+      "grad_norm": 0.0031076574232429266,
+      "learning_rate": 0.001,
+      "loss": 0.4241,
+      "step": 5539
+    },
+    {
+      "epoch": 0.15286079389660512,
+      "grad_norm": 0.00288601266220212,
+      "learning_rate": 0.001,
+      "loss": 0.4068,
+      "step": 5540
+    },
+    {
+      "epoch": 0.1528883860976695,
+      "grad_norm": 0.004477125592529774,
+      "learning_rate": 0.001,
+      "loss": 0.3715,
+      "step": 5541
+    },
+    {
+      "epoch": 0.15291597829873385,
+      "grad_norm": 0.0031475056894123554,
+      "learning_rate": 0.001,
+      "loss": 0.4294,
+      "step": 5542
+    },
+    {
+      "epoch": 0.15294357049979823,
+      "grad_norm": 0.0023465659469366074,
+      "learning_rate": 0.001,
+      "loss": 0.4104,
+      "step": 5543
+    },
+    {
+      "epoch": 0.1529711627008626,
+      "grad_norm": 0.0032081464305520058,
+      "learning_rate": 0.001,
+      "loss": 0.3931,
+      "step": 5544
+    },
+    {
+      "epoch": 0.15299875490192696,
+      "grad_norm": 0.002472371095791459,
+      "learning_rate": 0.001,
+      "loss": 0.4276,
+      "step": 5545
+    },
+    {
+      "epoch": 0.15302634710299134,
+      "grad_norm": 0.006937100552022457,
+      "learning_rate": 0.001,
+      "loss": 0.4047,
+      "step": 5546
+    },
+    {
+      "epoch": 0.1530539393040557,
+      "grad_norm": 0.0034694471396505833,
+      "learning_rate": 0.001,
+      "loss": 0.4476,
+      "step": 5547
+    },
+    {
+      "epoch": 0.15308153150512008,
+      "grad_norm": 0.0028970185667276382,
+      "learning_rate": 0.001,
+      "loss": 0.3899,
+      "step": 5548
+    },
+    {
+      "epoch": 0.15310912370618446,
+      "grad_norm": 0.0024902718141674995,
+      "learning_rate": 0.001,
+      "loss": 0.449,
+      "step": 5549
+    },
+    {
+      "epoch": 0.1531367159072488,
+      "grad_norm": 0.0023354976437985897,
+      "learning_rate": 0.001,
+      "loss": 0.3648,
+      "step": 5550
+    },
+    {
+      "epoch": 0.1531643081083132,
+      "grad_norm": 0.004294807091355324,
+      "learning_rate": 0.001,
+      "loss": 0.4073,
+      "step": 5551
+    },
+    {
+      "epoch": 0.15319190030937754,
+      "grad_norm": 0.0020946157164871693,
+      "learning_rate": 0.001,
+      "loss": 0.3978,
+      "step": 5552
+    },
+    {
+      "epoch": 0.15321949251044192,
+      "grad_norm": 0.003225631546229124,
+      "learning_rate": 0.001,
+      "loss": 0.3803,
+      "step": 5553
+    },
+    {
+      "epoch": 0.1532470847115063,
+      "grad_norm": 0.0035047954879701138,
+      "learning_rate": 0.001,
+      "loss": 0.3462,
+      "step": 5554
+    },
+    {
+      "epoch": 0.15327467691257066,
+      "grad_norm": 0.0029047110583633184,
+      "learning_rate": 0.001,
+      "loss": 0.393,
+      "step": 5555
+    },
+    {
+      "epoch": 0.15330226911363504,
+      "grad_norm": 0.0030687206890434027,
+      "learning_rate": 0.001,
+      "loss": 0.3931,
+      "step": 5556
+    },
+    {
+      "epoch": 0.1533298613146994,
+      "grad_norm": 0.0027540400624275208,
+      "learning_rate": 0.001,
+      "loss": 0.4126,
+      "step": 5557
+    },
+    {
+      "epoch": 0.15335745351576377,
+      "grad_norm": 0.002840878674760461,
+      "learning_rate": 0.001,
+      "loss": 0.3541,
+      "step": 5558
+    },
+    {
+      "epoch": 0.15338504571682815,
+      "grad_norm": 0.0027139533776789904,
+      "learning_rate": 0.001,
+      "loss": 0.4351,
+      "step": 5559
+    },
+    {
+      "epoch": 0.1534126379178925,
+      "grad_norm": 0.0027314014732837677,
+      "learning_rate": 0.001,
+      "loss": 0.4102,
+      "step": 5560
+    },
+    {
+      "epoch": 0.15344023011895688,
+      "grad_norm": 0.002478542272001505,
+      "learning_rate": 0.001,
+      "loss": 0.3936,
+      "step": 5561
+    },
+    {
+      "epoch": 0.15346782232002124,
+      "grad_norm": 0.007170672062784433,
+      "learning_rate": 0.001,
+      "loss": 0.3703,
+      "step": 5562
+    },
+    {
+      "epoch": 0.15349541452108562,
+      "grad_norm": 0.004415068309754133,
+      "learning_rate": 0.001,
+      "loss": 0.4195,
+      "step": 5563
+    },
+    {
+      "epoch": 0.15352300672215,
+      "grad_norm": 0.004127574618905783,
+      "learning_rate": 0.001,
+      "loss": 0.3705,
+      "step": 5564
+    },
+    {
+      "epoch": 0.15355059892321435,
+      "grad_norm": 0.0033065411262214184,
+      "learning_rate": 0.001,
+      "loss": 0.3814,
+      "step": 5565
+    },
+    {
+      "epoch": 0.15357819112427873,
+      "grad_norm": 0.0036155430134385824,
+      "learning_rate": 0.001,
+      "loss": 0.4144,
+      "step": 5566
+    },
+    {
+      "epoch": 0.15360578332534308,
+      "grad_norm": 0.0034816188272088766,
+      "learning_rate": 0.001,
+      "loss": 0.3936,
+      "step": 5567
+    },
+    {
+      "epoch": 0.15363337552640746,
+      "grad_norm": 0.003091090824455023,
+      "learning_rate": 0.001,
+      "loss": 0.4052,
+      "step": 5568
+    },
+    {
+      "epoch": 0.15366096772747184,
+      "grad_norm": 0.006985298823565245,
+      "learning_rate": 0.001,
+      "loss": 0.4364,
+      "step": 5569
+    },
+    {
+      "epoch": 0.1536885599285362,
+      "grad_norm": 0.0029920649249106646,
+      "learning_rate": 0.001,
+      "loss": 0.4319,
+      "step": 5570
+    },
+    {
+      "epoch": 0.15371615212960058,
+      "grad_norm": 0.004347649868577719,
+      "learning_rate": 0.001,
+      "loss": 0.361,
+      "step": 5571
+    },
+    {
+      "epoch": 0.15374374433066493,
+      "grad_norm": 0.002314548706635833,
+      "learning_rate": 0.001,
+      "loss": 0.4181,
+      "step": 5572
+    },
+    {
+      "epoch": 0.1537713365317293,
+      "grad_norm": 0.0023193899542093277,
+      "learning_rate": 0.001,
+      "loss": 0.4267,
+      "step": 5573
+    },
+    {
+      "epoch": 0.1537989287327937,
+      "grad_norm": 0.004895602352917194,
+      "learning_rate": 0.001,
+      "loss": 0.3866,
+      "step": 5574
+    },
+    {
+      "epoch": 0.15382652093385804,
+      "grad_norm": 0.003913875203579664,
+      "learning_rate": 0.001,
+      "loss": 0.3747,
+      "step": 5575
+    },
+    {
+      "epoch": 0.15385411313492242,
+      "grad_norm": 0.002599104307591915,
+      "learning_rate": 0.001,
+      "loss": 0.4489,
+      "step": 5576
+    },
+    {
+      "epoch": 0.15388170533598677,
+      "grad_norm": 0.0025326511822640896,
+      "learning_rate": 0.001,
+      "loss": 0.3903,
+      "step": 5577
+    },
+    {
+      "epoch": 0.15390929753705115,
+      "grad_norm": 0.003771717194467783,
+      "learning_rate": 0.001,
+      "loss": 0.3915,
+      "step": 5578
+    },
+    {
+      "epoch": 0.15393688973811553,
+      "grad_norm": 0.004678527358919382,
+      "learning_rate": 0.001,
+      "loss": 0.3761,
+      "step": 5579
+    },
+    {
+      "epoch": 0.1539644819391799,
+      "grad_norm": 0.0024059766437858343,
+      "learning_rate": 0.001,
+      "loss": 0.3815,
+      "step": 5580
+    },
+    {
+      "epoch": 0.15399207414024427,
+      "grad_norm": 0.003771177725866437,
+      "learning_rate": 0.001,
+      "loss": 0.3776,
+      "step": 5581
+    },
+    {
+      "epoch": 0.15401966634130862,
+      "grad_norm": 0.002773088635876775,
+      "learning_rate": 0.001,
+      "loss": 0.4007,
+      "step": 5582
+    },
+    {
+      "epoch": 0.154047258542373,
+      "grad_norm": 0.003286458784714341,
+      "learning_rate": 0.001,
+      "loss": 0.4284,
+      "step": 5583
+    },
+    {
+      "epoch": 0.15407485074343738,
+      "grad_norm": 0.007167492527514696,
+      "learning_rate": 0.001,
+      "loss": 0.3719,
+      "step": 5584
+    },
+    {
+      "epoch": 0.15410244294450173,
+      "grad_norm": 0.0022903766948729753,
+      "learning_rate": 0.001,
+      "loss": 0.4223,
+      "step": 5585
+    },
+    {
+      "epoch": 0.1541300351455661,
+      "grad_norm": 0.0034131724387407303,
+      "learning_rate": 0.001,
+      "loss": 0.3919,
+      "step": 5586
+    },
+    {
+      "epoch": 0.15415762734663047,
+      "grad_norm": 0.005761331412941217,
+      "learning_rate": 0.001,
+      "loss": 0.3989,
+      "step": 5587
+    },
+    {
+      "epoch": 0.15418521954769485,
+      "grad_norm": 0.0029236627742648125,
+      "learning_rate": 0.001,
+      "loss": 0.3858,
+      "step": 5588
+    },
+    {
+      "epoch": 0.15421281174875923,
+      "grad_norm": 0.0033100962173193693,
+      "learning_rate": 0.001,
+      "loss": 0.405,
+      "step": 5589
+    },
+    {
+      "epoch": 0.15424040394982358,
+      "grad_norm": 0.003999118227511644,
+      "learning_rate": 0.001,
+      "loss": 0.4109,
+      "step": 5590
+    },
+    {
+      "epoch": 0.15426799615088796,
+      "grad_norm": 0.006767808459699154,
+      "learning_rate": 0.001,
+      "loss": 0.4015,
+      "step": 5591
+    },
+    {
+      "epoch": 0.1542955883519523,
+      "grad_norm": 0.0028603686951100826,
+      "learning_rate": 0.001,
+      "loss": 0.4035,
+      "step": 5592
+    },
+    {
+      "epoch": 0.1543231805530167,
+      "grad_norm": 0.002447220031172037,
+      "learning_rate": 0.001,
+      "loss": 0.3855,
+      "step": 5593
+    },
+    {
+      "epoch": 0.15435077275408107,
+      "grad_norm": 0.00598937040194869,
+      "learning_rate": 0.001,
+      "loss": 0.3911,
+      "step": 5594
+    },
+    {
+      "epoch": 0.15437836495514543,
+      "grad_norm": 0.0037801298312842846,
+      "learning_rate": 0.001,
+      "loss": 0.4432,
+      "step": 5595
+    },
+    {
+      "epoch": 0.1544059571562098,
+      "grad_norm": 0.00243676477111876,
+      "learning_rate": 0.001,
+      "loss": 0.4197,
+      "step": 5596
+    },
+    {
+      "epoch": 0.15443354935727416,
+      "grad_norm": 0.0022302521392703056,
+      "learning_rate": 0.001,
+      "loss": 0.4172,
+      "step": 5597
+    },
+    {
+      "epoch": 0.15446114155833854,
+      "grad_norm": 0.002573461504653096,
+      "learning_rate": 0.001,
+      "loss": 0.4177,
+      "step": 5598
+    },
+    {
+      "epoch": 0.15448873375940292,
+      "grad_norm": 0.004255924839526415,
+      "learning_rate": 0.001,
+      "loss": 0.3875,
+      "step": 5599
+    },
+    {
+      "epoch": 0.15451632596046727,
+      "grad_norm": 0.0026880300138145685,
+      "learning_rate": 0.001,
+      "loss": 0.3755,
+      "step": 5600
+    },
+    {
+      "epoch": 0.15454391816153165,
+      "grad_norm": 0.00242446456104517,
+      "learning_rate": 0.001,
+      "loss": 0.4213,
+      "step": 5601
+    },
+    {
+      "epoch": 0.154571510362596,
+      "grad_norm": 0.00247559929266572,
+      "learning_rate": 0.001,
+      "loss": 0.4105,
+      "step": 5602
+    },
+    {
+      "epoch": 0.15459910256366038,
+      "grad_norm": 0.003476825775578618,
+      "learning_rate": 0.001,
+      "loss": 0.3411,
+      "step": 5603
+    },
+    {
+      "epoch": 0.15462669476472474,
+      "grad_norm": 0.003844754071906209,
+      "learning_rate": 0.001,
+      "loss": 0.3922,
+      "step": 5604
+    },
+    {
+      "epoch": 0.15465428696578912,
+      "grad_norm": 0.0029344146605581045,
+      "learning_rate": 0.001,
+      "loss": 0.4211,
+      "step": 5605
+    },
+    {
+      "epoch": 0.1546818791668535,
+      "grad_norm": 0.0040620798245072365,
+      "learning_rate": 0.001,
+      "loss": 0.3821,
+      "step": 5606
+    },
+    {
+      "epoch": 0.15470947136791785,
+      "grad_norm": 0.0020004198886454105,
+      "learning_rate": 0.001,
+      "loss": 0.4069,
+      "step": 5607
+    },
+    {
+      "epoch": 0.15473706356898223,
+      "grad_norm": 0.002827103016898036,
+      "learning_rate": 0.001,
+      "loss": 0.376,
+      "step": 5608
+    },
+    {
+      "epoch": 0.15476465577004658,
+      "grad_norm": 0.0023741433396935463,
+      "learning_rate": 0.001,
+      "loss": 0.4229,
+      "step": 5609
+    },
+    {
+      "epoch": 0.15479224797111096,
+      "grad_norm": 0.004414036870002747,
+      "learning_rate": 0.001,
+      "loss": 0.4071,
+      "step": 5610
+    },
+    {
+      "epoch": 0.15481984017217534,
+      "grad_norm": 0.0030676499009132385,
+      "learning_rate": 0.001,
+      "loss": 0.3843,
+      "step": 5611
+    },
+    {
+      "epoch": 0.1548474323732397,
+      "grad_norm": 0.0032093478366732597,
+      "learning_rate": 0.001,
+      "loss": 0.401,
+      "step": 5612
+    },
+    {
+      "epoch": 0.15487502457430408,
+      "grad_norm": 0.0024001640267670155,
+      "learning_rate": 0.001,
+      "loss": 0.4511,
+      "step": 5613
+    },
+    {
+      "epoch": 0.15490261677536843,
+      "grad_norm": 0.004838323220610619,
+      "learning_rate": 0.001,
+      "loss": 0.4094,
+      "step": 5614
+    },
+    {
+      "epoch": 0.1549302089764328,
+      "grad_norm": 0.003309109481051564,
+      "learning_rate": 0.001,
+      "loss": 0.3565,
+      "step": 5615
+    },
+    {
+      "epoch": 0.1549578011774972,
+      "grad_norm": 0.0025519407354295254,
+      "learning_rate": 0.001,
+      "loss": 0.4047,
+      "step": 5616
+    },
+    {
+      "epoch": 0.15498539337856154,
+      "grad_norm": 0.002377223689109087,
+      "learning_rate": 0.001,
+      "loss": 0.4146,
+      "step": 5617
+    },
+    {
+      "epoch": 0.15501298557962592,
+      "grad_norm": 0.002599177649244666,
+      "learning_rate": 0.001,
+      "loss": 0.4414,
+      "step": 5618
+    },
+    {
+      "epoch": 0.15504057778069028,
+      "grad_norm": 0.0029773900751024485,
+      "learning_rate": 0.001,
+      "loss": 0.4301,
+      "step": 5619
+    },
+    {
+      "epoch": 0.15506816998175466,
+      "grad_norm": 0.0042325593531131744,
+      "learning_rate": 0.001,
+      "loss": 0.4303,
+      "step": 5620
+    },
+    {
+      "epoch": 0.15509576218281904,
+      "grad_norm": 0.0032665403559803963,
+      "learning_rate": 0.001,
+      "loss": 0.4201,
+      "step": 5621
+    },
+    {
+      "epoch": 0.1551233543838834,
+      "grad_norm": 0.00355579168535769,
+      "learning_rate": 0.001,
+      "loss": 0.3707,
+      "step": 5622
+    },
+    {
+      "epoch": 0.15515094658494777,
+      "grad_norm": 0.0023576144594699144,
+      "learning_rate": 0.001,
+      "loss": 0.4181,
+      "step": 5623
+    },
+    {
+      "epoch": 0.15517853878601212,
+      "grad_norm": 0.0028574687894433737,
+      "learning_rate": 0.001,
+      "loss": 0.399,
+      "step": 5624
+    },
+    {
+      "epoch": 0.1552061309870765,
+      "grad_norm": 0.0038885304238647223,
+      "learning_rate": 0.001,
+      "loss": 0.4392,
+      "step": 5625
+    },
+    {
+      "epoch": 0.15523372318814088,
+      "grad_norm": 0.0036933154333382845,
+      "learning_rate": 0.001,
+      "loss": 0.375,
+      "step": 5626
+    },
+    {
+      "epoch": 0.15526131538920523,
+      "grad_norm": 0.0026938861701637506,
+      "learning_rate": 0.001,
+      "loss": 0.3936,
+      "step": 5627
+    },
+    {
+      "epoch": 0.15528890759026961,
+      "grad_norm": 0.00399070093408227,
+      "learning_rate": 0.001,
+      "loss": 0.3792,
+      "step": 5628
+    },
+    {
+      "epoch": 0.15531649979133397,
+      "grad_norm": 0.004512227140367031,
+      "learning_rate": 0.001,
+      "loss": 0.4562,
+      "step": 5629
+    },
+    {
+      "epoch": 0.15534409199239835,
+      "grad_norm": 0.006776070687919855,
+      "learning_rate": 0.001,
+      "loss": 0.4011,
+      "step": 5630
+    },
+    {
+      "epoch": 0.15537168419346273,
+      "grad_norm": 0.006678242702037096,
+      "learning_rate": 0.001,
+      "loss": 0.4211,
+      "step": 5631
+    },
+    {
+      "epoch": 0.15539927639452708,
+      "grad_norm": 0.0036010188050568104,
+      "learning_rate": 0.001,
+      "loss": 0.4175,
+      "step": 5632
+    },
+    {
+      "epoch": 0.15542686859559146,
+      "grad_norm": 0.0073088183999061584,
+      "learning_rate": 0.001,
+      "loss": 0.3522,
+      "step": 5633
+    },
+    {
+      "epoch": 0.1554544607966558,
+      "grad_norm": 0.005029013846069574,
+      "learning_rate": 0.001,
+      "loss": 0.4045,
+      "step": 5634
+    },
+    {
+      "epoch": 0.1554820529977202,
+      "grad_norm": 0.002414390444755554,
+      "learning_rate": 0.001,
+      "loss": 0.4185,
+      "step": 5635
+    },
+    {
+      "epoch": 0.15550964519878457,
+      "grad_norm": 0.0036553817335516214,
+      "learning_rate": 0.001,
+      "loss": 0.3909,
+      "step": 5636
+    },
+    {
+      "epoch": 0.15553723739984893,
+      "grad_norm": 0.005032503046095371,
+      "learning_rate": 0.001,
+      "loss": 0.3812,
+      "step": 5637
+    },
+    {
+      "epoch": 0.1555648296009133,
+      "grad_norm": 0.0028513423167169094,
+      "learning_rate": 0.001,
+      "loss": 0.4276,
+      "step": 5638
+    },
+    {
+      "epoch": 0.15559242180197766,
+      "grad_norm": 0.004651295021176338,
+      "learning_rate": 0.001,
+      "loss": 0.4122,
+      "step": 5639
+    },
+    {
+      "epoch": 0.15562001400304204,
+      "grad_norm": 0.0026212476659566164,
+      "learning_rate": 0.001,
+      "loss": 0.3865,
+      "step": 5640
+    },
+    {
+      "epoch": 0.15564760620410642,
+      "grad_norm": 0.0022126201074570417,
+      "learning_rate": 0.001,
+      "loss": 0.4105,
+      "step": 5641
+    },
+    {
+      "epoch": 0.15567519840517077,
+      "grad_norm": 0.002519601956009865,
+      "learning_rate": 0.001,
+      "loss": 0.3929,
+      "step": 5642
+    },
+    {
+      "epoch": 0.15570279060623515,
+      "grad_norm": 0.002760083880275488,
+      "learning_rate": 0.001,
+      "loss": 0.3819,
+      "step": 5643
+    },
+    {
+      "epoch": 0.1557303828072995,
+      "grad_norm": 0.003575262613594532,
+      "learning_rate": 0.001,
+      "loss": 0.401,
+      "step": 5644
+    },
+    {
+      "epoch": 0.15575797500836389,
+      "grad_norm": 0.0028452950064092875,
+      "learning_rate": 0.001,
+      "loss": 0.3747,
+      "step": 5645
+    },
+    {
+      "epoch": 0.15578556720942827,
+      "grad_norm": 0.0025837207213044167,
+      "learning_rate": 0.001,
+      "loss": 0.3975,
+      "step": 5646
+    },
+    {
+      "epoch": 0.15581315941049262,
+      "grad_norm": 0.0023428683634847403,
+      "learning_rate": 0.001,
+      "loss": 0.4274,
+      "step": 5647
+    },
+    {
+      "epoch": 0.155840751611557,
+      "grad_norm": 0.0025679313112050295,
+      "learning_rate": 0.001,
+      "loss": 0.3871,
+      "step": 5648
+    },
+    {
+      "epoch": 0.15586834381262135,
+      "grad_norm": 0.002796493237838149,
+      "learning_rate": 0.001,
+      "loss": 0.4153,
+      "step": 5649
+    },
+    {
+      "epoch": 0.15589593601368573,
+      "grad_norm": 0.002868711482733488,
+      "learning_rate": 0.001,
+      "loss": 0.3826,
+      "step": 5650
+    },
+    {
+      "epoch": 0.1559235282147501,
+      "grad_norm": 0.0034229152370244265,
+      "learning_rate": 0.001,
+      "loss": 0.4015,
+      "step": 5651
+    },
+    {
+      "epoch": 0.15595112041581446,
+      "grad_norm": 0.0044869715347886086,
+      "learning_rate": 0.001,
+      "loss": 0.3901,
+      "step": 5652
+    },
+    {
+      "epoch": 0.15597871261687885,
+      "grad_norm": 0.0027872773353010416,
+      "learning_rate": 0.001,
+      "loss": 0.4223,
+      "step": 5653
+    },
+    {
+      "epoch": 0.1560063048179432,
+      "grad_norm": 0.007318846881389618,
+      "learning_rate": 0.001,
+      "loss": 0.4144,
+      "step": 5654
+    },
+    {
+      "epoch": 0.15603389701900758,
+      "grad_norm": 0.003985683433711529,
+      "learning_rate": 0.001,
+      "loss": 0.3814,
+      "step": 5655
+    },
+    {
+      "epoch": 0.15606148922007196,
+      "grad_norm": 0.002402941230684519,
+      "learning_rate": 0.001,
+      "loss": 0.4094,
+      "step": 5656
+    },
+    {
+      "epoch": 0.1560890814211363,
+      "grad_norm": 0.00238110963255167,
+      "learning_rate": 0.001,
+      "loss": 0.4056,
+      "step": 5657
+    },
+    {
+      "epoch": 0.1561166736222007,
+      "grad_norm": 0.00369979883544147,
+      "learning_rate": 0.001,
+      "loss": 0.3758,
+      "step": 5658
+    },
+    {
+      "epoch": 0.15614426582326504,
+      "grad_norm": 0.0022104033268988132,
+      "learning_rate": 0.001,
+      "loss": 0.4058,
+      "step": 5659
+    },
+    {
+      "epoch": 0.15617185802432942,
+      "grad_norm": 0.0024638620670884848,
+      "learning_rate": 0.001,
+      "loss": 0.4071,
+      "step": 5660
+    },
+    {
+      "epoch": 0.1561994502253938,
+      "grad_norm": 0.0023760111071169376,
+      "learning_rate": 0.001,
+      "loss": 0.4247,
+      "step": 5661
+    },
+    {
+      "epoch": 0.15622704242645816,
+      "grad_norm": 0.002687407424673438,
+      "learning_rate": 0.001,
+      "loss": 0.3877,
+      "step": 5662
+    },
+    {
+      "epoch": 0.15625463462752254,
+      "grad_norm": 0.002467637648805976,
+      "learning_rate": 0.001,
+      "loss": 0.4084,
+      "step": 5663
+    },
+    {
+      "epoch": 0.1562822268285869,
+      "grad_norm": 0.0020871292799711227,
+      "learning_rate": 0.001,
+      "loss": 0.3866,
+      "step": 5664
+    },
+    {
+      "epoch": 0.15630981902965127,
+      "grad_norm": 0.0048971157521009445,
+      "learning_rate": 0.001,
+      "loss": 0.4112,
+      "step": 5665
+    },
+    {
+      "epoch": 0.15633741123071565,
+      "grad_norm": 0.002223189687356353,
+      "learning_rate": 0.001,
+      "loss": 0.387,
+      "step": 5666
+    },
+    {
+      "epoch": 0.15636500343178,
+      "grad_norm": 0.0026094792410731316,
+      "learning_rate": 0.001,
+      "loss": 0.4407,
+      "step": 5667
+    },
+    {
+      "epoch": 0.15639259563284438,
+      "grad_norm": 0.004317654296755791,
+      "learning_rate": 0.001,
+      "loss": 0.4128,
+      "step": 5668
+    },
+    {
+      "epoch": 0.15642018783390874,
+      "grad_norm": 0.0028076712042093277,
+      "learning_rate": 0.001,
+      "loss": 0.3887,
+      "step": 5669
+    },
+    {
+      "epoch": 0.15644778003497312,
+      "grad_norm": 0.0033742813393473625,
+      "learning_rate": 0.001,
+      "loss": 0.4305,
+      "step": 5670
+    },
+    {
+      "epoch": 0.1564753722360375,
+      "grad_norm": 0.004012165125459433,
+      "learning_rate": 0.001,
+      "loss": 0.3531,
+      "step": 5671
+    },
+    {
+      "epoch": 0.15650296443710185,
+      "grad_norm": 0.0029937534127384424,
+      "learning_rate": 0.001,
+      "loss": 0.3737,
+      "step": 5672
+    },
+    {
+      "epoch": 0.15653055663816623,
+      "grad_norm": 0.004357090685516596,
+      "learning_rate": 0.001,
+      "loss": 0.3949,
+      "step": 5673
+    },
+    {
+      "epoch": 0.15655814883923058,
+      "grad_norm": 0.006484336219727993,
+      "learning_rate": 0.001,
+      "loss": 0.3871,
+      "step": 5674
+    },
+    {
+      "epoch": 0.15658574104029496,
+      "grad_norm": 0.0038871821016073227,
+      "learning_rate": 0.001,
+      "loss": 0.4232,
+      "step": 5675
+    },
+    {
+      "epoch": 0.15661333324135934,
+      "grad_norm": 0.008595773950219154,
+      "learning_rate": 0.001,
+      "loss": 0.3821,
+      "step": 5676
+    },
+    {
+      "epoch": 0.1566409254424237,
+      "grad_norm": 0.009347690269351006,
+      "learning_rate": 0.001,
+      "loss": 0.3448,
+      "step": 5677
+    },
+    {
+      "epoch": 0.15666851764348808,
+      "grad_norm": 0.002826757961884141,
+      "learning_rate": 0.001,
+      "loss": 0.3628,
+      "step": 5678
+    },
+    {
+      "epoch": 0.15669610984455243,
+      "grad_norm": 0.003515382297337055,
+      "learning_rate": 0.001,
+      "loss": 0.3935,
+      "step": 5679
+    },
+    {
+      "epoch": 0.1567237020456168,
+      "grad_norm": 0.0032721932511776686,
+      "learning_rate": 0.001,
+      "loss": 0.4253,
+      "step": 5680
+    },
+    {
+      "epoch": 0.1567512942466812,
+      "grad_norm": 0.003463307162746787,
+      "learning_rate": 0.001,
+      "loss": 0.384,
+      "step": 5681
+    },
+    {
+      "epoch": 0.15677888644774554,
+      "grad_norm": 0.003177905920892954,
+      "learning_rate": 0.001,
+      "loss": 0.3769,
+      "step": 5682
+    },
+    {
+      "epoch": 0.15680647864880992,
+      "grad_norm": 0.003270684042945504,
+      "learning_rate": 0.001,
+      "loss": 0.4242,
+      "step": 5683
+    },
+    {
+      "epoch": 0.15683407084987427,
+      "grad_norm": 0.0029053555335849524,
+      "learning_rate": 0.001,
+      "loss": 0.4024,
+      "step": 5684
+    },
+    {
+      "epoch": 0.15686166305093865,
+      "grad_norm": 0.004513274412602186,
+      "learning_rate": 0.001,
+      "loss": 0.3607,
+      "step": 5685
+    },
+    {
+      "epoch": 0.15688925525200303,
+      "grad_norm": 0.012942062690854073,
+      "learning_rate": 0.001,
+      "loss": 0.408,
+      "step": 5686
+    },
+    {
+      "epoch": 0.1569168474530674,
+      "grad_norm": 0.004073042422533035,
+      "learning_rate": 0.001,
+      "loss": 0.3747,
+      "step": 5687
+    },
+    {
+      "epoch": 0.15694443965413177,
+      "grad_norm": 0.0024588184896856546,
+      "learning_rate": 0.001,
+      "loss": 0.4046,
+      "step": 5688
+    },
+    {
+      "epoch": 0.15697203185519612,
+      "grad_norm": 0.002686630468815565,
+      "learning_rate": 0.001,
+      "loss": 0.4188,
+      "step": 5689
+    },
+    {
+      "epoch": 0.1569996240562605,
+      "grad_norm": 0.0031974229495972395,
+      "learning_rate": 0.001,
+      "loss": 0.3942,
+      "step": 5690
+    },
+    {
+      "epoch": 0.15702721625732488,
+      "grad_norm": 0.0040585664100945,
+      "learning_rate": 0.001,
+      "loss": 0.3606,
+      "step": 5691
+    },
+    {
+      "epoch": 0.15705480845838923,
+      "grad_norm": 0.005035550333559513,
+      "learning_rate": 0.001,
+      "loss": 0.4055,
+      "step": 5692
+    },
+    {
+      "epoch": 0.1570824006594536,
+      "grad_norm": 0.005043357145041227,
+      "learning_rate": 0.001,
+      "loss": 0.4051,
+      "step": 5693
+    },
+    {
+      "epoch": 0.15710999286051797,
+      "grad_norm": 0.002406257903203368,
+      "learning_rate": 0.001,
+      "loss": 0.3943,
+      "step": 5694
+    },
+    {
+      "epoch": 0.15713758506158235,
+      "grad_norm": 0.002501264214515686,
+      "learning_rate": 0.001,
+      "loss": 0.395,
+      "step": 5695
+    },
+    {
+      "epoch": 0.1571651772626467,
+      "grad_norm": 0.003250819630920887,
+      "learning_rate": 0.001,
+      "loss": 0.3747,
+      "step": 5696
+    },
+    {
+      "epoch": 0.15719276946371108,
+      "grad_norm": 0.0020628594793379307,
+      "learning_rate": 0.001,
+      "loss": 0.3828,
+      "step": 5697
+    },
+    {
+      "epoch": 0.15722036166477546,
+      "grad_norm": 0.002165653510019183,
+      "learning_rate": 0.001,
+      "loss": 0.3879,
+      "step": 5698
+    },
+    {
+      "epoch": 0.1572479538658398,
+      "grad_norm": 0.003081389470025897,
+      "learning_rate": 0.001,
+      "loss": 0.4097,
+      "step": 5699
+    },
+    {
+      "epoch": 0.1572755460669042,
+      "grad_norm": 0.0035273549146950245,
+      "learning_rate": 0.001,
+      "loss": 0.4187,
+      "step": 5700
+    },
+    {
+      "epoch": 0.15730313826796855,
+      "grad_norm": 0.0023170190397650003,
+      "learning_rate": 0.001,
+      "loss": 0.4429,
+      "step": 5701
+    },
+    {
+      "epoch": 0.15733073046903293,
+      "grad_norm": 0.0033377818763256073,
+      "learning_rate": 0.001,
+      "loss": 0.4085,
+      "step": 5702
+    },
+    {
+      "epoch": 0.1573583226700973,
+      "grad_norm": 0.0030271231662482023,
+      "learning_rate": 0.001,
+      "loss": 0.3799,
+      "step": 5703
+    },
+    {
+      "epoch": 0.15738591487116166,
+      "grad_norm": 0.002670741407200694,
+      "learning_rate": 0.001,
+      "loss": 0.3989,
+      "step": 5704
+    },
+    {
+      "epoch": 0.15741350707222604,
+      "grad_norm": 0.0025286702439188957,
+      "learning_rate": 0.001,
+      "loss": 0.4247,
+      "step": 5705
+    },
+    {
+      "epoch": 0.1574410992732904,
+      "grad_norm": 0.002894239965826273,
+      "learning_rate": 0.001,
+      "loss": 0.4041,
+      "step": 5706
+    },
+    {
+      "epoch": 0.15746869147435477,
+      "grad_norm": 0.0030300067737698555,
+      "learning_rate": 0.001,
+      "loss": 0.4297,
+      "step": 5707
+    },
+    {
+      "epoch": 0.15749628367541915,
+      "grad_norm": 0.0026801263447850943,
+      "learning_rate": 0.001,
+      "loss": 0.3788,
+      "step": 5708
+    },
+    {
+      "epoch": 0.1575238758764835,
+      "grad_norm": 0.0029794182628393173,
+      "learning_rate": 0.001,
+      "loss": 0.3889,
+      "step": 5709
+    },
+    {
+      "epoch": 0.15755146807754788,
+      "grad_norm": 0.0026555589865893126,
+      "learning_rate": 0.001,
+      "loss": 0.4181,
+      "step": 5710
+    },
+    {
+      "epoch": 0.15757906027861224,
+      "grad_norm": 0.0025151160079985857,
+      "learning_rate": 0.001,
+      "loss": 0.3574,
+      "step": 5711
+    },
+    {
+      "epoch": 0.15760665247967662,
+      "grad_norm": 0.0026525617577135563,
+      "learning_rate": 0.001,
+      "loss": 0.3906,
+      "step": 5712
+    },
+    {
+      "epoch": 0.157634244680741,
+      "grad_norm": 0.002047165296971798,
+      "learning_rate": 0.001,
+      "loss": 0.4115,
+      "step": 5713
+    },
+    {
+      "epoch": 0.15766183688180535,
+      "grad_norm": 0.00422664824873209,
+      "learning_rate": 0.001,
+      "loss": 0.3657,
+      "step": 5714
+    },
+    {
+      "epoch": 0.15768942908286973,
+      "grad_norm": 0.005134572274982929,
+      "learning_rate": 0.001,
+      "loss": 0.3954,
+      "step": 5715
+    },
+    {
+      "epoch": 0.15771702128393408,
+      "grad_norm": 0.004460239317268133,
+      "learning_rate": 0.001,
+      "loss": 0.4211,
+      "step": 5716
+    },
+    {
+      "epoch": 0.15774461348499846,
+      "grad_norm": 0.0029927766881883144,
+      "learning_rate": 0.001,
+      "loss": 0.3724,
+      "step": 5717
+    },
+    {
+      "epoch": 0.15777220568606284,
+      "grad_norm": 0.005971200298517942,
+      "learning_rate": 0.001,
+      "loss": 0.3673,
+      "step": 5718
+    },
+    {
+      "epoch": 0.1577997978871272,
+      "grad_norm": 0.0025469562970101833,
+      "learning_rate": 0.001,
+      "loss": 0.381,
+      "step": 5719
+    },
+    {
+      "epoch": 0.15782739008819158,
+      "grad_norm": 0.0036222515627741814,
+      "learning_rate": 0.001,
+      "loss": 0.3634,
+      "step": 5720
+    },
+    {
+      "epoch": 0.15785498228925593,
+      "grad_norm": 0.002309228293597698,
+      "learning_rate": 0.001,
+      "loss": 0.399,
+      "step": 5721
+    },
+    {
+      "epoch": 0.1578825744903203,
+      "grad_norm": 0.006437615491449833,
+      "learning_rate": 0.001,
+      "loss": 0.3823,
+      "step": 5722
+    },
+    {
+      "epoch": 0.1579101666913847,
+      "grad_norm": 0.0027779482770711184,
+      "learning_rate": 0.001,
+      "loss": 0.4044,
+      "step": 5723
+    },
+    {
+      "epoch": 0.15793775889244904,
+      "grad_norm": 0.006843385752290487,
+      "learning_rate": 0.001,
+      "loss": 0.4139,
+      "step": 5724
+    },
+    {
+      "epoch": 0.15796535109351342,
+      "grad_norm": 0.005818149074912071,
+      "learning_rate": 0.001,
+      "loss": 0.3991,
+      "step": 5725
+    },
+    {
+      "epoch": 0.15799294329457778,
+      "grad_norm": 0.004674671217799187,
+      "learning_rate": 0.001,
+      "loss": 0.4334,
+      "step": 5726
+    },
+    {
+      "epoch": 0.15802053549564216,
+      "grad_norm": 0.0025879840832203627,
+      "learning_rate": 0.001,
+      "loss": 0.4182,
+      "step": 5727
+    },
+    {
+      "epoch": 0.15804812769670654,
+      "grad_norm": 0.0030769093427807093,
+      "learning_rate": 0.001,
+      "loss": 0.4101,
+      "step": 5728
+    },
+    {
+      "epoch": 0.1580757198977709,
+      "grad_norm": 0.0031094071455299854,
+      "learning_rate": 0.001,
+      "loss": 0.4282,
+      "step": 5729
+    },
+    {
+      "epoch": 0.15810331209883527,
+      "grad_norm": 0.0025110999122262,
+      "learning_rate": 0.001,
+      "loss": 0.4245,
+      "step": 5730
+    },
+    {
+      "epoch": 0.15813090429989962,
+      "grad_norm": 0.003311107400804758,
+      "learning_rate": 0.001,
+      "loss": 0.4133,
+      "step": 5731
+    },
+    {
+      "epoch": 0.158158496500964,
+      "grad_norm": 0.003340107621625066,
+      "learning_rate": 0.001,
+      "loss": 0.3871,
+      "step": 5732
+    },
+    {
+      "epoch": 0.15818608870202838,
+      "grad_norm": 0.004575404338538647,
+      "learning_rate": 0.001,
+      "loss": 0.3994,
+      "step": 5733
+    },
+    {
+      "epoch": 0.15821368090309273,
+      "grad_norm": 0.0022902588825672865,
+      "learning_rate": 0.001,
+      "loss": 0.3931,
+      "step": 5734
+    },
+    {
+      "epoch": 0.15824127310415712,
+      "grad_norm": 0.0030541005544364452,
+      "learning_rate": 0.001,
+      "loss": 0.4236,
+      "step": 5735
+    },
+    {
+      "epoch": 0.15826886530522147,
+      "grad_norm": 0.002666509710252285,
+      "learning_rate": 0.001,
+      "loss": 0.399,
+      "step": 5736
+    },
+    {
+      "epoch": 0.15829645750628585,
+      "grad_norm": 0.0029707769863307476,
+      "learning_rate": 0.001,
+      "loss": 0.4407,
+      "step": 5737
+    },
+    {
+      "epoch": 0.15832404970735023,
+      "grad_norm": 0.0034186067059636116,
+      "learning_rate": 0.001,
+      "loss": 0.3971,
+      "step": 5738
+    },
+    {
+      "epoch": 0.15835164190841458,
+      "grad_norm": 0.002819865709170699,
+      "learning_rate": 0.001,
+      "loss": 0.3952,
+      "step": 5739
+    },
+    {
+      "epoch": 0.15837923410947896,
+      "grad_norm": 0.004903051070868969,
+      "learning_rate": 0.001,
+      "loss": 0.3917,
+      "step": 5740
+    },
+    {
+      "epoch": 0.1584068263105433,
+      "grad_norm": 0.0036358917132019997,
+      "learning_rate": 0.001,
+      "loss": 0.4009,
+      "step": 5741
+    },
+    {
+      "epoch": 0.1584344185116077,
+      "grad_norm": 0.004073919262737036,
+      "learning_rate": 0.001,
+      "loss": 0.4096,
+      "step": 5742
+    },
+    {
+      "epoch": 0.15846201071267207,
+      "grad_norm": 0.0033164892811328173,
+      "learning_rate": 0.001,
+      "loss": 0.431,
+      "step": 5743
+    },
+    {
+      "epoch": 0.15848960291373643,
+      "grad_norm": 0.008852209895849228,
+      "learning_rate": 0.001,
+      "loss": 0.4249,
+      "step": 5744
+    },
+    {
+      "epoch": 0.1585171951148008,
+      "grad_norm": 0.004943115636706352,
+      "learning_rate": 0.001,
+      "loss": 0.3866,
+      "step": 5745
+    },
+    {
+      "epoch": 0.15854478731586516,
+      "grad_norm": 0.0025044376961886883,
+      "learning_rate": 0.001,
+      "loss": 0.4301,
+      "step": 5746
+    },
+    {
+      "epoch": 0.15857237951692954,
+      "grad_norm": 0.0032009256538003683,
+      "learning_rate": 0.001,
+      "loss": 0.3997,
+      "step": 5747
+    },
+    {
+      "epoch": 0.15859997171799392,
+      "grad_norm": 0.002845719689503312,
+      "learning_rate": 0.001,
+      "loss": 0.4215,
+      "step": 5748
+    },
+    {
+      "epoch": 0.15862756391905827,
+      "grad_norm": 0.00325995241291821,
+      "learning_rate": 0.001,
+      "loss": 0.408,
+      "step": 5749
+    },
+    {
+      "epoch": 0.15865515612012265,
+      "grad_norm": 0.0026871482841670513,
+      "learning_rate": 0.001,
+      "loss": 0.4205,
+      "step": 5750
+    },
+    {
+      "epoch": 0.158682748321187,
+      "grad_norm": 0.0027641262859106064,
+      "learning_rate": 0.001,
+      "loss": 0.3618,
+      "step": 5751
+    },
+    {
+      "epoch": 0.1587103405222514,
+      "grad_norm": 0.003129362827166915,
+      "learning_rate": 0.001,
+      "loss": 0.4006,
+      "step": 5752
+    },
+    {
+      "epoch": 0.15873793272331577,
+      "grad_norm": 0.0031827734783291817,
+      "learning_rate": 0.001,
+      "loss": 0.3633,
+      "step": 5753
+    },
+    {
+      "epoch": 0.15876552492438012,
+      "grad_norm": 0.0046300506219267845,
+      "learning_rate": 0.001,
+      "loss": 0.3883,
+      "step": 5754
+    },
+    {
+      "epoch": 0.1587931171254445,
+      "grad_norm": 0.00367173389531672,
+      "learning_rate": 0.001,
+      "loss": 0.4506,
+      "step": 5755
+    },
+    {
+      "epoch": 0.15882070932650885,
+      "grad_norm": 0.0031661444809287786,
+      "learning_rate": 0.001,
+      "loss": 0.4077,
+      "step": 5756
+    },
+    {
+      "epoch": 0.15884830152757323,
+      "grad_norm": 0.002517246874049306,
+      "learning_rate": 0.001,
+      "loss": 0.366,
+      "step": 5757
+    },
+    {
+      "epoch": 0.1588758937286376,
+      "grad_norm": 0.0026858425699174404,
+      "learning_rate": 0.001,
+      "loss": 0.4244,
+      "step": 5758
+    },
+    {
+      "epoch": 0.15890348592970197,
+      "grad_norm": 0.0026828250847756863,
+      "learning_rate": 0.001,
+      "loss": 0.4196,
+      "step": 5759
+    },
+    {
+      "epoch": 0.15893107813076635,
+      "grad_norm": 0.002697891555726528,
+      "learning_rate": 0.001,
+      "loss": 0.389,
+      "step": 5760
+    },
+    {
+      "epoch": 0.1589586703318307,
+      "grad_norm": 0.004389958921819925,
+      "learning_rate": 0.001,
+      "loss": 0.4066,
+      "step": 5761
+    },
+    {
+      "epoch": 0.15898626253289508,
+      "grad_norm": 0.003987609874457121,
+      "learning_rate": 0.001,
+      "loss": 0.396,
+      "step": 5762
+    },
+    {
+      "epoch": 0.15901385473395946,
+      "grad_norm": 0.003649795660749078,
+      "learning_rate": 0.001,
+      "loss": 0.3842,
+      "step": 5763
+    },
+    {
+      "epoch": 0.1590414469350238,
+      "grad_norm": 0.0034654231276363134,
+      "learning_rate": 0.001,
+      "loss": 0.409,
+      "step": 5764
+    },
+    {
+      "epoch": 0.1590690391360882,
+      "grad_norm": 0.005718625150620937,
+      "learning_rate": 0.001,
+      "loss": 0.405,
+      "step": 5765
+    },
+    {
+      "epoch": 0.15909663133715254,
+      "grad_norm": 0.0025557433255016804,
+      "learning_rate": 0.001,
+      "loss": 0.4179,
+      "step": 5766
+    },
+    {
+      "epoch": 0.15912422353821692,
+      "grad_norm": 0.002460428746417165,
+      "learning_rate": 0.001,
+      "loss": 0.3853,
+      "step": 5767
+    },
+    {
+      "epoch": 0.1591518157392813,
+      "grad_norm": 0.007104699965566397,
+      "learning_rate": 0.001,
+      "loss": 0.3963,
+      "step": 5768
+    },
+    {
+      "epoch": 0.15917940794034566,
+      "grad_norm": 0.006836418528109789,
+      "learning_rate": 0.001,
+      "loss": 0.3664,
+      "step": 5769
+    },
+    {
+      "epoch": 0.15920700014141004,
+      "grad_norm": 0.0034382655285298824,
+      "learning_rate": 0.001,
+      "loss": 0.4309,
+      "step": 5770
+    },
+    {
+      "epoch": 0.1592345923424744,
+      "grad_norm": 0.0040556564927101135,
+      "learning_rate": 0.001,
+      "loss": 0.3581,
+      "step": 5771
+    },
+    {
+      "epoch": 0.15926218454353877,
+      "grad_norm": 0.0038001069333404303,
+      "learning_rate": 0.001,
+      "loss": 0.3918,
+      "step": 5772
+    },
+    {
+      "epoch": 0.15928977674460315,
+      "grad_norm": 0.004240202251821756,
+      "learning_rate": 0.001,
+      "loss": 0.4036,
+      "step": 5773
+    },
+    {
+      "epoch": 0.1593173689456675,
+      "grad_norm": 0.004947451408952475,
+      "learning_rate": 0.001,
+      "loss": 0.3871,
+      "step": 5774
+    },
+    {
+      "epoch": 0.15934496114673188,
+      "grad_norm": 0.002301145112141967,
+      "learning_rate": 0.001,
+      "loss": 0.4065,
+      "step": 5775
+    },
+    {
+      "epoch": 0.15937255334779624,
+      "grad_norm": 0.003091468010097742,
+      "learning_rate": 0.001,
+      "loss": 0.4057,
+      "step": 5776
+    },
+    {
+      "epoch": 0.15940014554886062,
+      "grad_norm": 0.004508181009441614,
+      "learning_rate": 0.001,
+      "loss": 0.3781,
+      "step": 5777
+    },
+    {
+      "epoch": 0.159427737749925,
+      "grad_norm": 0.002182744676247239,
+      "learning_rate": 0.001,
+      "loss": 0.4129,
+      "step": 5778
+    },
+    {
+      "epoch": 0.15945532995098935,
+      "grad_norm": 0.00251766131259501,
+      "learning_rate": 0.001,
+      "loss": 0.4302,
+      "step": 5779
+    },
+    {
+      "epoch": 0.15948292215205373,
+      "grad_norm": 0.002740236232057214,
+      "learning_rate": 0.001,
+      "loss": 0.4014,
+      "step": 5780
+    },
+    {
+      "epoch": 0.15951051435311808,
+      "grad_norm": 0.005189212504774332,
+      "learning_rate": 0.001,
+      "loss": 0.4264,
+      "step": 5781
+    },
+    {
+      "epoch": 0.15953810655418246,
+      "grad_norm": 0.002994902664795518,
+      "learning_rate": 0.001,
+      "loss": 0.3908,
+      "step": 5782
+    },
+    {
+      "epoch": 0.15956569875524684,
+      "grad_norm": 0.003750283271074295,
+      "learning_rate": 0.001,
+      "loss": 0.4208,
+      "step": 5783
+    },
+    {
+      "epoch": 0.1595932909563112,
+      "grad_norm": 0.003004671772941947,
+      "learning_rate": 0.001,
+      "loss": 0.4253,
+      "step": 5784
+    },
+    {
+      "epoch": 0.15962088315737558,
+      "grad_norm": 0.004584962036460638,
+      "learning_rate": 0.001,
+      "loss": 0.4165,
+      "step": 5785
+    },
+    {
+      "epoch": 0.15964847535843993,
+      "grad_norm": 0.005403813440352678,
+      "learning_rate": 0.001,
+      "loss": 0.3689,
+      "step": 5786
+    },
+    {
+      "epoch": 0.1596760675595043,
+      "grad_norm": 0.0032318695448338985,
+      "learning_rate": 0.001,
+      "loss": 0.4165,
+      "step": 5787
+    },
+    {
+      "epoch": 0.15970365976056866,
+      "grad_norm": 0.002624890301376581,
+      "learning_rate": 0.001,
+      "loss": 0.369,
+      "step": 5788
+    },
+    {
+      "epoch": 0.15973125196163304,
+      "grad_norm": 0.004142098128795624,
+      "learning_rate": 0.001,
+      "loss": 0.3614,
+      "step": 5789
+    },
+    {
+      "epoch": 0.15975884416269742,
+      "grad_norm": 0.002787158126011491,
+      "learning_rate": 0.001,
+      "loss": 0.3912,
+      "step": 5790
+    },
+    {
+      "epoch": 0.15978643636376177,
+      "grad_norm": 0.004009711556136608,
+      "learning_rate": 0.001,
+      "loss": 0.4172,
+      "step": 5791
+    },
+    {
+      "epoch": 0.15981402856482615,
+      "grad_norm": 0.002381292637437582,
+      "learning_rate": 0.001,
+      "loss": 0.3975,
+      "step": 5792
+    },
+    {
+      "epoch": 0.1598416207658905,
+      "grad_norm": 0.0021126163192093372,
+      "learning_rate": 0.001,
+      "loss": 0.4088,
+      "step": 5793
+    },
+    {
+      "epoch": 0.1598692129669549,
+      "grad_norm": 0.0049142311327159405,
+      "learning_rate": 0.001,
+      "loss": 0.3782,
+      "step": 5794
+    },
+    {
+      "epoch": 0.15989680516801927,
+      "grad_norm": 0.0029681632295250893,
+      "learning_rate": 0.001,
+      "loss": 0.3565,
+      "step": 5795
+    },
+    {
+      "epoch": 0.15992439736908362,
+      "grad_norm": 0.002932732691988349,
+      "learning_rate": 0.001,
+      "loss": 0.4096,
+      "step": 5796
+    },
+    {
+      "epoch": 0.159951989570148,
+      "grad_norm": 0.002819716464728117,
+      "learning_rate": 0.001,
+      "loss": 0.3889,
+      "step": 5797
+    },
+    {
+      "epoch": 0.15997958177121235,
+      "grad_norm": 0.0028946103993803263,
+      "learning_rate": 0.001,
+      "loss": 0.395,
+      "step": 5798
+    },
+    {
+      "epoch": 0.16000717397227673,
+      "grad_norm": 0.0024298951029777527,
+      "learning_rate": 0.001,
+      "loss": 0.3859,
+      "step": 5799
+    },
+    {
+      "epoch": 0.16003476617334111,
+      "grad_norm": 0.002949990564957261,
+      "learning_rate": 0.001,
+      "loss": 0.4012,
+      "step": 5800
+    },
+    {
+      "epoch": 0.16006235837440547,
+      "grad_norm": 0.003484488232061267,
+      "learning_rate": 0.001,
+      "loss": 0.3983,
+      "step": 5801
+    },
+    {
+      "epoch": 0.16008995057546985,
+      "grad_norm": 0.004663324449211359,
+      "learning_rate": 0.001,
+      "loss": 0.4009,
+      "step": 5802
+    },
+    {
+      "epoch": 0.1601175427765342,
+      "grad_norm": 0.003363480092957616,
+      "learning_rate": 0.001,
+      "loss": 0.4289,
+      "step": 5803
+    },
+    {
+      "epoch": 0.16014513497759858,
+      "grad_norm": 0.0023565501905977726,
+      "learning_rate": 0.001,
+      "loss": 0.3825,
+      "step": 5804
+    },
+    {
+      "epoch": 0.16017272717866296,
+      "grad_norm": 0.002411758527159691,
+      "learning_rate": 0.001,
+      "loss": 0.4388,
+      "step": 5805
+    },
+    {
+      "epoch": 0.1602003193797273,
+      "grad_norm": 0.0026753805577754974,
+      "learning_rate": 0.001,
+      "loss": 0.4267,
+      "step": 5806
+    },
+    {
+      "epoch": 0.1602279115807917,
+      "grad_norm": 0.003161199390888214,
+      "learning_rate": 0.001,
+      "loss": 0.3725,
+      "step": 5807
+    },
+    {
+      "epoch": 0.16025550378185605,
+      "grad_norm": 0.0042272647842764854,
+      "learning_rate": 0.001,
+      "loss": 0.3912,
+      "step": 5808
+    },
+    {
+      "epoch": 0.16028309598292043,
+      "grad_norm": 0.0023989747278392315,
+      "learning_rate": 0.001,
+      "loss": 0.3888,
+      "step": 5809
+    },
+    {
+      "epoch": 0.1603106881839848,
+      "grad_norm": 0.003144536865875125,
+      "learning_rate": 0.001,
+      "loss": 0.4189,
+      "step": 5810
+    },
+    {
+      "epoch": 0.16033828038504916,
+      "grad_norm": 0.002298231702297926,
+      "learning_rate": 0.001,
+      "loss": 0.3817,
+      "step": 5811
+    },
+    {
+      "epoch": 0.16036587258611354,
+      "grad_norm": 0.005583870690315962,
+      "learning_rate": 0.001,
+      "loss": 0.4163,
+      "step": 5812
+    },
+    {
+      "epoch": 0.1603934647871779,
+      "grad_norm": 0.002705441555008292,
+      "learning_rate": 0.001,
+      "loss": 0.4142,
+      "step": 5813
+    },
+    {
+      "epoch": 0.16042105698824227,
+      "grad_norm": 0.0025329829659312963,
+      "learning_rate": 0.001,
+      "loss": 0.4042,
+      "step": 5814
+    },
+    {
+      "epoch": 0.16044864918930665,
+      "grad_norm": 0.00348359951749444,
+      "learning_rate": 0.001,
+      "loss": 0.4062,
+      "step": 5815
+    },
+    {
+      "epoch": 0.160476241390371,
+      "grad_norm": 0.005613440182060003,
+      "learning_rate": 0.001,
+      "loss": 0.3973,
+      "step": 5816
+    },
+    {
+      "epoch": 0.16050383359143539,
+      "grad_norm": 0.005014996509999037,
+      "learning_rate": 0.001,
+      "loss": 0.3909,
+      "step": 5817
+    },
+    {
+      "epoch": 0.16053142579249974,
+      "grad_norm": 0.003465597052127123,
+      "learning_rate": 0.001,
+      "loss": 0.4292,
+      "step": 5818
+    },
+    {
+      "epoch": 0.16055901799356412,
+      "grad_norm": 0.003062979318201542,
+      "learning_rate": 0.001,
+      "loss": 0.3688,
+      "step": 5819
+    },
+    {
+      "epoch": 0.1605866101946285,
+      "grad_norm": 0.003151744371280074,
+      "learning_rate": 0.001,
+      "loss": 0.3744,
+      "step": 5820
+    },
+    {
+      "epoch": 0.16061420239569285,
+      "grad_norm": 0.0023994911462068558,
+      "learning_rate": 0.001,
+      "loss": 0.4317,
+      "step": 5821
+    },
+    {
+      "epoch": 0.16064179459675723,
+      "grad_norm": 0.002751079387962818,
+      "learning_rate": 0.001,
+      "loss": 0.4137,
+      "step": 5822
+    },
+    {
+      "epoch": 0.16066938679782158,
+      "grad_norm": 0.0021729813888669014,
+      "learning_rate": 0.001,
+      "loss": 0.3928,
+      "step": 5823
+    },
+    {
+      "epoch": 0.16069697899888596,
+      "grad_norm": 0.0027657474856823683,
+      "learning_rate": 0.001,
+      "loss": 0.4277,
+      "step": 5824
+    },
+    {
+      "epoch": 0.16072457119995034,
+      "grad_norm": 0.0026696647983044386,
+      "learning_rate": 0.001,
+      "loss": 0.3872,
+      "step": 5825
+    },
+    {
+      "epoch": 0.1607521634010147,
+      "grad_norm": 0.0028585640247911215,
+      "learning_rate": 0.001,
+      "loss": 0.3727,
+      "step": 5826
+    },
+    {
+      "epoch": 0.16077975560207908,
+      "grad_norm": 0.003272457979619503,
+      "learning_rate": 0.001,
+      "loss": 0.395,
+      "step": 5827
+    },
+    {
+      "epoch": 0.16080734780314343,
+      "grad_norm": 0.004849962890148163,
+      "learning_rate": 0.001,
+      "loss": 0.3662,
+      "step": 5828
+    },
+    {
+      "epoch": 0.1608349400042078,
+      "grad_norm": 0.0022566032130271196,
+      "learning_rate": 0.001,
+      "loss": 0.4437,
+      "step": 5829
+    },
+    {
+      "epoch": 0.1608625322052722,
+      "grad_norm": 0.0026730254758149385,
+      "learning_rate": 0.001,
+      "loss": 0.3895,
+      "step": 5830
+    },
+    {
+      "epoch": 0.16089012440633654,
+      "grad_norm": 0.005255574360489845,
+      "learning_rate": 0.001,
+      "loss": 0.4395,
+      "step": 5831
+    },
+    {
+      "epoch": 0.16091771660740092,
+      "grad_norm": 0.0031837450806051493,
+      "learning_rate": 0.001,
+      "loss": 0.4005,
+      "step": 5832
+    },
+    {
+      "epoch": 0.16094530880846528,
+      "grad_norm": 0.0028844368644058704,
+      "learning_rate": 0.001,
+      "loss": 0.4413,
+      "step": 5833
+    },
+    {
+      "epoch": 0.16097290100952966,
+      "grad_norm": 0.0036064505111426115,
+      "learning_rate": 0.001,
+      "loss": 0.4039,
+      "step": 5834
+    },
+    {
+      "epoch": 0.16100049321059404,
+      "grad_norm": 0.0031712886411696672,
+      "learning_rate": 0.001,
+      "loss": 0.3584,
+      "step": 5835
+    },
+    {
+      "epoch": 0.1610280854116584,
+      "grad_norm": 0.0029163220897316933,
+      "learning_rate": 0.001,
+      "loss": 0.4165,
+      "step": 5836
+    },
+    {
+      "epoch": 0.16105567761272277,
+      "grad_norm": 0.002263015601783991,
+      "learning_rate": 0.001,
+      "loss": 0.3831,
+      "step": 5837
+    },
+    {
+      "epoch": 0.16108326981378712,
+      "grad_norm": 0.0027008019387722015,
+      "learning_rate": 0.001,
+      "loss": 0.4,
+      "step": 5838
+    },
+    {
+      "epoch": 0.1611108620148515,
+      "grad_norm": 0.002291950862854719,
+      "learning_rate": 0.001,
+      "loss": 0.384,
+      "step": 5839
+    },
+    {
+      "epoch": 0.16113845421591588,
+      "grad_norm": 0.003169293748214841,
+      "learning_rate": 0.001,
+      "loss": 0.3912,
+      "step": 5840
+    },
+    {
+      "epoch": 0.16116604641698024,
+      "grad_norm": 0.0032045808620750904,
+      "learning_rate": 0.001,
+      "loss": 0.4201,
+      "step": 5841
+    },
+    {
+      "epoch": 0.16119363861804462,
+      "grad_norm": 0.00491705909371376,
+      "learning_rate": 0.001,
+      "loss": 0.3684,
+      "step": 5842
+    },
+    {
+      "epoch": 0.16122123081910897,
+      "grad_norm": 0.007085980381816626,
+      "learning_rate": 0.001,
+      "loss": 0.3874,
+      "step": 5843
+    },
+    {
+      "epoch": 0.16124882302017335,
+      "grad_norm": 0.0026187379844486713,
+      "learning_rate": 0.001,
+      "loss": 0.3899,
+      "step": 5844
+    },
+    {
+      "epoch": 0.16127641522123773,
+      "grad_norm": 0.0021291703451424837,
+      "learning_rate": 0.001,
+      "loss": 0.4321,
+      "step": 5845
+    },
+    {
+      "epoch": 0.16130400742230208,
+      "grad_norm": 0.0025390980299562216,
+      "learning_rate": 0.001,
+      "loss": 0.3974,
+      "step": 5846
+    },
+    {
+      "epoch": 0.16133159962336646,
+      "grad_norm": 0.0024054215755313635,
+      "learning_rate": 0.001,
+      "loss": 0.3825,
+      "step": 5847
+    },
+    {
+      "epoch": 0.16135919182443081,
+      "grad_norm": 0.0030334896873682737,
+      "learning_rate": 0.001,
+      "loss": 0.3689,
+      "step": 5848
+    },
+    {
+      "epoch": 0.1613867840254952,
+      "grad_norm": 0.004912371281534433,
+      "learning_rate": 0.001,
+      "loss": 0.3735,
+      "step": 5849
+    },
+    {
+      "epoch": 0.16141437622655957,
+      "grad_norm": 0.00569628132507205,
+      "learning_rate": 0.001,
+      "loss": 0.4294,
+      "step": 5850
+    },
+    {
+      "epoch": 0.16144196842762393,
+      "grad_norm": 0.01722198724746704,
+      "learning_rate": 0.001,
+      "loss": 0.3729,
+      "step": 5851
+    },
+    {
+      "epoch": 0.1614695606286883,
+      "grad_norm": 0.00705372542142868,
+      "learning_rate": 0.001,
+      "loss": 0.3854,
+      "step": 5852
+    },
+    {
+      "epoch": 0.16149715282975266,
+      "grad_norm": 0.0025785481557250023,
+      "learning_rate": 0.001,
+      "loss": 0.4384,
+      "step": 5853
+    },
+    {
+      "epoch": 0.16152474503081704,
+      "grad_norm": 0.0024269360583275557,
+      "learning_rate": 0.001,
+      "loss": 0.3897,
+      "step": 5854
+    },
+    {
+      "epoch": 0.16155233723188142,
+      "grad_norm": 0.0034063782077282667,
+      "learning_rate": 0.001,
+      "loss": 0.4463,
+      "step": 5855
+    },
+    {
+      "epoch": 0.16157992943294577,
+      "grad_norm": 0.0034151726868003607,
+      "learning_rate": 0.001,
+      "loss": 0.4093,
+      "step": 5856
+    },
+    {
+      "epoch": 0.16160752163401015,
+      "grad_norm": 0.0038089072331786156,
+      "learning_rate": 0.001,
+      "loss": 0.386,
+      "step": 5857
+    },
+    {
+      "epoch": 0.1616351138350745,
+      "grad_norm": 0.0021995375864207745,
+      "learning_rate": 0.001,
+      "loss": 0.42,
+      "step": 5858
+    },
+    {
+      "epoch": 0.1616627060361389,
+      "grad_norm": 0.004200818948447704,
+      "learning_rate": 0.001,
+      "loss": 0.4232,
+      "step": 5859
+    },
+    {
+      "epoch": 0.16169029823720327,
+      "grad_norm": 0.002936551347374916,
+      "learning_rate": 0.001,
+      "loss": 0.3806,
+      "step": 5860
+    },
+    {
+      "epoch": 0.16171789043826762,
+      "grad_norm": 0.0025540448259562254,
+      "learning_rate": 0.001,
+      "loss": 0.4272,
+      "step": 5861
+    },
+    {
+      "epoch": 0.161745482639332,
+      "grad_norm": 0.0026053600013256073,
+      "learning_rate": 0.001,
+      "loss": 0.3827,
+      "step": 5862
+    },
+    {
+      "epoch": 0.16177307484039635,
+      "grad_norm": 0.0023329800460487604,
+      "learning_rate": 0.001,
+      "loss": 0.3666,
+      "step": 5863
+    },
+    {
+      "epoch": 0.16180066704146073,
+      "grad_norm": 0.004357020370662212,
+      "learning_rate": 0.001,
+      "loss": 0.3969,
+      "step": 5864
+    },
+    {
+      "epoch": 0.1618282592425251,
+      "grad_norm": 0.0025468000676482916,
+      "learning_rate": 0.001,
+      "loss": 0.4233,
+      "step": 5865
+    },
+    {
+      "epoch": 0.16185585144358947,
+      "grad_norm": 0.004209857899695635,
+      "learning_rate": 0.001,
+      "loss": 0.362,
+      "step": 5866
+    },
+    {
+      "epoch": 0.16188344364465385,
+      "grad_norm": 0.005030781961977482,
+      "learning_rate": 0.001,
+      "loss": 0.4034,
+      "step": 5867
+    },
+    {
+      "epoch": 0.1619110358457182,
+      "grad_norm": 0.0026541538536548615,
+      "learning_rate": 0.001,
+      "loss": 0.4342,
+      "step": 5868
+    },
+    {
+      "epoch": 0.16193862804678258,
+      "grad_norm": 0.0038330042734742165,
+      "learning_rate": 0.001,
+      "loss": 0.3936,
+      "step": 5869
+    },
+    {
+      "epoch": 0.16196622024784696,
+      "grad_norm": 0.004821475129574537,
+      "learning_rate": 0.001,
+      "loss": 0.413,
+      "step": 5870
+    },
+    {
+      "epoch": 0.1619938124489113,
+      "grad_norm": 0.0028450365643948317,
+      "learning_rate": 0.001,
+      "loss": 0.3942,
+      "step": 5871
+    },
+    {
+      "epoch": 0.1620214046499757,
+      "grad_norm": 0.0027043556328862906,
+      "learning_rate": 0.001,
+      "loss": 0.4009,
+      "step": 5872
+    },
+    {
+      "epoch": 0.16204899685104004,
+      "grad_norm": 0.0035783706698566675,
+      "learning_rate": 0.001,
+      "loss": 0.3574,
+      "step": 5873
+    },
+    {
+      "epoch": 0.16207658905210443,
+      "grad_norm": 0.0049888514913618565,
+      "learning_rate": 0.001,
+      "loss": 0.3756,
+      "step": 5874
+    },
+    {
+      "epoch": 0.1621041812531688,
+      "grad_norm": 0.0031105815432965755,
+      "learning_rate": 0.001,
+      "loss": 0.4,
+      "step": 5875
+    },
+    {
+      "epoch": 0.16213177345423316,
+      "grad_norm": 0.0034052070695906878,
+      "learning_rate": 0.001,
+      "loss": 0.4151,
+      "step": 5876
+    },
+    {
+      "epoch": 0.16215936565529754,
+      "grad_norm": 0.009355590678751469,
+      "learning_rate": 0.001,
+      "loss": 0.3995,
+      "step": 5877
+    },
+    {
+      "epoch": 0.1621869578563619,
+      "grad_norm": 0.0034844528418034315,
+      "learning_rate": 0.001,
+      "loss": 0.4489,
+      "step": 5878
+    },
+    {
+      "epoch": 0.16221455005742627,
+      "grad_norm": 0.0033237759489566088,
+      "learning_rate": 0.001,
+      "loss": 0.4136,
+      "step": 5879
+    },
+    {
+      "epoch": 0.16224214225849065,
+      "grad_norm": 0.0030154725536704063,
+      "learning_rate": 0.001,
+      "loss": 0.3679,
+      "step": 5880
+    },
+    {
+      "epoch": 0.162269734459555,
+      "grad_norm": 0.00340478727594018,
+      "learning_rate": 0.001,
+      "loss": 0.3674,
+      "step": 5881
+    },
+    {
+      "epoch": 0.16229732666061938,
+      "grad_norm": 0.003332821885123849,
+      "learning_rate": 0.001,
+      "loss": 0.3668,
+      "step": 5882
+    },
+    {
+      "epoch": 0.16232491886168374,
+      "grad_norm": 0.00302005629055202,
+      "learning_rate": 0.001,
+      "loss": 0.4317,
+      "step": 5883
+    },
+    {
+      "epoch": 0.16235251106274812,
+      "grad_norm": 0.0027085014153271914,
+      "learning_rate": 0.001,
+      "loss": 0.379,
+      "step": 5884
+    },
+    {
+      "epoch": 0.16238010326381247,
+      "grad_norm": 0.0031871707178652287,
+      "learning_rate": 0.001,
+      "loss": 0.3755,
+      "step": 5885
+    },
+    {
+      "epoch": 0.16240769546487685,
+      "grad_norm": 0.0026585953310132027,
+      "learning_rate": 0.001,
+      "loss": 0.3933,
+      "step": 5886
+    },
+    {
+      "epoch": 0.16243528766594123,
+      "grad_norm": 0.0022077590692788363,
+      "learning_rate": 0.001,
+      "loss": 0.3791,
+      "step": 5887
+    },
+    {
+      "epoch": 0.16246287986700558,
+      "grad_norm": 0.0099159125238657,
+      "learning_rate": 0.001,
+      "loss": 0.4363,
+      "step": 5888
+    },
+    {
+      "epoch": 0.16249047206806996,
+      "grad_norm": 0.004561326466500759,
+      "learning_rate": 0.001,
+      "loss": 0.4033,
+      "step": 5889
+    },
+    {
+      "epoch": 0.16251806426913432,
+      "grad_norm": 0.004134157672524452,
+      "learning_rate": 0.001,
+      "loss": 0.4241,
+      "step": 5890
+    },
+    {
+      "epoch": 0.1625456564701987,
+      "grad_norm": 0.0028744975570589304,
+      "learning_rate": 0.001,
+      "loss": 0.4235,
+      "step": 5891
+    },
+    {
+      "epoch": 0.16257324867126308,
+      "grad_norm": 0.0063214050605893135,
+      "learning_rate": 0.001,
+      "loss": 0.4302,
+      "step": 5892
+    },
+    {
+      "epoch": 0.16260084087232743,
+      "grad_norm": 0.0033373129554092884,
+      "learning_rate": 0.001,
+      "loss": 0.4211,
+      "step": 5893
+    },
+    {
+      "epoch": 0.1626284330733918,
+      "grad_norm": 0.007856364361941814,
+      "learning_rate": 0.001,
+      "loss": 0.3508,
+      "step": 5894
+    },
+    {
+      "epoch": 0.16265602527445616,
+      "grad_norm": 0.005610327236354351,
+      "learning_rate": 0.001,
+      "loss": 0.4022,
+      "step": 5895
+    },
+    {
+      "epoch": 0.16268361747552054,
+      "grad_norm": 0.0030173116829246283,
+      "learning_rate": 0.001,
+      "loss": 0.4308,
+      "step": 5896
+    },
+    {
+      "epoch": 0.16271120967658492,
+      "grad_norm": 0.007099445443600416,
+      "learning_rate": 0.001,
+      "loss": 0.4093,
+      "step": 5897
+    },
+    {
+      "epoch": 0.16273880187764928,
+      "grad_norm": 0.0066776215098798275,
+      "learning_rate": 0.001,
+      "loss": 0.425,
+      "step": 5898
+    },
+    {
+      "epoch": 0.16276639407871366,
+      "grad_norm": 0.0048042964190244675,
+      "learning_rate": 0.001,
+      "loss": 0.4283,
+      "step": 5899
+    },
+    {
+      "epoch": 0.162793986279778,
+      "grad_norm": 0.013856537640094757,
+      "learning_rate": 0.001,
+      "loss": 0.3666,
+      "step": 5900
+    },
+    {
+      "epoch": 0.1628215784808424,
+      "grad_norm": 0.00262752384878695,
+      "learning_rate": 0.001,
+      "loss": 0.4292,
+      "step": 5901
+    },
+    {
+      "epoch": 0.16284917068190677,
+      "grad_norm": 0.004112104419618845,
+      "learning_rate": 0.001,
+      "loss": 0.3826,
+      "step": 5902
+    },
+    {
+      "epoch": 0.16287676288297112,
+      "grad_norm": 0.0023975521326065063,
+      "learning_rate": 0.001,
+      "loss": 0.4522,
+      "step": 5903
+    },
+    {
+      "epoch": 0.1629043550840355,
+      "grad_norm": 0.003720303997397423,
+      "learning_rate": 0.001,
+      "loss": 0.3911,
+      "step": 5904
+    },
+    {
+      "epoch": 0.16293194728509985,
+      "grad_norm": 0.0029375324957072735,
+      "learning_rate": 0.001,
+      "loss": 0.4242,
+      "step": 5905
+    },
+    {
+      "epoch": 0.16295953948616423,
+      "grad_norm": 0.003167448565363884,
+      "learning_rate": 0.001,
+      "loss": 0.4212,
+      "step": 5906
+    },
+    {
+      "epoch": 0.16298713168722861,
+      "grad_norm": 0.0032344404608011246,
+      "learning_rate": 0.001,
+      "loss": 0.4288,
+      "step": 5907
+    },
+    {
+      "epoch": 0.16301472388829297,
+      "grad_norm": 0.0028254149947315454,
+      "learning_rate": 0.001,
+      "loss": 0.3897,
+      "step": 5908
+    },
+    {
+      "epoch": 0.16304231608935735,
+      "grad_norm": 0.0026103025302290916,
+      "learning_rate": 0.001,
+      "loss": 0.3859,
+      "step": 5909
+    },
+    {
+      "epoch": 0.1630699082904217,
+      "grad_norm": 0.0032244473695755005,
+      "learning_rate": 0.001,
+      "loss": 0.4391,
+      "step": 5910
+    },
+    {
+      "epoch": 0.16309750049148608,
+      "grad_norm": 0.005498593673110008,
+      "learning_rate": 0.001,
+      "loss": 0.3777,
+      "step": 5911
+    },
+    {
+      "epoch": 0.16312509269255046,
+      "grad_norm": 0.002438473980873823,
+      "learning_rate": 0.001,
+      "loss": 0.3835,
+      "step": 5912
+    },
+    {
+      "epoch": 0.1631526848936148,
+      "grad_norm": 0.002533574588596821,
+      "learning_rate": 0.001,
+      "loss": 0.4007,
+      "step": 5913
+    },
+    {
+      "epoch": 0.1631802770946792,
+      "grad_norm": 0.0030868607573211193,
+      "learning_rate": 0.001,
+      "loss": 0.3878,
+      "step": 5914
+    },
+    {
+      "epoch": 0.16320786929574355,
+      "grad_norm": 0.002708959858864546,
+      "learning_rate": 0.001,
+      "loss": 0.4111,
+      "step": 5915
+    },
+    {
+      "epoch": 0.16323546149680793,
+      "grad_norm": 0.0037433947436511517,
+      "learning_rate": 0.001,
+      "loss": 0.4038,
+      "step": 5916
+    },
+    {
+      "epoch": 0.1632630536978723,
+      "grad_norm": 0.0034228821750730276,
+      "learning_rate": 0.001,
+      "loss": 0.397,
+      "step": 5917
+    },
+    {
+      "epoch": 0.16329064589893666,
+      "grad_norm": 0.0021487257909029722,
+      "learning_rate": 0.001,
+      "loss": 0.4127,
+      "step": 5918
+    },
+    {
+      "epoch": 0.16331823810000104,
+      "grad_norm": 0.008542514406144619,
+      "learning_rate": 0.001,
+      "loss": 0.3763,
+      "step": 5919
+    },
+    {
+      "epoch": 0.1633458303010654,
+      "grad_norm": 0.0028414383996278048,
+      "learning_rate": 0.001,
+      "loss": 0.3694,
+      "step": 5920
+    },
+    {
+      "epoch": 0.16337342250212977,
+      "grad_norm": 0.003626716323196888,
+      "learning_rate": 0.001,
+      "loss": 0.3819,
+      "step": 5921
+    },
+    {
+      "epoch": 0.16340101470319415,
+      "grad_norm": 0.004335957579314709,
+      "learning_rate": 0.001,
+      "loss": 0.3939,
+      "step": 5922
+    },
+    {
+      "epoch": 0.1634286069042585,
+      "grad_norm": 0.003027878934517503,
+      "learning_rate": 0.001,
+      "loss": 0.4225,
+      "step": 5923
+    },
+    {
+      "epoch": 0.16345619910532289,
+      "grad_norm": 0.002936540637165308,
+      "learning_rate": 0.001,
+      "loss": 0.3906,
+      "step": 5924
+    },
+    {
+      "epoch": 0.16348379130638724,
+      "grad_norm": 0.004033949691802263,
+      "learning_rate": 0.001,
+      "loss": 0.4212,
+      "step": 5925
+    },
+    {
+      "epoch": 0.16351138350745162,
+      "grad_norm": 0.003175846766680479,
+      "learning_rate": 0.001,
+      "loss": 0.4137,
+      "step": 5926
+    },
+    {
+      "epoch": 0.163538975708516,
+      "grad_norm": 0.002765644108876586,
+      "learning_rate": 0.001,
+      "loss": 0.4257,
+      "step": 5927
+    },
+    {
+      "epoch": 0.16356656790958035,
+      "grad_norm": 0.0026081749238073826,
+      "learning_rate": 0.001,
+      "loss": 0.46,
+      "step": 5928
+    },
+    {
+      "epoch": 0.16359416011064473,
+      "grad_norm": 0.002697083866223693,
+      "learning_rate": 0.001,
+      "loss": 0.4118,
+      "step": 5929
+    },
+    {
+      "epoch": 0.16362175231170908,
+      "grad_norm": 0.0040725781582295895,
+      "learning_rate": 0.001,
+      "loss": 0.4067,
+      "step": 5930
+    },
+    {
+      "epoch": 0.16364934451277346,
+      "grad_norm": 0.0018469118513166904,
+      "learning_rate": 0.001,
+      "loss": 0.4157,
+      "step": 5931
+    },
+    {
+      "epoch": 0.16367693671383785,
+      "grad_norm": 0.0027053162921220064,
+      "learning_rate": 0.001,
+      "loss": 0.3853,
+      "step": 5932
+    },
+    {
+      "epoch": 0.1637045289149022,
+      "grad_norm": 0.0021168517414480448,
+      "learning_rate": 0.001,
+      "loss": 0.4013,
+      "step": 5933
+    },
+    {
+      "epoch": 0.16373212111596658,
+      "grad_norm": 0.002157708862796426,
+      "learning_rate": 0.001,
+      "loss": 0.3991,
+      "step": 5934
+    },
+    {
+      "epoch": 0.16375971331703093,
+      "grad_norm": 0.00347045436501503,
+      "learning_rate": 0.001,
+      "loss": 0.3945,
+      "step": 5935
+    },
+    {
+      "epoch": 0.1637873055180953,
+      "grad_norm": 0.0034281827975064516,
+      "learning_rate": 0.001,
+      "loss": 0.4147,
+      "step": 5936
+    },
+    {
+      "epoch": 0.1638148977191597,
+      "grad_norm": 0.005607214290648699,
+      "learning_rate": 0.001,
+      "loss": 0.3952,
+      "step": 5937
+    },
+    {
+      "epoch": 0.16384248992022404,
+      "grad_norm": 0.0018512718379497528,
+      "learning_rate": 0.001,
+      "loss": 0.4062,
+      "step": 5938
+    },
+    {
+      "epoch": 0.16387008212128842,
+      "grad_norm": 0.005284006241708994,
+      "learning_rate": 0.001,
+      "loss": 0.4104,
+      "step": 5939
+    },
+    {
+      "epoch": 0.16389767432235278,
+      "grad_norm": 0.006320610176771879,
+      "learning_rate": 0.001,
+      "loss": 0.3999,
+      "step": 5940
+    },
+    {
+      "epoch": 0.16392526652341716,
+      "grad_norm": 0.0025526133831590414,
+      "learning_rate": 0.001,
+      "loss": 0.3816,
+      "step": 5941
+    },
+    {
+      "epoch": 0.16395285872448154,
+      "grad_norm": 0.0021038020495325327,
+      "learning_rate": 0.001,
+      "loss": 0.4013,
+      "step": 5942
+    },
+    {
+      "epoch": 0.1639804509255459,
+      "grad_norm": 0.0026413672603666782,
+      "learning_rate": 0.001,
+      "loss": 0.4036,
+      "step": 5943
+    },
+    {
+      "epoch": 0.16400804312661027,
+      "grad_norm": 0.004636452533304691,
+      "learning_rate": 0.001,
+      "loss": 0.4089,
+      "step": 5944
+    },
+    {
+      "epoch": 0.16403563532767462,
+      "grad_norm": 0.005048688966780901,
+      "learning_rate": 0.001,
+      "loss": 0.4018,
+      "step": 5945
+    },
+    {
+      "epoch": 0.164063227528739,
+      "grad_norm": 0.0032856205943971872,
+      "learning_rate": 0.001,
+      "loss": 0.4095,
+      "step": 5946
+    },
+    {
+      "epoch": 0.16409081972980338,
+      "grad_norm": 0.0027474837843328714,
+      "learning_rate": 0.001,
+      "loss": 0.4642,
+      "step": 5947
+    },
+    {
+      "epoch": 0.16411841193086774,
+      "grad_norm": 0.004227146506309509,
+      "learning_rate": 0.001,
+      "loss": 0.3778,
+      "step": 5948
+    },
+    {
+      "epoch": 0.16414600413193212,
+      "grad_norm": 0.00409206748008728,
+      "learning_rate": 0.001,
+      "loss": 0.4036,
+      "step": 5949
+    },
+    {
+      "epoch": 0.16417359633299647,
+      "grad_norm": 0.004566535819321871,
+      "learning_rate": 0.001,
+      "loss": 0.4191,
+      "step": 5950
+    },
+    {
+      "epoch": 0.16420118853406085,
+      "grad_norm": 0.0042169406078755856,
+      "learning_rate": 0.001,
+      "loss": 0.384,
+      "step": 5951
+    },
+    {
+      "epoch": 0.16422878073512523,
+      "grad_norm": 0.0025692598428577185,
+      "learning_rate": 0.001,
+      "loss": 0.4062,
+      "step": 5952
+    },
+    {
+      "epoch": 0.16425637293618958,
+      "grad_norm": 0.0040415204130113125,
+      "learning_rate": 0.001,
+      "loss": 0.4316,
+      "step": 5953
+    },
+    {
+      "epoch": 0.16428396513725396,
+      "grad_norm": 0.002447271952405572,
+      "learning_rate": 0.001,
+      "loss": 0.3831,
+      "step": 5954
+    },
+    {
+      "epoch": 0.16431155733831831,
+      "grad_norm": 0.00370286637917161,
+      "learning_rate": 0.001,
+      "loss": 0.3911,
+      "step": 5955
+    },
+    {
+      "epoch": 0.1643391495393827,
+      "grad_norm": 0.0034686720464378595,
+      "learning_rate": 0.001,
+      "loss": 0.3902,
+      "step": 5956
+    },
+    {
+      "epoch": 0.16436674174044708,
+      "grad_norm": 0.002705411519855261,
+      "learning_rate": 0.001,
+      "loss": 0.4178,
+      "step": 5957
+    },
+    {
+      "epoch": 0.16439433394151143,
+      "grad_norm": 0.003815694246441126,
+      "learning_rate": 0.001,
+      "loss": 0.4295,
+      "step": 5958
+    },
+    {
+      "epoch": 0.1644219261425758,
+      "grad_norm": 0.0033353432081639767,
+      "learning_rate": 0.001,
+      "loss": 0.4033,
+      "step": 5959
+    },
+    {
+      "epoch": 0.16444951834364016,
+      "grad_norm": 0.003151560202240944,
+      "learning_rate": 0.001,
+      "loss": 0.4065,
+      "step": 5960
+    },
+    {
+      "epoch": 0.16447711054470454,
+      "grad_norm": 0.0059262774884700775,
+      "learning_rate": 0.001,
+      "loss": 0.4036,
+      "step": 5961
+    },
+    {
+      "epoch": 0.16450470274576892,
+      "grad_norm": 0.0027054711245000362,
+      "learning_rate": 0.001,
+      "loss": 0.3902,
+      "step": 5962
+    },
+    {
+      "epoch": 0.16453229494683327,
+      "grad_norm": 0.005763984750956297,
+      "learning_rate": 0.001,
+      "loss": 0.3304,
+      "step": 5963
+    },
+    {
+      "epoch": 0.16455988714789765,
+      "grad_norm": 0.002855742583051324,
+      "learning_rate": 0.001,
+      "loss": 0.3459,
+      "step": 5964
+    },
+    {
+      "epoch": 0.164587479348962,
+      "grad_norm": 0.003007008694112301,
+      "learning_rate": 0.001,
+      "loss": 0.425,
+      "step": 5965
+    },
+    {
+      "epoch": 0.1646150715500264,
+      "grad_norm": 0.0030072838999330997,
+      "learning_rate": 0.001,
+      "loss": 0.3738,
+      "step": 5966
+    },
+    {
+      "epoch": 0.16464266375109077,
+      "grad_norm": 0.005015561822801828,
+      "learning_rate": 0.001,
+      "loss": 0.3966,
+      "step": 5967
+    },
+    {
+      "epoch": 0.16467025595215512,
+      "grad_norm": 0.0044816480949521065,
+      "learning_rate": 0.001,
+      "loss": 0.4192,
+      "step": 5968
+    },
+    {
+      "epoch": 0.1646978481532195,
+      "grad_norm": 0.0035232999362051487,
+      "learning_rate": 0.001,
+      "loss": 0.3555,
+      "step": 5969
+    },
+    {
+      "epoch": 0.16472544035428385,
+      "grad_norm": 0.0043287379667162895,
+      "learning_rate": 0.001,
+      "loss": 0.3861,
+      "step": 5970
+    },
+    {
+      "epoch": 0.16475303255534823,
+      "grad_norm": 0.00282704783603549,
+      "learning_rate": 0.001,
+      "loss": 0.4321,
+      "step": 5971
+    },
+    {
+      "epoch": 0.1647806247564126,
+      "grad_norm": 0.004027531016618013,
+      "learning_rate": 0.001,
+      "loss": 0.4018,
+      "step": 5972
+    },
+    {
+      "epoch": 0.16480821695747697,
+      "grad_norm": 0.011107796803116798,
+      "learning_rate": 0.001,
+      "loss": 0.4055,
+      "step": 5973
+    },
+    {
+      "epoch": 0.16483580915854135,
+      "grad_norm": 0.004610290750861168,
+      "learning_rate": 0.001,
+      "loss": 0.4326,
+      "step": 5974
+    },
+    {
+      "epoch": 0.1648634013596057,
+      "grad_norm": 0.002399605233222246,
+      "learning_rate": 0.001,
+      "loss": 0.3964,
+      "step": 5975
+    },
+    {
+      "epoch": 0.16489099356067008,
+      "grad_norm": 0.004088590387254953,
+      "learning_rate": 0.001,
+      "loss": 0.414,
+      "step": 5976
+    },
+    {
+      "epoch": 0.16491858576173443,
+      "grad_norm": 0.0037160133942961693,
+      "learning_rate": 0.001,
+      "loss": 0.3613,
+      "step": 5977
+    },
+    {
+      "epoch": 0.1649461779627988,
+      "grad_norm": 0.004553014412522316,
+      "learning_rate": 0.001,
+      "loss": 0.414,
+      "step": 5978
+    },
+    {
+      "epoch": 0.1649737701638632,
+      "grad_norm": 0.0035007258411496878,
+      "learning_rate": 0.001,
+      "loss": 0.3974,
+      "step": 5979
+    },
+    {
+      "epoch": 0.16500136236492755,
+      "grad_norm": 0.0026380368508398533,
+      "learning_rate": 0.001,
+      "loss": 0.3852,
+      "step": 5980
+    },
+    {
+      "epoch": 0.16502895456599193,
+      "grad_norm": 0.0043465979397296906,
+      "learning_rate": 0.001,
+      "loss": 0.3681,
+      "step": 5981
+    },
+    {
+      "epoch": 0.16505654676705628,
+      "grad_norm": 0.00366704142652452,
+      "learning_rate": 0.001,
+      "loss": 0.3833,
+      "step": 5982
+    },
+    {
+      "epoch": 0.16508413896812066,
+      "grad_norm": 0.002497212029993534,
+      "learning_rate": 0.001,
+      "loss": 0.4077,
+      "step": 5983
+    },
+    {
+      "epoch": 0.16511173116918504,
+      "grad_norm": 0.007907644845545292,
+      "learning_rate": 0.001,
+      "loss": 0.407,
+      "step": 5984
+    },
+    {
+      "epoch": 0.1651393233702494,
+      "grad_norm": 0.0024078672286123037,
+      "learning_rate": 0.001,
+      "loss": 0.4098,
+      "step": 5985
+    },
+    {
+      "epoch": 0.16516691557131377,
+      "grad_norm": 0.003503212472423911,
+      "learning_rate": 0.001,
+      "loss": 0.4093,
+      "step": 5986
+    },
+    {
+      "epoch": 0.16519450777237812,
+      "grad_norm": 0.00796839315444231,
+      "learning_rate": 0.001,
+      "loss": 0.4138,
+      "step": 5987
+    },
+    {
+      "epoch": 0.1652220999734425,
+      "grad_norm": 0.007227026857435703,
+      "learning_rate": 0.001,
+      "loss": 0.367,
+      "step": 5988
+    },
+    {
+      "epoch": 0.16524969217450688,
+      "grad_norm": 0.0025876981671899557,
+      "learning_rate": 0.001,
+      "loss": 0.4025,
+      "step": 5989
+    },
+    {
+      "epoch": 0.16527728437557124,
+      "grad_norm": 0.0023945660796016455,
+      "learning_rate": 0.001,
+      "loss": 0.4076,
+      "step": 5990
+    },
+    {
+      "epoch": 0.16530487657663562,
+      "grad_norm": 0.0032521584071218967,
+      "learning_rate": 0.001,
+      "loss": 0.399,
+      "step": 5991
+    },
+    {
+      "epoch": 0.16533246877769997,
+      "grad_norm": 0.0025216443464159966,
+      "learning_rate": 0.001,
+      "loss": 0.4063,
+      "step": 5992
+    },
+    {
+      "epoch": 0.16536006097876435,
+      "grad_norm": 0.0021180277690291405,
+      "learning_rate": 0.001,
+      "loss": 0.4105,
+      "step": 5993
+    },
+    {
+      "epoch": 0.16538765317982873,
+      "grad_norm": 0.002108354354277253,
+      "learning_rate": 0.001,
+      "loss": 0.4072,
+      "step": 5994
+    },
+    {
+      "epoch": 0.16541524538089308,
+      "grad_norm": 0.0022766771726310253,
+      "learning_rate": 0.001,
+      "loss": 0.4213,
+      "step": 5995
+    },
+    {
+      "epoch": 0.16544283758195746,
+      "grad_norm": 0.0022483475040644407,
+      "learning_rate": 0.001,
+      "loss": 0.389,
+      "step": 5996
+    },
+    {
+      "epoch": 0.16547042978302182,
+      "grad_norm": 0.006487622391432524,
+      "learning_rate": 0.001,
+      "loss": 0.4283,
+      "step": 5997
+    },
+    {
+      "epoch": 0.1654980219840862,
+      "grad_norm": 0.00356247927993536,
+      "learning_rate": 0.001,
+      "loss": 0.3738,
+      "step": 5998
+    },
+    {
+      "epoch": 0.16552561418515058,
+      "grad_norm": 0.002978307893499732,
+      "learning_rate": 0.001,
+      "loss": 0.3778,
+      "step": 5999
+    },
+    {
+      "epoch": 0.16555320638621493,
+      "grad_norm": 0.007124268915504217,
+      "learning_rate": 0.001,
+      "loss": 0.3882,
+      "step": 6000
+    },
+    {
+      "epoch": 0.16555320638621493,
+      "eval_runtime": 24.3281,
+      "eval_samples_per_second": 1.315,
+      "eval_steps_per_second": 0.164,
+      "step": 6000
+    },
+    {
+      "epoch": 0.1655807985872793,
+      "grad_norm": 0.0031695945654064417,
+      "learning_rate": 0.001,
+      "loss": 0.4021,
+      "step": 6001
+    },
+    {
+      "epoch": 0.16560839078834366,
+      "grad_norm": 0.0024594555143266916,
+      "learning_rate": 0.001,
+      "loss": 0.3992,
+      "step": 6002
+    },
+    {
+      "epoch": 0.16563598298940804,
+      "grad_norm": 0.002371816663071513,
+      "learning_rate": 0.001,
+      "loss": 0.4138,
+      "step": 6003
+    },
+    {
+      "epoch": 0.16566357519047242,
+      "grad_norm": 0.0034170527942478657,
+      "learning_rate": 0.001,
+      "loss": 0.3857,
+      "step": 6004
+    },
+    {
+      "epoch": 0.16569116739153678,
+      "grad_norm": 0.0031782693695276976,
+      "learning_rate": 0.001,
+      "loss": 0.3954,
+      "step": 6005
+    },
+    {
+      "epoch": 0.16571875959260116,
+      "grad_norm": 0.0026943848934024572,
+      "learning_rate": 0.001,
+      "loss": 0.3939,
+      "step": 6006
+    },
+    {
+      "epoch": 0.1657463517936655,
+      "grad_norm": 0.003218788420781493,
+      "learning_rate": 0.001,
+      "loss": 0.397,
+      "step": 6007
+    },
+    {
+      "epoch": 0.1657739439947299,
+      "grad_norm": 0.0070941089652478695,
+      "learning_rate": 0.001,
+      "loss": 0.3705,
+      "step": 6008
+    },
+    {
+      "epoch": 0.16580153619579427,
+      "grad_norm": 0.0025819384027272463,
+      "learning_rate": 0.001,
+      "loss": 0.3838,
+      "step": 6009
+    },
+    {
+      "epoch": 0.16582912839685862,
+      "grad_norm": 0.003531514434143901,
+      "learning_rate": 0.001,
+      "loss": 0.383,
+      "step": 6010
+    },
+    {
+      "epoch": 0.165856720597923,
+      "grad_norm": 0.006012933328747749,
+      "learning_rate": 0.001,
+      "loss": 0.383,
+      "step": 6011
+    },
+    {
+      "epoch": 0.16588431279898735,
+      "grad_norm": 0.0036130910739302635,
+      "learning_rate": 0.001,
+      "loss": 0.3882,
+      "step": 6012
+    },
+    {
+      "epoch": 0.16591190500005173,
+      "grad_norm": 0.0030107556376606226,
+      "learning_rate": 0.001,
+      "loss": 0.3775,
+      "step": 6013
+    },
+    {
+      "epoch": 0.16593949720111612,
+      "grad_norm": 0.0032329028472304344,
+      "learning_rate": 0.001,
+      "loss": 0.3742,
+      "step": 6014
+    },
+    {
+      "epoch": 0.16596708940218047,
+      "grad_norm": 0.002732783555984497,
+      "learning_rate": 0.001,
+      "loss": 0.4053,
+      "step": 6015
+    },
+    {
+      "epoch": 0.16599468160324485,
+      "grad_norm": 0.004081232473254204,
+      "learning_rate": 0.001,
+      "loss": 0.3952,
+      "step": 6016
+    },
+    {
+      "epoch": 0.1660222738043092,
+      "grad_norm": 0.1196216568350792,
+      "learning_rate": 0.001,
+      "loss": 0.3624,
+      "step": 6017
+    },
+    {
+      "epoch": 0.16604986600537358,
+      "grad_norm": 0.005355612374842167,
+      "learning_rate": 0.001,
+      "loss": 0.3804,
+      "step": 6018
+    },
+    {
+      "epoch": 0.16607745820643796,
+      "grad_norm": 0.003674236126244068,
+      "learning_rate": 0.001,
+      "loss": 0.378,
+      "step": 6019
+    },
+    {
+      "epoch": 0.1661050504075023,
+      "grad_norm": 0.002634497359395027,
+      "learning_rate": 0.001,
+      "loss": 0.3923,
+      "step": 6020
+    },
+    {
+      "epoch": 0.1661326426085667,
+      "grad_norm": 0.0034477454610168934,
+      "learning_rate": 0.001,
+      "loss": 0.3945,
+      "step": 6021
+    },
+    {
+      "epoch": 0.16616023480963105,
+      "grad_norm": 0.006275518331676722,
+      "learning_rate": 0.001,
+      "loss": 0.39,
+      "step": 6022
+    },
+    {
+      "epoch": 0.16618782701069543,
+      "grad_norm": 0.0067554921843111515,
+      "learning_rate": 0.001,
+      "loss": 0.3807,
+      "step": 6023
+    },
+    {
+      "epoch": 0.1662154192117598,
+      "grad_norm": 0.0043805669993162155,
+      "learning_rate": 0.001,
+      "loss": 0.4091,
+      "step": 6024
+    },
+    {
+      "epoch": 0.16624301141282416,
+      "grad_norm": 0.0032864839304238558,
+      "learning_rate": 0.001,
+      "loss": 0.3718,
+      "step": 6025
+    },
+    {
+      "epoch": 0.16627060361388854,
+      "grad_norm": 0.0033326733391731977,
+      "learning_rate": 0.001,
+      "loss": 0.3709,
+      "step": 6026
+    },
+    {
+      "epoch": 0.1662981958149529,
+      "grad_norm": 0.00480731762945652,
+      "learning_rate": 0.001,
+      "loss": 0.4116,
+      "step": 6027
+    },
+    {
+      "epoch": 0.16632578801601727,
+      "grad_norm": 0.0029019401408731937,
+      "learning_rate": 0.001,
+      "loss": 0.4049,
+      "step": 6028
+    },
+    {
+      "epoch": 0.16635338021708165,
+      "grad_norm": 0.002836265368387103,
+      "learning_rate": 0.001,
+      "loss": 0.3866,
+      "step": 6029
+    },
+    {
+      "epoch": 0.166380972418146,
+      "grad_norm": 0.002911055227741599,
+      "learning_rate": 0.001,
+      "loss": 0.4002,
+      "step": 6030
+    },
+    {
+      "epoch": 0.1664085646192104,
+      "grad_norm": 0.002560385735705495,
+      "learning_rate": 0.001,
+      "loss": 0.3841,
+      "step": 6031
+    },
+    {
+      "epoch": 0.16643615682027474,
+      "grad_norm": 0.003303447039797902,
+      "learning_rate": 0.001,
+      "loss": 0.3975,
+      "step": 6032
+    },
+    {
+      "epoch": 0.16646374902133912,
+      "grad_norm": 0.0026523631531745195,
+      "learning_rate": 0.001,
+      "loss": 0.3821,
+      "step": 6033
+    },
+    {
+      "epoch": 0.1664913412224035,
+      "grad_norm": 0.0037518537137657404,
+      "learning_rate": 0.001,
+      "loss": 0.4,
+      "step": 6034
+    },
+    {
+      "epoch": 0.16651893342346785,
+      "grad_norm": 0.0027246454264968634,
+      "learning_rate": 0.001,
+      "loss": 0.4288,
+      "step": 6035
+    },
+    {
+      "epoch": 0.16654652562453223,
+      "grad_norm": 0.012342111207544804,
+      "learning_rate": 0.001,
+      "loss": 0.4144,
+      "step": 6036
+    },
+    {
+      "epoch": 0.16657411782559658,
+      "grad_norm": 0.005223266314715147,
+      "learning_rate": 0.001,
+      "loss": 0.4332,
+      "step": 6037
+    },
+    {
+      "epoch": 0.16660171002666097,
+      "grad_norm": 0.003073178231716156,
+      "learning_rate": 0.001,
+      "loss": 0.4051,
+      "step": 6038
+    },
+    {
+      "epoch": 0.16662930222772535,
+      "grad_norm": 0.004849819000810385,
+      "learning_rate": 0.001,
+      "loss": 0.4385,
+      "step": 6039
+    },
+    {
+      "epoch": 0.1666568944287897,
+      "grad_norm": 0.002936877077445388,
+      "learning_rate": 0.001,
+      "loss": 0.3801,
+      "step": 6040
+    },
+    {
+      "epoch": 0.16668448662985408,
+      "grad_norm": 0.002828566124662757,
+      "learning_rate": 0.001,
+      "loss": 0.3962,
+      "step": 6041
+    },
+    {
+      "epoch": 0.16671207883091843,
+      "grad_norm": 0.0030576810240745544,
+      "learning_rate": 0.001,
+      "loss": 0.4031,
+      "step": 6042
+    },
+    {
+      "epoch": 0.1667396710319828,
+      "grad_norm": 0.0025680421385914087,
+      "learning_rate": 0.001,
+      "loss": 0.4002,
+      "step": 6043
+    },
+    {
+      "epoch": 0.1667672632330472,
+      "grad_norm": 0.004450993146747351,
+      "learning_rate": 0.001,
+      "loss": 0.3943,
+      "step": 6044
+    },
+    {
+      "epoch": 0.16679485543411154,
+      "grad_norm": 0.0038969400338828564,
+      "learning_rate": 0.001,
+      "loss": 0.4425,
+      "step": 6045
+    },
+    {
+      "epoch": 0.16682244763517592,
+      "grad_norm": 0.003842557780444622,
+      "learning_rate": 0.001,
+      "loss": 0.4202,
+      "step": 6046
+    },
+    {
+      "epoch": 0.16685003983624028,
+      "grad_norm": 0.002472433727234602,
+      "learning_rate": 0.001,
+      "loss": 0.391,
+      "step": 6047
+    },
+    {
+      "epoch": 0.16687763203730466,
+      "grad_norm": 0.00535456370562315,
+      "learning_rate": 0.001,
+      "loss": 0.4123,
+      "step": 6048
+    },
+    {
+      "epoch": 0.16690522423836904,
+      "grad_norm": 0.002608702052384615,
+      "learning_rate": 0.001,
+      "loss": 0.4108,
+      "step": 6049
+    },
+    {
+      "epoch": 0.1669328164394334,
+      "grad_norm": 0.0029743602499365807,
+      "learning_rate": 0.001,
+      "loss": 0.3424,
+      "step": 6050
+    },
+    {
+      "epoch": 0.16696040864049777,
+      "grad_norm": 0.0025020604953169823,
+      "learning_rate": 0.001,
+      "loss": 0.3778,
+      "step": 6051
+    },
+    {
+      "epoch": 0.16698800084156212,
+      "grad_norm": 0.0027822526171803474,
+      "learning_rate": 0.001,
+      "loss": 0.4124,
+      "step": 6052
+    },
+    {
+      "epoch": 0.1670155930426265,
+      "grad_norm": 0.0028931559063494205,
+      "learning_rate": 0.001,
+      "loss": 0.4171,
+      "step": 6053
+    },
+    {
+      "epoch": 0.16704318524369088,
+      "grad_norm": 0.0022724780719727278,
+      "learning_rate": 0.001,
+      "loss": 0.3975,
+      "step": 6054
+    },
+    {
+      "epoch": 0.16707077744475524,
+      "grad_norm": 0.004654435440897942,
+      "learning_rate": 0.001,
+      "loss": 0.3738,
+      "step": 6055
+    },
+    {
+      "epoch": 0.16709836964581962,
+      "grad_norm": 0.002734872279688716,
+      "learning_rate": 0.001,
+      "loss": 0.4039,
+      "step": 6056
+    },
+    {
+      "epoch": 0.16712596184688397,
+      "grad_norm": 0.0025022958870977163,
+      "learning_rate": 0.001,
+      "loss": 0.4061,
+      "step": 6057
+    },
+    {
+      "epoch": 0.16715355404794835,
+      "grad_norm": 0.004890537820756435,
+      "learning_rate": 0.001,
+      "loss": 0.4065,
+      "step": 6058
+    },
+    {
+      "epoch": 0.16718114624901273,
+      "grad_norm": 0.0023612927179783583,
+      "learning_rate": 0.001,
+      "loss": 0.3985,
+      "step": 6059
+    },
+    {
+      "epoch": 0.16720873845007708,
+      "grad_norm": 0.002075622323900461,
+      "learning_rate": 0.001,
+      "loss": 0.4118,
+      "step": 6060
+    },
+    {
+      "epoch": 0.16723633065114146,
+      "grad_norm": 0.004314557649195194,
+      "learning_rate": 0.001,
+      "loss": 0.4039,
+      "step": 6061
+    },
+    {
+      "epoch": 0.16726392285220582,
+      "grad_norm": 0.0022840569727122784,
+      "learning_rate": 0.001,
+      "loss": 0.3853,
+      "step": 6062
+    },
+    {
+      "epoch": 0.1672915150532702,
+      "grad_norm": 0.0027527734637260437,
+      "learning_rate": 0.001,
+      "loss": 0.3942,
+      "step": 6063
+    },
+    {
+      "epoch": 0.16731910725433458,
+      "grad_norm": 0.0024163657799363136,
+      "learning_rate": 0.001,
+      "loss": 0.4241,
+      "step": 6064
+    },
+    {
+      "epoch": 0.16734669945539893,
+      "grad_norm": 0.0026521470863372087,
+      "learning_rate": 0.001,
+      "loss": 0.4254,
+      "step": 6065
+    },
+    {
+      "epoch": 0.1673742916564633,
+      "grad_norm": 0.00407033134251833,
+      "learning_rate": 0.001,
+      "loss": 0.3867,
+      "step": 6066
+    },
+    {
+      "epoch": 0.16740188385752766,
+      "grad_norm": 0.0028122677467763424,
+      "learning_rate": 0.001,
+      "loss": 0.3722,
+      "step": 6067
+    },
+    {
+      "epoch": 0.16742947605859204,
+      "grad_norm": 0.0021444158628582954,
+      "learning_rate": 0.001,
+      "loss": 0.432,
+      "step": 6068
+    },
+    {
+      "epoch": 0.16745706825965642,
+      "grad_norm": 0.004711539018899202,
+      "learning_rate": 0.001,
+      "loss": 0.3673,
+      "step": 6069
+    },
+    {
+      "epoch": 0.16748466046072077,
+      "grad_norm": 0.005842708982527256,
+      "learning_rate": 0.001,
+      "loss": 0.3871,
+      "step": 6070
+    },
+    {
+      "epoch": 0.16751225266178515,
+      "grad_norm": 0.005441974848508835,
+      "learning_rate": 0.001,
+      "loss": 0.4022,
+      "step": 6071
+    },
+    {
+      "epoch": 0.1675398448628495,
+      "grad_norm": 0.005875125993043184,
+      "learning_rate": 0.001,
+      "loss": 0.408,
+      "step": 6072
+    },
+    {
+      "epoch": 0.1675674370639139,
+      "grad_norm": 0.002280894201248884,
+      "learning_rate": 0.001,
+      "loss": 0.4282,
+      "step": 6073
+    },
+    {
+      "epoch": 0.16759502926497824,
+      "grad_norm": 0.0026473281905055046,
+      "learning_rate": 0.001,
+      "loss": 0.371,
+      "step": 6074
+    },
+    {
+      "epoch": 0.16762262146604262,
+      "grad_norm": 0.006329023279249668,
+      "learning_rate": 0.001,
+      "loss": 0.4011,
+      "step": 6075
+    },
+    {
+      "epoch": 0.167650213667107,
+      "grad_norm": 0.0029098165687173605,
+      "learning_rate": 0.001,
+      "loss": 0.3979,
+      "step": 6076
+    },
+    {
+      "epoch": 0.16767780586817135,
+      "grad_norm": 0.005095341708511114,
+      "learning_rate": 0.001,
+      "loss": 0.4137,
+      "step": 6077
+    },
+    {
+      "epoch": 0.16770539806923573,
+      "grad_norm": 0.002626903122290969,
+      "learning_rate": 0.001,
+      "loss": 0.3767,
+      "step": 6078
+    },
+    {
+      "epoch": 0.1677329902703001,
+      "grad_norm": 0.006261000409722328,
+      "learning_rate": 0.001,
+      "loss": 0.4167,
+      "step": 6079
+    },
+    {
+      "epoch": 0.16776058247136447,
+      "grad_norm": 0.0027065426111221313,
+      "learning_rate": 0.001,
+      "loss": 0.3699,
+      "step": 6080
+    },
+    {
+      "epoch": 0.16778817467242885,
+      "grad_norm": 0.0033599373418837786,
+      "learning_rate": 0.001,
+      "loss": 0.3882,
+      "step": 6081
+    },
+    {
+      "epoch": 0.1678157668734932,
+      "grad_norm": 0.0021663676016032696,
+      "learning_rate": 0.001,
+      "loss": 0.3641,
+      "step": 6082
+    },
+    {
+      "epoch": 0.16784335907455758,
+      "grad_norm": 0.003881096374243498,
+      "learning_rate": 0.001,
+      "loss": 0.3787,
+      "step": 6083
+    },
+    {
+      "epoch": 0.16787095127562193,
+      "grad_norm": 0.0030875105876475573,
+      "learning_rate": 0.001,
+      "loss": 0.3872,
+      "step": 6084
+    },
+    {
+      "epoch": 0.1678985434766863,
+      "grad_norm": 0.003538950812071562,
+      "learning_rate": 0.001,
+      "loss": 0.4172,
+      "step": 6085
+    },
+    {
+      "epoch": 0.1679261356777507,
+      "grad_norm": 0.003060020739212632,
+      "learning_rate": 0.001,
+      "loss": 0.4165,
+      "step": 6086
+    },
+    {
+      "epoch": 0.16795372787881505,
+      "grad_norm": 0.0032102216500788927,
+      "learning_rate": 0.001,
+      "loss": 0.4264,
+      "step": 6087
+    },
+    {
+      "epoch": 0.16798132007987943,
+      "grad_norm": 0.004532152321189642,
+      "learning_rate": 0.001,
+      "loss": 0.3611,
+      "step": 6088
+    },
+    {
+      "epoch": 0.16800891228094378,
+      "grad_norm": 0.002951999893411994,
+      "learning_rate": 0.001,
+      "loss": 0.3728,
+      "step": 6089
+    },
+    {
+      "epoch": 0.16803650448200816,
+      "grad_norm": 0.002537887077778578,
+      "learning_rate": 0.001,
+      "loss": 0.3939,
+      "step": 6090
+    },
+    {
+      "epoch": 0.16806409668307254,
+      "grad_norm": 0.003645245684310794,
+      "learning_rate": 0.001,
+      "loss": 0.4323,
+      "step": 6091
+    },
+    {
+      "epoch": 0.1680916888841369,
+      "grad_norm": 0.0038180460687726736,
+      "learning_rate": 0.001,
+      "loss": 0.3968,
+      "step": 6092
+    },
+    {
+      "epoch": 0.16811928108520127,
+      "grad_norm": 0.002475427696481347,
+      "learning_rate": 0.001,
+      "loss": 0.4211,
+      "step": 6093
+    },
+    {
+      "epoch": 0.16814687328626562,
+      "grad_norm": 0.017842723056674004,
+      "learning_rate": 0.001,
+      "loss": 0.3785,
+      "step": 6094
+    },
+    {
+      "epoch": 0.16817446548733,
+      "grad_norm": 0.0030736385378986597,
+      "learning_rate": 0.001,
+      "loss": 0.4232,
+      "step": 6095
+    },
+    {
+      "epoch": 0.16820205768839439,
+      "grad_norm": 0.004124650731682777,
+      "learning_rate": 0.001,
+      "loss": 0.424,
+      "step": 6096
+    },
+    {
+      "epoch": 0.16822964988945874,
+      "grad_norm": 0.002600730862468481,
+      "learning_rate": 0.001,
+      "loss": 0.4044,
+      "step": 6097
+    },
+    {
+      "epoch": 0.16825724209052312,
+      "grad_norm": 0.003001442411914468,
+      "learning_rate": 0.001,
+      "loss": 0.4263,
+      "step": 6098
+    },
+    {
+      "epoch": 0.16828483429158747,
+      "grad_norm": 0.002522877650335431,
+      "learning_rate": 0.001,
+      "loss": 0.4041,
+      "step": 6099
+    },
+    {
+      "epoch": 0.16831242649265185,
+      "grad_norm": 0.0038972371257841587,
+      "learning_rate": 0.001,
+      "loss": 0.3812,
+      "step": 6100
+    },
+    {
+      "epoch": 0.16834001869371623,
+      "grad_norm": 0.0026751782279461622,
+      "learning_rate": 0.001,
+      "loss": 0.3849,
+      "step": 6101
+    },
+    {
+      "epoch": 0.16836761089478058,
+      "grad_norm": 0.003704581642523408,
+      "learning_rate": 0.001,
+      "loss": 0.3573,
+      "step": 6102
+    },
+    {
+      "epoch": 0.16839520309584496,
+      "grad_norm": 0.00222760159522295,
+      "learning_rate": 0.001,
+      "loss": 0.4439,
+      "step": 6103
+    },
+    {
+      "epoch": 0.16842279529690932,
+      "grad_norm": 0.0037216665223240852,
+      "learning_rate": 0.001,
+      "loss": 0.3748,
+      "step": 6104
+    },
+    {
+      "epoch": 0.1684503874979737,
+      "grad_norm": 0.0030143833719193935,
+      "learning_rate": 0.001,
+      "loss": 0.3928,
+      "step": 6105
+    },
+    {
+      "epoch": 0.16847797969903808,
+      "grad_norm": 0.0032768959645181894,
+      "learning_rate": 0.001,
+      "loss": 0.3986,
+      "step": 6106
+    },
+    {
+      "epoch": 0.16850557190010243,
+      "grad_norm": 0.0028334499802440405,
+      "learning_rate": 0.001,
+      "loss": 0.4071,
+      "step": 6107
+    },
+    {
+      "epoch": 0.1685331641011668,
+      "grad_norm": 0.004226780030876398,
+      "learning_rate": 0.001,
+      "loss": 0.354,
+      "step": 6108
+    },
+    {
+      "epoch": 0.16856075630223116,
+      "grad_norm": 0.0034058301243931055,
+      "learning_rate": 0.001,
+      "loss": 0.407,
+      "step": 6109
+    },
+    {
+      "epoch": 0.16858834850329554,
+      "grad_norm": 0.003791308030486107,
+      "learning_rate": 0.001,
+      "loss": 0.3822,
+      "step": 6110
+    },
+    {
+      "epoch": 0.16861594070435992,
+      "grad_norm": 0.0050668418407440186,
+      "learning_rate": 0.001,
+      "loss": 0.395,
+      "step": 6111
+    },
+    {
+      "epoch": 0.16864353290542428,
+      "grad_norm": 0.01103867869824171,
+      "learning_rate": 0.001,
+      "loss": 0.3519,
+      "step": 6112
+    },
+    {
+      "epoch": 0.16867112510648866,
+      "grad_norm": 0.003474986646324396,
+      "learning_rate": 0.001,
+      "loss": 0.405,
+      "step": 6113
+    },
+    {
+      "epoch": 0.168698717307553,
+      "grad_norm": 0.0034254579804837704,
+      "learning_rate": 0.001,
+      "loss": 0.4403,
+      "step": 6114
+    },
+    {
+      "epoch": 0.1687263095086174,
+      "grad_norm": 0.0021024683956056833,
+      "learning_rate": 0.001,
+      "loss": 0.4198,
+      "step": 6115
+    },
+    {
+      "epoch": 0.16875390170968177,
+      "grad_norm": 0.0043007307685911655,
+      "learning_rate": 0.001,
+      "loss": 0.4036,
+      "step": 6116
+    },
+    {
+      "epoch": 0.16878149391074612,
+      "grad_norm": 0.004201627802103758,
+      "learning_rate": 0.001,
+      "loss": 0.3919,
+      "step": 6117
+    },
+    {
+      "epoch": 0.1688090861118105,
+      "grad_norm": 0.006456395611166954,
+      "learning_rate": 0.001,
+      "loss": 0.364,
+      "step": 6118
+    },
+    {
+      "epoch": 0.16883667831287485,
+      "grad_norm": 0.006011181510984898,
+      "learning_rate": 0.001,
+      "loss": 0.3847,
+      "step": 6119
+    },
+    {
+      "epoch": 0.16886427051393924,
+      "grad_norm": 0.0036535318940877914,
+      "learning_rate": 0.001,
+      "loss": 0.3934,
+      "step": 6120
+    },
+    {
+      "epoch": 0.16889186271500362,
+      "grad_norm": 0.0032456032931804657,
+      "learning_rate": 0.001,
+      "loss": 0.4035,
+      "step": 6121
+    },
+    {
+      "epoch": 0.16891945491606797,
+      "grad_norm": 0.0025099399499595165,
+      "learning_rate": 0.001,
+      "loss": 0.4001,
+      "step": 6122
+    },
+    {
+      "epoch": 0.16894704711713235,
+      "grad_norm": 0.003860413795337081,
+      "learning_rate": 0.001,
+      "loss": 0.3662,
+      "step": 6123
+    },
+    {
+      "epoch": 0.1689746393181967,
+      "grad_norm": 0.004212185274809599,
+      "learning_rate": 0.001,
+      "loss": 0.3793,
+      "step": 6124
+    },
+    {
+      "epoch": 0.16900223151926108,
+      "grad_norm": 0.0037831286899745464,
+      "learning_rate": 0.001,
+      "loss": 0.3896,
+      "step": 6125
+    },
+    {
+      "epoch": 0.16902982372032546,
+      "grad_norm": 0.002503293799236417,
+      "learning_rate": 0.001,
+      "loss": 0.3809,
+      "step": 6126
+    },
+    {
+      "epoch": 0.16905741592138981,
+      "grad_norm": 0.0032346874941140413,
+      "learning_rate": 0.001,
+      "loss": 0.3859,
+      "step": 6127
+    },
+    {
+      "epoch": 0.1690850081224542,
+      "grad_norm": 0.0029911173041909933,
+      "learning_rate": 0.001,
+      "loss": 0.3975,
+      "step": 6128
+    },
+    {
+      "epoch": 0.16911260032351855,
+      "grad_norm": 0.003199785714969039,
+      "learning_rate": 0.001,
+      "loss": 0.3726,
+      "step": 6129
+    },
+    {
+      "epoch": 0.16914019252458293,
+      "grad_norm": 0.0029822427313774824,
+      "learning_rate": 0.001,
+      "loss": 0.4123,
+      "step": 6130
+    },
+    {
+      "epoch": 0.1691677847256473,
+      "grad_norm": 0.0030586915090680122,
+      "learning_rate": 0.001,
+      "loss": 0.4171,
+      "step": 6131
+    },
+    {
+      "epoch": 0.16919537692671166,
+      "grad_norm": 0.0046739643439650536,
+      "learning_rate": 0.001,
+      "loss": 0.3746,
+      "step": 6132
+    },
+    {
+      "epoch": 0.16922296912777604,
+      "grad_norm": 0.0033701432403177023,
+      "learning_rate": 0.001,
+      "loss": 0.3778,
+      "step": 6133
+    },
+    {
+      "epoch": 0.1692505613288404,
+      "grad_norm": 0.003320742631331086,
+      "learning_rate": 0.001,
+      "loss": 0.3892,
+      "step": 6134
+    },
+    {
+      "epoch": 0.16927815352990477,
+      "grad_norm": 0.005310896318405867,
+      "learning_rate": 0.001,
+      "loss": 0.4297,
+      "step": 6135
+    },
+    {
+      "epoch": 0.16930574573096915,
+      "grad_norm": 0.003215103643015027,
+      "learning_rate": 0.001,
+      "loss": 0.3855,
+      "step": 6136
+    },
+    {
+      "epoch": 0.1693333379320335,
+      "grad_norm": 0.002435446949675679,
+      "learning_rate": 0.001,
+      "loss": 0.4226,
+      "step": 6137
+    },
+    {
+      "epoch": 0.1693609301330979,
+      "grad_norm": 0.009125195443630219,
+      "learning_rate": 0.001,
+      "loss": 0.3945,
+      "step": 6138
+    },
+    {
+      "epoch": 0.16938852233416224,
+      "grad_norm": 0.002583020832389593,
+      "learning_rate": 0.001,
+      "loss": 0.4324,
+      "step": 6139
+    },
+    {
+      "epoch": 0.16941611453522662,
+      "grad_norm": 0.003045174991711974,
+      "learning_rate": 0.001,
+      "loss": 0.3868,
+      "step": 6140
+    },
+    {
+      "epoch": 0.169443706736291,
+      "grad_norm": 0.0020076248329132795,
+      "learning_rate": 0.001,
+      "loss": 0.3943,
+      "step": 6141
+    },
+    {
+      "epoch": 0.16947129893735535,
+      "grad_norm": 0.003063742769882083,
+      "learning_rate": 0.001,
+      "loss": 0.4329,
+      "step": 6142
+    },
+    {
+      "epoch": 0.16949889113841973,
+      "grad_norm": 0.00650634104385972,
+      "learning_rate": 0.001,
+      "loss": 0.4143,
+      "step": 6143
+    },
+    {
+      "epoch": 0.16952648333948409,
+      "grad_norm": 0.0031983335502445698,
+      "learning_rate": 0.001,
+      "loss": 0.4299,
+      "step": 6144
+    },
+    {
+      "epoch": 0.16955407554054847,
+      "grad_norm": 0.0037359180860221386,
+      "learning_rate": 0.001,
+      "loss": 0.3994,
+      "step": 6145
+    },
+    {
+      "epoch": 0.16958166774161285,
+      "grad_norm": 0.003823197679594159,
+      "learning_rate": 0.001,
+      "loss": 0.3726,
+      "step": 6146
+    },
+    {
+      "epoch": 0.1696092599426772,
+      "grad_norm": 0.0030885476153343916,
+      "learning_rate": 0.001,
+      "loss": 0.3947,
+      "step": 6147
+    },
+    {
+      "epoch": 0.16963685214374158,
+      "grad_norm": 0.005116578657180071,
+      "learning_rate": 0.001,
+      "loss": 0.3788,
+      "step": 6148
+    },
+    {
+      "epoch": 0.16966444434480593,
+      "grad_norm": 0.003294056048616767,
+      "learning_rate": 0.001,
+      "loss": 0.428,
+      "step": 6149
+    },
+    {
+      "epoch": 0.1696920365458703,
+      "grad_norm": 0.002693452872335911,
+      "learning_rate": 0.001,
+      "loss": 0.389,
+      "step": 6150
+    },
+    {
+      "epoch": 0.1697196287469347,
+      "grad_norm": 0.0032077941577881575,
+      "learning_rate": 0.001,
+      "loss": 0.3958,
+      "step": 6151
+    },
+    {
+      "epoch": 0.16974722094799904,
+      "grad_norm": 0.002029215916991234,
+      "learning_rate": 0.001,
+      "loss": 0.4398,
+      "step": 6152
+    },
+    {
+      "epoch": 0.16977481314906342,
+      "grad_norm": 0.002842904767021537,
+      "learning_rate": 0.001,
+      "loss": 0.4213,
+      "step": 6153
+    },
+    {
+      "epoch": 0.16980240535012778,
+      "grad_norm": 0.004416587762534618,
+      "learning_rate": 0.001,
+      "loss": 0.3518,
+      "step": 6154
+    },
+    {
+      "epoch": 0.16982999755119216,
+      "grad_norm": 0.0031072101555764675,
+      "learning_rate": 0.001,
+      "loss": 0.3979,
+      "step": 6155
+    },
+    {
+      "epoch": 0.16985758975225654,
+      "grad_norm": 0.006052012555301189,
+      "learning_rate": 0.001,
+      "loss": 0.4004,
+      "step": 6156
+    },
+    {
+      "epoch": 0.1698851819533209,
+      "grad_norm": 0.012233547866344452,
+      "learning_rate": 0.001,
+      "loss": 0.3828,
+      "step": 6157
+    },
+    {
+      "epoch": 0.16991277415438527,
+      "grad_norm": 0.004168129060417414,
+      "learning_rate": 0.001,
+      "loss": 0.4157,
+      "step": 6158
+    },
+    {
+      "epoch": 0.16994036635544962,
+      "grad_norm": 0.0026695001870393753,
+      "learning_rate": 0.001,
+      "loss": 0.4204,
+      "step": 6159
+    },
+    {
+      "epoch": 0.169967958556514,
+      "grad_norm": 0.0034213713370263577,
+      "learning_rate": 0.001,
+      "loss": 0.3999,
+      "step": 6160
+    },
+    {
+      "epoch": 0.16999555075757838,
+      "grad_norm": 0.004756170324981213,
+      "learning_rate": 0.001,
+      "loss": 0.3741,
+      "step": 6161
+    },
+    {
+      "epoch": 0.17002314295864274,
+      "grad_norm": 0.0028778575360774994,
+      "learning_rate": 0.001,
+      "loss": 0.4134,
+      "step": 6162
+    },
+    {
+      "epoch": 0.17005073515970712,
+      "grad_norm": 0.003623120253905654,
+      "learning_rate": 0.001,
+      "loss": 0.4147,
+      "step": 6163
+    },
+    {
+      "epoch": 0.17007832736077147,
+      "grad_norm": 0.0027707270346581936,
+      "learning_rate": 0.001,
+      "loss": 0.3961,
+      "step": 6164
+    },
+    {
+      "epoch": 0.17010591956183585,
+      "grad_norm": 0.0021031824871897697,
+      "learning_rate": 0.001,
+      "loss": 0.4003,
+      "step": 6165
+    },
+    {
+      "epoch": 0.1701335117629002,
+      "grad_norm": 0.0030407633166760206,
+      "learning_rate": 0.001,
+      "loss": 0.4014,
+      "step": 6166
+    },
+    {
+      "epoch": 0.17016110396396458,
+      "grad_norm": 0.004170509055256844,
+      "learning_rate": 0.001,
+      "loss": 0.3621,
+      "step": 6167
+    },
+    {
+      "epoch": 0.17018869616502896,
+      "grad_norm": 0.0025970737915486097,
+      "learning_rate": 0.001,
+      "loss": 0.4042,
+      "step": 6168
+    },
+    {
+      "epoch": 0.17021628836609332,
+      "grad_norm": 0.003734288504347205,
+      "learning_rate": 0.001,
+      "loss": 0.4398,
+      "step": 6169
+    },
+    {
+      "epoch": 0.1702438805671577,
+      "grad_norm": 0.00284449546597898,
+      "learning_rate": 0.001,
+      "loss": 0.4262,
+      "step": 6170
+    },
+    {
+      "epoch": 0.17027147276822205,
+      "grad_norm": 0.0026834469754248857,
+      "learning_rate": 0.001,
+      "loss": 0.3816,
+      "step": 6171
+    },
+    {
+      "epoch": 0.17029906496928643,
+      "grad_norm": 0.002561383182182908,
+      "learning_rate": 0.001,
+      "loss": 0.4185,
+      "step": 6172
+    },
+    {
+      "epoch": 0.1703266571703508,
+      "grad_norm": 0.008724918588995934,
+      "learning_rate": 0.001,
+      "loss": 0.3888,
+      "step": 6173
+    },
+    {
+      "epoch": 0.17035424937141516,
+      "grad_norm": 0.004458330571651459,
+      "learning_rate": 0.001,
+      "loss": 0.4107,
+      "step": 6174
+    },
+    {
+      "epoch": 0.17038184157247954,
+      "grad_norm": 0.004813158418983221,
+      "learning_rate": 0.001,
+      "loss": 0.4391,
+      "step": 6175
+    },
+    {
+      "epoch": 0.1704094337735439,
+      "grad_norm": 0.002779042813926935,
+      "learning_rate": 0.001,
+      "loss": 0.3619,
+      "step": 6176
+    },
+    {
+      "epoch": 0.17043702597460827,
+      "grad_norm": 0.004016479942947626,
+      "learning_rate": 0.001,
+      "loss": 0.3834,
+      "step": 6177
+    },
+    {
+      "epoch": 0.17046461817567266,
+      "grad_norm": 0.0036288578994572163,
+      "learning_rate": 0.001,
+      "loss": 0.4317,
+      "step": 6178
+    },
+    {
+      "epoch": 0.170492210376737,
+      "grad_norm": 0.003530343994498253,
+      "learning_rate": 0.001,
+      "loss": 0.4049,
+      "step": 6179
+    },
+    {
+      "epoch": 0.1705198025778014,
+      "grad_norm": 0.005636625923216343,
+      "learning_rate": 0.001,
+      "loss": 0.3926,
+      "step": 6180
+    },
+    {
+      "epoch": 0.17054739477886574,
+      "grad_norm": 0.004850632511079311,
+      "learning_rate": 0.001,
+      "loss": 0.4102,
+      "step": 6181
+    },
+    {
+      "epoch": 0.17057498697993012,
+      "grad_norm": 0.0022970670834183693,
+      "learning_rate": 0.001,
+      "loss": 0.3997,
+      "step": 6182
+    },
+    {
+      "epoch": 0.1706025791809945,
+      "grad_norm": 0.0032087513245642185,
+      "learning_rate": 0.001,
+      "loss": 0.3718,
+      "step": 6183
+    },
+    {
+      "epoch": 0.17063017138205885,
+      "grad_norm": 0.0027670126873999834,
+      "learning_rate": 0.001,
+      "loss": 0.4282,
+      "step": 6184
+    },
+    {
+      "epoch": 0.17065776358312323,
+      "grad_norm": 0.003269513137638569,
+      "learning_rate": 0.001,
+      "loss": 0.4121,
+      "step": 6185
+    },
+    {
+      "epoch": 0.1706853557841876,
+      "grad_norm": 0.0030671104323118925,
+      "learning_rate": 0.001,
+      "loss": 0.4107,
+      "step": 6186
+    },
+    {
+      "epoch": 0.17071294798525197,
+      "grad_norm": 0.002466262085363269,
+      "learning_rate": 0.001,
+      "loss": 0.4055,
+      "step": 6187
+    },
+    {
+      "epoch": 0.17074054018631635,
+      "grad_norm": 0.0033186296932399273,
+      "learning_rate": 0.001,
+      "loss": 0.4113,
+      "step": 6188
+    },
+    {
+      "epoch": 0.1707681323873807,
+      "grad_norm": 0.004772027488797903,
+      "learning_rate": 0.001,
+      "loss": 0.3852,
+      "step": 6189
+    },
+    {
+      "epoch": 0.17079572458844508,
+      "grad_norm": 0.003858543001115322,
+      "learning_rate": 0.001,
+      "loss": 0.3884,
+      "step": 6190
+    },
+    {
+      "epoch": 0.17082331678950943,
+      "grad_norm": 0.003998824395239353,
+      "learning_rate": 0.001,
+      "loss": 0.3572,
+      "step": 6191
+    },
+    {
+      "epoch": 0.1708509089905738,
+      "grad_norm": 0.0029393411241471767,
+      "learning_rate": 0.001,
+      "loss": 0.3608,
+      "step": 6192
+    },
+    {
+      "epoch": 0.1708785011916382,
+      "grad_norm": 0.002925847191363573,
+      "learning_rate": 0.001,
+      "loss": 0.4047,
+      "step": 6193
+    },
+    {
+      "epoch": 0.17090609339270255,
+      "grad_norm": 0.006174467504024506,
+      "learning_rate": 0.001,
+      "loss": 0.3995,
+      "step": 6194
+    },
+    {
+      "epoch": 0.17093368559376693,
+      "grad_norm": 0.003490880597382784,
+      "learning_rate": 0.001,
+      "loss": 0.4366,
+      "step": 6195
+    },
+    {
+      "epoch": 0.17096127779483128,
+      "grad_norm": 0.00242701661773026,
+      "learning_rate": 0.001,
+      "loss": 0.3756,
+      "step": 6196
+    },
+    {
+      "epoch": 0.17098886999589566,
+      "grad_norm": 0.0028819721192121506,
+      "learning_rate": 0.001,
+      "loss": 0.4108,
+      "step": 6197
+    },
+    {
+      "epoch": 0.17101646219696004,
+      "grad_norm": 0.0025861901231110096,
+      "learning_rate": 0.001,
+      "loss": 0.4214,
+      "step": 6198
+    },
+    {
+      "epoch": 0.1710440543980244,
+      "grad_norm": 0.0020953891798853874,
+      "learning_rate": 0.001,
+      "loss": 0.4142,
+      "step": 6199
+    },
+    {
+      "epoch": 0.17107164659908877,
+      "grad_norm": 0.0024096069391816854,
+      "learning_rate": 0.001,
+      "loss": 0.3922,
+      "step": 6200
+    },
+    {
+      "epoch": 0.17109923880015313,
+      "grad_norm": 0.0030896812677383423,
+      "learning_rate": 0.001,
+      "loss": 0.4042,
+      "step": 6201
+    },
+    {
+      "epoch": 0.1711268310012175,
+      "grad_norm": 0.0025605044793337584,
+      "learning_rate": 0.001,
+      "loss": 0.3839,
+      "step": 6202
+    },
+    {
+      "epoch": 0.17115442320228189,
+      "grad_norm": 0.0032270110677927732,
+      "learning_rate": 0.001,
+      "loss": 0.4318,
+      "step": 6203
+    },
+    {
+      "epoch": 0.17118201540334624,
+      "grad_norm": 0.002061746781691909,
+      "learning_rate": 0.001,
+      "loss": 0.3874,
+      "step": 6204
+    },
+    {
+      "epoch": 0.17120960760441062,
+      "grad_norm": 0.002208675490692258,
+      "learning_rate": 0.001,
+      "loss": 0.4531,
+      "step": 6205
+    },
+    {
+      "epoch": 0.17123719980547497,
+      "grad_norm": 0.0019300546264275908,
+      "learning_rate": 0.001,
+      "loss": 0.3975,
+      "step": 6206
+    },
+    {
+      "epoch": 0.17126479200653935,
+      "grad_norm": 0.002864664187654853,
+      "learning_rate": 0.001,
+      "loss": 0.4051,
+      "step": 6207
+    },
+    {
+      "epoch": 0.17129238420760373,
+      "grad_norm": 0.003815412288531661,
+      "learning_rate": 0.001,
+      "loss": 0.4082,
+      "step": 6208
+    },
+    {
+      "epoch": 0.17131997640866808,
+      "grad_norm": 0.0021196226589381695,
+      "learning_rate": 0.001,
+      "loss": 0.401,
+      "step": 6209
+    },
+    {
+      "epoch": 0.17134756860973246,
+      "grad_norm": 0.0040994128212332726,
+      "learning_rate": 0.001,
+      "loss": 0.4043,
+      "step": 6210
+    },
+    {
+      "epoch": 0.17137516081079682,
+      "grad_norm": 0.0021242250222712755,
+      "learning_rate": 0.001,
+      "loss": 0.4115,
+      "step": 6211
+    },
+    {
+      "epoch": 0.1714027530118612,
+      "grad_norm": 0.0023374794982373714,
+      "learning_rate": 0.001,
+      "loss": 0.4438,
+      "step": 6212
+    },
+    {
+      "epoch": 0.17143034521292558,
+      "grad_norm": 0.004651496186852455,
+      "learning_rate": 0.001,
+      "loss": 0.3917,
+      "step": 6213
+    },
+    {
+      "epoch": 0.17145793741398993,
+      "grad_norm": 0.0039611500687897205,
+      "learning_rate": 0.001,
+      "loss": 0.3808,
+      "step": 6214
+    },
+    {
+      "epoch": 0.1714855296150543,
+      "grad_norm": 0.008256969973444939,
+      "learning_rate": 0.001,
+      "loss": 0.3935,
+      "step": 6215
+    },
+    {
+      "epoch": 0.17151312181611866,
+      "grad_norm": 0.010161311365664005,
+      "learning_rate": 0.001,
+      "loss": 0.4106,
+      "step": 6216
+    },
+    {
+      "epoch": 0.17154071401718304,
+      "grad_norm": 0.010829792357981205,
+      "learning_rate": 0.001,
+      "loss": 0.4025,
+      "step": 6217
+    },
+    {
+      "epoch": 0.17156830621824742,
+      "grad_norm": 0.0036437201779335737,
+      "learning_rate": 0.001,
+      "loss": 0.3786,
+      "step": 6218
+    },
+    {
+      "epoch": 0.17159589841931178,
+      "grad_norm": 0.0040598101913928986,
+      "learning_rate": 0.001,
+      "loss": 0.3926,
+      "step": 6219
+    },
+    {
+      "epoch": 0.17162349062037616,
+      "grad_norm": 0.0032655976247042418,
+      "learning_rate": 0.001,
+      "loss": 0.4099,
+      "step": 6220
+    },
+    {
+      "epoch": 0.1716510828214405,
+      "grad_norm": 0.0032314627896994352,
+      "learning_rate": 0.001,
+      "loss": 0.4076,
+      "step": 6221
+    },
+    {
+      "epoch": 0.1716786750225049,
+      "grad_norm": 0.0044107455760240555,
+      "learning_rate": 0.001,
+      "loss": 0.3608,
+      "step": 6222
+    },
+    {
+      "epoch": 0.17170626722356927,
+      "grad_norm": 0.0073016672395169735,
+      "learning_rate": 0.001,
+      "loss": 0.4034,
+      "step": 6223
+    },
+    {
+      "epoch": 0.17173385942463362,
+      "grad_norm": 0.0031636636704206467,
+      "learning_rate": 0.001,
+      "loss": 0.3902,
+      "step": 6224
+    },
+    {
+      "epoch": 0.171761451625698,
+      "grad_norm": 0.0030360522214323282,
+      "learning_rate": 0.001,
+      "loss": 0.3553,
+      "step": 6225
+    },
+    {
+      "epoch": 0.17178904382676236,
+      "grad_norm": 0.0032519621308892965,
+      "learning_rate": 0.001,
+      "loss": 0.4174,
+      "step": 6226
+    },
+    {
+      "epoch": 0.17181663602782674,
+      "grad_norm": 0.002763562835752964,
+      "learning_rate": 0.001,
+      "loss": 0.3958,
+      "step": 6227
+    },
+    {
+      "epoch": 0.17184422822889112,
+      "grad_norm": 0.002963263774290681,
+      "learning_rate": 0.001,
+      "loss": 0.4079,
+      "step": 6228
+    },
+    {
+      "epoch": 0.17187182042995547,
+      "grad_norm": 0.0156992357224226,
+      "learning_rate": 0.001,
+      "loss": 0.4353,
+      "step": 6229
+    },
+    {
+      "epoch": 0.17189941263101985,
+      "grad_norm": 0.009238180704414845,
+      "learning_rate": 0.001,
+      "loss": 0.3851,
+      "step": 6230
+    },
+    {
+      "epoch": 0.1719270048320842,
+      "grad_norm": 0.003196639008820057,
+      "learning_rate": 0.001,
+      "loss": 0.416,
+      "step": 6231
+    },
+    {
+      "epoch": 0.17195459703314858,
+      "grad_norm": 0.004371176473796368,
+      "learning_rate": 0.001,
+      "loss": 0.4253,
+      "step": 6232
+    },
+    {
+      "epoch": 0.17198218923421296,
+      "grad_norm": 0.004302786663174629,
+      "learning_rate": 0.001,
+      "loss": 0.3993,
+      "step": 6233
+    },
+    {
+      "epoch": 0.17200978143527731,
+      "grad_norm": 0.0027876824606209993,
+      "learning_rate": 0.001,
+      "loss": 0.4164,
+      "step": 6234
+    },
+    {
+      "epoch": 0.1720373736363417,
+      "grad_norm": 0.004181624855846167,
+      "learning_rate": 0.001,
+      "loss": 0.4076,
+      "step": 6235
+    },
+    {
+      "epoch": 0.17206496583740605,
+      "grad_norm": 0.0028202803805470467,
+      "learning_rate": 0.001,
+      "loss": 0.3772,
+      "step": 6236
+    },
+    {
+      "epoch": 0.17209255803847043,
+      "grad_norm": 0.002387853804975748,
+      "learning_rate": 0.001,
+      "loss": 0.4437,
+      "step": 6237
+    },
+    {
+      "epoch": 0.1721201502395348,
+      "grad_norm": 0.0061591207049787045,
+      "learning_rate": 0.001,
+      "loss": 0.3678,
+      "step": 6238
+    },
+    {
+      "epoch": 0.17214774244059916,
+      "grad_norm": 0.002573385601863265,
+      "learning_rate": 0.001,
+      "loss": 0.3835,
+      "step": 6239
+    },
+    {
+      "epoch": 0.17217533464166354,
+      "grad_norm": 0.0026830765418708324,
+      "learning_rate": 0.001,
+      "loss": 0.3773,
+      "step": 6240
+    },
+    {
+      "epoch": 0.1722029268427279,
+      "grad_norm": 0.005540953949093819,
+      "learning_rate": 0.001,
+      "loss": 0.4075,
+      "step": 6241
+    },
+    {
+      "epoch": 0.17223051904379227,
+      "grad_norm": 0.0054961638525128365,
+      "learning_rate": 0.001,
+      "loss": 0.4079,
+      "step": 6242
+    },
+    {
+      "epoch": 0.17225811124485665,
+      "grad_norm": 0.007950017228722572,
+      "learning_rate": 0.001,
+      "loss": 0.3939,
+      "step": 6243
+    },
+    {
+      "epoch": 0.172285703445921,
+      "grad_norm": 0.005001327488571405,
+      "learning_rate": 0.001,
+      "loss": 0.4595,
+      "step": 6244
+    },
+    {
+      "epoch": 0.1723132956469854,
+      "grad_norm": 0.011175988242030144,
+      "learning_rate": 0.001,
+      "loss": 0.3986,
+      "step": 6245
+    },
+    {
+      "epoch": 0.17234088784804974,
+      "grad_norm": 0.004182237666100264,
+      "learning_rate": 0.001,
+      "loss": 0.4204,
+      "step": 6246
+    },
+    {
+      "epoch": 0.17236848004911412,
+      "grad_norm": 0.004050822462886572,
+      "learning_rate": 0.001,
+      "loss": 0.4092,
+      "step": 6247
+    },
+    {
+      "epoch": 0.1723960722501785,
+      "grad_norm": 0.0025094212032854557,
+      "learning_rate": 0.001,
+      "loss": 0.4113,
+      "step": 6248
+    },
+    {
+      "epoch": 0.17242366445124285,
+      "grad_norm": 0.0029853296000510454,
+      "learning_rate": 0.001,
+      "loss": 0.4068,
+      "step": 6249
+    },
+    {
+      "epoch": 0.17245125665230723,
+      "grad_norm": 0.0035753315314650536,
+      "learning_rate": 0.001,
+      "loss": 0.3865,
+      "step": 6250
+    },
+    {
+      "epoch": 0.17247884885337159,
+      "grad_norm": 0.0028684705030173063,
+      "learning_rate": 0.001,
+      "loss": 0.3864,
+      "step": 6251
+    },
+    {
+      "epoch": 0.17250644105443597,
+      "grad_norm": 0.0019036736339330673,
+      "learning_rate": 0.001,
+      "loss": 0.3883,
+      "step": 6252
+    },
+    {
+      "epoch": 0.17253403325550035,
+      "grad_norm": 0.0022934586741030216,
+      "learning_rate": 0.001,
+      "loss": 0.4147,
+      "step": 6253
+    },
+    {
+      "epoch": 0.1725616254565647,
+      "grad_norm": 0.0033066177275031805,
+      "learning_rate": 0.001,
+      "loss": 0.4162,
+      "step": 6254
+    },
+    {
+      "epoch": 0.17258921765762908,
+      "grad_norm": 0.004688805900514126,
+      "learning_rate": 0.001,
+      "loss": 0.3955,
+      "step": 6255
+    },
+    {
+      "epoch": 0.17261680985869343,
+      "grad_norm": 0.0033551587257534266,
+      "learning_rate": 0.001,
+      "loss": 0.4143,
+      "step": 6256
+    },
+    {
+      "epoch": 0.1726444020597578,
+      "grad_norm": 0.002868784125894308,
+      "learning_rate": 0.001,
+      "loss": 0.3965,
+      "step": 6257
+    },
+    {
+      "epoch": 0.17267199426082216,
+      "grad_norm": 0.0017792254220694304,
+      "learning_rate": 0.001,
+      "loss": 0.3981,
+      "step": 6258
+    },
+    {
+      "epoch": 0.17269958646188655,
+      "grad_norm": 0.0052290805615484715,
+      "learning_rate": 0.001,
+      "loss": 0.374,
+      "step": 6259
+    },
+    {
+      "epoch": 0.17272717866295093,
+      "grad_norm": 0.003975987434387207,
+      "learning_rate": 0.001,
+      "loss": 0.3784,
+      "step": 6260
+    },
+    {
+      "epoch": 0.17275477086401528,
+      "grad_norm": 0.005119245033711195,
+      "learning_rate": 0.001,
+      "loss": 0.383,
+      "step": 6261
+    },
+    {
+      "epoch": 0.17278236306507966,
+      "grad_norm": 0.009563916362822056,
+      "learning_rate": 0.001,
+      "loss": 0.4202,
+      "step": 6262
+    },
+    {
+      "epoch": 0.172809955266144,
+      "grad_norm": 0.006325340364128351,
+      "learning_rate": 0.001,
+      "loss": 0.4045,
+      "step": 6263
+    },
+    {
+      "epoch": 0.1728375474672084,
+      "grad_norm": 0.00598013773560524,
+      "learning_rate": 0.001,
+      "loss": 0.414,
+      "step": 6264
+    },
+    {
+      "epoch": 0.17286513966827277,
+      "grad_norm": 0.0035727466456592083,
+      "learning_rate": 0.001,
+      "loss": 0.3876,
+      "step": 6265
+    },
+    {
+      "epoch": 0.17289273186933712,
+      "grad_norm": 0.009252941235899925,
+      "learning_rate": 0.001,
+      "loss": 0.4447,
+      "step": 6266
+    },
+    {
+      "epoch": 0.1729203240704015,
+      "grad_norm": 0.0041562700644135475,
+      "learning_rate": 0.001,
+      "loss": 0.4291,
+      "step": 6267
+    },
+    {
+      "epoch": 0.17294791627146586,
+      "grad_norm": 0.0030006521847099066,
+      "learning_rate": 0.001,
+      "loss": 0.3489,
+      "step": 6268
+    },
+    {
+      "epoch": 0.17297550847253024,
+      "grad_norm": 0.002229264471679926,
+      "learning_rate": 0.001,
+      "loss": 0.4225,
+      "step": 6269
+    },
+    {
+      "epoch": 0.17300310067359462,
+      "grad_norm": 0.004136356525123119,
+      "learning_rate": 0.001,
+      "loss": 0.3893,
+      "step": 6270
+    },
+    {
+      "epoch": 0.17303069287465897,
+      "grad_norm": 0.005907285492867231,
+      "learning_rate": 0.001,
+      "loss": 0.3709,
+      "step": 6271
+    },
+    {
+      "epoch": 0.17305828507572335,
+      "grad_norm": 0.003040108596906066,
+      "learning_rate": 0.001,
+      "loss": 0.391,
+      "step": 6272
+    },
+    {
+      "epoch": 0.1730858772767877,
+      "grad_norm": 0.0026024484541267157,
+      "learning_rate": 0.001,
+      "loss": 0.4067,
+      "step": 6273
+    },
+    {
+      "epoch": 0.17311346947785208,
+      "grad_norm": 0.002952686743810773,
+      "learning_rate": 0.001,
+      "loss": 0.3804,
+      "step": 6274
+    },
+    {
+      "epoch": 0.17314106167891646,
+      "grad_norm": 0.004544638562947512,
+      "learning_rate": 0.001,
+      "loss": 0.3958,
+      "step": 6275
+    },
+    {
+      "epoch": 0.17316865387998082,
+      "grad_norm": 0.004679075442254543,
+      "learning_rate": 0.001,
+      "loss": 0.3623,
+      "step": 6276
+    },
+    {
+      "epoch": 0.1731962460810452,
+      "grad_norm": 0.002047475427389145,
+      "learning_rate": 0.001,
+      "loss": 0.4014,
+      "step": 6277
+    },
+    {
+      "epoch": 0.17322383828210955,
+      "grad_norm": 0.005213032476603985,
+      "learning_rate": 0.001,
+      "loss": 0.3915,
+      "step": 6278
+    },
+    {
+      "epoch": 0.17325143048317393,
+      "grad_norm": 0.00405911635607481,
+      "learning_rate": 0.001,
+      "loss": 0.4133,
+      "step": 6279
+    },
+    {
+      "epoch": 0.1732790226842383,
+      "grad_norm": 0.002308881375938654,
+      "learning_rate": 0.001,
+      "loss": 0.3932,
+      "step": 6280
+    },
+    {
+      "epoch": 0.17330661488530266,
+      "grad_norm": 0.007890172302722931,
+      "learning_rate": 0.001,
+      "loss": 0.4045,
+      "step": 6281
+    },
+    {
+      "epoch": 0.17333420708636704,
+      "grad_norm": 0.0034512749407440424,
+      "learning_rate": 0.001,
+      "loss": 0.42,
+      "step": 6282
+    },
+    {
+      "epoch": 0.1733617992874314,
+      "grad_norm": 0.007768754381686449,
+      "learning_rate": 0.001,
+      "loss": 0.3901,
+      "step": 6283
+    },
+    {
+      "epoch": 0.17338939148849578,
+      "grad_norm": 0.004081700462847948,
+      "learning_rate": 0.001,
+      "loss": 0.3718,
+      "step": 6284
+    },
+    {
+      "epoch": 0.17341698368956016,
+      "grad_norm": 0.003280284348875284,
+      "learning_rate": 0.001,
+      "loss": 0.417,
+      "step": 6285
+    },
+    {
+      "epoch": 0.1734445758906245,
+      "grad_norm": 0.0031469378154724836,
+      "learning_rate": 0.001,
+      "loss": 0.3955,
+      "step": 6286
+    },
+    {
+      "epoch": 0.1734721680916889,
+      "grad_norm": 0.002462326781824231,
+      "learning_rate": 0.001,
+      "loss": 0.4204,
+      "step": 6287
+    },
+    {
+      "epoch": 0.17349976029275324,
+      "grad_norm": 0.0022118764463812113,
+      "learning_rate": 0.001,
+      "loss": 0.3788,
+      "step": 6288
+    },
+    {
+      "epoch": 0.17352735249381762,
+      "grad_norm": 0.0026676817797124386,
+      "learning_rate": 0.001,
+      "loss": 0.3755,
+      "step": 6289
+    },
+    {
+      "epoch": 0.173554944694882,
+      "grad_norm": 0.002649841830134392,
+      "learning_rate": 0.001,
+      "loss": 0.4007,
+      "step": 6290
+    },
+    {
+      "epoch": 0.17358253689594635,
+      "grad_norm": 0.002229656558483839,
+      "learning_rate": 0.001,
+      "loss": 0.4231,
+      "step": 6291
+    },
+    {
+      "epoch": 0.17361012909701073,
+      "grad_norm": 0.0036838806699961424,
+      "learning_rate": 0.001,
+      "loss": 0.4267,
+      "step": 6292
+    },
+    {
+      "epoch": 0.1736377212980751,
+      "grad_norm": 0.015469814650714397,
+      "learning_rate": 0.001,
+      "loss": 0.3901,
+      "step": 6293
+    },
+    {
+      "epoch": 0.17366531349913947,
+      "grad_norm": 0.002747813006862998,
+      "learning_rate": 0.001,
+      "loss": 0.4272,
+      "step": 6294
+    },
+    {
+      "epoch": 0.17369290570020385,
+      "grad_norm": 0.0029808697290718555,
+      "learning_rate": 0.001,
+      "loss": 0.3918,
+      "step": 6295
+    },
+    {
+      "epoch": 0.1737204979012682,
+      "grad_norm": 0.0032657973933964968,
+      "learning_rate": 0.001,
+      "loss": 0.4121,
+      "step": 6296
+    },
+    {
+      "epoch": 0.17374809010233258,
+      "grad_norm": 0.005452743265777826,
+      "learning_rate": 0.001,
+      "loss": 0.4293,
+      "step": 6297
+    },
+    {
+      "epoch": 0.17377568230339693,
+      "grad_norm": 0.00484059751033783,
+      "learning_rate": 0.001,
+      "loss": 0.4184,
+      "step": 6298
+    },
+    {
+      "epoch": 0.1738032745044613,
+      "grad_norm": 0.003019709372892976,
+      "learning_rate": 0.001,
+      "loss": 0.3924,
+      "step": 6299
+    },
+    {
+      "epoch": 0.1738308667055257,
+      "grad_norm": 0.003449161071330309,
+      "learning_rate": 0.001,
+      "loss": 0.3817,
+      "step": 6300
+    },
+    {
+      "epoch": 0.17385845890659005,
+      "grad_norm": 0.004382924642413855,
+      "learning_rate": 0.001,
+      "loss": 0.3602,
+      "step": 6301
+    },
+    {
+      "epoch": 0.17388605110765443,
+      "grad_norm": 0.00298452191054821,
+      "learning_rate": 0.001,
+      "loss": 0.3835,
+      "step": 6302
+    },
+    {
+      "epoch": 0.17391364330871878,
+      "grad_norm": 0.002635273849591613,
+      "learning_rate": 0.001,
+      "loss": 0.4109,
+      "step": 6303
+    },
+    {
+      "epoch": 0.17394123550978316,
+      "grad_norm": 0.0019398960284888744,
+      "learning_rate": 0.001,
+      "loss": 0.4097,
+      "step": 6304
+    },
+    {
+      "epoch": 0.17396882771084754,
+      "grad_norm": 0.002571891061961651,
+      "learning_rate": 0.001,
+      "loss": 0.3784,
+      "step": 6305
+    },
+    {
+      "epoch": 0.1739964199119119,
+      "grad_norm": 0.004911772906780243,
+      "learning_rate": 0.001,
+      "loss": 0.4232,
+      "step": 6306
+    },
+    {
+      "epoch": 0.17402401211297627,
+      "grad_norm": 0.004217732232064009,
+      "learning_rate": 0.001,
+      "loss": 0.4172,
+      "step": 6307
+    },
+    {
+      "epoch": 0.17405160431404063,
+      "grad_norm": 0.002490875544026494,
+      "learning_rate": 0.001,
+      "loss": 0.4339,
+      "step": 6308
+    },
+    {
+      "epoch": 0.174079196515105,
+      "grad_norm": 0.002398621989414096,
+      "learning_rate": 0.001,
+      "loss": 0.4136,
+      "step": 6309
+    },
+    {
+      "epoch": 0.17410678871616939,
+      "grad_norm": 0.0025024251081049442,
+      "learning_rate": 0.001,
+      "loss": 0.4176,
+      "step": 6310
+    },
+    {
+      "epoch": 0.17413438091723374,
+      "grad_norm": 0.003449508221819997,
+      "learning_rate": 0.001,
+      "loss": 0.3632,
+      "step": 6311
+    },
+    {
+      "epoch": 0.17416197311829812,
+      "grad_norm": 0.003372189588844776,
+      "learning_rate": 0.001,
+      "loss": 0.4212,
+      "step": 6312
+    },
+    {
+      "epoch": 0.17418956531936247,
+      "grad_norm": 0.004607321694493294,
+      "learning_rate": 0.001,
+      "loss": 0.3907,
+      "step": 6313
+    },
+    {
+      "epoch": 0.17421715752042685,
+      "grad_norm": 0.00431500980630517,
+      "learning_rate": 0.001,
+      "loss": 0.419,
+      "step": 6314
+    },
+    {
+      "epoch": 0.17424474972149123,
+      "grad_norm": 0.0045218681916594505,
+      "learning_rate": 0.001,
+      "loss": 0.339,
+      "step": 6315
+    },
+    {
+      "epoch": 0.17427234192255558,
+      "grad_norm": 0.005656406749039888,
+      "learning_rate": 0.001,
+      "loss": 0.3859,
+      "step": 6316
+    },
+    {
+      "epoch": 0.17429993412361997,
+      "grad_norm": 0.0031431603711098433,
+      "learning_rate": 0.001,
+      "loss": 0.3871,
+      "step": 6317
+    },
+    {
+      "epoch": 0.17432752632468432,
+      "grad_norm": 0.0022704950533807278,
+      "learning_rate": 0.001,
+      "loss": 0.398,
+      "step": 6318
+    },
+    {
+      "epoch": 0.1743551185257487,
+      "grad_norm": 0.003445026697590947,
+      "learning_rate": 0.001,
+      "loss": 0.3907,
+      "step": 6319
+    },
+    {
+      "epoch": 0.17438271072681308,
+      "grad_norm": 0.002719600684940815,
+      "learning_rate": 0.001,
+      "loss": 0.3847,
+      "step": 6320
+    },
+    {
+      "epoch": 0.17441030292787743,
+      "grad_norm": 0.0020257963333278894,
+      "learning_rate": 0.001,
+      "loss": 0.3971,
+      "step": 6321
+    },
+    {
+      "epoch": 0.1744378951289418,
+      "grad_norm": 0.0022240818943828344,
+      "learning_rate": 0.001,
+      "loss": 0.4141,
+      "step": 6322
+    },
+    {
+      "epoch": 0.17446548733000616,
+      "grad_norm": 0.002934374613687396,
+      "learning_rate": 0.001,
+      "loss": 0.4139,
+      "step": 6323
+    },
+    {
+      "epoch": 0.17449307953107054,
+      "grad_norm": 0.006465549115091562,
+      "learning_rate": 0.001,
+      "loss": 0.3782,
+      "step": 6324
+    },
+    {
+      "epoch": 0.17452067173213492,
+      "grad_norm": 0.005728098098188639,
+      "learning_rate": 0.001,
+      "loss": 0.3999,
+      "step": 6325
+    },
+    {
+      "epoch": 0.17454826393319928,
+      "grad_norm": 0.002171339699998498,
+      "learning_rate": 0.001,
+      "loss": 0.4177,
+      "step": 6326
+    },
+    {
+      "epoch": 0.17457585613426366,
+      "grad_norm": 0.0026950135361403227,
+      "learning_rate": 0.001,
+      "loss": 0.3974,
+      "step": 6327
+    },
+    {
+      "epoch": 0.174603448335328,
+      "grad_norm": 0.006035445258021355,
+      "learning_rate": 0.001,
+      "loss": 0.4045,
+      "step": 6328
+    },
+    {
+      "epoch": 0.1746310405363924,
+      "grad_norm": 0.0035918874200433493,
+      "learning_rate": 0.001,
+      "loss": 0.3913,
+      "step": 6329
+    },
+    {
+      "epoch": 0.17465863273745677,
+      "grad_norm": 0.002868028124794364,
+      "learning_rate": 0.001,
+      "loss": 0.4279,
+      "step": 6330
+    },
+    {
+      "epoch": 0.17468622493852112,
+      "grad_norm": 0.00462915375828743,
+      "learning_rate": 0.001,
+      "loss": 0.405,
+      "step": 6331
+    },
+    {
+      "epoch": 0.1747138171395855,
+      "grad_norm": 0.0024865546729415655,
+      "learning_rate": 0.001,
+      "loss": 0.3877,
+      "step": 6332
+    },
+    {
+      "epoch": 0.17474140934064986,
+      "grad_norm": 0.002470286563038826,
+      "learning_rate": 0.001,
+      "loss": 0.4129,
+      "step": 6333
+    },
+    {
+      "epoch": 0.17476900154171424,
+      "grad_norm": 0.003509331261739135,
+      "learning_rate": 0.001,
+      "loss": 0.3881,
+      "step": 6334
+    },
+    {
+      "epoch": 0.17479659374277862,
+      "grad_norm": 0.0032379806507378817,
+      "learning_rate": 0.001,
+      "loss": 0.4061,
+      "step": 6335
+    },
+    {
+      "epoch": 0.17482418594384297,
+      "grad_norm": 0.003039590548723936,
+      "learning_rate": 0.001,
+      "loss": 0.3989,
+      "step": 6336
+    },
+    {
+      "epoch": 0.17485177814490735,
+      "grad_norm": 0.0033747265115380287,
+      "learning_rate": 0.001,
+      "loss": 0.4049,
+      "step": 6337
+    },
+    {
+      "epoch": 0.1748793703459717,
+      "grad_norm": 0.005611809901893139,
+      "learning_rate": 0.001,
+      "loss": 0.4113,
+      "step": 6338
+    },
+    {
+      "epoch": 0.17490696254703608,
+      "grad_norm": 0.002599924337118864,
+      "learning_rate": 0.001,
+      "loss": 0.4066,
+      "step": 6339
+    },
+    {
+      "epoch": 0.17493455474810046,
+      "grad_norm": 0.0026452033780515194,
+      "learning_rate": 0.001,
+      "loss": 0.3799,
+      "step": 6340
+    },
+    {
+      "epoch": 0.17496214694916482,
+      "grad_norm": 0.003480355953797698,
+      "learning_rate": 0.001,
+      "loss": 0.3805,
+      "step": 6341
+    },
+    {
+      "epoch": 0.1749897391502292,
+      "grad_norm": 0.0024892655201256275,
+      "learning_rate": 0.001,
+      "loss": 0.4077,
+      "step": 6342
+    },
+    {
+      "epoch": 0.17501733135129355,
+      "grad_norm": 0.0024968746583908796,
+      "learning_rate": 0.001,
+      "loss": 0.4035,
+      "step": 6343
+    },
+    {
+      "epoch": 0.17504492355235793,
+      "grad_norm": 0.00571107491850853,
+      "learning_rate": 0.001,
+      "loss": 0.398,
+      "step": 6344
+    },
+    {
+      "epoch": 0.1750725157534223,
+      "grad_norm": 0.0021744274999946356,
+      "learning_rate": 0.001,
+      "loss": 0.4078,
+      "step": 6345
+    },
+    {
+      "epoch": 0.17510010795448666,
+      "grad_norm": 0.004079717211425304,
+      "learning_rate": 0.001,
+      "loss": 0.4082,
+      "step": 6346
+    },
+    {
+      "epoch": 0.17512770015555104,
+      "grad_norm": 0.0027901085559278727,
+      "learning_rate": 0.001,
+      "loss": 0.3885,
+      "step": 6347
+    },
+    {
+      "epoch": 0.1751552923566154,
+      "grad_norm": 0.0030881750863045454,
+      "learning_rate": 0.001,
+      "loss": 0.4215,
+      "step": 6348
+    },
+    {
+      "epoch": 0.17518288455767977,
+      "grad_norm": 0.002648904686793685,
+      "learning_rate": 0.001,
+      "loss": 0.4207,
+      "step": 6349
+    },
+    {
+      "epoch": 0.17521047675874415,
+      "grad_norm": 0.004519915673881769,
+      "learning_rate": 0.001,
+      "loss": 0.413,
+      "step": 6350
+    },
+    {
+      "epoch": 0.1752380689598085,
+      "grad_norm": 0.004600946791470051,
+      "learning_rate": 0.001,
+      "loss": 0.3957,
+      "step": 6351
+    },
+    {
+      "epoch": 0.1752656611608729,
+      "grad_norm": 0.003022734308615327,
+      "learning_rate": 0.001,
+      "loss": 0.4198,
+      "step": 6352
+    },
+    {
+      "epoch": 0.17529325336193724,
+      "grad_norm": 0.004520618822425604,
+      "learning_rate": 0.001,
+      "loss": 0.3863,
+      "step": 6353
+    },
+    {
+      "epoch": 0.17532084556300162,
+      "grad_norm": 0.0052781859412789345,
+      "learning_rate": 0.001,
+      "loss": 0.3592,
+      "step": 6354
+    },
+    {
+      "epoch": 0.17534843776406597,
+      "grad_norm": 0.0028431678656488657,
+      "learning_rate": 0.001,
+      "loss": 0.4286,
+      "step": 6355
+    },
+    {
+      "epoch": 0.17537602996513035,
+      "grad_norm": 0.004005215130746365,
+      "learning_rate": 0.001,
+      "loss": 0.4177,
+      "step": 6356
+    },
+    {
+      "epoch": 0.17540362216619473,
+      "grad_norm": 0.0038050967268645763,
+      "learning_rate": 0.001,
+      "loss": 0.3892,
+      "step": 6357
+    },
+    {
+      "epoch": 0.1754312143672591,
+      "grad_norm": 0.01202372182160616,
+      "learning_rate": 0.001,
+      "loss": 0.4263,
+      "step": 6358
+    },
+    {
+      "epoch": 0.17545880656832347,
+      "grad_norm": 0.0020817206241190434,
+      "learning_rate": 0.001,
+      "loss": 0.4242,
+      "step": 6359
+    },
+    {
+      "epoch": 0.17548639876938782,
+      "grad_norm": 0.0022702061105519533,
+      "learning_rate": 0.001,
+      "loss": 0.4136,
+      "step": 6360
+    },
+    {
+      "epoch": 0.1755139909704522,
+      "grad_norm": 0.002957484917715192,
+      "learning_rate": 0.001,
+      "loss": 0.3985,
+      "step": 6361
+    },
+    {
+      "epoch": 0.17554158317151658,
+      "grad_norm": 0.002842365065589547,
+      "learning_rate": 0.001,
+      "loss": 0.4062,
+      "step": 6362
+    },
+    {
+      "epoch": 0.17556917537258093,
+      "grad_norm": 0.002651175484061241,
+      "learning_rate": 0.001,
+      "loss": 0.448,
+      "step": 6363
+    },
+    {
+      "epoch": 0.1755967675736453,
+      "grad_norm": 0.0024952066596597433,
+      "learning_rate": 0.001,
+      "loss": 0.4306,
+      "step": 6364
+    },
+    {
+      "epoch": 0.17562435977470967,
+      "grad_norm": 0.0025687795132398605,
+      "learning_rate": 0.001,
+      "loss": 0.3934,
+      "step": 6365
+    },
+    {
+      "epoch": 0.17565195197577405,
+      "grad_norm": 0.002511841943487525,
+      "learning_rate": 0.001,
+      "loss": 0.4229,
+      "step": 6366
+    },
+    {
+      "epoch": 0.17567954417683843,
+      "grad_norm": 0.00360535248182714,
+      "learning_rate": 0.001,
+      "loss": 0.4178,
+      "step": 6367
+    },
+    {
+      "epoch": 0.17570713637790278,
+      "grad_norm": 0.002814951818436384,
+      "learning_rate": 0.001,
+      "loss": 0.3839,
+      "step": 6368
+    },
+    {
+      "epoch": 0.17573472857896716,
+      "grad_norm": 0.003035743487998843,
+      "learning_rate": 0.001,
+      "loss": 0.426,
+      "step": 6369
+    },
+    {
+      "epoch": 0.1757623207800315,
+      "grad_norm": 0.0033583652693778276,
+      "learning_rate": 0.001,
+      "loss": 0.3715,
+      "step": 6370
+    },
+    {
+      "epoch": 0.1757899129810959,
+      "grad_norm": 0.0023555420339107513,
+      "learning_rate": 0.001,
+      "loss": 0.3874,
+      "step": 6371
+    },
+    {
+      "epoch": 0.17581750518216027,
+      "grad_norm": 0.0036680963821709156,
+      "learning_rate": 0.001,
+      "loss": 0.4161,
+      "step": 6372
+    },
+    {
+      "epoch": 0.17584509738322462,
+      "grad_norm": 0.00232534552924335,
+      "learning_rate": 0.001,
+      "loss": 0.422,
+      "step": 6373
+    },
+    {
+      "epoch": 0.175872689584289,
+      "grad_norm": 0.0050485520623624325,
+      "learning_rate": 0.001,
+      "loss": 0.3911,
+      "step": 6374
+    },
+    {
+      "epoch": 0.17590028178535336,
+      "grad_norm": 0.004004286136478186,
+      "learning_rate": 0.001,
+      "loss": 0.3798,
+      "step": 6375
+    },
+    {
+      "epoch": 0.17592787398641774,
+      "grad_norm": 0.003267053049057722,
+      "learning_rate": 0.001,
+      "loss": 0.4085,
+      "step": 6376
+    },
+    {
+      "epoch": 0.17595546618748212,
+      "grad_norm": 0.002155203837901354,
+      "learning_rate": 0.001,
+      "loss": 0.4274,
+      "step": 6377
+    },
+    {
+      "epoch": 0.17598305838854647,
+      "grad_norm": 0.030459165573120117,
+      "learning_rate": 0.001,
+      "loss": 0.379,
+      "step": 6378
+    },
+    {
+      "epoch": 0.17601065058961085,
+      "grad_norm": 0.007769315037876368,
+      "learning_rate": 0.001,
+      "loss": 0.3776,
+      "step": 6379
+    },
+    {
+      "epoch": 0.1760382427906752,
+      "grad_norm": 0.00399011978879571,
+      "learning_rate": 0.001,
+      "loss": 0.4016,
+      "step": 6380
+    },
+    {
+      "epoch": 0.17606583499173958,
+      "grad_norm": 0.003226799890398979,
+      "learning_rate": 0.001,
+      "loss": 0.4185,
+      "step": 6381
+    },
+    {
+      "epoch": 0.17609342719280396,
+      "grad_norm": 0.005244703497737646,
+      "learning_rate": 0.001,
+      "loss": 0.3881,
+      "step": 6382
+    },
+    {
+      "epoch": 0.17612101939386832,
+      "grad_norm": 0.00664431881159544,
+      "learning_rate": 0.001,
+      "loss": 0.3848,
+      "step": 6383
+    },
+    {
+      "epoch": 0.1761486115949327,
+      "grad_norm": 0.003033186076208949,
+      "learning_rate": 0.001,
+      "loss": 0.3989,
+      "step": 6384
+    },
+    {
+      "epoch": 0.17617620379599705,
+      "grad_norm": 0.0022742394357919693,
+      "learning_rate": 0.001,
+      "loss": 0.4217,
+      "step": 6385
+    },
+    {
+      "epoch": 0.17620379599706143,
+      "grad_norm": 0.0029898136854171753,
+      "learning_rate": 0.001,
+      "loss": 0.4422,
+      "step": 6386
+    },
+    {
+      "epoch": 0.1762313881981258,
+      "grad_norm": 0.0022048363462090492,
+      "learning_rate": 0.001,
+      "loss": 0.4039,
+      "step": 6387
+    },
+    {
+      "epoch": 0.17625898039919016,
+      "grad_norm": 0.0021618627943098545,
+      "learning_rate": 0.001,
+      "loss": 0.4049,
+      "step": 6388
+    },
+    {
+      "epoch": 0.17628657260025454,
+      "grad_norm": 0.0029654805548489094,
+      "learning_rate": 0.001,
+      "loss": 0.3764,
+      "step": 6389
+    },
+    {
+      "epoch": 0.1763141648013189,
+      "grad_norm": 0.0045374371111392975,
+      "learning_rate": 0.001,
+      "loss": 0.4074,
+      "step": 6390
+    },
+    {
+      "epoch": 0.17634175700238328,
+      "grad_norm": 0.00429152837023139,
+      "learning_rate": 0.001,
+      "loss": 0.3708,
+      "step": 6391
+    },
+    {
+      "epoch": 0.17636934920344766,
+      "grad_norm": 0.004207782447338104,
+      "learning_rate": 0.001,
+      "loss": 0.4119,
+      "step": 6392
+    },
+    {
+      "epoch": 0.176396941404512,
+      "grad_norm": 0.0025363874156028032,
+      "learning_rate": 0.001,
+      "loss": 0.4166,
+      "step": 6393
+    },
+    {
+      "epoch": 0.1764245336055764,
+      "grad_norm": 0.004385404288768768,
+      "learning_rate": 0.001,
+      "loss": 0.3839,
+      "step": 6394
+    },
+    {
+      "epoch": 0.17645212580664074,
+      "grad_norm": 0.0033825349528342485,
+      "learning_rate": 0.001,
+      "loss": 0.407,
+      "step": 6395
+    },
+    {
+      "epoch": 0.17647971800770512,
+      "grad_norm": 0.002470629056915641,
+      "learning_rate": 0.001,
+      "loss": 0.3783,
+      "step": 6396
+    },
+    {
+      "epoch": 0.1765073102087695,
+      "grad_norm": 0.004038370680063963,
+      "learning_rate": 0.001,
+      "loss": 0.4126,
+      "step": 6397
+    },
+    {
+      "epoch": 0.17653490240983385,
+      "grad_norm": 0.002262383932247758,
+      "learning_rate": 0.001,
+      "loss": 0.44,
+      "step": 6398
+    },
+    {
+      "epoch": 0.17656249461089824,
+      "grad_norm": 0.0023851392325013876,
+      "learning_rate": 0.001,
+      "loss": 0.3765,
+      "step": 6399
+    },
+    {
+      "epoch": 0.1765900868119626,
+      "grad_norm": 0.0039879628457129,
+      "learning_rate": 0.001,
+      "loss": 0.3875,
+      "step": 6400
+    },
+    {
+      "epoch": 0.17661767901302697,
+      "grad_norm": 0.0036031249910593033,
+      "learning_rate": 0.001,
+      "loss": 0.4164,
+      "step": 6401
+    },
+    {
+      "epoch": 0.17664527121409135,
+      "grad_norm": 0.0038508297875523567,
+      "learning_rate": 0.001,
+      "loss": 0.3642,
+      "step": 6402
+    },
+    {
+      "epoch": 0.1766728634151557,
+      "grad_norm": 0.0030231124255806208,
+      "learning_rate": 0.001,
+      "loss": 0.439,
+      "step": 6403
+    },
+    {
+      "epoch": 0.17670045561622008,
+      "grad_norm": 0.003226878121495247,
+      "learning_rate": 0.001,
+      "loss": 0.3718,
+      "step": 6404
+    },
+    {
+      "epoch": 0.17672804781728443,
+      "grad_norm": 0.003266223007813096,
+      "learning_rate": 0.001,
+      "loss": 0.3595,
+      "step": 6405
+    },
+    {
+      "epoch": 0.17675564001834881,
+      "grad_norm": 0.003785705426707864,
+      "learning_rate": 0.001,
+      "loss": 0.4091,
+      "step": 6406
+    },
+    {
+      "epoch": 0.1767832322194132,
+      "grad_norm": 0.0025163183454424143,
+      "learning_rate": 0.001,
+      "loss": 0.4141,
+      "step": 6407
+    },
+    {
+      "epoch": 0.17681082442047755,
+      "grad_norm": 0.002644954714924097,
+      "learning_rate": 0.001,
+      "loss": 0.4054,
+      "step": 6408
+    },
+    {
+      "epoch": 0.17683841662154193,
+      "grad_norm": 0.003993362188339233,
+      "learning_rate": 0.001,
+      "loss": 0.3773,
+      "step": 6409
+    },
+    {
+      "epoch": 0.17686600882260628,
+      "grad_norm": 0.0027130364906042814,
+      "learning_rate": 0.001,
+      "loss": 0.3985,
+      "step": 6410
+    },
+    {
+      "epoch": 0.17689360102367066,
+      "grad_norm": 0.005328681785613298,
+      "learning_rate": 0.001,
+      "loss": 0.3954,
+      "step": 6411
+    },
+    {
+      "epoch": 0.17692119322473504,
+      "grad_norm": 0.0027323930989950895,
+      "learning_rate": 0.001,
+      "loss": 0.4151,
+      "step": 6412
+    },
+    {
+      "epoch": 0.1769487854257994,
+      "grad_norm": 0.002667306223884225,
+      "learning_rate": 0.001,
+      "loss": 0.3945,
+      "step": 6413
+    },
+    {
+      "epoch": 0.17697637762686377,
+      "grad_norm": 0.0038074366748332977,
+      "learning_rate": 0.001,
+      "loss": 0.4259,
+      "step": 6414
+    },
+    {
+      "epoch": 0.17700396982792813,
+      "grad_norm": 0.002215777989476919,
+      "learning_rate": 0.001,
+      "loss": 0.4414,
+      "step": 6415
+    },
+    {
+      "epoch": 0.1770315620289925,
+      "grad_norm": 0.002434387104585767,
+      "learning_rate": 0.001,
+      "loss": 0.3959,
+      "step": 6416
+    },
+    {
+      "epoch": 0.1770591542300569,
+      "grad_norm": 0.0024976793210953474,
+      "learning_rate": 0.001,
+      "loss": 0.4113,
+      "step": 6417
+    },
+    {
+      "epoch": 0.17708674643112124,
+      "grad_norm": 0.0029137188103049994,
+      "learning_rate": 0.001,
+      "loss": 0.4001,
+      "step": 6418
+    },
+    {
+      "epoch": 0.17711433863218562,
+      "grad_norm": 0.002975389827042818,
+      "learning_rate": 0.001,
+      "loss": 0.3774,
+      "step": 6419
+    },
+    {
+      "epoch": 0.17714193083324997,
+      "grad_norm": 0.002931037684902549,
+      "learning_rate": 0.001,
+      "loss": 0.4205,
+      "step": 6420
+    },
+    {
+      "epoch": 0.17716952303431435,
+      "grad_norm": 0.004573407117277384,
+      "learning_rate": 0.001,
+      "loss": 0.4018,
+      "step": 6421
+    },
+    {
+      "epoch": 0.17719711523537873,
+      "grad_norm": 0.003066561883315444,
+      "learning_rate": 0.001,
+      "loss": 0.4224,
+      "step": 6422
+    },
+    {
+      "epoch": 0.17722470743644309,
+      "grad_norm": 0.003729255637153983,
+      "learning_rate": 0.001,
+      "loss": 0.4228,
+      "step": 6423
+    },
+    {
+      "epoch": 0.17725229963750747,
+      "grad_norm": 0.0029258355498313904,
+      "learning_rate": 0.001,
+      "loss": 0.3862,
+      "step": 6424
+    },
+    {
+      "epoch": 0.17727989183857182,
+      "grad_norm": 0.0034123340155929327,
+      "learning_rate": 0.001,
+      "loss": 0.405,
+      "step": 6425
+    },
+    {
+      "epoch": 0.1773074840396362,
+      "grad_norm": 0.0038247809279710054,
+      "learning_rate": 0.001,
+      "loss": 0.4374,
+      "step": 6426
+    },
+    {
+      "epoch": 0.17733507624070058,
+      "grad_norm": 0.003323949873447418,
+      "learning_rate": 0.001,
+      "loss": 0.3942,
+      "step": 6427
+    },
+    {
+      "epoch": 0.17736266844176493,
+      "grad_norm": 0.01532609574496746,
+      "learning_rate": 0.001,
+      "loss": 0.4212,
+      "step": 6428
+    },
+    {
+      "epoch": 0.1773902606428293,
+      "grad_norm": 0.0025842315517365932,
+      "learning_rate": 0.001,
+      "loss": 0.4241,
+      "step": 6429
+    },
+    {
+      "epoch": 0.17741785284389366,
+      "grad_norm": 0.003932729829102755,
+      "learning_rate": 0.001,
+      "loss": 0.4104,
+      "step": 6430
+    },
+    {
+      "epoch": 0.17744544504495804,
+      "grad_norm": 0.00350461108610034,
+      "learning_rate": 0.001,
+      "loss": 0.3995,
+      "step": 6431
+    },
+    {
+      "epoch": 0.17747303724602242,
+      "grad_norm": 0.0030087281484156847,
+      "learning_rate": 0.001,
+      "loss": 0.4106,
+      "step": 6432
+    },
+    {
+      "epoch": 0.17750062944708678,
+      "grad_norm": 0.006132917013019323,
+      "learning_rate": 0.001,
+      "loss": 0.354,
+      "step": 6433
+    },
+    {
+      "epoch": 0.17752822164815116,
+      "grad_norm": 0.0028762738220393658,
+      "learning_rate": 0.001,
+      "loss": 0.4173,
+      "step": 6434
+    },
+    {
+      "epoch": 0.1775558138492155,
+      "grad_norm": 0.0023419486824423075,
+      "learning_rate": 0.001,
+      "loss": 0.376,
+      "step": 6435
+    },
+    {
+      "epoch": 0.1775834060502799,
+      "grad_norm": 0.0026978617534041405,
+      "learning_rate": 0.001,
+      "loss": 0.3891,
+      "step": 6436
+    },
+    {
+      "epoch": 0.17761099825134427,
+      "grad_norm": 0.003220965852960944,
+      "learning_rate": 0.001,
+      "loss": 0.4197,
+      "step": 6437
+    },
+    {
+      "epoch": 0.17763859045240862,
+      "grad_norm": 0.013487125746905804,
+      "learning_rate": 0.001,
+      "loss": 0.4159,
+      "step": 6438
+    },
+    {
+      "epoch": 0.177666182653473,
+      "grad_norm": 0.0024956711567938328,
+      "learning_rate": 0.001,
+      "loss": 0.3742,
+      "step": 6439
+    },
+    {
+      "epoch": 0.17769377485453736,
+      "grad_norm": 0.002173262881115079,
+      "learning_rate": 0.001,
+      "loss": 0.4185,
+      "step": 6440
+    },
+    {
+      "epoch": 0.17772136705560174,
+      "grad_norm": 0.0020064099226146936,
+      "learning_rate": 0.001,
+      "loss": 0.4062,
+      "step": 6441
+    },
+    {
+      "epoch": 0.17774895925666612,
+      "grad_norm": 0.002172301523387432,
+      "learning_rate": 0.001,
+      "loss": 0.4002,
+      "step": 6442
+    },
+    {
+      "epoch": 0.17777655145773047,
+      "grad_norm": 0.0030487473122775555,
+      "learning_rate": 0.001,
+      "loss": 0.4101,
+      "step": 6443
+    },
+    {
+      "epoch": 0.17780414365879485,
+      "grad_norm": 0.0025476363953202963,
+      "learning_rate": 0.001,
+      "loss": 0.4063,
+      "step": 6444
+    },
+    {
+      "epoch": 0.1778317358598592,
+      "grad_norm": 0.002622386207804084,
+      "learning_rate": 0.001,
+      "loss": 0.4083,
+      "step": 6445
+    },
+    {
+      "epoch": 0.17785932806092358,
+      "grad_norm": 0.00240062247030437,
+      "learning_rate": 0.001,
+      "loss": 0.3893,
+      "step": 6446
+    },
+    {
+      "epoch": 0.17788692026198794,
+      "grad_norm": 0.0036130244843661785,
+      "learning_rate": 0.001,
+      "loss": 0.4177,
+      "step": 6447
+    },
+    {
+      "epoch": 0.17791451246305232,
+      "grad_norm": 0.002018724335357547,
+      "learning_rate": 0.001,
+      "loss": 0.4125,
+      "step": 6448
+    },
+    {
+      "epoch": 0.1779421046641167,
+      "grad_norm": 0.0024812505580484867,
+      "learning_rate": 0.001,
+      "loss": 0.3876,
+      "step": 6449
+    },
+    {
+      "epoch": 0.17796969686518105,
+      "grad_norm": 0.002273386111482978,
+      "learning_rate": 0.001,
+      "loss": 0.3907,
+      "step": 6450
+    },
+    {
+      "epoch": 0.17799728906624543,
+      "grad_norm": 0.0027766439598053694,
+      "learning_rate": 0.001,
+      "loss": 0.3812,
+      "step": 6451
+    },
+    {
+      "epoch": 0.17802488126730978,
+      "grad_norm": 0.003901482792571187,
+      "learning_rate": 0.001,
+      "loss": 0.3847,
+      "step": 6452
+    },
+    {
+      "epoch": 0.17805247346837416,
+      "grad_norm": 0.003208085196092725,
+      "learning_rate": 0.001,
+      "loss": 0.4277,
+      "step": 6453
+    },
+    {
+      "epoch": 0.17808006566943854,
+      "grad_norm": 0.0029840737115591764,
+      "learning_rate": 0.001,
+      "loss": 0.4254,
+      "step": 6454
+    },
+    {
+      "epoch": 0.1781076578705029,
+      "grad_norm": 0.00228834873996675,
+      "learning_rate": 0.001,
+      "loss": 0.3943,
+      "step": 6455
+    },
+    {
+      "epoch": 0.17813525007156727,
+      "grad_norm": 0.0035069435834884644,
+      "learning_rate": 0.001,
+      "loss": 0.3772,
+      "step": 6456
+    },
+    {
+      "epoch": 0.17816284227263163,
+      "grad_norm": 0.0046169045381248,
+      "learning_rate": 0.001,
+      "loss": 0.3904,
+      "step": 6457
+    },
+    {
+      "epoch": 0.178190434473696,
+      "grad_norm": 0.002904290799051523,
+      "learning_rate": 0.001,
+      "loss": 0.4014,
+      "step": 6458
+    },
+    {
+      "epoch": 0.1782180266747604,
+      "grad_norm": 0.004634745419025421,
+      "learning_rate": 0.001,
+      "loss": 0.3864,
+      "step": 6459
+    },
+    {
+      "epoch": 0.17824561887582474,
+      "grad_norm": 0.004808775614947081,
+      "learning_rate": 0.001,
+      "loss": 0.3519,
+      "step": 6460
+    },
+    {
+      "epoch": 0.17827321107688912,
+      "grad_norm": 0.005992877297103405,
+      "learning_rate": 0.001,
+      "loss": 0.3487,
+      "step": 6461
+    },
+    {
+      "epoch": 0.17830080327795347,
+      "grad_norm": 0.003151120152324438,
+      "learning_rate": 0.001,
+      "loss": 0.3892,
+      "step": 6462
+    },
+    {
+      "epoch": 0.17832839547901785,
+      "grad_norm": 0.0020630250219255686,
+      "learning_rate": 0.001,
+      "loss": 0.4114,
+      "step": 6463
+    },
+    {
+      "epoch": 0.17835598768008223,
+      "grad_norm": 0.0031087875831872225,
+      "learning_rate": 0.001,
+      "loss": 0.3838,
+      "step": 6464
+    },
+    {
+      "epoch": 0.1783835798811466,
+      "grad_norm": 0.001903343596495688,
+      "learning_rate": 0.001,
+      "loss": 0.4283,
+      "step": 6465
+    },
+    {
+      "epoch": 0.17841117208221097,
+      "grad_norm": 0.0029442012310028076,
+      "learning_rate": 0.001,
+      "loss": 0.4059,
+      "step": 6466
+    },
+    {
+      "epoch": 0.17843876428327532,
+      "grad_norm": 0.003048170590773225,
+      "learning_rate": 0.001,
+      "loss": 0.3653,
+      "step": 6467
+    },
+    {
+      "epoch": 0.1784663564843397,
+      "grad_norm": 0.002289197174832225,
+      "learning_rate": 0.001,
+      "loss": 0.4146,
+      "step": 6468
+    },
+    {
+      "epoch": 0.17849394868540408,
+      "grad_norm": 0.001974297221750021,
+      "learning_rate": 0.001,
+      "loss": 0.4418,
+      "step": 6469
+    },
+    {
+      "epoch": 0.17852154088646843,
+      "grad_norm": 0.0037421276792883873,
+      "learning_rate": 0.001,
+      "loss": 0.3631,
+      "step": 6470
+    },
+    {
+      "epoch": 0.1785491330875328,
+      "grad_norm": 0.0027026592288166285,
+      "learning_rate": 0.001,
+      "loss": 0.4292,
+      "step": 6471
+    },
+    {
+      "epoch": 0.17857672528859717,
+      "grad_norm": 0.0024960192386060953,
+      "learning_rate": 0.001,
+      "loss": 0.4296,
+      "step": 6472
+    },
+    {
+      "epoch": 0.17860431748966155,
+      "grad_norm": 0.0029041236266493797,
+      "learning_rate": 0.001,
+      "loss": 0.3639,
+      "step": 6473
+    },
+    {
+      "epoch": 0.17863190969072593,
+      "grad_norm": 0.003906071186065674,
+      "learning_rate": 0.001,
+      "loss": 0.4274,
+      "step": 6474
+    },
+    {
+      "epoch": 0.17865950189179028,
+      "grad_norm": 0.002730944426730275,
+      "learning_rate": 0.001,
+      "loss": 0.3775,
+      "step": 6475
+    },
+    {
+      "epoch": 0.17868709409285466,
+      "grad_norm": 0.002463391749188304,
+      "learning_rate": 0.001,
+      "loss": 0.4227,
+      "step": 6476
+    },
+    {
+      "epoch": 0.178714686293919,
+      "grad_norm": 0.003006384475156665,
+      "learning_rate": 0.001,
+      "loss": 0.3643,
+      "step": 6477
+    },
+    {
+      "epoch": 0.1787422784949834,
+      "grad_norm": 0.002436541486531496,
+      "learning_rate": 0.001,
+      "loss": 0.3971,
+      "step": 6478
+    },
+    {
+      "epoch": 0.17876987069604777,
+      "grad_norm": 0.007043438032269478,
+      "learning_rate": 0.001,
+      "loss": 0.42,
+      "step": 6479
+    },
+    {
+      "epoch": 0.17879746289711212,
+      "grad_norm": 0.0045332033187150955,
+      "learning_rate": 0.001,
+      "loss": 0.3988,
+      "step": 6480
+    },
+    {
+      "epoch": 0.1788250550981765,
+      "grad_norm": 0.006813266780227423,
+      "learning_rate": 0.001,
+      "loss": 0.3835,
+      "step": 6481
+    },
+    {
+      "epoch": 0.17885264729924086,
+      "grad_norm": 0.003106794785708189,
+      "learning_rate": 0.001,
+      "loss": 0.4195,
+      "step": 6482
+    },
+    {
+      "epoch": 0.17888023950030524,
+      "grad_norm": 0.003665417432785034,
+      "learning_rate": 0.001,
+      "loss": 0.4124,
+      "step": 6483
+    },
+    {
+      "epoch": 0.17890783170136962,
+      "grad_norm": 0.002157231792807579,
+      "learning_rate": 0.001,
+      "loss": 0.4128,
+      "step": 6484
+    },
+    {
+      "epoch": 0.17893542390243397,
+      "grad_norm": 0.0032816650345921516,
+      "learning_rate": 0.001,
+      "loss": 0.3797,
+      "step": 6485
+    },
+    {
+      "epoch": 0.17896301610349835,
+      "grad_norm": 0.003755836049094796,
+      "learning_rate": 0.001,
+      "loss": 0.3927,
+      "step": 6486
+    },
+    {
+      "epoch": 0.1789906083045627,
+      "grad_norm": 0.0028953254222869873,
+      "learning_rate": 0.001,
+      "loss": 0.3926,
+      "step": 6487
+    },
+    {
+      "epoch": 0.17901820050562708,
+      "grad_norm": 0.004839729517698288,
+      "learning_rate": 0.001,
+      "loss": 0.3948,
+      "step": 6488
+    },
+    {
+      "epoch": 0.17904579270669146,
+      "grad_norm": 0.003079955466091633,
+      "learning_rate": 0.001,
+      "loss": 0.4396,
+      "step": 6489
+    },
+    {
+      "epoch": 0.17907338490775582,
+      "grad_norm": 0.0024841863196343184,
+      "learning_rate": 0.001,
+      "loss": 0.3973,
+      "step": 6490
+    },
+    {
+      "epoch": 0.1791009771088202,
+      "grad_norm": 0.0028867730870842934,
+      "learning_rate": 0.001,
+      "loss": 0.3779,
+      "step": 6491
+    },
+    {
+      "epoch": 0.17912856930988455,
+      "grad_norm": 0.0027325463015586138,
+      "learning_rate": 0.001,
+      "loss": 0.4084,
+      "step": 6492
+    },
+    {
+      "epoch": 0.17915616151094893,
+      "grad_norm": 0.006061290390789509,
+      "learning_rate": 0.001,
+      "loss": 0.4036,
+      "step": 6493
+    },
+    {
+      "epoch": 0.1791837537120133,
+      "grad_norm": 0.002831445774063468,
+      "learning_rate": 0.001,
+      "loss": 0.3903,
+      "step": 6494
+    },
+    {
+      "epoch": 0.17921134591307766,
+      "grad_norm": 0.004074187949299812,
+      "learning_rate": 0.001,
+      "loss": 0.3793,
+      "step": 6495
+    },
+    {
+      "epoch": 0.17923893811414204,
+      "grad_norm": 0.0037040761671960354,
+      "learning_rate": 0.001,
+      "loss": 0.3831,
+      "step": 6496
+    },
+    {
+      "epoch": 0.1792665303152064,
+      "grad_norm": 0.0032522703986614943,
+      "learning_rate": 0.001,
+      "loss": 0.4091,
+      "step": 6497
+    },
+    {
+      "epoch": 0.17929412251627078,
+      "grad_norm": 0.003379018511623144,
+      "learning_rate": 0.001,
+      "loss": 0.3869,
+      "step": 6498
+    },
+    {
+      "epoch": 0.17932171471733516,
+      "grad_norm": 0.004454959649592638,
+      "learning_rate": 0.001,
+      "loss": 0.3864,
+      "step": 6499
+    },
+    {
+      "epoch": 0.1793493069183995,
+      "grad_norm": 0.006603573448956013,
+      "learning_rate": 0.001,
+      "loss": 0.378,
+      "step": 6500
+    },
+    {
+      "epoch": 0.1793493069183995,
+      "eval_runtime": 25.1324,
+      "eval_samples_per_second": 1.273,
+      "eval_steps_per_second": 0.159,
+      "step": 6500
+    },
+    {
+      "epoch": 0.1793768991194639,
+      "grad_norm": 0.002346447203308344,
+      "learning_rate": 0.001,
+      "loss": 0.4272,
+      "step": 6501
+    },
+    {
+      "epoch": 0.17940449132052824,
+      "grad_norm": 0.004839559551328421,
+      "learning_rate": 0.001,
+      "loss": 0.3956,
+      "step": 6502
+    },
+    {
+      "epoch": 0.17943208352159262,
+      "grad_norm": 0.0033710715360939503,
+      "learning_rate": 0.001,
+      "loss": 0.4238,
+      "step": 6503
+    },
+    {
+      "epoch": 0.179459675722657,
+      "grad_norm": 0.003859972581267357,
+      "learning_rate": 0.001,
+      "loss": 0.4184,
+      "step": 6504
+    },
+    {
+      "epoch": 0.17948726792372136,
+      "grad_norm": 0.004193128552287817,
+      "learning_rate": 0.001,
+      "loss": 0.443,
+      "step": 6505
+    },
+    {
+      "epoch": 0.17951486012478574,
+      "grad_norm": 0.0029107420705258846,
+      "learning_rate": 0.001,
+      "loss": 0.4284,
+      "step": 6506
+    },
+    {
+      "epoch": 0.1795424523258501,
+      "grad_norm": 0.0028658658266067505,
+      "learning_rate": 0.001,
+      "loss": 0.4214,
+      "step": 6507
+    },
+    {
+      "epoch": 0.17957004452691447,
+      "grad_norm": 0.0021101958118379116,
+      "learning_rate": 0.001,
+      "loss": 0.4653,
+      "step": 6508
+    },
+    {
+      "epoch": 0.17959763672797885,
+      "grad_norm": 0.0034610512666404247,
+      "learning_rate": 0.001,
+      "loss": 0.3924,
+      "step": 6509
+    },
+    {
+      "epoch": 0.1796252289290432,
+      "grad_norm": 0.004026256036013365,
+      "learning_rate": 0.001,
+      "loss": 0.3748,
+      "step": 6510
+    },
+    {
+      "epoch": 0.17965282113010758,
+      "grad_norm": 0.004078659228980541,
+      "learning_rate": 0.001,
+      "loss": 0.3741,
+      "step": 6511
+    },
+    {
+      "epoch": 0.17968041333117193,
+      "grad_norm": 0.006540537811815739,
+      "learning_rate": 0.001,
+      "loss": 0.3947,
+      "step": 6512
+    },
+    {
+      "epoch": 0.17970800553223631,
+      "grad_norm": 0.002590343588963151,
+      "learning_rate": 0.001,
+      "loss": 0.3711,
+      "step": 6513
+    },
+    {
+      "epoch": 0.1797355977333007,
+      "grad_norm": 0.005247985944151878,
+      "learning_rate": 0.001,
+      "loss": 0.4139,
+      "step": 6514
+    },
+    {
+      "epoch": 0.17976318993436505,
+      "grad_norm": 0.0027244454249739647,
+      "learning_rate": 0.001,
+      "loss": 0.3928,
+      "step": 6515
+    },
+    {
+      "epoch": 0.17979078213542943,
+      "grad_norm": 0.002999567426741123,
+      "learning_rate": 0.001,
+      "loss": 0.3676,
+      "step": 6516
+    },
+    {
+      "epoch": 0.17981837433649378,
+      "grad_norm": 0.002379565965384245,
+      "learning_rate": 0.001,
+      "loss": 0.3766,
+      "step": 6517
+    },
+    {
+      "epoch": 0.17984596653755816,
+      "grad_norm": 0.0031703212298452854,
+      "learning_rate": 0.001,
+      "loss": 0.4199,
+      "step": 6518
+    },
+    {
+      "epoch": 0.17987355873862254,
+      "grad_norm": 0.005782125052064657,
+      "learning_rate": 0.001,
+      "loss": 0.4311,
+      "step": 6519
+    },
+    {
+      "epoch": 0.1799011509396869,
+      "grad_norm": 0.002240552334114909,
+      "learning_rate": 0.001,
+      "loss": 0.3879,
+      "step": 6520
+    },
+    {
+      "epoch": 0.17992874314075127,
+      "grad_norm": 0.0027389421593397856,
+      "learning_rate": 0.001,
+      "loss": 0.3973,
+      "step": 6521
+    },
+    {
+      "epoch": 0.17995633534181563,
+      "grad_norm": 0.0032193311490118504,
+      "learning_rate": 0.001,
+      "loss": 0.389,
+      "step": 6522
+    },
+    {
+      "epoch": 0.17998392754288,
+      "grad_norm": 0.002220897702500224,
+      "learning_rate": 0.001,
+      "loss": 0.3952,
+      "step": 6523
+    },
+    {
+      "epoch": 0.1800115197439444,
+      "grad_norm": 0.0031423268374055624,
+      "learning_rate": 0.001,
+      "loss": 0.3548,
+      "step": 6524
+    },
+    {
+      "epoch": 0.18003911194500874,
+      "grad_norm": 0.0029932213947176933,
+      "learning_rate": 0.001,
+      "loss": 0.4019,
+      "step": 6525
+    },
+    {
+      "epoch": 0.18006670414607312,
+      "grad_norm": 0.0042558046989142895,
+      "learning_rate": 0.001,
+      "loss": 0.4004,
+      "step": 6526
+    },
+    {
+      "epoch": 0.18009429634713747,
+      "grad_norm": 0.006126770284026861,
+      "learning_rate": 0.001,
+      "loss": 0.392,
+      "step": 6527
+    },
+    {
+      "epoch": 0.18012188854820185,
+      "grad_norm": 0.0038677749689668417,
+      "learning_rate": 0.001,
+      "loss": 0.3591,
+      "step": 6528
+    },
+    {
+      "epoch": 0.18014948074926623,
+      "grad_norm": 0.004701892379671335,
+      "learning_rate": 0.001,
+      "loss": 0.4048,
+      "step": 6529
+    },
+    {
+      "epoch": 0.18017707295033059,
+      "grad_norm": 0.002978444332256913,
+      "learning_rate": 0.001,
+      "loss": 0.3636,
+      "step": 6530
+    },
+    {
+      "epoch": 0.18020466515139497,
+      "grad_norm": 0.003622008254751563,
+      "learning_rate": 0.001,
+      "loss": 0.3938,
+      "step": 6531
+    },
+    {
+      "epoch": 0.18023225735245932,
+      "grad_norm": 0.004150434397161007,
+      "learning_rate": 0.001,
+      "loss": 0.3921,
+      "step": 6532
+    },
+    {
+      "epoch": 0.1802598495535237,
+      "grad_norm": 0.006446919869631529,
+      "learning_rate": 0.001,
+      "loss": 0.3843,
+      "step": 6533
+    },
+    {
+      "epoch": 0.18028744175458808,
+      "grad_norm": 0.0027283939998596907,
+      "learning_rate": 0.001,
+      "loss": 0.3764,
+      "step": 6534
+    },
+    {
+      "epoch": 0.18031503395565243,
+      "grad_norm": 0.0035466288682073355,
+      "learning_rate": 0.001,
+      "loss": 0.4308,
+      "step": 6535
+    },
+    {
+      "epoch": 0.1803426261567168,
+      "grad_norm": 0.0033509554341435432,
+      "learning_rate": 0.001,
+      "loss": 0.3809,
+      "step": 6536
+    },
+    {
+      "epoch": 0.18037021835778116,
+      "grad_norm": 0.0052613504230976105,
+      "learning_rate": 0.001,
+      "loss": 0.3677,
+      "step": 6537
+    },
+    {
+      "epoch": 0.18039781055884554,
+      "grad_norm": 0.0029608176555484533,
+      "learning_rate": 0.001,
+      "loss": 0.4364,
+      "step": 6538
+    },
+    {
+      "epoch": 0.18042540275990993,
+      "grad_norm": 0.002105949679389596,
+      "learning_rate": 0.001,
+      "loss": 0.3944,
+      "step": 6539
+    },
+    {
+      "epoch": 0.18045299496097428,
+      "grad_norm": 0.002690407680347562,
+      "learning_rate": 0.001,
+      "loss": 0.3894,
+      "step": 6540
+    },
+    {
+      "epoch": 0.18048058716203866,
+      "grad_norm": 0.0024390683975070715,
+      "learning_rate": 0.001,
+      "loss": 0.4373,
+      "step": 6541
+    },
+    {
+      "epoch": 0.180508179363103,
+      "grad_norm": 0.0034981141798198223,
+      "learning_rate": 0.001,
+      "loss": 0.3952,
+      "step": 6542
+    },
+    {
+      "epoch": 0.1805357715641674,
+      "grad_norm": 0.003039612900465727,
+      "learning_rate": 0.001,
+      "loss": 0.3999,
+      "step": 6543
+    },
+    {
+      "epoch": 0.18056336376523174,
+      "grad_norm": 0.00274271029047668,
+      "learning_rate": 0.001,
+      "loss": 0.3842,
+      "step": 6544
+    },
+    {
+      "epoch": 0.18059095596629612,
+      "grad_norm": 0.002432740991935134,
+      "learning_rate": 0.001,
+      "loss": 0.4024,
+      "step": 6545
+    },
+    {
+      "epoch": 0.1806185481673605,
+      "grad_norm": 0.003491653362289071,
+      "learning_rate": 0.001,
+      "loss": 0.4019,
+      "step": 6546
+    },
+    {
+      "epoch": 0.18064614036842486,
+      "grad_norm": 0.0032173325307667255,
+      "learning_rate": 0.001,
+      "loss": 0.4048,
+      "step": 6547
+    },
+    {
+      "epoch": 0.18067373256948924,
+      "grad_norm": 0.0029747970402240753,
+      "learning_rate": 0.001,
+      "loss": 0.3918,
+      "step": 6548
+    },
+    {
+      "epoch": 0.1807013247705536,
+      "grad_norm": 0.007667601108551025,
+      "learning_rate": 0.001,
+      "loss": 0.3944,
+      "step": 6549
+    },
+    {
+      "epoch": 0.18072891697161797,
+      "grad_norm": 0.003929935861378908,
+      "learning_rate": 0.001,
+      "loss": 0.3994,
+      "step": 6550
+    },
+    {
+      "epoch": 0.18075650917268235,
+      "grad_norm": 0.004057936370372772,
+      "learning_rate": 0.001,
+      "loss": 0.4289,
+      "step": 6551
+    },
+    {
+      "epoch": 0.1807841013737467,
+      "grad_norm": 0.003355666296556592,
+      "learning_rate": 0.001,
+      "loss": 0.3934,
+      "step": 6552
+    },
+    {
+      "epoch": 0.18081169357481108,
+      "grad_norm": 0.0018296247581019998,
+      "learning_rate": 0.001,
+      "loss": 0.4068,
+      "step": 6553
+    },
+    {
+      "epoch": 0.18083928577587544,
+      "grad_norm": 0.0028984597884118557,
+      "learning_rate": 0.001,
+      "loss": 0.4098,
+      "step": 6554
+    },
+    {
+      "epoch": 0.18086687797693982,
+      "grad_norm": 0.004249675665050745,
+      "learning_rate": 0.001,
+      "loss": 0.3854,
+      "step": 6555
+    },
+    {
+      "epoch": 0.1808944701780042,
+      "grad_norm": 0.002462320262566209,
+      "learning_rate": 0.001,
+      "loss": 0.4299,
+      "step": 6556
+    },
+    {
+      "epoch": 0.18092206237906855,
+      "grad_norm": 0.003266765736043453,
+      "learning_rate": 0.001,
+      "loss": 0.3725,
+      "step": 6557
+    },
+    {
+      "epoch": 0.18094965458013293,
+      "grad_norm": 0.002418224699795246,
+      "learning_rate": 0.001,
+      "loss": 0.4059,
+      "step": 6558
+    },
+    {
+      "epoch": 0.18097724678119728,
+      "grad_norm": 0.002268604701384902,
+      "learning_rate": 0.001,
+      "loss": 0.4535,
+      "step": 6559
+    },
+    {
+      "epoch": 0.18100483898226166,
+      "grad_norm": 0.0030840514227747917,
+      "learning_rate": 0.001,
+      "loss": 0.4107,
+      "step": 6560
+    },
+    {
+      "epoch": 0.18103243118332604,
+      "grad_norm": 0.003670538542792201,
+      "learning_rate": 0.001,
+      "loss": 0.3636,
+      "step": 6561
+    },
+    {
+      "epoch": 0.1810600233843904,
+      "grad_norm": 0.003579066600650549,
+      "learning_rate": 0.001,
+      "loss": 0.3943,
+      "step": 6562
+    },
+    {
+      "epoch": 0.18108761558545478,
+      "grad_norm": 0.008917649276554585,
+      "learning_rate": 0.001,
+      "loss": 0.4046,
+      "step": 6563
+    },
+    {
+      "epoch": 0.18111520778651913,
+      "grad_norm": 0.004259804729372263,
+      "learning_rate": 0.001,
+      "loss": 0.4289,
+      "step": 6564
+    },
+    {
+      "epoch": 0.1811427999875835,
+      "grad_norm": 0.0026127335149794817,
+      "learning_rate": 0.001,
+      "loss": 0.4319,
+      "step": 6565
+    },
+    {
+      "epoch": 0.1811703921886479,
+      "grad_norm": 0.00313868117518723,
+      "learning_rate": 0.001,
+      "loss": 0.3858,
+      "step": 6566
+    },
+    {
+      "epoch": 0.18119798438971224,
+      "grad_norm": 0.002577871782705188,
+      "learning_rate": 0.001,
+      "loss": 0.4148,
+      "step": 6567
+    },
+    {
+      "epoch": 0.18122557659077662,
+      "grad_norm": 0.0029946148861199617,
+      "learning_rate": 0.001,
+      "loss": 0.3963,
+      "step": 6568
+    },
+    {
+      "epoch": 0.18125316879184097,
+      "grad_norm": 0.0025365573819726706,
+      "learning_rate": 0.001,
+      "loss": 0.3689,
+      "step": 6569
+    },
+    {
+      "epoch": 0.18128076099290535,
+      "grad_norm": 0.002418767660856247,
+      "learning_rate": 0.001,
+      "loss": 0.3896,
+      "step": 6570
+    },
+    {
+      "epoch": 0.18130835319396973,
+      "grad_norm": 0.003545231418684125,
+      "learning_rate": 0.001,
+      "loss": 0.3633,
+      "step": 6571
+    },
+    {
+      "epoch": 0.1813359453950341,
+      "grad_norm": 0.0031614559702575207,
+      "learning_rate": 0.001,
+      "loss": 0.4208,
+      "step": 6572
+    },
+    {
+      "epoch": 0.18136353759609847,
+      "grad_norm": 0.004500371403992176,
+      "learning_rate": 0.001,
+      "loss": 0.3797,
+      "step": 6573
+    },
+    {
+      "epoch": 0.18139112979716282,
+      "grad_norm": 0.004762687720358372,
+      "learning_rate": 0.001,
+      "loss": 0.3754,
+      "step": 6574
+    },
+    {
+      "epoch": 0.1814187219982272,
+      "grad_norm": 0.0032746128272265196,
+      "learning_rate": 0.001,
+      "loss": 0.3721,
+      "step": 6575
+    },
+    {
+      "epoch": 0.18144631419929158,
+      "grad_norm": 0.002015149686485529,
+      "learning_rate": 0.001,
+      "loss": 0.4016,
+      "step": 6576
+    },
+    {
+      "epoch": 0.18147390640035593,
+      "grad_norm": 0.003133730962872505,
+      "learning_rate": 0.001,
+      "loss": 0.3765,
+      "step": 6577
+    },
+    {
+      "epoch": 0.1815014986014203,
+      "grad_norm": 0.002634577453136444,
+      "learning_rate": 0.001,
+      "loss": 0.4255,
+      "step": 6578
+    },
+    {
+      "epoch": 0.18152909080248467,
+      "grad_norm": 0.002629198832437396,
+      "learning_rate": 0.001,
+      "loss": 0.3698,
+      "step": 6579
+    },
+    {
+      "epoch": 0.18155668300354905,
+      "grad_norm": 0.0025706025771796703,
+      "learning_rate": 0.001,
+      "loss": 0.4115,
+      "step": 6580
+    },
+    {
+      "epoch": 0.18158427520461343,
+      "grad_norm": 0.0024760281667113304,
+      "learning_rate": 0.001,
+      "loss": 0.3772,
+      "step": 6581
+    },
+    {
+      "epoch": 0.18161186740567778,
+      "grad_norm": 0.002749704523012042,
+      "learning_rate": 0.001,
+      "loss": 0.441,
+      "step": 6582
+    },
+    {
+      "epoch": 0.18163945960674216,
+      "grad_norm": 0.004137085285037756,
+      "learning_rate": 0.001,
+      "loss": 0.401,
+      "step": 6583
+    },
+    {
+      "epoch": 0.1816670518078065,
+      "grad_norm": 0.003452888922765851,
+      "learning_rate": 0.001,
+      "loss": 0.3951,
+      "step": 6584
+    },
+    {
+      "epoch": 0.1816946440088709,
+      "grad_norm": 0.00280118640512228,
+      "learning_rate": 0.001,
+      "loss": 0.4069,
+      "step": 6585
+    },
+    {
+      "epoch": 0.18172223620993527,
+      "grad_norm": 0.002511856146156788,
+      "learning_rate": 0.001,
+      "loss": 0.3994,
+      "step": 6586
+    },
+    {
+      "epoch": 0.18174982841099963,
+      "grad_norm": 0.003266239771619439,
+      "learning_rate": 0.001,
+      "loss": 0.3831,
+      "step": 6587
+    },
+    {
+      "epoch": 0.181777420612064,
+      "grad_norm": 0.004074828699231148,
+      "learning_rate": 0.001,
+      "loss": 0.4011,
+      "step": 6588
+    },
+    {
+      "epoch": 0.18180501281312836,
+      "grad_norm": 0.0032096183858811855,
+      "learning_rate": 0.001,
+      "loss": 0.3851,
+      "step": 6589
+    },
+    {
+      "epoch": 0.18183260501419274,
+      "grad_norm": 0.004038103390485048,
+      "learning_rate": 0.001,
+      "loss": 0.3757,
+      "step": 6590
+    },
+    {
+      "epoch": 0.18186019721525712,
+      "grad_norm": 0.002833410631865263,
+      "learning_rate": 0.001,
+      "loss": 0.412,
+      "step": 6591
+    },
+    {
+      "epoch": 0.18188778941632147,
+      "grad_norm": 0.0042181131429970264,
+      "learning_rate": 0.001,
+      "loss": 0.4179,
+      "step": 6592
+    },
+    {
+      "epoch": 0.18191538161738585,
+      "grad_norm": 0.0023462646640837193,
+      "learning_rate": 0.001,
+      "loss": 0.4394,
+      "step": 6593
+    },
+    {
+      "epoch": 0.1819429738184502,
+      "grad_norm": 0.00263848970644176,
+      "learning_rate": 0.001,
+      "loss": 0.4032,
+      "step": 6594
+    },
+    {
+      "epoch": 0.18197056601951458,
+      "grad_norm": 0.0036028139293193817,
+      "learning_rate": 0.001,
+      "loss": 0.4056,
+      "step": 6595
+    },
+    {
+      "epoch": 0.18199815822057896,
+      "grad_norm": 0.0028711268678307533,
+      "learning_rate": 0.001,
+      "loss": 0.3963,
+      "step": 6596
+    },
+    {
+      "epoch": 0.18202575042164332,
+      "grad_norm": 0.0030903166625648737,
+      "learning_rate": 0.001,
+      "loss": 0.4274,
+      "step": 6597
+    },
+    {
+      "epoch": 0.1820533426227077,
+      "grad_norm": 0.0024692462757229805,
+      "learning_rate": 0.001,
+      "loss": 0.3792,
+      "step": 6598
+    },
+    {
+      "epoch": 0.18208093482377205,
+      "grad_norm": 0.0031899542082101107,
+      "learning_rate": 0.001,
+      "loss": 0.3994,
+      "step": 6599
+    },
+    {
+      "epoch": 0.18210852702483643,
+      "grad_norm": 0.003478053957223892,
+      "learning_rate": 0.001,
+      "loss": 0.399,
+      "step": 6600
+    },
+    {
+      "epoch": 0.1821361192259008,
+      "grad_norm": 0.00227729300968349,
+      "learning_rate": 0.001,
+      "loss": 0.3974,
+      "step": 6601
+    },
+    {
+      "epoch": 0.18216371142696516,
+      "grad_norm": 0.002498425543308258,
+      "learning_rate": 0.001,
+      "loss": 0.3593,
+      "step": 6602
+    },
+    {
+      "epoch": 0.18219130362802954,
+      "grad_norm": 0.0035188214387744665,
+      "learning_rate": 0.001,
+      "loss": 0.3785,
+      "step": 6603
+    },
+    {
+      "epoch": 0.1822188958290939,
+      "grad_norm": 0.0029362791683524847,
+      "learning_rate": 0.001,
+      "loss": 0.3971,
+      "step": 6604
+    },
+    {
+      "epoch": 0.18224648803015828,
+      "grad_norm": 0.0034354175440967083,
+      "learning_rate": 0.001,
+      "loss": 0.3582,
+      "step": 6605
+    },
+    {
+      "epoch": 0.18227408023122266,
+      "grad_norm": 0.0026759831234812737,
+      "learning_rate": 0.001,
+      "loss": 0.3607,
+      "step": 6606
+    },
+    {
+      "epoch": 0.182301672432287,
+      "grad_norm": 0.002816120395436883,
+      "learning_rate": 0.001,
+      "loss": 0.3837,
+      "step": 6607
+    },
+    {
+      "epoch": 0.1823292646333514,
+      "grad_norm": 0.0035022348165512085,
+      "learning_rate": 0.001,
+      "loss": 0.4098,
+      "step": 6608
+    },
+    {
+      "epoch": 0.18235685683441574,
+      "grad_norm": 0.0034114073496311903,
+      "learning_rate": 0.001,
+      "loss": 0.3654,
+      "step": 6609
+    },
+    {
+      "epoch": 0.18238444903548012,
+      "grad_norm": 0.0029557389207184315,
+      "learning_rate": 0.001,
+      "loss": 0.3873,
+      "step": 6610
+    },
+    {
+      "epoch": 0.1824120412365445,
+      "grad_norm": 0.0028255372308194637,
+      "learning_rate": 0.001,
+      "loss": 0.4066,
+      "step": 6611
+    },
+    {
+      "epoch": 0.18243963343760886,
+      "grad_norm": 0.003345831297338009,
+      "learning_rate": 0.001,
+      "loss": 0.3974,
+      "step": 6612
+    },
+    {
+      "epoch": 0.18246722563867324,
+      "grad_norm": 0.0033113721292465925,
+      "learning_rate": 0.001,
+      "loss": 0.3894,
+      "step": 6613
+    },
+    {
+      "epoch": 0.1824948178397376,
+      "grad_norm": 0.0022858104202896357,
+      "learning_rate": 0.001,
+      "loss": 0.4601,
+      "step": 6614
+    },
+    {
+      "epoch": 0.18252241004080197,
+      "grad_norm": 0.0037697155494242907,
+      "learning_rate": 0.001,
+      "loss": 0.3842,
+      "step": 6615
+    },
+    {
+      "epoch": 0.18255000224186635,
+      "grad_norm": 0.003321718657389283,
+      "learning_rate": 0.001,
+      "loss": 0.3957,
+      "step": 6616
+    },
+    {
+      "epoch": 0.1825775944429307,
+      "grad_norm": 0.00823116209357977,
+      "learning_rate": 0.001,
+      "loss": 0.408,
+      "step": 6617
+    },
+    {
+      "epoch": 0.18260518664399508,
+      "grad_norm": 0.009032117202877998,
+      "learning_rate": 0.001,
+      "loss": 0.4064,
+      "step": 6618
+    },
+    {
+      "epoch": 0.18263277884505943,
+      "grad_norm": 0.012609437108039856,
+      "learning_rate": 0.001,
+      "loss": 0.4159,
+      "step": 6619
+    },
+    {
+      "epoch": 0.18266037104612382,
+      "grad_norm": 0.0026686121709644794,
+      "learning_rate": 0.001,
+      "loss": 0.428,
+      "step": 6620
+    },
+    {
+      "epoch": 0.1826879632471882,
+      "grad_norm": 0.004215455148369074,
+      "learning_rate": 0.001,
+      "loss": 0.412,
+      "step": 6621
+    },
+    {
+      "epoch": 0.18271555544825255,
+      "grad_norm": 0.004420561250299215,
+      "learning_rate": 0.001,
+      "loss": 0.3867,
+      "step": 6622
+    },
+    {
+      "epoch": 0.18274314764931693,
+      "grad_norm": 0.004728737287223339,
+      "learning_rate": 0.001,
+      "loss": 0.39,
+      "step": 6623
+    },
+    {
+      "epoch": 0.18277073985038128,
+      "grad_norm": 0.0077790976502001286,
+      "learning_rate": 0.001,
+      "loss": 0.4597,
+      "step": 6624
+    },
+    {
+      "epoch": 0.18279833205144566,
+      "grad_norm": 0.005704334005713463,
+      "learning_rate": 0.001,
+      "loss": 0.38,
+      "step": 6625
+    },
+    {
+      "epoch": 0.18282592425251004,
+      "grad_norm": 0.0057289209216833115,
+      "learning_rate": 0.001,
+      "loss": 0.4078,
+      "step": 6626
+    },
+    {
+      "epoch": 0.1828535164535744,
+      "grad_norm": 0.0031269195023924112,
+      "learning_rate": 0.001,
+      "loss": 0.3863,
+      "step": 6627
+    },
+    {
+      "epoch": 0.18288110865463877,
+      "grad_norm": 0.005130524281412363,
+      "learning_rate": 0.001,
+      "loss": 0.3633,
+      "step": 6628
+    },
+    {
+      "epoch": 0.18290870085570313,
+      "grad_norm": 0.003109897952526808,
+      "learning_rate": 0.001,
+      "loss": 0.4098,
+      "step": 6629
+    },
+    {
+      "epoch": 0.1829362930567675,
+      "grad_norm": 0.003781312145292759,
+      "learning_rate": 0.001,
+      "loss": 0.4178,
+      "step": 6630
+    },
+    {
+      "epoch": 0.1829638852578319,
+      "grad_norm": 0.002132704248651862,
+      "learning_rate": 0.001,
+      "loss": 0.4127,
+      "step": 6631
+    },
+    {
+      "epoch": 0.18299147745889624,
+      "grad_norm": 0.0032692537643015385,
+      "learning_rate": 0.001,
+      "loss": 0.4029,
+      "step": 6632
+    },
+    {
+      "epoch": 0.18301906965996062,
+      "grad_norm": 0.004832074046134949,
+      "learning_rate": 0.001,
+      "loss": 0.4062,
+      "step": 6633
+    },
+    {
+      "epoch": 0.18304666186102497,
+      "grad_norm": 0.002927541034296155,
+      "learning_rate": 0.001,
+      "loss": 0.4106,
+      "step": 6634
+    },
+    {
+      "epoch": 0.18307425406208935,
+      "grad_norm": 0.0030651381239295006,
+      "learning_rate": 0.001,
+      "loss": 0.4064,
+      "step": 6635
+    },
+    {
+      "epoch": 0.1831018462631537,
+      "grad_norm": 0.005420647095888853,
+      "learning_rate": 0.001,
+      "loss": 0.3958,
+      "step": 6636
+    },
+    {
+      "epoch": 0.1831294384642181,
+      "grad_norm": 0.0034076806623488665,
+      "learning_rate": 0.001,
+      "loss": 0.4476,
+      "step": 6637
+    },
+    {
+      "epoch": 0.18315703066528247,
+      "grad_norm": 0.0023421344812959433,
+      "learning_rate": 0.001,
+      "loss": 0.4285,
+      "step": 6638
+    },
+    {
+      "epoch": 0.18318462286634682,
+      "grad_norm": 0.0025167351122945547,
+      "learning_rate": 0.001,
+      "loss": 0.3947,
+      "step": 6639
+    },
+    {
+      "epoch": 0.1832122150674112,
+      "grad_norm": 0.0035126416478306055,
+      "learning_rate": 0.001,
+      "loss": 0.3813,
+      "step": 6640
+    },
+    {
+      "epoch": 0.18323980726847555,
+      "grad_norm": 0.002516387961804867,
+      "learning_rate": 0.001,
+      "loss": 0.4018,
+      "step": 6641
+    },
+    {
+      "epoch": 0.18326739946953993,
+      "grad_norm": 0.0032357322052121162,
+      "learning_rate": 0.001,
+      "loss": 0.3613,
+      "step": 6642
+    },
+    {
+      "epoch": 0.1832949916706043,
+      "grad_norm": 0.0042640650644898415,
+      "learning_rate": 0.001,
+      "loss": 0.4065,
+      "step": 6643
+    },
+    {
+      "epoch": 0.18332258387166867,
+      "grad_norm": 0.003650575876235962,
+      "learning_rate": 0.001,
+      "loss": 0.3848,
+      "step": 6644
+    },
+    {
+      "epoch": 0.18335017607273305,
+      "grad_norm": 0.00383795821107924,
+      "learning_rate": 0.001,
+      "loss": 0.4154,
+      "step": 6645
+    },
+    {
+      "epoch": 0.1833777682737974,
+      "grad_norm": 0.003271985799074173,
+      "learning_rate": 0.001,
+      "loss": 0.4281,
+      "step": 6646
+    },
+    {
+      "epoch": 0.18340536047486178,
+      "grad_norm": 0.004597705788910389,
+      "learning_rate": 0.001,
+      "loss": 0.3947,
+      "step": 6647
+    },
+    {
+      "epoch": 0.18343295267592616,
+      "grad_norm": 0.0030701288487762213,
+      "learning_rate": 0.001,
+      "loss": 0.394,
+      "step": 6648
+    },
+    {
+      "epoch": 0.1834605448769905,
+      "grad_norm": 0.002716483548283577,
+      "learning_rate": 0.001,
+      "loss": 0.3562,
+      "step": 6649
+    },
+    {
+      "epoch": 0.1834881370780549,
+      "grad_norm": 0.00313906604424119,
+      "learning_rate": 0.001,
+      "loss": 0.4024,
+      "step": 6650
+    },
+    {
+      "epoch": 0.18351572927911924,
+      "grad_norm": 0.002156095113605261,
+      "learning_rate": 0.001,
+      "loss": 0.4083,
+      "step": 6651
+    },
+    {
+      "epoch": 0.18354332148018362,
+      "grad_norm": 0.005598701070994139,
+      "learning_rate": 0.001,
+      "loss": 0.394,
+      "step": 6652
+    },
+    {
+      "epoch": 0.183570913681248,
+      "grad_norm": 0.002826511859893799,
+      "learning_rate": 0.001,
+      "loss": 0.4202,
+      "step": 6653
+    },
+    {
+      "epoch": 0.18359850588231236,
+      "grad_norm": 0.004248370416462421,
+      "learning_rate": 0.001,
+      "loss": 0.4006,
+      "step": 6654
+    },
+    {
+      "epoch": 0.18362609808337674,
+      "grad_norm": 0.002483610762283206,
+      "learning_rate": 0.001,
+      "loss": 0.3963,
+      "step": 6655
+    },
+    {
+      "epoch": 0.1836536902844411,
+      "grad_norm": 0.003579988144338131,
+      "learning_rate": 0.001,
+      "loss": 0.4175,
+      "step": 6656
+    },
+    {
+      "epoch": 0.18368128248550547,
+      "grad_norm": 0.002827717922627926,
+      "learning_rate": 0.001,
+      "loss": 0.393,
+      "step": 6657
+    },
+    {
+      "epoch": 0.18370887468656985,
+      "grad_norm": 0.001899820170365274,
+      "learning_rate": 0.001,
+      "loss": 0.4096,
+      "step": 6658
+    },
+    {
+      "epoch": 0.1837364668876342,
+      "grad_norm": 0.0031157906632870436,
+      "learning_rate": 0.001,
+      "loss": 0.3807,
+      "step": 6659
+    },
+    {
+      "epoch": 0.18376405908869858,
+      "grad_norm": 0.0026786159723997116,
+      "learning_rate": 0.001,
+      "loss": 0.3914,
+      "step": 6660
+    },
+    {
+      "epoch": 0.18379165128976294,
+      "grad_norm": 0.003711652010679245,
+      "learning_rate": 0.001,
+      "loss": 0.3857,
+      "step": 6661
+    },
+    {
+      "epoch": 0.18381924349082732,
+      "grad_norm": 0.009293664246797562,
+      "learning_rate": 0.001,
+      "loss": 0.3962,
+      "step": 6662
+    },
+    {
+      "epoch": 0.1838468356918917,
+      "grad_norm": 0.005516475066542625,
+      "learning_rate": 0.001,
+      "loss": 0.3988,
+      "step": 6663
+    },
+    {
+      "epoch": 0.18387442789295605,
+      "grad_norm": 0.003258306998759508,
+      "learning_rate": 0.001,
+      "loss": 0.3742,
+      "step": 6664
+    },
+    {
+      "epoch": 0.18390202009402043,
+      "grad_norm": 0.0026423600502312183,
+      "learning_rate": 0.001,
+      "loss": 0.4059,
+      "step": 6665
+    },
+    {
+      "epoch": 0.18392961229508478,
+      "grad_norm": 0.0038184835575520992,
+      "learning_rate": 0.001,
+      "loss": 0.3844,
+      "step": 6666
+    },
+    {
+      "epoch": 0.18395720449614916,
+      "grad_norm": 0.007692532613873482,
+      "learning_rate": 0.001,
+      "loss": 0.3629,
+      "step": 6667
+    },
+    {
+      "epoch": 0.18398479669721354,
+      "grad_norm": 0.003459090832620859,
+      "learning_rate": 0.001,
+      "loss": 0.372,
+      "step": 6668
+    },
+    {
+      "epoch": 0.1840123888982779,
+      "grad_norm": 0.002234878484159708,
+      "learning_rate": 0.001,
+      "loss": 0.3923,
+      "step": 6669
+    },
+    {
+      "epoch": 0.18403998109934228,
+      "grad_norm": 0.004345033783465624,
+      "learning_rate": 0.001,
+      "loss": 0.4232,
+      "step": 6670
+    },
+    {
+      "epoch": 0.18406757330040663,
+      "grad_norm": 0.0024329267907887697,
+      "learning_rate": 0.001,
+      "loss": 0.4063,
+      "step": 6671
+    },
+    {
+      "epoch": 0.184095165501471,
+      "grad_norm": 0.004201768897473812,
+      "learning_rate": 0.001,
+      "loss": 0.3964,
+      "step": 6672
+    },
+    {
+      "epoch": 0.1841227577025354,
+      "grad_norm": 0.0029748522210866213,
+      "learning_rate": 0.001,
+      "loss": 0.3698,
+      "step": 6673
+    },
+    {
+      "epoch": 0.18415034990359974,
+      "grad_norm": 0.0033948393538594246,
+      "learning_rate": 0.001,
+      "loss": 0.4151,
+      "step": 6674
+    },
+    {
+      "epoch": 0.18417794210466412,
+      "grad_norm": 0.004755482543259859,
+      "learning_rate": 0.001,
+      "loss": 0.3606,
+      "step": 6675
+    },
+    {
+      "epoch": 0.18420553430572847,
+      "grad_norm": 0.004203738644719124,
+      "learning_rate": 0.001,
+      "loss": 0.3899,
+      "step": 6676
+    },
+    {
+      "epoch": 0.18423312650679285,
+      "grad_norm": 0.0028434142004698515,
+      "learning_rate": 0.001,
+      "loss": 0.4026,
+      "step": 6677
+    },
+    {
+      "epoch": 0.18426071870785724,
+      "grad_norm": 0.0036475297529250383,
+      "learning_rate": 0.001,
+      "loss": 0.3939,
+      "step": 6678
+    },
+    {
+      "epoch": 0.1842883109089216,
+      "grad_norm": 0.0032815190497785807,
+      "learning_rate": 0.001,
+      "loss": 0.3731,
+      "step": 6679
+    },
+    {
+      "epoch": 0.18431590310998597,
+      "grad_norm": 0.0025924514047801495,
+      "learning_rate": 0.001,
+      "loss": 0.4364,
+      "step": 6680
+    },
+    {
+      "epoch": 0.18434349531105032,
+      "grad_norm": 0.003826566506177187,
+      "learning_rate": 0.001,
+      "loss": 0.3723,
+      "step": 6681
+    },
+    {
+      "epoch": 0.1843710875121147,
+      "grad_norm": 0.002758833346888423,
+      "learning_rate": 0.001,
+      "loss": 0.4023,
+      "step": 6682
+    },
+    {
+      "epoch": 0.18439867971317908,
+      "grad_norm": 0.003747826674953103,
+      "learning_rate": 0.001,
+      "loss": 0.3938,
+      "step": 6683
+    },
+    {
+      "epoch": 0.18442627191424343,
+      "grad_norm": 0.005826046224683523,
+      "learning_rate": 0.001,
+      "loss": 0.3837,
+      "step": 6684
+    },
+    {
+      "epoch": 0.18445386411530781,
+      "grad_norm": 0.002600571606308222,
+      "learning_rate": 0.001,
+      "loss": 0.4378,
+      "step": 6685
+    },
+    {
+      "epoch": 0.18448145631637217,
+      "grad_norm": 0.0024368218146264553,
+      "learning_rate": 0.001,
+      "loss": 0.3911,
+      "step": 6686
+    },
+    {
+      "epoch": 0.18450904851743655,
+      "grad_norm": 0.0028827337082475424,
+      "learning_rate": 0.001,
+      "loss": 0.4253,
+      "step": 6687
+    },
+    {
+      "epoch": 0.18453664071850093,
+      "grad_norm": 0.0031948471441864967,
+      "learning_rate": 0.001,
+      "loss": 0.3473,
+      "step": 6688
+    },
+    {
+      "epoch": 0.18456423291956528,
+      "grad_norm": 0.0033101870212703943,
+      "learning_rate": 0.001,
+      "loss": 0.3909,
+      "step": 6689
+    },
+    {
+      "epoch": 0.18459182512062966,
+      "grad_norm": 0.00872339028865099,
+      "learning_rate": 0.001,
+      "loss": 0.3876,
+      "step": 6690
+    },
+    {
+      "epoch": 0.184619417321694,
+      "grad_norm": 0.0024567311629652977,
+      "learning_rate": 0.001,
+      "loss": 0.4228,
+      "step": 6691
+    },
+    {
+      "epoch": 0.1846470095227584,
+      "grad_norm": 0.0036271214485168457,
+      "learning_rate": 0.001,
+      "loss": 0.3931,
+      "step": 6692
+    },
+    {
+      "epoch": 0.18467460172382277,
+      "grad_norm": 0.004172285553067923,
+      "learning_rate": 0.001,
+      "loss": 0.3968,
+      "step": 6693
+    },
+    {
+      "epoch": 0.18470219392488713,
+      "grad_norm": 0.002208105055615306,
+      "learning_rate": 0.001,
+      "loss": 0.3876,
+      "step": 6694
+    },
+    {
+      "epoch": 0.1847297861259515,
+      "grad_norm": 0.00242190295830369,
+      "learning_rate": 0.001,
+      "loss": 0.4018,
+      "step": 6695
+    },
+    {
+      "epoch": 0.18475737832701586,
+      "grad_norm": 0.002299925545230508,
+      "learning_rate": 0.001,
+      "loss": 0.3984,
+      "step": 6696
+    },
+    {
+      "epoch": 0.18478497052808024,
+      "grad_norm": 0.0044403825886547565,
+      "learning_rate": 0.001,
+      "loss": 0.3807,
+      "step": 6697
+    },
+    {
+      "epoch": 0.18481256272914462,
+      "grad_norm": 0.002557777799665928,
+      "learning_rate": 0.001,
+      "loss": 0.3971,
+      "step": 6698
+    },
+    {
+      "epoch": 0.18484015493020897,
+      "grad_norm": 0.003170351032167673,
+      "learning_rate": 0.001,
+      "loss": 0.3567,
+      "step": 6699
+    },
+    {
+      "epoch": 0.18486774713127335,
+      "grad_norm": 0.00265632476657629,
+      "learning_rate": 0.001,
+      "loss": 0.4175,
+      "step": 6700
+    },
+    {
+      "epoch": 0.1848953393323377,
+      "grad_norm": 0.00452793575823307,
+      "learning_rate": 0.001,
+      "loss": 0.413,
+      "step": 6701
+    },
+    {
+      "epoch": 0.18492293153340209,
+      "grad_norm": 0.003011427354067564,
+      "learning_rate": 0.001,
+      "loss": 0.3934,
+      "step": 6702
+    },
+    {
+      "epoch": 0.18495052373446647,
+      "grad_norm": 0.0024044597521424294,
+      "learning_rate": 0.001,
+      "loss": 0.3874,
+      "step": 6703
+    },
+    {
+      "epoch": 0.18497811593553082,
+      "grad_norm": 0.0035888811107724905,
+      "learning_rate": 0.001,
+      "loss": 0.4016,
+      "step": 6704
+    },
+    {
+      "epoch": 0.1850057081365952,
+      "grad_norm": 0.002353749005123973,
+      "learning_rate": 0.001,
+      "loss": 0.4378,
+      "step": 6705
+    },
+    {
+      "epoch": 0.18503330033765955,
+      "grad_norm": 0.002476689638569951,
+      "learning_rate": 0.001,
+      "loss": 0.3886,
+      "step": 6706
+    },
+    {
+      "epoch": 0.18506089253872393,
+      "grad_norm": 0.002863020868971944,
+      "learning_rate": 0.001,
+      "loss": 0.3952,
+      "step": 6707
+    },
+    {
+      "epoch": 0.1850884847397883,
+      "grad_norm": 0.003525110660120845,
+      "learning_rate": 0.001,
+      "loss": 0.3827,
+      "step": 6708
+    },
+    {
+      "epoch": 0.18511607694085266,
+      "grad_norm": 0.0031487741507589817,
+      "learning_rate": 0.001,
+      "loss": 0.4206,
+      "step": 6709
+    },
+    {
+      "epoch": 0.18514366914191704,
+      "grad_norm": 0.0025411078240722418,
+      "learning_rate": 0.001,
+      "loss": 0.3931,
+      "step": 6710
+    },
+    {
+      "epoch": 0.1851712613429814,
+      "grad_norm": 0.003697034902870655,
+      "learning_rate": 0.001,
+      "loss": 0.3566,
+      "step": 6711
+    },
+    {
+      "epoch": 0.18519885354404578,
+      "grad_norm": 0.0027804269921034575,
+      "learning_rate": 0.001,
+      "loss": 0.3887,
+      "step": 6712
+    },
+    {
+      "epoch": 0.18522644574511016,
+      "grad_norm": 0.0025478231254965067,
+      "learning_rate": 0.001,
+      "loss": 0.379,
+      "step": 6713
+    },
+    {
+      "epoch": 0.1852540379461745,
+      "grad_norm": 0.005976776126772165,
+      "learning_rate": 0.001,
+      "loss": 0.4248,
+      "step": 6714
+    },
+    {
+      "epoch": 0.1852816301472389,
+      "grad_norm": 0.004181606695055962,
+      "learning_rate": 0.001,
+      "loss": 0.4066,
+      "step": 6715
+    },
+    {
+      "epoch": 0.18530922234830324,
+      "grad_norm": 0.002260936889797449,
+      "learning_rate": 0.001,
+      "loss": 0.4213,
+      "step": 6716
+    },
+    {
+      "epoch": 0.18533681454936762,
+      "grad_norm": 0.004087687470018864,
+      "learning_rate": 0.001,
+      "loss": 0.3988,
+      "step": 6717
+    },
+    {
+      "epoch": 0.185364406750432,
+      "grad_norm": 0.004004262387752533,
+      "learning_rate": 0.001,
+      "loss": 0.3629,
+      "step": 6718
+    },
+    {
+      "epoch": 0.18539199895149636,
+      "grad_norm": 0.0027805129066109657,
+      "learning_rate": 0.001,
+      "loss": 0.3721,
+      "step": 6719
+    },
+    {
+      "epoch": 0.18541959115256074,
+      "grad_norm": 0.0028537947218865156,
+      "learning_rate": 0.001,
+      "loss": 0.4114,
+      "step": 6720
+    },
+    {
+      "epoch": 0.1854471833536251,
+      "grad_norm": 0.002309284871444106,
+      "learning_rate": 0.001,
+      "loss": 0.3947,
+      "step": 6721
+    },
+    {
+      "epoch": 0.18547477555468947,
+      "grad_norm": 0.002640241291373968,
+      "learning_rate": 0.001,
+      "loss": 0.371,
+      "step": 6722
+    },
+    {
+      "epoch": 0.18550236775575385,
+      "grad_norm": 0.0032795467413961887,
+      "learning_rate": 0.001,
+      "loss": 0.3957,
+      "step": 6723
+    },
+    {
+      "epoch": 0.1855299599568182,
+      "grad_norm": 0.0028754386585205793,
+      "learning_rate": 0.001,
+      "loss": 0.4008,
+      "step": 6724
+    },
+    {
+      "epoch": 0.18555755215788258,
+      "grad_norm": 0.003849068423733115,
+      "learning_rate": 0.001,
+      "loss": 0.3489,
+      "step": 6725
+    },
+    {
+      "epoch": 0.18558514435894694,
+      "grad_norm": 0.0026304719503968954,
+      "learning_rate": 0.001,
+      "loss": 0.3889,
+      "step": 6726
+    },
+    {
+      "epoch": 0.18561273656001132,
+      "grad_norm": 0.004114130511879921,
+      "learning_rate": 0.001,
+      "loss": 0.3534,
+      "step": 6727
+    },
+    {
+      "epoch": 0.1856403287610757,
+      "grad_norm": 0.0047980137169361115,
+      "learning_rate": 0.001,
+      "loss": 0.4002,
+      "step": 6728
+    },
+    {
+      "epoch": 0.18566792096214005,
+      "grad_norm": 0.0033457186073064804,
+      "learning_rate": 0.001,
+      "loss": 0.4017,
+      "step": 6729
+    },
+    {
+      "epoch": 0.18569551316320443,
+      "grad_norm": 0.0024908706545829773,
+      "learning_rate": 0.001,
+      "loss": 0.3874,
+      "step": 6730
+    },
+    {
+      "epoch": 0.18572310536426878,
+      "grad_norm": 0.003977627027779818,
+      "learning_rate": 0.001,
+      "loss": 0.4041,
+      "step": 6731
+    },
+    {
+      "epoch": 0.18575069756533316,
+      "grad_norm": 0.002345160348340869,
+      "learning_rate": 0.001,
+      "loss": 0.4135,
+      "step": 6732
+    },
+    {
+      "epoch": 0.18577828976639751,
+      "grad_norm": 0.0025549698621034622,
+      "learning_rate": 0.001,
+      "loss": 0.3838,
+      "step": 6733
+    },
+    {
+      "epoch": 0.1858058819674619,
+      "grad_norm": 0.002513099228963256,
+      "learning_rate": 0.001,
+      "loss": 0.398,
+      "step": 6734
+    },
+    {
+      "epoch": 0.18583347416852627,
+      "grad_norm": 0.00270891678519547,
+      "learning_rate": 0.001,
+      "loss": 0.4206,
+      "step": 6735
+    },
+    {
+      "epoch": 0.18586106636959063,
+      "grad_norm": 0.002633742755278945,
+      "learning_rate": 0.001,
+      "loss": 0.3934,
+      "step": 6736
+    },
+    {
+      "epoch": 0.185888658570655,
+      "grad_norm": 0.0025300132110714912,
+      "learning_rate": 0.001,
+      "loss": 0.4546,
+      "step": 6737
+    },
+    {
+      "epoch": 0.18591625077171936,
+      "grad_norm": 0.002625576453283429,
+      "learning_rate": 0.001,
+      "loss": 0.3991,
+      "step": 6738
+    },
+    {
+      "epoch": 0.18594384297278374,
+      "grad_norm": 0.002719470066949725,
+      "learning_rate": 0.001,
+      "loss": 0.3899,
+      "step": 6739
+    },
+    {
+      "epoch": 0.18597143517384812,
+      "grad_norm": 0.002173792105168104,
+      "learning_rate": 0.001,
+      "loss": 0.388,
+      "step": 6740
+    },
+    {
+      "epoch": 0.18599902737491247,
+      "grad_norm": 0.0022414301056414843,
+      "learning_rate": 0.001,
+      "loss": 0.4405,
+      "step": 6741
+    },
+    {
+      "epoch": 0.18602661957597685,
+      "grad_norm": 0.0032112672924995422,
+      "learning_rate": 0.001,
+      "loss": 0.3803,
+      "step": 6742
+    },
+    {
+      "epoch": 0.1860542117770412,
+      "grad_norm": 0.002975397277623415,
+      "learning_rate": 0.001,
+      "loss": 0.4099,
+      "step": 6743
+    },
+    {
+      "epoch": 0.1860818039781056,
+      "grad_norm": 0.0024268808774650097,
+      "learning_rate": 0.001,
+      "loss": 0.3698,
+      "step": 6744
+    },
+    {
+      "epoch": 0.18610939617916997,
+      "grad_norm": 0.0031276950612664223,
+      "learning_rate": 0.001,
+      "loss": 0.4216,
+      "step": 6745
+    },
+    {
+      "epoch": 0.18613698838023432,
+      "grad_norm": 0.0024137012660503387,
+      "learning_rate": 0.001,
+      "loss": 0.4036,
+      "step": 6746
+    },
+    {
+      "epoch": 0.1861645805812987,
+      "grad_norm": 0.002811797196045518,
+      "learning_rate": 0.001,
+      "loss": 0.4148,
+      "step": 6747
+    },
+    {
+      "epoch": 0.18619217278236305,
+      "grad_norm": 0.004455705638974905,
+      "learning_rate": 0.001,
+      "loss": 0.394,
+      "step": 6748
+    },
+    {
+      "epoch": 0.18621976498342743,
+      "grad_norm": 0.0033044186420738697,
+      "learning_rate": 0.001,
+      "loss": 0.4225,
+      "step": 6749
+    },
+    {
+      "epoch": 0.1862473571844918,
+      "grad_norm": 0.0033213391434401274,
+      "learning_rate": 0.001,
+      "loss": 0.38,
+      "step": 6750
+    },
+    {
+      "epoch": 0.18627494938555617,
+      "grad_norm": 0.002652913797646761,
+      "learning_rate": 0.001,
+      "loss": 0.3944,
+      "step": 6751
+    },
+    {
+      "epoch": 0.18630254158662055,
+      "grad_norm": 0.003949091769754887,
+      "learning_rate": 0.001,
+      "loss": 0.3979,
+      "step": 6752
+    },
+    {
+      "epoch": 0.1863301337876849,
+      "grad_norm": 0.002760807517915964,
+      "learning_rate": 0.001,
+      "loss": 0.4183,
+      "step": 6753
+    },
+    {
+      "epoch": 0.18635772598874928,
+      "grad_norm": 0.0035468130372464657,
+      "learning_rate": 0.001,
+      "loss": 0.3776,
+      "step": 6754
+    },
+    {
+      "epoch": 0.18638531818981366,
+      "grad_norm": 0.005636704154312611,
+      "learning_rate": 0.001,
+      "loss": 0.3868,
+      "step": 6755
+    },
+    {
+      "epoch": 0.186412910390878,
+      "grad_norm": 0.002743509830906987,
+      "learning_rate": 0.001,
+      "loss": 0.4324,
+      "step": 6756
+    },
+    {
+      "epoch": 0.1864405025919424,
+      "grad_norm": 0.0033133768010884523,
+      "learning_rate": 0.001,
+      "loss": 0.4075,
+      "step": 6757
+    },
+    {
+      "epoch": 0.18646809479300674,
+      "grad_norm": 0.004021006636321545,
+      "learning_rate": 0.001,
+      "loss": 0.3698,
+      "step": 6758
+    },
+    {
+      "epoch": 0.18649568699407112,
+      "grad_norm": 0.0037046188954263926,
+      "learning_rate": 0.001,
+      "loss": 0.3946,
+      "step": 6759
+    },
+    {
+      "epoch": 0.1865232791951355,
+      "grad_norm": 0.0019486859673634171,
+      "learning_rate": 0.001,
+      "loss": 0.4049,
+      "step": 6760
+    },
+    {
+      "epoch": 0.18655087139619986,
+      "grad_norm": 0.005325522273778915,
+      "learning_rate": 0.001,
+      "loss": 0.4075,
+      "step": 6761
+    },
+    {
+      "epoch": 0.18657846359726424,
+      "grad_norm": 0.003112213918939233,
+      "learning_rate": 0.001,
+      "loss": 0.4059,
+      "step": 6762
+    },
+    {
+      "epoch": 0.1866060557983286,
+      "grad_norm": 0.003908544313162565,
+      "learning_rate": 0.001,
+      "loss": 0.4135,
+      "step": 6763
+    },
+    {
+      "epoch": 0.18663364799939297,
+      "grad_norm": 0.0025811195373535156,
+      "learning_rate": 0.001,
+      "loss": 0.4025,
+      "step": 6764
+    },
+    {
+      "epoch": 0.18666124020045735,
+      "grad_norm": 0.002601012121886015,
+      "learning_rate": 0.001,
+      "loss": 0.3972,
+      "step": 6765
+    },
+    {
+      "epoch": 0.1866888324015217,
+      "grad_norm": 0.0023691104725003242,
+      "learning_rate": 0.001,
+      "loss": 0.4034,
+      "step": 6766
+    },
+    {
+      "epoch": 0.18671642460258608,
+      "grad_norm": 0.0021417406387627125,
+      "learning_rate": 0.001,
+      "loss": 0.3635,
+      "step": 6767
+    },
+    {
+      "epoch": 0.18674401680365044,
+      "grad_norm": 0.0022483544889837503,
+      "learning_rate": 0.001,
+      "loss": 0.415,
+      "step": 6768
+    },
+    {
+      "epoch": 0.18677160900471482,
+      "grad_norm": 0.0032693545799702406,
+      "learning_rate": 0.001,
+      "loss": 0.4228,
+      "step": 6769
+    },
+    {
+      "epoch": 0.1867992012057792,
+      "grad_norm": 0.002980321878567338,
+      "learning_rate": 0.001,
+      "loss": 0.4426,
+      "step": 6770
+    },
+    {
+      "epoch": 0.18682679340684355,
+      "grad_norm": 0.0027730639558285475,
+      "learning_rate": 0.001,
+      "loss": 0.4281,
+      "step": 6771
+    },
+    {
+      "epoch": 0.18685438560790793,
+      "grad_norm": 0.002590013900771737,
+      "learning_rate": 0.001,
+      "loss": 0.3585,
+      "step": 6772
+    },
+    {
+      "epoch": 0.18688197780897228,
+      "grad_norm": 0.00434154225513339,
+      "learning_rate": 0.001,
+      "loss": 0.3845,
+      "step": 6773
+    },
+    {
+      "epoch": 0.18690957001003666,
+      "grad_norm": 0.002608590293675661,
+      "learning_rate": 0.001,
+      "loss": 0.3641,
+      "step": 6774
+    },
+    {
+      "epoch": 0.18693716221110104,
+      "grad_norm": 0.002211198676377535,
+      "learning_rate": 0.001,
+      "loss": 0.3956,
+      "step": 6775
+    },
+    {
+      "epoch": 0.1869647544121654,
+      "grad_norm": 0.002349904738366604,
+      "learning_rate": 0.001,
+      "loss": 0.3773,
+      "step": 6776
+    },
+    {
+      "epoch": 0.18699234661322978,
+      "grad_norm": 0.002399194985628128,
+      "learning_rate": 0.001,
+      "loss": 0.4271,
+      "step": 6777
+    },
+    {
+      "epoch": 0.18701993881429413,
+      "grad_norm": 0.0026427030097693205,
+      "learning_rate": 0.001,
+      "loss": 0.386,
+      "step": 6778
+    },
+    {
+      "epoch": 0.1870475310153585,
+      "grad_norm": 0.00331774540245533,
+      "learning_rate": 0.001,
+      "loss": 0.4142,
+      "step": 6779
+    },
+    {
+      "epoch": 0.1870751232164229,
+      "grad_norm": 0.003382153809070587,
+      "learning_rate": 0.001,
+      "loss": 0.4269,
+      "step": 6780
+    },
+    {
+      "epoch": 0.18710271541748724,
+      "grad_norm": 0.002654004842042923,
+      "learning_rate": 0.001,
+      "loss": 0.4131,
+      "step": 6781
+    },
+    {
+      "epoch": 0.18713030761855162,
+      "grad_norm": 0.005028711166232824,
+      "learning_rate": 0.001,
+      "loss": 0.3986,
+      "step": 6782
+    },
+    {
+      "epoch": 0.18715789981961597,
+      "grad_norm": 0.002557139378041029,
+      "learning_rate": 0.001,
+      "loss": 0.384,
+      "step": 6783
+    },
+    {
+      "epoch": 0.18718549202068036,
+      "grad_norm": 0.003317068563774228,
+      "learning_rate": 0.001,
+      "loss": 0.4058,
+      "step": 6784
+    },
+    {
+      "epoch": 0.18721308422174474,
+      "grad_norm": 0.007066572085022926,
+      "learning_rate": 0.001,
+      "loss": 0.4051,
+      "step": 6785
+    },
+    {
+      "epoch": 0.1872406764228091,
+      "grad_norm": 0.002671597758308053,
+      "learning_rate": 0.001,
+      "loss": 0.402,
+      "step": 6786
+    },
+    {
+      "epoch": 0.18726826862387347,
+      "grad_norm": 0.0031658501829952,
+      "learning_rate": 0.001,
+      "loss": 0.4196,
+      "step": 6787
+    },
+    {
+      "epoch": 0.18729586082493782,
+      "grad_norm": 0.004446005914360285,
+      "learning_rate": 0.001,
+      "loss": 0.387,
+      "step": 6788
+    },
+    {
+      "epoch": 0.1873234530260022,
+      "grad_norm": 0.004098923876881599,
+      "learning_rate": 0.001,
+      "loss": 0.3955,
+      "step": 6789
+    },
+    {
+      "epoch": 0.18735104522706658,
+      "grad_norm": 0.010191929526627064,
+      "learning_rate": 0.001,
+      "loss": 0.4261,
+      "step": 6790
+    },
+    {
+      "epoch": 0.18737863742813093,
+      "grad_norm": 0.0034303830470889807,
+      "learning_rate": 0.001,
+      "loss": 0.3878,
+      "step": 6791
+    },
+    {
+      "epoch": 0.18740622962919531,
+      "grad_norm": 0.0028527514077723026,
+      "learning_rate": 0.001,
+      "loss": 0.4453,
+      "step": 6792
+    },
+    {
+      "epoch": 0.18743382183025967,
+      "grad_norm": 0.003259077901020646,
+      "learning_rate": 0.001,
+      "loss": 0.3772,
+      "step": 6793
+    },
+    {
+      "epoch": 0.18746141403132405,
+      "grad_norm": 0.003400780726224184,
+      "learning_rate": 0.001,
+      "loss": 0.3775,
+      "step": 6794
+    },
+    {
+      "epoch": 0.18748900623238843,
+      "grad_norm": 0.0031762244179844856,
+      "learning_rate": 0.001,
+      "loss": 0.398,
+      "step": 6795
+    },
+    {
+      "epoch": 0.18751659843345278,
+      "grad_norm": 0.0025605822447687387,
+      "learning_rate": 0.001,
+      "loss": 0.4467,
+      "step": 6796
+    },
+    {
+      "epoch": 0.18754419063451716,
+      "grad_norm": 0.0029185418970882893,
+      "learning_rate": 0.001,
+      "loss": 0.4008,
+      "step": 6797
+    },
+    {
+      "epoch": 0.1875717828355815,
+      "grad_norm": 0.0023754434660077095,
+      "learning_rate": 0.001,
+      "loss": 0.4238,
+      "step": 6798
+    },
+    {
+      "epoch": 0.1875993750366459,
+      "grad_norm": 0.002524258103221655,
+      "learning_rate": 0.001,
+      "loss": 0.4156,
+      "step": 6799
+    },
+    {
+      "epoch": 0.18762696723771027,
+      "grad_norm": 0.0024060863070189953,
+      "learning_rate": 0.001,
+      "loss": 0.3629,
+      "step": 6800
+    },
+    {
+      "epoch": 0.18765455943877463,
+      "grad_norm": 0.0025536813773214817,
+      "learning_rate": 0.001,
+      "loss": 0.3831,
+      "step": 6801
+    },
+    {
+      "epoch": 0.187682151639839,
+      "grad_norm": 0.0032576583325862885,
+      "learning_rate": 0.001,
+      "loss": 0.4294,
+      "step": 6802
+    },
+    {
+      "epoch": 0.18770974384090336,
+      "grad_norm": 0.004546053241938353,
+      "learning_rate": 0.001,
+      "loss": 0.3852,
+      "step": 6803
+    },
+    {
+      "epoch": 0.18773733604196774,
+      "grad_norm": 0.0041918703354895115,
+      "learning_rate": 0.001,
+      "loss": 0.3851,
+      "step": 6804
+    },
+    {
+      "epoch": 0.18776492824303212,
+      "grad_norm": 0.0028781883884221315,
+      "learning_rate": 0.001,
+      "loss": 0.4044,
+      "step": 6805
+    },
+    {
+      "epoch": 0.18779252044409647,
+      "grad_norm": 0.0031794614624232054,
+      "learning_rate": 0.001,
+      "loss": 0.413,
+      "step": 6806
+    },
+    {
+      "epoch": 0.18782011264516085,
+      "grad_norm": 0.0023967409506440163,
+      "learning_rate": 0.001,
+      "loss": 0.3906,
+      "step": 6807
+    },
+    {
+      "epoch": 0.1878477048462252,
+      "grad_norm": 0.0029673967510461807,
+      "learning_rate": 0.001,
+      "loss": 0.3996,
+      "step": 6808
+    },
+    {
+      "epoch": 0.18787529704728959,
+      "grad_norm": 0.0028512347489595413,
+      "learning_rate": 0.001,
+      "loss": 0.4211,
+      "step": 6809
+    },
+    {
+      "epoch": 0.18790288924835397,
+      "grad_norm": 0.0031782910227775574,
+      "learning_rate": 0.001,
+      "loss": 0.4095,
+      "step": 6810
+    },
+    {
+      "epoch": 0.18793048144941832,
+      "grad_norm": 0.00338121154345572,
+      "learning_rate": 0.001,
+      "loss": 0.3941,
+      "step": 6811
+    },
+    {
+      "epoch": 0.1879580736504827,
+      "grad_norm": 0.002484740223735571,
+      "learning_rate": 0.001,
+      "loss": 0.4319,
+      "step": 6812
+    },
+    {
+      "epoch": 0.18798566585154705,
+      "grad_norm": 0.0032946360297501087,
+      "learning_rate": 0.001,
+      "loss": 0.3824,
+      "step": 6813
+    },
+    {
+      "epoch": 0.18801325805261143,
+      "grad_norm": 0.003030691295862198,
+      "learning_rate": 0.001,
+      "loss": 0.4301,
+      "step": 6814
+    },
+    {
+      "epoch": 0.1880408502536758,
+      "grad_norm": 0.004018679261207581,
+      "learning_rate": 0.001,
+      "loss": 0.404,
+      "step": 6815
+    },
+    {
+      "epoch": 0.18806844245474016,
+      "grad_norm": 0.004374852403998375,
+      "learning_rate": 0.001,
+      "loss": 0.3796,
+      "step": 6816
+    },
+    {
+      "epoch": 0.18809603465580454,
+      "grad_norm": 0.003533913753926754,
+      "learning_rate": 0.001,
+      "loss": 0.4152,
+      "step": 6817
+    },
+    {
+      "epoch": 0.1881236268568689,
+      "grad_norm": 0.0031644238624721766,
+      "learning_rate": 0.001,
+      "loss": 0.4073,
+      "step": 6818
+    },
+    {
+      "epoch": 0.18815121905793328,
+      "grad_norm": 0.003066874807700515,
+      "learning_rate": 0.001,
+      "loss": 0.4544,
+      "step": 6819
+    },
+    {
+      "epoch": 0.18817881125899766,
+      "grad_norm": 0.0024850773625075817,
+      "learning_rate": 0.001,
+      "loss": 0.4152,
+      "step": 6820
+    },
+    {
+      "epoch": 0.188206403460062,
+      "grad_norm": 0.002926696091890335,
+      "learning_rate": 0.001,
+      "loss": 0.3464,
+      "step": 6821
+    },
+    {
+      "epoch": 0.1882339956611264,
+      "grad_norm": 0.00351322372443974,
+      "learning_rate": 0.001,
+      "loss": 0.3975,
+      "step": 6822
+    },
+    {
+      "epoch": 0.18826158786219074,
+      "grad_norm": 0.0029151032213121653,
+      "learning_rate": 0.001,
+      "loss": 0.3988,
+      "step": 6823
+    },
+    {
+      "epoch": 0.18828918006325512,
+      "grad_norm": 0.004052076023072004,
+      "learning_rate": 0.001,
+      "loss": 0.3899,
+      "step": 6824
+    },
+    {
+      "epoch": 0.18831677226431948,
+      "grad_norm": 0.00337521662004292,
+      "learning_rate": 0.001,
+      "loss": 0.4122,
+      "step": 6825
+    },
+    {
+      "epoch": 0.18834436446538386,
+      "grad_norm": 0.003744855523109436,
+      "learning_rate": 0.001,
+      "loss": 0.3723,
+      "step": 6826
+    },
+    {
+      "epoch": 0.18837195666644824,
+      "grad_norm": 0.0029044130351394415,
+      "learning_rate": 0.001,
+      "loss": 0.388,
+      "step": 6827
+    },
+    {
+      "epoch": 0.1883995488675126,
+      "grad_norm": 0.003070064587518573,
+      "learning_rate": 0.001,
+      "loss": 0.397,
+      "step": 6828
+    },
+    {
+      "epoch": 0.18842714106857697,
+      "grad_norm": 0.0030712231528013945,
+      "learning_rate": 0.001,
+      "loss": 0.4087,
+      "step": 6829
+    },
+    {
+      "epoch": 0.18845473326964132,
+      "grad_norm": 0.0035601078998297453,
+      "learning_rate": 0.001,
+      "loss": 0.3992,
+      "step": 6830
+    },
+    {
+      "epoch": 0.1884823254707057,
+      "grad_norm": 0.0034437361173331738,
+      "learning_rate": 0.001,
+      "loss": 0.4109,
+      "step": 6831
+    },
+    {
+      "epoch": 0.18850991767177008,
+      "grad_norm": 0.003156115999445319,
+      "learning_rate": 0.001,
+      "loss": 0.4126,
+      "step": 6832
+    },
+    {
+      "epoch": 0.18853750987283444,
+      "grad_norm": 0.0028527495451271534,
+      "learning_rate": 0.001,
+      "loss": 0.425,
+      "step": 6833
+    },
+    {
+      "epoch": 0.18856510207389882,
+      "grad_norm": 0.0044526634737849236,
+      "learning_rate": 0.001,
+      "loss": 0.3881,
+      "step": 6834
+    },
+    {
+      "epoch": 0.18859269427496317,
+      "grad_norm": 0.006967521738260984,
+      "learning_rate": 0.001,
+      "loss": 0.3879,
+      "step": 6835
+    },
+    {
+      "epoch": 0.18862028647602755,
+      "grad_norm": 0.0034451198298484087,
+      "learning_rate": 0.001,
+      "loss": 0.4003,
+      "step": 6836
+    },
+    {
+      "epoch": 0.18864787867709193,
+      "grad_norm": 0.004877714905887842,
+      "learning_rate": 0.001,
+      "loss": 0.4206,
+      "step": 6837
+    },
+    {
+      "epoch": 0.18867547087815628,
+      "grad_norm": 0.00429321825504303,
+      "learning_rate": 0.001,
+      "loss": 0.3752,
+      "step": 6838
+    },
+    {
+      "epoch": 0.18870306307922066,
+      "grad_norm": 0.0028502768836915493,
+      "learning_rate": 0.001,
+      "loss": 0.3603,
+      "step": 6839
+    },
+    {
+      "epoch": 0.18873065528028501,
+      "grad_norm": 0.0023765196092426777,
+      "learning_rate": 0.001,
+      "loss": 0.4287,
+      "step": 6840
+    },
+    {
+      "epoch": 0.1887582474813494,
+      "grad_norm": 0.0028856396675109863,
+      "learning_rate": 0.001,
+      "loss": 0.3819,
+      "step": 6841
+    },
+    {
+      "epoch": 0.18878583968241378,
+      "grad_norm": 0.004223294090479612,
+      "learning_rate": 0.001,
+      "loss": 0.4277,
+      "step": 6842
+    },
+    {
+      "epoch": 0.18881343188347813,
+      "grad_norm": 0.003582458943128586,
+      "learning_rate": 0.001,
+      "loss": 0.3699,
+      "step": 6843
+    },
+    {
+      "epoch": 0.1888410240845425,
+      "grad_norm": 0.004069041460752487,
+      "learning_rate": 0.001,
+      "loss": 0.4196,
+      "step": 6844
+    },
+    {
+      "epoch": 0.18886861628560686,
+      "grad_norm": 0.004946923349052668,
+      "learning_rate": 0.001,
+      "loss": 0.4151,
+      "step": 6845
+    },
+    {
+      "epoch": 0.18889620848667124,
+      "grad_norm": 0.002650062320753932,
+      "learning_rate": 0.001,
+      "loss": 0.3993,
+      "step": 6846
+    },
+    {
+      "epoch": 0.18892380068773562,
+      "grad_norm": 0.002152357017621398,
+      "learning_rate": 0.001,
+      "loss": 0.452,
+      "step": 6847
+    },
+    {
+      "epoch": 0.18895139288879997,
+      "grad_norm": 0.0030690866988152266,
+      "learning_rate": 0.001,
+      "loss": 0.3782,
+      "step": 6848
+    },
+    {
+      "epoch": 0.18897898508986435,
+      "grad_norm": 0.0022116086911410093,
+      "learning_rate": 0.001,
+      "loss": 0.4015,
+      "step": 6849
+    },
+    {
+      "epoch": 0.1890065772909287,
+      "grad_norm": 0.0023306317161768675,
+      "learning_rate": 0.001,
+      "loss": 0.4093,
+      "step": 6850
+    },
+    {
+      "epoch": 0.1890341694919931,
+      "grad_norm": 0.002590792253613472,
+      "learning_rate": 0.001,
+      "loss": 0.3973,
+      "step": 6851
+    },
+    {
+      "epoch": 0.18906176169305747,
+      "grad_norm": 0.005286765284836292,
+      "learning_rate": 0.001,
+      "loss": 0.4247,
+      "step": 6852
+    },
+    {
+      "epoch": 0.18908935389412182,
+      "grad_norm": 0.006548671051859856,
+      "learning_rate": 0.001,
+      "loss": 0.4021,
+      "step": 6853
+    },
+    {
+      "epoch": 0.1891169460951862,
+      "grad_norm": 0.002917979843914509,
+      "learning_rate": 0.001,
+      "loss": 0.4222,
+      "step": 6854
+    },
+    {
+      "epoch": 0.18914453829625055,
+      "grad_norm": 0.002655414631590247,
+      "learning_rate": 0.001,
+      "loss": 0.3637,
+      "step": 6855
+    },
+    {
+      "epoch": 0.18917213049731493,
+      "grad_norm": 0.003176578553393483,
+      "learning_rate": 0.001,
+      "loss": 0.3973,
+      "step": 6856
+    },
+    {
+      "epoch": 0.1891997226983793,
+      "grad_norm": 0.0026494793128222227,
+      "learning_rate": 0.001,
+      "loss": 0.4069,
+      "step": 6857
+    },
+    {
+      "epoch": 0.18922731489944367,
+      "grad_norm": 0.0027524055913090706,
+      "learning_rate": 0.001,
+      "loss": 0.4352,
+      "step": 6858
+    },
+    {
+      "epoch": 0.18925490710050805,
+      "grad_norm": 0.0025463078636676073,
+      "learning_rate": 0.001,
+      "loss": 0.4272,
+      "step": 6859
+    },
+    {
+      "epoch": 0.1892824993015724,
+      "grad_norm": 0.011269212700426579,
+      "learning_rate": 0.001,
+      "loss": 0.4013,
+      "step": 6860
+    },
+    {
+      "epoch": 0.18931009150263678,
+      "grad_norm": 0.003725286340340972,
+      "learning_rate": 0.001,
+      "loss": 0.4062,
+      "step": 6861
+    },
+    {
+      "epoch": 0.18933768370370116,
+      "grad_norm": 0.0022669502068310976,
+      "learning_rate": 0.001,
+      "loss": 0.4321,
+      "step": 6862
+    },
+    {
+      "epoch": 0.1893652759047655,
+      "grad_norm": 0.0022833060938864946,
+      "learning_rate": 0.001,
+      "loss": 0.391,
+      "step": 6863
+    },
+    {
+      "epoch": 0.1893928681058299,
+      "grad_norm": 0.0027603027410805225,
+      "learning_rate": 0.001,
+      "loss": 0.4338,
+      "step": 6864
+    },
+    {
+      "epoch": 0.18942046030689424,
+      "grad_norm": 0.00340470764786005,
+      "learning_rate": 0.001,
+      "loss": 0.3815,
+      "step": 6865
+    },
+    {
+      "epoch": 0.18944805250795863,
+      "grad_norm": 0.003243909915909171,
+      "learning_rate": 0.001,
+      "loss": 0.3784,
+      "step": 6866
+    },
+    {
+      "epoch": 0.189475644709023,
+      "grad_norm": 0.003641802351921797,
+      "learning_rate": 0.001,
+      "loss": 0.3896,
+      "step": 6867
+    },
+    {
+      "epoch": 0.18950323691008736,
+      "grad_norm": 0.00207584910094738,
+      "learning_rate": 0.001,
+      "loss": 0.4182,
+      "step": 6868
+    },
+    {
+      "epoch": 0.18953082911115174,
+      "grad_norm": 0.0021775367204099894,
+      "learning_rate": 0.001,
+      "loss": 0.4068,
+      "step": 6869
+    },
+    {
+      "epoch": 0.1895584213122161,
+      "grad_norm": 0.0023639260325580835,
+      "learning_rate": 0.001,
+      "loss": 0.3917,
+      "step": 6870
+    },
+    {
+      "epoch": 0.18958601351328047,
+      "grad_norm": 0.0024233164731413126,
+      "learning_rate": 0.001,
+      "loss": 0.4213,
+      "step": 6871
+    },
+    {
+      "epoch": 0.18961360571434485,
+      "grad_norm": 0.0024376865476369858,
+      "learning_rate": 0.001,
+      "loss": 0.4345,
+      "step": 6872
+    },
+    {
+      "epoch": 0.1896411979154092,
+      "grad_norm": 0.004064356442540884,
+      "learning_rate": 0.001,
+      "loss": 0.4221,
+      "step": 6873
+    },
+    {
+      "epoch": 0.18966879011647358,
+      "grad_norm": 0.0038676555268466473,
+      "learning_rate": 0.001,
+      "loss": 0.3978,
+      "step": 6874
+    },
+    {
+      "epoch": 0.18969638231753794,
+      "grad_norm": 0.002870697295293212,
+      "learning_rate": 0.001,
+      "loss": 0.3892,
+      "step": 6875
+    },
+    {
+      "epoch": 0.18972397451860232,
+      "grad_norm": 0.005860270466655493,
+      "learning_rate": 0.001,
+      "loss": 0.3993,
+      "step": 6876
+    },
+    {
+      "epoch": 0.1897515667196667,
+      "grad_norm": 0.0024599696043878794,
+      "learning_rate": 0.001,
+      "loss": 0.4116,
+      "step": 6877
+    },
+    {
+      "epoch": 0.18977915892073105,
+      "grad_norm": 0.002083956263959408,
+      "learning_rate": 0.001,
+      "loss": 0.4314,
+      "step": 6878
+    },
+    {
+      "epoch": 0.18980675112179543,
+      "grad_norm": 0.0021180741023272276,
+      "learning_rate": 0.001,
+      "loss": 0.3981,
+      "step": 6879
+    },
+    {
+      "epoch": 0.18983434332285978,
+      "grad_norm": 0.0028299293480813503,
+      "learning_rate": 0.001,
+      "loss": 0.3764,
+      "step": 6880
+    },
+    {
+      "epoch": 0.18986193552392416,
+      "grad_norm": 0.009525451809167862,
+      "learning_rate": 0.001,
+      "loss": 0.362,
+      "step": 6881
+    },
+    {
+      "epoch": 0.18988952772498854,
+      "grad_norm": 0.0036288495175540447,
+      "learning_rate": 0.001,
+      "loss": 0.3884,
+      "step": 6882
+    },
+    {
+      "epoch": 0.1899171199260529,
+      "grad_norm": 0.0025045403745025396,
+      "learning_rate": 0.001,
+      "loss": 0.4469,
+      "step": 6883
+    },
+    {
+      "epoch": 0.18994471212711728,
+      "grad_norm": 0.006380067672580481,
+      "learning_rate": 0.001,
+      "loss": 0.4178,
+      "step": 6884
+    },
+    {
+      "epoch": 0.18997230432818163,
+      "grad_norm": 0.0035281891468912363,
+      "learning_rate": 0.001,
+      "loss": 0.3987,
+      "step": 6885
+    },
+    {
+      "epoch": 0.189999896529246,
+      "grad_norm": 0.012035722844302654,
+      "learning_rate": 0.001,
+      "loss": 0.4396,
+      "step": 6886
+    },
+    {
+      "epoch": 0.1900274887303104,
+      "grad_norm": 0.004180245101451874,
+      "learning_rate": 0.001,
+      "loss": 0.3978,
+      "step": 6887
+    },
+    {
+      "epoch": 0.19005508093137474,
+      "grad_norm": 0.002991945017129183,
+      "learning_rate": 0.001,
+      "loss": 0.3904,
+      "step": 6888
+    },
+    {
+      "epoch": 0.19008267313243912,
+      "grad_norm": 0.002699353964999318,
+      "learning_rate": 0.001,
+      "loss": 0.4174,
+      "step": 6889
+    },
+    {
+      "epoch": 0.19011026533350348,
+      "grad_norm": 0.009517242200672626,
+      "learning_rate": 0.001,
+      "loss": 0.3863,
+      "step": 6890
+    },
+    {
+      "epoch": 0.19013785753456786,
+      "grad_norm": 0.002533604623749852,
+      "learning_rate": 0.001,
+      "loss": 0.419,
+      "step": 6891
+    },
+    {
+      "epoch": 0.19016544973563224,
+      "grad_norm": 0.0020676185376942158,
+      "learning_rate": 0.001,
+      "loss": 0.4244,
+      "step": 6892
+    },
+    {
+      "epoch": 0.1901930419366966,
+      "grad_norm": 0.0027616149745881557,
+      "learning_rate": 0.001,
+      "loss": 0.4224,
+      "step": 6893
+    },
+    {
+      "epoch": 0.19022063413776097,
+      "grad_norm": 0.0025242832489311695,
+      "learning_rate": 0.001,
+      "loss": 0.3775,
+      "step": 6894
+    },
+    {
+      "epoch": 0.19024822633882532,
+      "grad_norm": 0.005067579913884401,
+      "learning_rate": 0.001,
+      "loss": 0.3848,
+      "step": 6895
+    },
+    {
+      "epoch": 0.1902758185398897,
+      "grad_norm": 0.005015381146222353,
+      "learning_rate": 0.001,
+      "loss": 0.4029,
+      "step": 6896
+    },
+    {
+      "epoch": 0.19030341074095408,
+      "grad_norm": 0.010791119188070297,
+      "learning_rate": 0.001,
+      "loss": 0.4127,
+      "step": 6897
+    },
+    {
+      "epoch": 0.19033100294201843,
+      "grad_norm": 0.00212705135345459,
+      "learning_rate": 0.001,
+      "loss": 0.4522,
+      "step": 6898
+    },
+    {
+      "epoch": 0.19035859514308281,
+      "grad_norm": 0.0018394197104498744,
+      "learning_rate": 0.001,
+      "loss": 0.4393,
+      "step": 6899
+    },
+    {
+      "epoch": 0.19038618734414717,
+      "grad_norm": 0.002272665733471513,
+      "learning_rate": 0.001,
+      "loss": 0.4152,
+      "step": 6900
+    },
+    {
+      "epoch": 0.19041377954521155,
+      "grad_norm": 0.002068854635581374,
+      "learning_rate": 0.001,
+      "loss": 0.4175,
+      "step": 6901
+    },
+    {
+      "epoch": 0.19044137174627593,
+      "grad_norm": 0.003429130418226123,
+      "learning_rate": 0.001,
+      "loss": 0.4015,
+      "step": 6902
+    },
+    {
+      "epoch": 0.19046896394734028,
+      "grad_norm": 0.006805672310292721,
+      "learning_rate": 0.001,
+      "loss": 0.4157,
+      "step": 6903
+    },
+    {
+      "epoch": 0.19049655614840466,
+      "grad_norm": 0.002861461602151394,
+      "learning_rate": 0.001,
+      "loss": 0.3941,
+      "step": 6904
+    },
+    {
+      "epoch": 0.190524148349469,
+      "grad_norm": 0.002513586077839136,
+      "learning_rate": 0.001,
+      "loss": 0.4403,
+      "step": 6905
+    },
+    {
+      "epoch": 0.1905517405505334,
+      "grad_norm": 0.003469777060672641,
+      "learning_rate": 0.001,
+      "loss": 0.3866,
+      "step": 6906
+    },
+    {
+      "epoch": 0.19057933275159777,
+      "grad_norm": 0.003417365485802293,
+      "learning_rate": 0.001,
+      "loss": 0.3806,
+      "step": 6907
+    },
+    {
+      "epoch": 0.19060692495266213,
+      "grad_norm": 0.0020143757574260235,
+      "learning_rate": 0.001,
+      "loss": 0.4343,
+      "step": 6908
+    },
+    {
+      "epoch": 0.1906345171537265,
+      "grad_norm": 0.002093646442517638,
+      "learning_rate": 0.001,
+      "loss": 0.4124,
+      "step": 6909
+    },
+    {
+      "epoch": 0.19066210935479086,
+      "grad_norm": 0.001849993597716093,
+      "learning_rate": 0.001,
+      "loss": 0.4144,
+      "step": 6910
+    },
+    {
+      "epoch": 0.19068970155585524,
+      "grad_norm": 0.002308469032868743,
+      "learning_rate": 0.001,
+      "loss": 0.4141,
+      "step": 6911
+    },
+    {
+      "epoch": 0.19071729375691962,
+      "grad_norm": 0.00342297344468534,
+      "learning_rate": 0.001,
+      "loss": 0.3559,
+      "step": 6912
+    },
+    {
+      "epoch": 0.19074488595798397,
+      "grad_norm": 0.002367036882787943,
+      "learning_rate": 0.001,
+      "loss": 0.3956,
+      "step": 6913
+    },
+    {
+      "epoch": 0.19077247815904835,
+      "grad_norm": 0.003705595852807164,
+      "learning_rate": 0.001,
+      "loss": 0.3972,
+      "step": 6914
+    },
+    {
+      "epoch": 0.1908000703601127,
+      "grad_norm": 0.0037233191542327404,
+      "learning_rate": 0.001,
+      "loss": 0.4135,
+      "step": 6915
+    },
+    {
+      "epoch": 0.19082766256117709,
+      "grad_norm": 0.0024186784867197275,
+      "learning_rate": 0.001,
+      "loss": 0.4397,
+      "step": 6916
+    },
+    {
+      "epoch": 0.19085525476224144,
+      "grad_norm": 0.0020412628073245287,
+      "learning_rate": 0.001,
+      "loss": 0.4353,
+      "step": 6917
+    },
+    {
+      "epoch": 0.19088284696330582,
+      "grad_norm": 0.002167558530345559,
+      "learning_rate": 0.001,
+      "loss": 0.4022,
+      "step": 6918
+    },
+    {
+      "epoch": 0.1909104391643702,
+      "grad_norm": 0.002329624257981777,
+      "learning_rate": 0.001,
+      "loss": 0.4321,
+      "step": 6919
+    },
+    {
+      "epoch": 0.19093803136543455,
+      "grad_norm": 0.0029148284811526537,
+      "learning_rate": 0.001,
+      "loss": 0.395,
+      "step": 6920
+    },
+    {
+      "epoch": 0.19096562356649893,
+      "grad_norm": 0.002269695047289133,
+      "learning_rate": 0.001,
+      "loss": 0.4007,
+      "step": 6921
+    },
+    {
+      "epoch": 0.19099321576756328,
+      "grad_norm": 0.002682054415345192,
+      "learning_rate": 0.001,
+      "loss": 0.3913,
+      "step": 6922
+    },
+    {
+      "epoch": 0.19102080796862766,
+      "grad_norm": 0.0024830945767462254,
+      "learning_rate": 0.001,
+      "loss": 0.4511,
+      "step": 6923
+    },
+    {
+      "epoch": 0.19104840016969205,
+      "grad_norm": 0.003123142756521702,
+      "learning_rate": 0.001,
+      "loss": 0.3975,
+      "step": 6924
+    },
+    {
+      "epoch": 0.1910759923707564,
+      "grad_norm": 0.004371596500277519,
+      "learning_rate": 0.001,
+      "loss": 0.3956,
+      "step": 6925
+    },
+    {
+      "epoch": 0.19110358457182078,
+      "grad_norm": 0.0042950925417244434,
+      "learning_rate": 0.001,
+      "loss": 0.3804,
+      "step": 6926
+    },
+    {
+      "epoch": 0.19113117677288513,
+      "grad_norm": 0.0022360682487487793,
+      "learning_rate": 0.001,
+      "loss": 0.4616,
+      "step": 6927
+    },
+    {
+      "epoch": 0.1911587689739495,
+      "grad_norm": 0.0026784553192555904,
+      "learning_rate": 0.001,
+      "loss": 0.3839,
+      "step": 6928
+    },
+    {
+      "epoch": 0.1911863611750139,
+      "grad_norm": 0.003034649882465601,
+      "learning_rate": 0.001,
+      "loss": 0.4016,
+      "step": 6929
+    },
+    {
+      "epoch": 0.19121395337607824,
+      "grad_norm": 0.006058567203581333,
+      "learning_rate": 0.001,
+      "loss": 0.3884,
+      "step": 6930
+    },
+    {
+      "epoch": 0.19124154557714262,
+      "grad_norm": 0.003582082223147154,
+      "learning_rate": 0.001,
+      "loss": 0.3453,
+      "step": 6931
+    },
+    {
+      "epoch": 0.19126913777820698,
+      "grad_norm": 0.002336689503863454,
+      "learning_rate": 0.001,
+      "loss": 0.4432,
+      "step": 6932
+    },
+    {
+      "epoch": 0.19129672997927136,
+      "grad_norm": 0.0033224106300622225,
+      "learning_rate": 0.001,
+      "loss": 0.4139,
+      "step": 6933
+    },
+    {
+      "epoch": 0.19132432218033574,
+      "grad_norm": 0.005102561321109533,
+      "learning_rate": 0.001,
+      "loss": 0.4244,
+      "step": 6934
+    },
+    {
+      "epoch": 0.1913519143814001,
+      "grad_norm": 0.0050558787770569324,
+      "learning_rate": 0.001,
+      "loss": 0.4118,
+      "step": 6935
+    },
+    {
+      "epoch": 0.19137950658246447,
+      "grad_norm": 0.007568387780338526,
+      "learning_rate": 0.001,
+      "loss": 0.3964,
+      "step": 6936
+    },
+    {
+      "epoch": 0.19140709878352882,
+      "grad_norm": 0.003738366300240159,
+      "learning_rate": 0.001,
+      "loss": 0.3949,
+      "step": 6937
+    },
+    {
+      "epoch": 0.1914346909845932,
+      "grad_norm": 0.0034411989618092775,
+      "learning_rate": 0.001,
+      "loss": 0.4202,
+      "step": 6938
+    },
+    {
+      "epoch": 0.19146228318565758,
+      "grad_norm": 0.0027622180059552193,
+      "learning_rate": 0.001,
+      "loss": 0.4015,
+      "step": 6939
+    },
+    {
+      "epoch": 0.19148987538672194,
+      "grad_norm": 0.0026892200112342834,
+      "learning_rate": 0.001,
+      "loss": 0.3971,
+      "step": 6940
+    },
+    {
+      "epoch": 0.19151746758778632,
+      "grad_norm": 0.002897805068641901,
+      "learning_rate": 0.001,
+      "loss": 0.3971,
+      "step": 6941
+    },
+    {
+      "epoch": 0.19154505978885067,
+      "grad_norm": 0.0027761433739215136,
+      "learning_rate": 0.001,
+      "loss": 0.3846,
+      "step": 6942
+    },
+    {
+      "epoch": 0.19157265198991505,
+      "grad_norm": 0.0034833746030926704,
+      "learning_rate": 0.001,
+      "loss": 0.3867,
+      "step": 6943
+    },
+    {
+      "epoch": 0.19160024419097943,
+      "grad_norm": 0.004619366955012083,
+      "learning_rate": 0.001,
+      "loss": 0.4471,
+      "step": 6944
+    },
+    {
+      "epoch": 0.19162783639204378,
+      "grad_norm": 0.003163927001878619,
+      "learning_rate": 0.001,
+      "loss": 0.4286,
+      "step": 6945
+    },
+    {
+      "epoch": 0.19165542859310816,
+      "grad_norm": 0.005678548477590084,
+      "learning_rate": 0.001,
+      "loss": 0.3976,
+      "step": 6946
+    },
+    {
+      "epoch": 0.19168302079417252,
+      "grad_norm": 0.0026377500034868717,
+      "learning_rate": 0.001,
+      "loss": 0.4158,
+      "step": 6947
+    },
+    {
+      "epoch": 0.1917106129952369,
+      "grad_norm": 0.006021553184837103,
+      "learning_rate": 0.001,
+      "loss": 0.3781,
+      "step": 6948
+    },
+    {
+      "epoch": 0.19173820519630128,
+      "grad_norm": 0.0024780158419162035,
+      "learning_rate": 0.001,
+      "loss": 0.4146,
+      "step": 6949
+    },
+    {
+      "epoch": 0.19176579739736563,
+      "grad_norm": 0.002402723301202059,
+      "learning_rate": 0.001,
+      "loss": 0.4051,
+      "step": 6950
+    },
+    {
+      "epoch": 0.19179338959843,
+      "grad_norm": 0.003152204444631934,
+      "learning_rate": 0.001,
+      "loss": 0.351,
+      "step": 6951
+    },
+    {
+      "epoch": 0.19182098179949436,
+      "grad_norm": 0.003795337863266468,
+      "learning_rate": 0.001,
+      "loss": 0.3941,
+      "step": 6952
+    },
+    {
+      "epoch": 0.19184857400055874,
+      "grad_norm": 0.002820979803800583,
+      "learning_rate": 0.001,
+      "loss": 0.3949,
+      "step": 6953
+    },
+    {
+      "epoch": 0.19187616620162312,
+      "grad_norm": 0.003103316528722644,
+      "learning_rate": 0.001,
+      "loss": 0.4087,
+      "step": 6954
+    },
+    {
+      "epoch": 0.19190375840268747,
+      "grad_norm": 0.002751655410975218,
+      "learning_rate": 0.001,
+      "loss": 0.4173,
+      "step": 6955
+    },
+    {
+      "epoch": 0.19193135060375185,
+      "grad_norm": 0.002747628604993224,
+      "learning_rate": 0.001,
+      "loss": 0.3952,
+      "step": 6956
+    },
+    {
+      "epoch": 0.1919589428048162,
+      "grad_norm": 0.0033904961310327053,
+      "learning_rate": 0.001,
+      "loss": 0.382,
+      "step": 6957
+    },
+    {
+      "epoch": 0.1919865350058806,
+      "grad_norm": 0.002641551662236452,
+      "learning_rate": 0.001,
+      "loss": 0.3573,
+      "step": 6958
+    },
+    {
+      "epoch": 0.19201412720694497,
+      "grad_norm": 0.0025419299490749836,
+      "learning_rate": 0.001,
+      "loss": 0.3792,
+      "step": 6959
+    },
+    {
+      "epoch": 0.19204171940800932,
+      "grad_norm": 0.0028294960502535105,
+      "learning_rate": 0.001,
+      "loss": 0.3585,
+      "step": 6960
+    },
+    {
+      "epoch": 0.1920693116090737,
+      "grad_norm": 0.0031680443789809942,
+      "learning_rate": 0.001,
+      "loss": 0.4097,
+      "step": 6961
+    },
+    {
+      "epoch": 0.19209690381013805,
+      "grad_norm": 0.0024578890297561884,
+      "learning_rate": 0.001,
+      "loss": 0.4042,
+      "step": 6962
+    },
+    {
+      "epoch": 0.19212449601120243,
+      "grad_norm": 0.002299124840646982,
+      "learning_rate": 0.001,
+      "loss": 0.4144,
+      "step": 6963
+    },
+    {
+      "epoch": 0.19215208821226681,
+      "grad_norm": 0.0031422816682606936,
+      "learning_rate": 0.001,
+      "loss": 0.4339,
+      "step": 6964
+    },
+    {
+      "epoch": 0.19217968041333117,
+      "grad_norm": 0.004103971645236015,
+      "learning_rate": 0.001,
+      "loss": 0.4197,
+      "step": 6965
+    },
+    {
+      "epoch": 0.19220727261439555,
+      "grad_norm": 0.002797805005684495,
+      "learning_rate": 0.001,
+      "loss": 0.415,
+      "step": 6966
+    },
+    {
+      "epoch": 0.1922348648154599,
+      "grad_norm": 0.006955363787710667,
+      "learning_rate": 0.001,
+      "loss": 0.384,
+      "step": 6967
+    },
+    {
+      "epoch": 0.19226245701652428,
+      "grad_norm": 0.0027699500788003206,
+      "learning_rate": 0.001,
+      "loss": 0.4067,
+      "step": 6968
+    },
+    {
+      "epoch": 0.19229004921758866,
+      "grad_norm": 0.006645936518907547,
+      "learning_rate": 0.001,
+      "loss": 0.3798,
+      "step": 6969
+    },
+    {
+      "epoch": 0.192317641418653,
+      "grad_norm": 0.003216095734387636,
+      "learning_rate": 0.001,
+      "loss": 0.4473,
+      "step": 6970
+    },
+    {
+      "epoch": 0.1923452336197174,
+      "grad_norm": 0.002484220312908292,
+      "learning_rate": 0.001,
+      "loss": 0.4134,
+      "step": 6971
+    },
+    {
+      "epoch": 0.19237282582078175,
+      "grad_norm": 0.0034046086948364973,
+      "learning_rate": 0.001,
+      "loss": 0.4042,
+      "step": 6972
+    },
+    {
+      "epoch": 0.19240041802184613,
+      "grad_norm": 0.0025209994055330753,
+      "learning_rate": 0.001,
+      "loss": 0.4586,
+      "step": 6973
+    },
+    {
+      "epoch": 0.1924280102229105,
+      "grad_norm": 0.004154059570282698,
+      "learning_rate": 0.001,
+      "loss": 0.3724,
+      "step": 6974
+    },
+    {
+      "epoch": 0.19245560242397486,
+      "grad_norm": 0.003831770271062851,
+      "learning_rate": 0.001,
+      "loss": 0.4142,
+      "step": 6975
+    },
+    {
+      "epoch": 0.19248319462503924,
+      "grad_norm": 0.005726317409425974,
+      "learning_rate": 0.001,
+      "loss": 0.4296,
+      "step": 6976
+    },
+    {
+      "epoch": 0.1925107868261036,
+      "grad_norm": 0.0032162668649107218,
+      "learning_rate": 0.001,
+      "loss": 0.3735,
+      "step": 6977
+    },
+    {
+      "epoch": 0.19253837902716797,
+      "grad_norm": 0.0027887041214853525,
+      "learning_rate": 0.001,
+      "loss": 0.4167,
+      "step": 6978
+    },
+    {
+      "epoch": 0.19256597122823235,
+      "grad_norm": 0.0031014943961054087,
+      "learning_rate": 0.001,
+      "loss": 0.3785,
+      "step": 6979
+    },
+    {
+      "epoch": 0.1925935634292967,
+      "grad_norm": 0.0026730517856776714,
+      "learning_rate": 0.001,
+      "loss": 0.4275,
+      "step": 6980
+    },
+    {
+      "epoch": 0.19262115563036109,
+      "grad_norm": 0.002030472969636321,
+      "learning_rate": 0.001,
+      "loss": 0.4078,
+      "step": 6981
+    },
+    {
+      "epoch": 0.19264874783142544,
+      "grad_norm": 0.0030617276206612587,
+      "learning_rate": 0.001,
+      "loss": 0.4235,
+      "step": 6982
+    },
+    {
+      "epoch": 0.19267634003248982,
+      "grad_norm": 0.0025087720714509487,
+      "learning_rate": 0.001,
+      "loss": 0.4084,
+      "step": 6983
+    },
+    {
+      "epoch": 0.1927039322335542,
+      "grad_norm": 0.0020648448262363672,
+      "learning_rate": 0.001,
+      "loss": 0.3989,
+      "step": 6984
+    },
+    {
+      "epoch": 0.19273152443461855,
+      "grad_norm": 0.002378196455538273,
+      "learning_rate": 0.001,
+      "loss": 0.4016,
+      "step": 6985
+    },
+    {
+      "epoch": 0.19275911663568293,
+      "grad_norm": 0.0027894238010048866,
+      "learning_rate": 0.001,
+      "loss": 0.4082,
+      "step": 6986
+    },
+    {
+      "epoch": 0.19278670883674728,
+      "grad_norm": 0.002425672486424446,
+      "learning_rate": 0.001,
+      "loss": 0.3512,
+      "step": 6987
+    },
+    {
+      "epoch": 0.19281430103781166,
+      "grad_norm": 0.0021580031607300043,
+      "learning_rate": 0.001,
+      "loss": 0.4396,
+      "step": 6988
+    },
+    {
+      "epoch": 0.19284189323887604,
+      "grad_norm": 0.0041187843307852745,
+      "learning_rate": 0.001,
+      "loss": 0.3748,
+      "step": 6989
+    },
+    {
+      "epoch": 0.1928694854399404,
+      "grad_norm": 0.0023956988006830215,
+      "learning_rate": 0.001,
+      "loss": 0.4488,
+      "step": 6990
+    },
+    {
+      "epoch": 0.19289707764100478,
+      "grad_norm": 0.0022000286262482405,
+      "learning_rate": 0.001,
+      "loss": 0.3773,
+      "step": 6991
+    },
+    {
+      "epoch": 0.19292466984206913,
+      "grad_norm": 0.0037109351251274347,
+      "learning_rate": 0.001,
+      "loss": 0.3982,
+      "step": 6992
+    },
+    {
+      "epoch": 0.1929522620431335,
+      "grad_norm": 0.0028746852185577154,
+      "learning_rate": 0.001,
+      "loss": 0.3912,
+      "step": 6993
+    },
+    {
+      "epoch": 0.1929798542441979,
+      "grad_norm": 0.0026120489928871393,
+      "learning_rate": 0.001,
+      "loss": 0.3846,
+      "step": 6994
+    },
+    {
+      "epoch": 0.19300744644526224,
+      "grad_norm": 0.0025121879298239946,
+      "learning_rate": 0.001,
+      "loss": 0.3957,
+      "step": 6995
+    },
+    {
+      "epoch": 0.19303503864632662,
+      "grad_norm": 0.0026690547820180655,
+      "learning_rate": 0.001,
+      "loss": 0.4165,
+      "step": 6996
+    },
+    {
+      "epoch": 0.19306263084739098,
+      "grad_norm": 0.0021126086357980967,
+      "learning_rate": 0.001,
+      "loss": 0.3714,
+      "step": 6997
+    },
+    {
+      "epoch": 0.19309022304845536,
+      "grad_norm": 0.007225458975881338,
+      "learning_rate": 0.001,
+      "loss": 0.4177,
+      "step": 6998
+    },
+    {
+      "epoch": 0.19311781524951974,
+      "grad_norm": 0.002150959335267544,
+      "learning_rate": 0.001,
+      "loss": 0.4287,
+      "step": 6999
+    },
+    {
+      "epoch": 0.1931454074505841,
+      "grad_norm": 0.002550938166677952,
+      "learning_rate": 0.001,
+      "loss": 0.4182,
+      "step": 7000
+    },
+    {
+      "epoch": 0.1931454074505841,
+      "eval_runtime": 24.8881,
+      "eval_samples_per_second": 1.286,
+      "eval_steps_per_second": 0.161,
+      "step": 7000
+    },
+    {
+      "epoch": 0.19317299965164847,
+      "grad_norm": 0.00321765523403883,
+      "learning_rate": 0.001,
+      "loss": 0.4241,
+      "step": 7001
+    },
+    {
+      "epoch": 0.19320059185271282,
+      "grad_norm": 0.0028067470993846655,
+      "learning_rate": 0.001,
+      "loss": 0.4288,
+      "step": 7002
+    },
+    {
+      "epoch": 0.1932281840537772,
+      "grad_norm": 0.002428458072245121,
+      "learning_rate": 0.001,
+      "loss": 0.4036,
+      "step": 7003
+    },
+    {
+      "epoch": 0.19325577625484158,
+      "grad_norm": 0.003469350514933467,
+      "learning_rate": 0.001,
+      "loss": 0.3974,
+      "step": 7004
+    },
+    {
+      "epoch": 0.19328336845590594,
+      "grad_norm": 0.0026082920376211405,
+      "learning_rate": 0.001,
+      "loss": 0.4113,
+      "step": 7005
+    },
+    {
+      "epoch": 0.19331096065697032,
+      "grad_norm": 0.002355298027396202,
+      "learning_rate": 0.001,
+      "loss": 0.4367,
+      "step": 7006
+    },
+    {
+      "epoch": 0.19333855285803467,
+      "grad_norm": 0.002677205018699169,
+      "learning_rate": 0.001,
+      "loss": 0.4009,
+      "step": 7007
+    },
+    {
+      "epoch": 0.19336614505909905,
+      "grad_norm": 0.002955394797027111,
+      "learning_rate": 0.001,
+      "loss": 0.4261,
+      "step": 7008
+    },
+    {
+      "epoch": 0.19339373726016343,
+      "grad_norm": 0.0033979052677750587,
+      "learning_rate": 0.001,
+      "loss": 0.3961,
+      "step": 7009
+    },
+    {
+      "epoch": 0.19342132946122778,
+      "grad_norm": 0.004769494291394949,
+      "learning_rate": 0.001,
+      "loss": 0.4022,
+      "step": 7010
+    },
+    {
+      "epoch": 0.19344892166229216,
+      "grad_norm": 0.003045122604817152,
+      "learning_rate": 0.001,
+      "loss": 0.4005,
+      "step": 7011
+    },
+    {
+      "epoch": 0.19347651386335651,
+      "grad_norm": 0.00237339548766613,
+      "learning_rate": 0.001,
+      "loss": 0.3947,
+      "step": 7012
+    },
+    {
+      "epoch": 0.1935041060644209,
+      "grad_norm": 0.002918932121247053,
+      "learning_rate": 0.001,
+      "loss": 0.4166,
+      "step": 7013
+    },
+    {
+      "epoch": 0.19353169826548525,
+      "grad_norm": 0.003531453665345907,
+      "learning_rate": 0.001,
+      "loss": 0.4043,
+      "step": 7014
+    },
+    {
+      "epoch": 0.19355929046654963,
+      "grad_norm": 0.002957909367978573,
+      "learning_rate": 0.001,
+      "loss": 0.4163,
+      "step": 7015
+    },
+    {
+      "epoch": 0.193586882667614,
+      "grad_norm": 0.002795727690681815,
+      "learning_rate": 0.001,
+      "loss": 0.423,
+      "step": 7016
+    },
+    {
+      "epoch": 0.19361447486867836,
+      "grad_norm": 0.002575729275122285,
+      "learning_rate": 0.001,
+      "loss": 0.3677,
+      "step": 7017
+    },
+    {
+      "epoch": 0.19364206706974274,
+      "grad_norm": 0.011985518038272858,
+      "learning_rate": 0.001,
+      "loss": 0.4104,
+      "step": 7018
+    },
+    {
+      "epoch": 0.1936696592708071,
+      "grad_norm": 0.005119143985211849,
+      "learning_rate": 0.001,
+      "loss": 0.3975,
+      "step": 7019
+    },
+    {
+      "epoch": 0.19369725147187147,
+      "grad_norm": 0.002983895130455494,
+      "learning_rate": 0.001,
+      "loss": 0.382,
+      "step": 7020
+    },
+    {
+      "epoch": 0.19372484367293585,
+      "grad_norm": 0.0021343736443668604,
+      "learning_rate": 0.001,
+      "loss": 0.4559,
+      "step": 7021
+    },
+    {
+      "epoch": 0.1937524358740002,
+      "grad_norm": 0.0032467253040522337,
+      "learning_rate": 0.001,
+      "loss": 0.3994,
+      "step": 7022
+    },
+    {
+      "epoch": 0.1937800280750646,
+      "grad_norm": 0.0023335155565291643,
+      "learning_rate": 0.001,
+      "loss": 0.4108,
+      "step": 7023
+    },
+    {
+      "epoch": 0.19380762027612894,
+      "grad_norm": 0.002503189956769347,
+      "learning_rate": 0.001,
+      "loss": 0.4152,
+      "step": 7024
+    },
+    {
+      "epoch": 0.19383521247719332,
+      "grad_norm": 0.002620559884235263,
+      "learning_rate": 0.001,
+      "loss": 0.3917,
+      "step": 7025
+    },
+    {
+      "epoch": 0.1938628046782577,
+      "grad_norm": 0.002399984747171402,
+      "learning_rate": 0.001,
+      "loss": 0.3945,
+      "step": 7026
+    },
+    {
+      "epoch": 0.19389039687932205,
+      "grad_norm": 0.0024000145494937897,
+      "learning_rate": 0.001,
+      "loss": 0.4327,
+      "step": 7027
+    },
+    {
+      "epoch": 0.19391798908038643,
+      "grad_norm": 0.0024858317337930202,
+      "learning_rate": 0.001,
+      "loss": 0.3851,
+      "step": 7028
+    },
+    {
+      "epoch": 0.19394558128145079,
+      "grad_norm": 0.0028986751567572355,
+      "learning_rate": 0.001,
+      "loss": 0.3865,
+      "step": 7029
+    },
+    {
+      "epoch": 0.19397317348251517,
+      "grad_norm": 0.0020534794311970472,
+      "learning_rate": 0.001,
+      "loss": 0.4198,
+      "step": 7030
+    },
+    {
+      "epoch": 0.19400076568357955,
+      "grad_norm": 0.0021818815730512142,
+      "learning_rate": 0.001,
+      "loss": 0.4122,
+      "step": 7031
+    },
+    {
+      "epoch": 0.1940283578846439,
+      "grad_norm": 0.003896314650774002,
+      "learning_rate": 0.001,
+      "loss": 0.3739,
+      "step": 7032
+    },
+    {
+      "epoch": 0.19405595008570828,
+      "grad_norm": 0.003106046002358198,
+      "learning_rate": 0.001,
+      "loss": 0.4117,
+      "step": 7033
+    },
+    {
+      "epoch": 0.19408354228677263,
+      "grad_norm": 0.003942525014281273,
+      "learning_rate": 0.001,
+      "loss": 0.3911,
+      "step": 7034
+    },
+    {
+      "epoch": 0.194111134487837,
+      "grad_norm": 0.006781655363738537,
+      "learning_rate": 0.001,
+      "loss": 0.3489,
+      "step": 7035
+    },
+    {
+      "epoch": 0.1941387266889014,
+      "grad_norm": 0.0025264392606914043,
+      "learning_rate": 0.001,
+      "loss": 0.403,
+      "step": 7036
+    },
+    {
+      "epoch": 0.19416631888996574,
+      "grad_norm": 0.0023680292069911957,
+      "learning_rate": 0.001,
+      "loss": 0.3891,
+      "step": 7037
+    },
+    {
+      "epoch": 0.19419391109103012,
+      "grad_norm": 0.0024297498166561127,
+      "learning_rate": 0.001,
+      "loss": 0.3862,
+      "step": 7038
+    },
+    {
+      "epoch": 0.19422150329209448,
+      "grad_norm": 0.0028899710159748793,
+      "learning_rate": 0.001,
+      "loss": 0.38,
+      "step": 7039
+    },
+    {
+      "epoch": 0.19424909549315886,
+      "grad_norm": 0.00262429122813046,
+      "learning_rate": 0.001,
+      "loss": 0.4411,
+      "step": 7040
+    },
+    {
+      "epoch": 0.19427668769422324,
+      "grad_norm": 0.0034140751231461763,
+      "learning_rate": 0.001,
+      "loss": 0.4207,
+      "step": 7041
+    },
+    {
+      "epoch": 0.1943042798952876,
+      "grad_norm": 0.003651238512247801,
+      "learning_rate": 0.001,
+      "loss": 0.3855,
+      "step": 7042
+    },
+    {
+      "epoch": 0.19433187209635197,
+      "grad_norm": 0.0027614810969680548,
+      "learning_rate": 0.001,
+      "loss": 0.3686,
+      "step": 7043
+    },
+    {
+      "epoch": 0.19435946429741632,
+      "grad_norm": 0.0027164684142917395,
+      "learning_rate": 0.001,
+      "loss": 0.4135,
+      "step": 7044
+    },
+    {
+      "epoch": 0.1943870564984807,
+      "grad_norm": 0.0031832915265113115,
+      "learning_rate": 0.001,
+      "loss": 0.402,
+      "step": 7045
+    },
+    {
+      "epoch": 0.19441464869954508,
+      "grad_norm": 0.0033719497732818127,
+      "learning_rate": 0.001,
+      "loss": 0.4137,
+      "step": 7046
+    },
+    {
+      "epoch": 0.19444224090060944,
+      "grad_norm": 0.002569133648648858,
+      "learning_rate": 0.001,
+      "loss": 0.399,
+      "step": 7047
+    },
+    {
+      "epoch": 0.19446983310167382,
+      "grad_norm": 0.0024267893750220537,
+      "learning_rate": 0.001,
+      "loss": 0.3761,
+      "step": 7048
+    },
+    {
+      "epoch": 0.19449742530273817,
+      "grad_norm": 0.002893506083637476,
+      "learning_rate": 0.001,
+      "loss": 0.4003,
+      "step": 7049
+    },
+    {
+      "epoch": 0.19452501750380255,
+      "grad_norm": 0.002697143005207181,
+      "learning_rate": 0.001,
+      "loss": 0.3907,
+      "step": 7050
+    },
+    {
+      "epoch": 0.19455260970486693,
+      "grad_norm": 0.0032814224250614643,
+      "learning_rate": 0.001,
+      "loss": 0.4078,
+      "step": 7051
+    },
+    {
+      "epoch": 0.19458020190593128,
+      "grad_norm": 0.002781797433272004,
+      "learning_rate": 0.001,
+      "loss": 0.3976,
+      "step": 7052
+    },
+    {
+      "epoch": 0.19460779410699566,
+      "grad_norm": 0.003478818805888295,
+      "learning_rate": 0.001,
+      "loss": 0.3787,
+      "step": 7053
+    },
+    {
+      "epoch": 0.19463538630806002,
+      "grad_norm": 0.0027507366612553596,
+      "learning_rate": 0.001,
+      "loss": 0.3745,
+      "step": 7054
+    },
+    {
+      "epoch": 0.1946629785091244,
+      "grad_norm": 0.0034008813090622425,
+      "learning_rate": 0.001,
+      "loss": 0.3999,
+      "step": 7055
+    },
+    {
+      "epoch": 0.19469057071018878,
+      "grad_norm": 0.0030989109072834253,
+      "learning_rate": 0.001,
+      "loss": 0.393,
+      "step": 7056
+    },
+    {
+      "epoch": 0.19471816291125313,
+      "grad_norm": 0.003060347633436322,
+      "learning_rate": 0.001,
+      "loss": 0.3682,
+      "step": 7057
+    },
+    {
+      "epoch": 0.1947457551123175,
+      "grad_norm": 0.0040310220792889595,
+      "learning_rate": 0.001,
+      "loss": 0.3729,
+      "step": 7058
+    },
+    {
+      "epoch": 0.19477334731338186,
+      "grad_norm": 0.0034857376012951136,
+      "learning_rate": 0.001,
+      "loss": 0.3781,
+      "step": 7059
+    },
+    {
+      "epoch": 0.19480093951444624,
+      "grad_norm": 0.0030391844920814037,
+      "learning_rate": 0.001,
+      "loss": 0.4159,
+      "step": 7060
+    },
+    {
+      "epoch": 0.19482853171551062,
+      "grad_norm": 0.0030424713622778654,
+      "learning_rate": 0.001,
+      "loss": 0.3882,
+      "step": 7061
+    },
+    {
+      "epoch": 0.19485612391657497,
+      "grad_norm": 0.0027415102813392878,
+      "learning_rate": 0.001,
+      "loss": 0.394,
+      "step": 7062
+    },
+    {
+      "epoch": 0.19488371611763936,
+      "grad_norm": 0.002916666679084301,
+      "learning_rate": 0.001,
+      "loss": 0.4095,
+      "step": 7063
+    },
+    {
+      "epoch": 0.1949113083187037,
+      "grad_norm": 0.0023576219100505114,
+      "learning_rate": 0.001,
+      "loss": 0.4512,
+      "step": 7064
+    },
+    {
+      "epoch": 0.1949389005197681,
+      "grad_norm": 0.002573948120698333,
+      "learning_rate": 0.001,
+      "loss": 0.3731,
+      "step": 7065
+    },
+    {
+      "epoch": 0.19496649272083247,
+      "grad_norm": 0.004156920593231916,
+      "learning_rate": 0.001,
+      "loss": 0.4234,
+      "step": 7066
+    },
+    {
+      "epoch": 0.19499408492189682,
+      "grad_norm": 0.0037690969184041023,
+      "learning_rate": 0.001,
+      "loss": 0.3681,
+      "step": 7067
+    },
+    {
+      "epoch": 0.1950216771229612,
+      "grad_norm": 0.0031203448306769133,
+      "learning_rate": 0.001,
+      "loss": 0.3632,
+      "step": 7068
+    },
+    {
+      "epoch": 0.19504926932402555,
+      "grad_norm": 0.002718303119763732,
+      "learning_rate": 0.001,
+      "loss": 0.4197,
+      "step": 7069
+    },
+    {
+      "epoch": 0.19507686152508993,
+      "grad_norm": 0.002880721352994442,
+      "learning_rate": 0.001,
+      "loss": 0.3749,
+      "step": 7070
+    },
+    {
+      "epoch": 0.19510445372615431,
+      "grad_norm": 0.0021646865643560886,
+      "learning_rate": 0.001,
+      "loss": 0.3722,
+      "step": 7071
+    },
+    {
+      "epoch": 0.19513204592721867,
+      "grad_norm": 0.004858906380832195,
+      "learning_rate": 0.001,
+      "loss": 0.3744,
+      "step": 7072
+    },
+    {
+      "epoch": 0.19515963812828305,
+      "grad_norm": 0.006143823266029358,
+      "learning_rate": 0.001,
+      "loss": 0.4083,
+      "step": 7073
+    },
+    {
+      "epoch": 0.1951872303293474,
+      "grad_norm": 0.002867503557354212,
+      "learning_rate": 0.001,
+      "loss": 0.3752,
+      "step": 7074
+    },
+    {
+      "epoch": 0.19521482253041178,
+      "grad_norm": 0.0057819196954369545,
+      "learning_rate": 0.001,
+      "loss": 0.4177,
+      "step": 7075
+    },
+    {
+      "epoch": 0.19524241473147616,
+      "grad_norm": 0.003106197575107217,
+      "learning_rate": 0.001,
+      "loss": 0.4094,
+      "step": 7076
+    },
+    {
+      "epoch": 0.1952700069325405,
+      "grad_norm": 0.0021443332079797983,
+      "learning_rate": 0.001,
+      "loss": 0.3717,
+      "step": 7077
+    },
+    {
+      "epoch": 0.1952975991336049,
+      "grad_norm": 0.003420140827074647,
+      "learning_rate": 0.001,
+      "loss": 0.3775,
+      "step": 7078
+    },
+    {
+      "epoch": 0.19532519133466925,
+      "grad_norm": 0.0017785170348361135,
+      "learning_rate": 0.001,
+      "loss": 0.4202,
+      "step": 7079
+    },
+    {
+      "epoch": 0.19535278353573363,
+      "grad_norm": 0.003068318823352456,
+      "learning_rate": 0.001,
+      "loss": 0.3768,
+      "step": 7080
+    },
+    {
+      "epoch": 0.195380375736798,
+      "grad_norm": 0.0028662248514592648,
+      "learning_rate": 0.001,
+      "loss": 0.4064,
+      "step": 7081
+    },
+    {
+      "epoch": 0.19540796793786236,
+      "grad_norm": 0.002816071966663003,
+      "learning_rate": 0.001,
+      "loss": 0.3827,
+      "step": 7082
+    },
+    {
+      "epoch": 0.19543556013892674,
+      "grad_norm": 0.003606053302064538,
+      "learning_rate": 0.001,
+      "loss": 0.3817,
+      "step": 7083
+    },
+    {
+      "epoch": 0.1954631523399911,
+      "grad_norm": 0.002966247033327818,
+      "learning_rate": 0.001,
+      "loss": 0.352,
+      "step": 7084
+    },
+    {
+      "epoch": 0.19549074454105547,
+      "grad_norm": 0.0033741393126547337,
+      "learning_rate": 0.001,
+      "loss": 0.4045,
+      "step": 7085
+    },
+    {
+      "epoch": 0.19551833674211985,
+      "grad_norm": 0.0050892336294054985,
+      "learning_rate": 0.001,
+      "loss": 0.4047,
+      "step": 7086
+    },
+    {
+      "epoch": 0.1955459289431842,
+      "grad_norm": 0.0030292568262666464,
+      "learning_rate": 0.001,
+      "loss": 0.4108,
+      "step": 7087
+    },
+    {
+      "epoch": 0.19557352114424859,
+      "grad_norm": 0.0051439255475997925,
+      "learning_rate": 0.001,
+      "loss": 0.4007,
+      "step": 7088
+    },
+    {
+      "epoch": 0.19560111334531294,
+      "grad_norm": 0.0027856396045535803,
+      "learning_rate": 0.001,
+      "loss": 0.45,
+      "step": 7089
+    },
+    {
+      "epoch": 0.19562870554637732,
+      "grad_norm": 0.0030972841195762157,
+      "learning_rate": 0.001,
+      "loss": 0.4173,
+      "step": 7090
+    },
+    {
+      "epoch": 0.1956562977474417,
+      "grad_norm": 0.003249438712373376,
+      "learning_rate": 0.001,
+      "loss": 0.3903,
+      "step": 7091
+    },
+    {
+      "epoch": 0.19568388994850605,
+      "grad_norm": 0.0027454618830233812,
+      "learning_rate": 0.001,
+      "loss": 0.4095,
+      "step": 7092
+    },
+    {
+      "epoch": 0.19571148214957043,
+      "grad_norm": 0.0027419314719736576,
+      "learning_rate": 0.001,
+      "loss": 0.3803,
+      "step": 7093
+    },
+    {
+      "epoch": 0.19573907435063478,
+      "grad_norm": 0.0027157592121511698,
+      "learning_rate": 0.001,
+      "loss": 0.3786,
+      "step": 7094
+    },
+    {
+      "epoch": 0.19576666655169916,
+      "grad_norm": 0.006076755467802286,
+      "learning_rate": 0.001,
+      "loss": 0.3998,
+      "step": 7095
+    },
+    {
+      "epoch": 0.19579425875276354,
+      "grad_norm": 0.0034049993846565485,
+      "learning_rate": 0.001,
+      "loss": 0.3862,
+      "step": 7096
+    },
+    {
+      "epoch": 0.1958218509538279,
+      "grad_norm": 0.004467803984880447,
+      "learning_rate": 0.001,
+      "loss": 0.3943,
+      "step": 7097
+    },
+    {
+      "epoch": 0.19584944315489228,
+      "grad_norm": 0.0059601617977023125,
+      "learning_rate": 0.001,
+      "loss": 0.3972,
+      "step": 7098
+    },
+    {
+      "epoch": 0.19587703535595663,
+      "grad_norm": 0.0025327398907393217,
+      "learning_rate": 0.001,
+      "loss": 0.4213,
+      "step": 7099
+    },
+    {
+      "epoch": 0.195904627557021,
+      "grad_norm": 0.002435739152133465,
+      "learning_rate": 0.001,
+      "loss": 0.3729,
+      "step": 7100
+    },
+    {
+      "epoch": 0.1959322197580854,
+      "grad_norm": 0.002827326999977231,
+      "learning_rate": 0.001,
+      "loss": 0.4195,
+      "step": 7101
+    },
+    {
+      "epoch": 0.19595981195914974,
+      "grad_norm": 0.003259490942582488,
+      "learning_rate": 0.001,
+      "loss": 0.4202,
+      "step": 7102
+    },
+    {
+      "epoch": 0.19598740416021412,
+      "grad_norm": 0.0026045122649520636,
+      "learning_rate": 0.001,
+      "loss": 0.3684,
+      "step": 7103
+    },
+    {
+      "epoch": 0.19601499636127848,
+      "grad_norm": 0.0032783085480332375,
+      "learning_rate": 0.001,
+      "loss": 0.382,
+      "step": 7104
+    },
+    {
+      "epoch": 0.19604258856234286,
+      "grad_norm": 0.004205625504255295,
+      "learning_rate": 0.001,
+      "loss": 0.3918,
+      "step": 7105
+    },
+    {
+      "epoch": 0.1960701807634072,
+      "grad_norm": 0.004856889136135578,
+      "learning_rate": 0.001,
+      "loss": 0.4006,
+      "step": 7106
+    },
+    {
+      "epoch": 0.1960977729644716,
+      "grad_norm": 0.005419073160737753,
+      "learning_rate": 0.001,
+      "loss": 0.4515,
+      "step": 7107
+    },
+    {
+      "epoch": 0.19612536516553597,
+      "grad_norm": 0.002779455156996846,
+      "learning_rate": 0.001,
+      "loss": 0.3753,
+      "step": 7108
+    },
+    {
+      "epoch": 0.19615295736660032,
+      "grad_norm": 0.005004864651709795,
+      "learning_rate": 0.001,
+      "loss": 0.3849,
+      "step": 7109
+    },
+    {
+      "epoch": 0.1961805495676647,
+      "grad_norm": 0.0031100206542760134,
+      "learning_rate": 0.001,
+      "loss": 0.4016,
+      "step": 7110
+    },
+    {
+      "epoch": 0.19620814176872906,
+      "grad_norm": 0.002827596152201295,
+      "learning_rate": 0.001,
+      "loss": 0.3784,
+      "step": 7111
+    },
+    {
+      "epoch": 0.19623573396979344,
+      "grad_norm": 0.003027317812666297,
+      "learning_rate": 0.001,
+      "loss": 0.3997,
+      "step": 7112
+    },
+    {
+      "epoch": 0.19626332617085782,
+      "grad_norm": 0.004092982038855553,
+      "learning_rate": 0.001,
+      "loss": 0.3499,
+      "step": 7113
+    },
+    {
+      "epoch": 0.19629091837192217,
+      "grad_norm": 0.0034322801511734724,
+      "learning_rate": 0.001,
+      "loss": 0.39,
+      "step": 7114
+    },
+    {
+      "epoch": 0.19631851057298655,
+      "grad_norm": 0.0031399535946547985,
+      "learning_rate": 0.001,
+      "loss": 0.3798,
+      "step": 7115
+    },
+    {
+      "epoch": 0.1963461027740509,
+      "grad_norm": 0.004002594854682684,
+      "learning_rate": 0.001,
+      "loss": 0.4014,
+      "step": 7116
+    },
+    {
+      "epoch": 0.19637369497511528,
+      "grad_norm": 0.0028446640353649855,
+      "learning_rate": 0.001,
+      "loss": 0.4235,
+      "step": 7117
+    },
+    {
+      "epoch": 0.19640128717617966,
+      "grad_norm": 0.0026851436123251915,
+      "learning_rate": 0.001,
+      "loss": 0.3893,
+      "step": 7118
+    },
+    {
+      "epoch": 0.19642887937724401,
+      "grad_norm": 0.004198794718831778,
+      "learning_rate": 0.001,
+      "loss": 0.3929,
+      "step": 7119
+    },
+    {
+      "epoch": 0.1964564715783084,
+      "grad_norm": 0.0026877266354858875,
+      "learning_rate": 0.001,
+      "loss": 0.4258,
+      "step": 7120
+    },
+    {
+      "epoch": 0.19648406377937275,
+      "grad_norm": 0.0036120673175901175,
+      "learning_rate": 0.001,
+      "loss": 0.4328,
+      "step": 7121
+    },
+    {
+      "epoch": 0.19651165598043713,
+      "grad_norm": 0.004937993362545967,
+      "learning_rate": 0.001,
+      "loss": 0.4064,
+      "step": 7122
+    },
+    {
+      "epoch": 0.1965392481815015,
+      "grad_norm": 0.0036189809907227755,
+      "learning_rate": 0.001,
+      "loss": 0.3815,
+      "step": 7123
+    },
+    {
+      "epoch": 0.19656684038256586,
+      "grad_norm": 0.002798637840896845,
+      "learning_rate": 0.001,
+      "loss": 0.3852,
+      "step": 7124
+    },
+    {
+      "epoch": 0.19659443258363024,
+      "grad_norm": 0.005139973945915699,
+      "learning_rate": 0.001,
+      "loss": 0.402,
+      "step": 7125
+    },
+    {
+      "epoch": 0.1966220247846946,
+      "grad_norm": 0.006130583584308624,
+      "learning_rate": 0.001,
+      "loss": 0.401,
+      "step": 7126
+    },
+    {
+      "epoch": 0.19664961698575897,
+      "grad_norm": 0.005488388240337372,
+      "learning_rate": 0.001,
+      "loss": 0.3898,
+      "step": 7127
+    },
+    {
+      "epoch": 0.19667720918682335,
+      "grad_norm": 0.004060831852257252,
+      "learning_rate": 0.001,
+      "loss": 0.4456,
+      "step": 7128
+    },
+    {
+      "epoch": 0.1967048013878877,
+      "grad_norm": 0.0029782892670482397,
+      "learning_rate": 0.001,
+      "loss": 0.394,
+      "step": 7129
+    },
+    {
+      "epoch": 0.1967323935889521,
+      "grad_norm": 0.0036948262713849545,
+      "learning_rate": 0.001,
+      "loss": 0.4239,
+      "step": 7130
+    },
+    {
+      "epoch": 0.19675998579001644,
+      "grad_norm": 0.004071654751896858,
+      "learning_rate": 0.001,
+      "loss": 0.4095,
+      "step": 7131
+    },
+    {
+      "epoch": 0.19678757799108082,
+      "grad_norm": 0.0027887504547834396,
+      "learning_rate": 0.001,
+      "loss": 0.404,
+      "step": 7132
+    },
+    {
+      "epoch": 0.1968151701921452,
+      "grad_norm": 0.003311131615191698,
+      "learning_rate": 0.001,
+      "loss": 0.3998,
+      "step": 7133
+    },
+    {
+      "epoch": 0.19684276239320955,
+      "grad_norm": 0.002101517515257001,
+      "learning_rate": 0.001,
+      "loss": 0.4199,
+      "step": 7134
+    },
+    {
+      "epoch": 0.19687035459427393,
+      "grad_norm": 0.002868801588192582,
+      "learning_rate": 0.001,
+      "loss": 0.4191,
+      "step": 7135
+    },
+    {
+      "epoch": 0.19689794679533829,
+      "grad_norm": 0.003476998768746853,
+      "learning_rate": 0.001,
+      "loss": 0.4003,
+      "step": 7136
+    },
+    {
+      "epoch": 0.19692553899640267,
+      "grad_norm": 0.0028729683253914118,
+      "learning_rate": 0.001,
+      "loss": 0.3974,
+      "step": 7137
+    },
+    {
+      "epoch": 0.19695313119746705,
+      "grad_norm": 0.0023318929597735405,
+      "learning_rate": 0.001,
+      "loss": 0.3955,
+      "step": 7138
+    },
+    {
+      "epoch": 0.1969807233985314,
+      "grad_norm": 0.0027352285105735064,
+      "learning_rate": 0.001,
+      "loss": 0.3787,
+      "step": 7139
+    },
+    {
+      "epoch": 0.19700831559959578,
+      "grad_norm": 0.006381817627698183,
+      "learning_rate": 0.001,
+      "loss": 0.3775,
+      "step": 7140
+    },
+    {
+      "epoch": 0.19703590780066013,
+      "grad_norm": 0.0036294516175985336,
+      "learning_rate": 0.001,
+      "loss": 0.3923,
+      "step": 7141
+    },
+    {
+      "epoch": 0.1970635000017245,
+      "grad_norm": 0.0023575150407850742,
+      "learning_rate": 0.001,
+      "loss": 0.4076,
+      "step": 7142
+    },
+    {
+      "epoch": 0.1970910922027889,
+      "grad_norm": 0.0027103947941213846,
+      "learning_rate": 0.001,
+      "loss": 0.3956,
+      "step": 7143
+    },
+    {
+      "epoch": 0.19711868440385324,
+      "grad_norm": 0.003010998945683241,
+      "learning_rate": 0.001,
+      "loss": 0.4132,
+      "step": 7144
+    },
+    {
+      "epoch": 0.19714627660491763,
+      "grad_norm": 0.0021617074962705374,
+      "learning_rate": 0.001,
+      "loss": 0.4136,
+      "step": 7145
+    },
+    {
+      "epoch": 0.19717386880598198,
+      "grad_norm": 0.008347420953214169,
+      "learning_rate": 0.001,
+      "loss": 0.4099,
+      "step": 7146
+    },
+    {
+      "epoch": 0.19720146100704636,
+      "grad_norm": 0.002271142089739442,
+      "learning_rate": 0.001,
+      "loss": 0.3984,
+      "step": 7147
+    },
+    {
+      "epoch": 0.19722905320811074,
+      "grad_norm": 0.005453981924802065,
+      "learning_rate": 0.001,
+      "loss": 0.3863,
+      "step": 7148
+    },
+    {
+      "epoch": 0.1972566454091751,
+      "grad_norm": 0.003245376283302903,
+      "learning_rate": 0.001,
+      "loss": 0.3939,
+      "step": 7149
+    },
+    {
+      "epoch": 0.19728423761023947,
+      "grad_norm": 0.0026226479094475508,
+      "learning_rate": 0.001,
+      "loss": 0.3647,
+      "step": 7150
+    },
+    {
+      "epoch": 0.19731182981130382,
+      "grad_norm": 0.003077802946791053,
+      "learning_rate": 0.001,
+      "loss": 0.4184,
+      "step": 7151
+    },
+    {
+      "epoch": 0.1973394220123682,
+      "grad_norm": 0.0025755036622285843,
+      "learning_rate": 0.001,
+      "loss": 0.3937,
+      "step": 7152
+    },
+    {
+      "epoch": 0.19736701421343258,
+      "grad_norm": 0.0020703410264104605,
+      "learning_rate": 0.001,
+      "loss": 0.4281,
+      "step": 7153
+    },
+    {
+      "epoch": 0.19739460641449694,
+      "grad_norm": 0.0022496734745800495,
+      "learning_rate": 0.001,
+      "loss": 0.3912,
+      "step": 7154
+    },
+    {
+      "epoch": 0.19742219861556132,
+      "grad_norm": 0.004225686192512512,
+      "learning_rate": 0.001,
+      "loss": 0.3843,
+      "step": 7155
+    },
+    {
+      "epoch": 0.19744979081662567,
+      "grad_norm": 0.0025360104627907276,
+      "learning_rate": 0.001,
+      "loss": 0.4148,
+      "step": 7156
+    },
+    {
+      "epoch": 0.19747738301769005,
+      "grad_norm": 0.00297233066521585,
+      "learning_rate": 0.001,
+      "loss": 0.4025,
+      "step": 7157
+    },
+    {
+      "epoch": 0.19750497521875443,
+      "grad_norm": 0.0028523015789687634,
+      "learning_rate": 0.001,
+      "loss": 0.4131,
+      "step": 7158
+    },
+    {
+      "epoch": 0.19753256741981878,
+      "grad_norm": 0.002735694171860814,
+      "learning_rate": 0.001,
+      "loss": 0.4205,
+      "step": 7159
+    },
+    {
+      "epoch": 0.19756015962088316,
+      "grad_norm": 0.005362628027796745,
+      "learning_rate": 0.001,
+      "loss": 0.4041,
+      "step": 7160
+    },
+    {
+      "epoch": 0.19758775182194752,
+      "grad_norm": 0.0036659168545156717,
+      "learning_rate": 0.001,
+      "loss": 0.4107,
+      "step": 7161
+    },
+    {
+      "epoch": 0.1976153440230119,
+      "grad_norm": 0.0036525116302073,
+      "learning_rate": 0.001,
+      "loss": 0.4265,
+      "step": 7162
+    },
+    {
+      "epoch": 0.19764293622407628,
+      "grad_norm": 0.002731502987444401,
+      "learning_rate": 0.001,
+      "loss": 0.4075,
+      "step": 7163
+    },
+    {
+      "epoch": 0.19767052842514063,
+      "grad_norm": 0.006773337721824646,
+      "learning_rate": 0.001,
+      "loss": 0.4441,
+      "step": 7164
+    },
+    {
+      "epoch": 0.197698120626205,
+      "grad_norm": 0.0025534844025969505,
+      "learning_rate": 0.001,
+      "loss": 0.4235,
+      "step": 7165
+    },
+    {
+      "epoch": 0.19772571282726936,
+      "grad_norm": 0.003402253147214651,
+      "learning_rate": 0.001,
+      "loss": 0.3757,
+      "step": 7166
+    },
+    {
+      "epoch": 0.19775330502833374,
+      "grad_norm": 0.003048243233934045,
+      "learning_rate": 0.001,
+      "loss": 0.3988,
+      "step": 7167
+    },
+    {
+      "epoch": 0.19778089722939812,
+      "grad_norm": 0.004743688274174929,
+      "learning_rate": 0.001,
+      "loss": 0.3636,
+      "step": 7168
+    },
+    {
+      "epoch": 0.19780848943046248,
+      "grad_norm": 0.0041982936672866344,
+      "learning_rate": 0.001,
+      "loss": 0.3869,
+      "step": 7169
+    },
+    {
+      "epoch": 0.19783608163152686,
+      "grad_norm": 0.0023896710481494665,
+      "learning_rate": 0.001,
+      "loss": 0.4214,
+      "step": 7170
+    },
+    {
+      "epoch": 0.1978636738325912,
+      "grad_norm": 0.0028549651615321636,
+      "learning_rate": 0.001,
+      "loss": 0.3938,
+      "step": 7171
+    },
+    {
+      "epoch": 0.1978912660336556,
+      "grad_norm": 0.002721426310017705,
+      "learning_rate": 0.001,
+      "loss": 0.4072,
+      "step": 7172
+    },
+    {
+      "epoch": 0.19791885823471997,
+      "grad_norm": 0.0029002949595451355,
+      "learning_rate": 0.001,
+      "loss": 0.3972,
+      "step": 7173
+    },
+    {
+      "epoch": 0.19794645043578432,
+      "grad_norm": 0.00278780166991055,
+      "learning_rate": 0.001,
+      "loss": 0.3967,
+      "step": 7174
+    },
+    {
+      "epoch": 0.1979740426368487,
+      "grad_norm": 0.0019508522236719728,
+      "learning_rate": 0.001,
+      "loss": 0.3739,
+      "step": 7175
+    },
+    {
+      "epoch": 0.19800163483791305,
+      "grad_norm": 0.002591001568362117,
+      "learning_rate": 0.001,
+      "loss": 0.3928,
+      "step": 7176
+    },
+    {
+      "epoch": 0.19802922703897743,
+      "grad_norm": 0.00309981987811625,
+      "learning_rate": 0.001,
+      "loss": 0.3692,
+      "step": 7177
+    },
+    {
+      "epoch": 0.19805681924004181,
+      "grad_norm": 0.003413395956158638,
+      "learning_rate": 0.001,
+      "loss": 0.3945,
+      "step": 7178
+    },
+    {
+      "epoch": 0.19808441144110617,
+      "grad_norm": 0.0031371319200843573,
+      "learning_rate": 0.001,
+      "loss": 0.3773,
+      "step": 7179
+    },
+    {
+      "epoch": 0.19811200364217055,
+      "grad_norm": 0.0040471418760716915,
+      "learning_rate": 0.001,
+      "loss": 0.379,
+      "step": 7180
+    },
+    {
+      "epoch": 0.1981395958432349,
+      "grad_norm": 0.002860469277948141,
+      "learning_rate": 0.001,
+      "loss": 0.4273,
+      "step": 7181
+    },
+    {
+      "epoch": 0.19816718804429928,
+      "grad_norm": 0.0035744935739785433,
+      "learning_rate": 0.001,
+      "loss": 0.3871,
+      "step": 7182
+    },
+    {
+      "epoch": 0.19819478024536366,
+      "grad_norm": 0.0021571393590420485,
+      "learning_rate": 0.001,
+      "loss": 0.4033,
+      "step": 7183
+    },
+    {
+      "epoch": 0.198222372446428,
+      "grad_norm": 0.0027684883680194616,
+      "learning_rate": 0.001,
+      "loss": 0.398,
+      "step": 7184
+    },
+    {
+      "epoch": 0.1982499646474924,
+      "grad_norm": 0.004502817988395691,
+      "learning_rate": 0.001,
+      "loss": 0.397,
+      "step": 7185
+    },
+    {
+      "epoch": 0.19827755684855675,
+      "grad_norm": 0.0026800809428095818,
+      "learning_rate": 0.001,
+      "loss": 0.4417,
+      "step": 7186
+    },
+    {
+      "epoch": 0.19830514904962113,
+      "grad_norm": 0.004524301737546921,
+      "learning_rate": 0.001,
+      "loss": 0.392,
+      "step": 7187
+    },
+    {
+      "epoch": 0.1983327412506855,
+      "grad_norm": 0.002851566532626748,
+      "learning_rate": 0.001,
+      "loss": 0.3904,
+      "step": 7188
+    },
+    {
+      "epoch": 0.19836033345174986,
+      "grad_norm": 0.004874248988926411,
+      "learning_rate": 0.001,
+      "loss": 0.4018,
+      "step": 7189
+    },
+    {
+      "epoch": 0.19838792565281424,
+      "grad_norm": 0.0025999664794653654,
+      "learning_rate": 0.001,
+      "loss": 0.4053,
+      "step": 7190
+    },
+    {
+      "epoch": 0.1984155178538786,
+      "grad_norm": 0.0053477040491998196,
+      "learning_rate": 0.001,
+      "loss": 0.4298,
+      "step": 7191
+    },
+    {
+      "epoch": 0.19844311005494297,
+      "grad_norm": 0.002580095548182726,
+      "learning_rate": 0.001,
+      "loss": 0.3889,
+      "step": 7192
+    },
+    {
+      "epoch": 0.19847070225600735,
+      "grad_norm": 0.006665955297648907,
+      "learning_rate": 0.001,
+      "loss": 0.4013,
+      "step": 7193
+    },
+    {
+      "epoch": 0.1984982944570717,
+      "grad_norm": 0.002255852334201336,
+      "learning_rate": 0.001,
+      "loss": 0.4159,
+      "step": 7194
+    },
+    {
+      "epoch": 0.19852588665813609,
+      "grad_norm": 0.0027929407078772783,
+      "learning_rate": 0.001,
+      "loss": 0.3832,
+      "step": 7195
+    },
+    {
+      "epoch": 0.19855347885920044,
+      "grad_norm": 0.002101206686347723,
+      "learning_rate": 0.001,
+      "loss": 0.4199,
+      "step": 7196
+    },
+    {
+      "epoch": 0.19858107106026482,
+      "grad_norm": 0.0027232288848608732,
+      "learning_rate": 0.001,
+      "loss": 0.435,
+      "step": 7197
+    },
+    {
+      "epoch": 0.1986086632613292,
+      "grad_norm": 0.007063394878059626,
+      "learning_rate": 0.001,
+      "loss": 0.385,
+      "step": 7198
+    },
+    {
+      "epoch": 0.19863625546239355,
+      "grad_norm": 0.004317095037549734,
+      "learning_rate": 0.001,
+      "loss": 0.4079,
+      "step": 7199
+    },
+    {
+      "epoch": 0.19866384766345793,
+      "grad_norm": 0.0018634272273629904,
+      "learning_rate": 0.001,
+      "loss": 0.3987,
+      "step": 7200
+    },
+    {
+      "epoch": 0.19869143986452228,
+      "grad_norm": 0.0029656426049768925,
+      "learning_rate": 0.001,
+      "loss": 0.3785,
+      "step": 7201
+    },
+    {
+      "epoch": 0.19871903206558666,
+      "grad_norm": 0.0018411249620839953,
+      "learning_rate": 0.001,
+      "loss": 0.44,
+      "step": 7202
+    },
+    {
+      "epoch": 0.19874662426665102,
+      "grad_norm": 0.0027455666568130255,
+      "learning_rate": 0.001,
+      "loss": 0.4132,
+      "step": 7203
+    },
+    {
+      "epoch": 0.1987742164677154,
+      "grad_norm": 0.003438533516600728,
+      "learning_rate": 0.001,
+      "loss": 0.423,
+      "step": 7204
+    },
+    {
+      "epoch": 0.19880180866877978,
+      "grad_norm": 0.0023013537283986807,
+      "learning_rate": 0.001,
+      "loss": 0.4364,
+      "step": 7205
+    },
+    {
+      "epoch": 0.19882940086984413,
+      "grad_norm": 0.003259762190282345,
+      "learning_rate": 0.001,
+      "loss": 0.3781,
+      "step": 7206
+    },
+    {
+      "epoch": 0.1988569930709085,
+      "grad_norm": 0.002609924878925085,
+      "learning_rate": 0.001,
+      "loss": 0.4025,
+      "step": 7207
+    },
+    {
+      "epoch": 0.19888458527197286,
+      "grad_norm": 0.002380332676693797,
+      "learning_rate": 0.001,
+      "loss": 0.4107,
+      "step": 7208
+    },
+    {
+      "epoch": 0.19891217747303724,
+      "grad_norm": 0.0030413048807531595,
+      "learning_rate": 0.001,
+      "loss": 0.3996,
+      "step": 7209
+    },
+    {
+      "epoch": 0.19893976967410162,
+      "grad_norm": 0.0022222718689590693,
+      "learning_rate": 0.001,
+      "loss": 0.3882,
+      "step": 7210
+    },
+    {
+      "epoch": 0.19896736187516598,
+      "grad_norm": 0.0030871161725372076,
+      "learning_rate": 0.001,
+      "loss": 0.4214,
+      "step": 7211
+    },
+    {
+      "epoch": 0.19899495407623036,
+      "grad_norm": 0.005348892416805029,
+      "learning_rate": 0.001,
+      "loss": 0.4101,
+      "step": 7212
+    },
+    {
+      "epoch": 0.1990225462772947,
+      "grad_norm": 0.005420180968940258,
+      "learning_rate": 0.001,
+      "loss": 0.4083,
+      "step": 7213
+    },
+    {
+      "epoch": 0.1990501384783591,
+      "grad_norm": 0.003726621624082327,
+      "learning_rate": 0.001,
+      "loss": 0.3649,
+      "step": 7214
+    },
+    {
+      "epoch": 0.19907773067942347,
+      "grad_norm": 0.0031097896862775087,
+      "learning_rate": 0.001,
+      "loss": 0.4092,
+      "step": 7215
+    },
+    {
+      "epoch": 0.19910532288048782,
+      "grad_norm": 0.0029584530275315046,
+      "learning_rate": 0.001,
+      "loss": 0.3914,
+      "step": 7216
+    },
+    {
+      "epoch": 0.1991329150815522,
+      "grad_norm": 0.0022907305974513292,
+      "learning_rate": 0.001,
+      "loss": 0.4146,
+      "step": 7217
+    },
+    {
+      "epoch": 0.19916050728261656,
+      "grad_norm": 0.0031007654033601284,
+      "learning_rate": 0.001,
+      "loss": 0.3831,
+      "step": 7218
+    },
+    {
+      "epoch": 0.19918809948368094,
+      "grad_norm": 0.0021812678314745426,
+      "learning_rate": 0.001,
+      "loss": 0.4274,
+      "step": 7219
+    },
+    {
+      "epoch": 0.19921569168474532,
+      "grad_norm": 0.0023966538719832897,
+      "learning_rate": 0.001,
+      "loss": 0.394,
+      "step": 7220
+    },
+    {
+      "epoch": 0.19924328388580967,
+      "grad_norm": 0.0031528030522167683,
+      "learning_rate": 0.001,
+      "loss": 0.3718,
+      "step": 7221
+    },
+    {
+      "epoch": 0.19927087608687405,
+      "grad_norm": 0.0022123432718217373,
+      "learning_rate": 0.001,
+      "loss": 0.3737,
+      "step": 7222
+    },
+    {
+      "epoch": 0.1992984682879384,
+      "grad_norm": 0.0031831758096814156,
+      "learning_rate": 0.001,
+      "loss": 0.3719,
+      "step": 7223
+    },
+    {
+      "epoch": 0.19932606048900278,
+      "grad_norm": 0.002463659970089793,
+      "learning_rate": 0.001,
+      "loss": 0.406,
+      "step": 7224
+    },
+    {
+      "epoch": 0.19935365269006716,
+      "grad_norm": 0.0026505514979362488,
+      "learning_rate": 0.001,
+      "loss": 0.3892,
+      "step": 7225
+    },
+    {
+      "epoch": 0.19938124489113151,
+      "grad_norm": 0.0024356083013117313,
+      "learning_rate": 0.001,
+      "loss": 0.3873,
+      "step": 7226
+    },
+    {
+      "epoch": 0.1994088370921959,
+      "grad_norm": 0.009376121684908867,
+      "learning_rate": 0.001,
+      "loss": 0.3801,
+      "step": 7227
+    },
+    {
+      "epoch": 0.19943642929326025,
+      "grad_norm": 0.002203370677307248,
+      "learning_rate": 0.001,
+      "loss": 0.4149,
+      "step": 7228
+    },
+    {
+      "epoch": 0.19946402149432463,
+      "grad_norm": 0.0029552599880844355,
+      "learning_rate": 0.001,
+      "loss": 0.3918,
+      "step": 7229
+    },
+    {
+      "epoch": 0.199491613695389,
+      "grad_norm": 0.0028194712940603495,
+      "learning_rate": 0.001,
+      "loss": 0.3809,
+      "step": 7230
+    },
+    {
+      "epoch": 0.19951920589645336,
+      "grad_norm": 0.0036550972145050764,
+      "learning_rate": 0.001,
+      "loss": 0.4238,
+      "step": 7231
+    },
+    {
+      "epoch": 0.19954679809751774,
+      "grad_norm": 0.002419488737359643,
+      "learning_rate": 0.001,
+      "loss": 0.378,
+      "step": 7232
+    },
+    {
+      "epoch": 0.1995743902985821,
+      "grad_norm": 0.002697533695027232,
+      "learning_rate": 0.001,
+      "loss": 0.4295,
+      "step": 7233
+    },
+    {
+      "epoch": 0.19960198249964647,
+      "grad_norm": 0.0025205162819474936,
+      "learning_rate": 0.001,
+      "loss": 0.3649,
+      "step": 7234
+    },
+    {
+      "epoch": 0.19962957470071085,
+      "grad_norm": 0.002819119254127145,
+      "learning_rate": 0.001,
+      "loss": 0.3823,
+      "step": 7235
+    },
+    {
+      "epoch": 0.1996571669017752,
+      "grad_norm": 0.002348159672692418,
+      "learning_rate": 0.001,
+      "loss": 0.4296,
+      "step": 7236
+    },
+    {
+      "epoch": 0.1996847591028396,
+      "grad_norm": 0.002327647991478443,
+      "learning_rate": 0.001,
+      "loss": 0.4258,
+      "step": 7237
+    },
+    {
+      "epoch": 0.19971235130390394,
+      "grad_norm": 0.003467888105660677,
+      "learning_rate": 0.001,
+      "loss": 0.375,
+      "step": 7238
+    },
+    {
+      "epoch": 0.19973994350496832,
+      "grad_norm": 0.002749866805970669,
+      "learning_rate": 0.001,
+      "loss": 0.3894,
+      "step": 7239
+    },
+    {
+      "epoch": 0.1997675357060327,
+      "grad_norm": 0.002804868621751666,
+      "learning_rate": 0.001,
+      "loss": 0.4616,
+      "step": 7240
+    },
+    {
+      "epoch": 0.19979512790709705,
+      "grad_norm": 0.0025544483214616776,
+      "learning_rate": 0.001,
+      "loss": 0.4019,
+      "step": 7241
+    },
+    {
+      "epoch": 0.19982272010816143,
+      "grad_norm": 0.004032348282635212,
+      "learning_rate": 0.001,
+      "loss": 0.3877,
+      "step": 7242
+    },
+    {
+      "epoch": 0.19985031230922579,
+      "grad_norm": 0.0025702861603349447,
+      "learning_rate": 0.001,
+      "loss": 0.4052,
+      "step": 7243
+    },
+    {
+      "epoch": 0.19987790451029017,
+      "grad_norm": 0.0029868450947105885,
+      "learning_rate": 0.001,
+      "loss": 0.3724,
+      "step": 7244
+    },
+    {
+      "epoch": 0.19990549671135455,
+      "grad_norm": 0.0034900156315416098,
+      "learning_rate": 0.001,
+      "loss": 0.3749,
+      "step": 7245
+    },
+    {
+      "epoch": 0.1999330889124189,
+      "grad_norm": 0.004184171557426453,
+      "learning_rate": 0.001,
+      "loss": 0.4015,
+      "step": 7246
+    },
+    {
+      "epoch": 0.19996068111348328,
+      "grad_norm": 0.003094634972512722,
+      "learning_rate": 0.001,
+      "loss": 0.3703,
+      "step": 7247
+    },
+    {
+      "epoch": 0.19998827331454763,
+      "grad_norm": 0.004218010231852531,
+      "learning_rate": 0.001,
+      "loss": 0.4223,
+      "step": 7248
+    },
+    {
+      "epoch": 0.200015865515612,
+      "grad_norm": 0.002849703887477517,
+      "learning_rate": 0.001,
+      "loss": 0.4061,
+      "step": 7249
+    },
+    {
+      "epoch": 0.2000434577166764,
+      "grad_norm": 0.002104248385876417,
+      "learning_rate": 0.001,
+      "loss": 0.4376,
+      "step": 7250
+    },
+    {
+      "epoch": 0.20007104991774075,
+      "grad_norm": 0.0028338278643786907,
+      "learning_rate": 0.001,
+      "loss": 0.4377,
+      "step": 7251
+    },
+    {
+      "epoch": 0.20009864211880513,
+      "grad_norm": 0.0070443847216665745,
+      "learning_rate": 0.001,
+      "loss": 0.3685,
+      "step": 7252
+    },
+    {
+      "epoch": 0.20012623431986948,
+      "grad_norm": 0.002888858551159501,
+      "learning_rate": 0.001,
+      "loss": 0.4073,
+      "step": 7253
+    },
+    {
+      "epoch": 0.20015382652093386,
+      "grad_norm": 0.0023948305752128363,
+      "learning_rate": 0.001,
+      "loss": 0.4076,
+      "step": 7254
+    },
+    {
+      "epoch": 0.20018141872199824,
+      "grad_norm": 0.0023272959515452385,
+      "learning_rate": 0.001,
+      "loss": 0.4205,
+      "step": 7255
+    },
+    {
+      "epoch": 0.2002090109230626,
+      "grad_norm": 0.0038700769655406475,
+      "learning_rate": 0.001,
+      "loss": 0.4253,
+      "step": 7256
+    },
+    {
+      "epoch": 0.20023660312412697,
+      "grad_norm": 0.002915510907769203,
+      "learning_rate": 0.001,
+      "loss": 0.3901,
+      "step": 7257
+    },
+    {
+      "epoch": 0.20026419532519132,
+      "grad_norm": 0.0024133019614964724,
+      "learning_rate": 0.001,
+      "loss": 0.4259,
+      "step": 7258
+    },
+    {
+      "epoch": 0.2002917875262557,
+      "grad_norm": 0.002601010724902153,
+      "learning_rate": 0.001,
+      "loss": 0.4087,
+      "step": 7259
+    },
+    {
+      "epoch": 0.20031937972732008,
+      "grad_norm": 0.004203200805932283,
+      "learning_rate": 0.001,
+      "loss": 0.3884,
+      "step": 7260
+    },
+    {
+      "epoch": 0.20034697192838444,
+      "grad_norm": 0.0025508387479931116,
+      "learning_rate": 0.001,
+      "loss": 0.4179,
+      "step": 7261
+    },
+    {
+      "epoch": 0.20037456412944882,
+      "grad_norm": 0.0040724920108914375,
+      "learning_rate": 0.001,
+      "loss": 0.4209,
+      "step": 7262
+    },
+    {
+      "epoch": 0.20040215633051317,
+      "grad_norm": 0.003506725886836648,
+      "learning_rate": 0.001,
+      "loss": 0.3999,
+      "step": 7263
+    },
+    {
+      "epoch": 0.20042974853157755,
+      "grad_norm": 0.003731567645445466,
+      "learning_rate": 0.001,
+      "loss": 0.4015,
+      "step": 7264
+    },
+    {
+      "epoch": 0.20045734073264193,
+      "grad_norm": 0.0050995261408388615,
+      "learning_rate": 0.001,
+      "loss": 0.3871,
+      "step": 7265
+    },
+    {
+      "epoch": 0.20048493293370628,
+      "grad_norm": 0.00341955223120749,
+      "learning_rate": 0.001,
+      "loss": 0.4045,
+      "step": 7266
+    },
+    {
+      "epoch": 0.20051252513477066,
+      "grad_norm": 0.003466901136562228,
+      "learning_rate": 0.001,
+      "loss": 0.3977,
+      "step": 7267
+    },
+    {
+      "epoch": 0.20054011733583502,
+      "grad_norm": 0.0026405456010252237,
+      "learning_rate": 0.001,
+      "loss": 0.4159,
+      "step": 7268
+    },
+    {
+      "epoch": 0.2005677095368994,
+      "grad_norm": 0.002517679473385215,
+      "learning_rate": 0.001,
+      "loss": 0.3902,
+      "step": 7269
+    },
+    {
+      "epoch": 0.20059530173796378,
+      "grad_norm": 0.002916789846494794,
+      "learning_rate": 0.001,
+      "loss": 0.4504,
+      "step": 7270
+    },
+    {
+      "epoch": 0.20062289393902813,
+      "grad_norm": 0.0029924395494163036,
+      "learning_rate": 0.001,
+      "loss": 0.3836,
+      "step": 7271
+    },
+    {
+      "epoch": 0.2006504861400925,
+      "grad_norm": 0.004070238210260868,
+      "learning_rate": 0.001,
+      "loss": 0.4133,
+      "step": 7272
+    },
+    {
+      "epoch": 0.20067807834115686,
+      "grad_norm": 0.002408135449513793,
+      "learning_rate": 0.001,
+      "loss": 0.4481,
+      "step": 7273
+    },
+    {
+      "epoch": 0.20070567054222124,
+      "grad_norm": 0.005004410166293383,
+      "learning_rate": 0.001,
+      "loss": 0.4515,
+      "step": 7274
+    },
+    {
+      "epoch": 0.20073326274328562,
+      "grad_norm": 0.002634943462908268,
+      "learning_rate": 0.001,
+      "loss": 0.4032,
+      "step": 7275
+    },
+    {
+      "epoch": 0.20076085494434998,
+      "grad_norm": 0.0034329029731452465,
+      "learning_rate": 0.001,
+      "loss": 0.3893,
+      "step": 7276
+    },
+    {
+      "epoch": 0.20078844714541436,
+      "grad_norm": 0.003150471020489931,
+      "learning_rate": 0.001,
+      "loss": 0.4292,
+      "step": 7277
+    },
+    {
+      "epoch": 0.2008160393464787,
+      "grad_norm": 0.0030349427834153175,
+      "learning_rate": 0.001,
+      "loss": 0.3582,
+      "step": 7278
+    },
+    {
+      "epoch": 0.2008436315475431,
+      "grad_norm": 0.0024701536167412996,
+      "learning_rate": 0.001,
+      "loss": 0.3926,
+      "step": 7279
+    },
+    {
+      "epoch": 0.20087122374860747,
+      "grad_norm": 0.0022775549441576004,
+      "learning_rate": 0.001,
+      "loss": 0.4324,
+      "step": 7280
+    },
+    {
+      "epoch": 0.20089881594967182,
+      "grad_norm": 0.0037725751753896475,
+      "learning_rate": 0.001,
+      "loss": 0.4061,
+      "step": 7281
+    },
+    {
+      "epoch": 0.2009264081507362,
+      "grad_norm": 0.0029782191850245,
+      "learning_rate": 0.001,
+      "loss": 0.4382,
+      "step": 7282
+    },
+    {
+      "epoch": 0.20095400035180055,
+      "grad_norm": 0.0028811590746045113,
+      "learning_rate": 0.001,
+      "loss": 0.3666,
+      "step": 7283
+    },
+    {
+      "epoch": 0.20098159255286493,
+      "grad_norm": 0.0027202339842915535,
+      "learning_rate": 0.001,
+      "loss": 0.4074,
+      "step": 7284
+    },
+    {
+      "epoch": 0.20100918475392932,
+      "grad_norm": 0.0026378172915428877,
+      "learning_rate": 0.001,
+      "loss": 0.3846,
+      "step": 7285
+    },
+    {
+      "epoch": 0.20103677695499367,
+      "grad_norm": 0.0031549364794045687,
+      "learning_rate": 0.001,
+      "loss": 0.409,
+      "step": 7286
+    },
+    {
+      "epoch": 0.20106436915605805,
+      "grad_norm": 0.0036540657747536898,
+      "learning_rate": 0.001,
+      "loss": 0.3454,
+      "step": 7287
+    },
+    {
+      "epoch": 0.2010919613571224,
+      "grad_norm": 0.0027373498305678368,
+      "learning_rate": 0.001,
+      "loss": 0.4287,
+      "step": 7288
+    },
+    {
+      "epoch": 0.20111955355818678,
+      "grad_norm": 0.0032555065117776394,
+      "learning_rate": 0.001,
+      "loss": 0.3941,
+      "step": 7289
+    },
+    {
+      "epoch": 0.20114714575925116,
+      "grad_norm": 0.0048676906153559685,
+      "learning_rate": 0.001,
+      "loss": 0.3526,
+      "step": 7290
+    },
+    {
+      "epoch": 0.20117473796031551,
+      "grad_norm": 0.0023562663700431585,
+      "learning_rate": 0.001,
+      "loss": 0.3755,
+      "step": 7291
+    },
+    {
+      "epoch": 0.2012023301613799,
+      "grad_norm": 0.00272022164426744,
+      "learning_rate": 0.001,
+      "loss": 0.384,
+      "step": 7292
+    },
+    {
+      "epoch": 0.20122992236244425,
+      "grad_norm": 0.002916688099503517,
+      "learning_rate": 0.001,
+      "loss": 0.3676,
+      "step": 7293
+    },
+    {
+      "epoch": 0.20125751456350863,
+      "grad_norm": 0.003989284858107567,
+      "learning_rate": 0.001,
+      "loss": 0.3717,
+      "step": 7294
+    },
+    {
+      "epoch": 0.20128510676457298,
+      "grad_norm": 0.0029025361873209476,
+      "learning_rate": 0.001,
+      "loss": 0.3997,
+      "step": 7295
+    },
+    {
+      "epoch": 0.20131269896563736,
+      "grad_norm": 0.003632181789726019,
+      "learning_rate": 0.001,
+      "loss": 0.4167,
+      "step": 7296
+    },
+    {
+      "epoch": 0.20134029116670174,
+      "grad_norm": 0.004145762883126736,
+      "learning_rate": 0.001,
+      "loss": 0.4017,
+      "step": 7297
+    },
+    {
+      "epoch": 0.2013678833677661,
+      "grad_norm": 0.0024826403241604567,
+      "learning_rate": 0.001,
+      "loss": 0.4606,
+      "step": 7298
+    },
+    {
+      "epoch": 0.20139547556883047,
+      "grad_norm": 0.002547920448705554,
+      "learning_rate": 0.001,
+      "loss": 0.4127,
+      "step": 7299
+    },
+    {
+      "epoch": 0.20142306776989483,
+      "grad_norm": 0.0030286472756415606,
+      "learning_rate": 0.001,
+      "loss": 0.4077,
+      "step": 7300
+    },
+    {
+      "epoch": 0.2014506599709592,
+      "grad_norm": 0.008298000320792198,
+      "learning_rate": 0.001,
+      "loss": 0.3692,
+      "step": 7301
+    },
+    {
+      "epoch": 0.2014782521720236,
+      "grad_norm": 0.003152182325720787,
+      "learning_rate": 0.001,
+      "loss": 0.4169,
+      "step": 7302
+    },
+    {
+      "epoch": 0.20150584437308794,
+      "grad_norm": 0.002809209283441305,
+      "learning_rate": 0.001,
+      "loss": 0.4033,
+      "step": 7303
+    },
+    {
+      "epoch": 0.20153343657415232,
+      "grad_norm": 0.004826799966394901,
+      "learning_rate": 0.001,
+      "loss": 0.4196,
+      "step": 7304
+    },
+    {
+      "epoch": 0.20156102877521667,
+      "grad_norm": 0.0035292392130941153,
+      "learning_rate": 0.001,
+      "loss": 0.4192,
+      "step": 7305
+    },
+    {
+      "epoch": 0.20158862097628105,
+      "grad_norm": 0.002209904370829463,
+      "learning_rate": 0.001,
+      "loss": 0.4124,
+      "step": 7306
+    },
+    {
+      "epoch": 0.20161621317734543,
+      "grad_norm": 0.002997803036123514,
+      "learning_rate": 0.001,
+      "loss": 0.4296,
+      "step": 7307
+    },
+    {
+      "epoch": 0.20164380537840979,
+      "grad_norm": 0.004406254272907972,
+      "learning_rate": 0.001,
+      "loss": 0.4045,
+      "step": 7308
+    },
+    {
+      "epoch": 0.20167139757947417,
+      "grad_norm": 0.0043180338107049465,
+      "learning_rate": 0.001,
+      "loss": 0.4384,
+      "step": 7309
+    },
+    {
+      "epoch": 0.20169898978053852,
+      "grad_norm": 0.07319663465023041,
+      "learning_rate": 0.001,
+      "loss": 0.3987,
+      "step": 7310
+    },
+    {
+      "epoch": 0.2017265819816029,
+      "grad_norm": 0.003474163357168436,
+      "learning_rate": 0.001,
+      "loss": 0.4134,
+      "step": 7311
+    },
+    {
+      "epoch": 0.20175417418266728,
+      "grad_norm": 0.005750832613557577,
+      "learning_rate": 0.001,
+      "loss": 0.3754,
+      "step": 7312
+    },
+    {
+      "epoch": 0.20178176638373163,
+      "grad_norm": 0.0026583108119666576,
+      "learning_rate": 0.001,
+      "loss": 0.4371,
+      "step": 7313
+    },
+    {
+      "epoch": 0.201809358584796,
+      "grad_norm": 0.004087739158421755,
+      "learning_rate": 0.001,
+      "loss": 0.4412,
+      "step": 7314
+    },
+    {
+      "epoch": 0.20183695078586036,
+      "grad_norm": 0.0027278910856693983,
+      "learning_rate": 0.001,
+      "loss": 0.4137,
+      "step": 7315
+    },
+    {
+      "epoch": 0.20186454298692474,
+      "grad_norm": 0.002498122164979577,
+      "learning_rate": 0.001,
+      "loss": 0.4084,
+      "step": 7316
+    },
+    {
+      "epoch": 0.20189213518798912,
+      "grad_norm": 0.004829999525099993,
+      "learning_rate": 0.001,
+      "loss": 0.3991,
+      "step": 7317
+    },
+    {
+      "epoch": 0.20191972738905348,
+      "grad_norm": 0.0025493940338492393,
+      "learning_rate": 0.001,
+      "loss": 0.4411,
+      "step": 7318
+    },
+    {
+      "epoch": 0.20194731959011786,
+      "grad_norm": 0.0027906931936740875,
+      "learning_rate": 0.001,
+      "loss": 0.3974,
+      "step": 7319
+    },
+    {
+      "epoch": 0.2019749117911822,
+      "grad_norm": 0.0034205715637654066,
+      "learning_rate": 0.001,
+      "loss": 0.3953,
+      "step": 7320
+    },
+    {
+      "epoch": 0.2020025039922466,
+      "grad_norm": 0.0030028745532035828,
+      "learning_rate": 0.001,
+      "loss": 0.4153,
+      "step": 7321
+    },
+    {
+      "epoch": 0.20203009619331097,
+      "grad_norm": 0.006040586158633232,
+      "learning_rate": 0.001,
+      "loss": 0.4089,
+      "step": 7322
+    },
+    {
+      "epoch": 0.20205768839437532,
+      "grad_norm": 0.0025682656560093164,
+      "learning_rate": 0.001,
+      "loss": 0.4298,
+      "step": 7323
+    },
+    {
+      "epoch": 0.2020852805954397,
+      "grad_norm": 0.003085241885855794,
+      "learning_rate": 0.001,
+      "loss": 0.44,
+      "step": 7324
+    },
+    {
+      "epoch": 0.20211287279650406,
+      "grad_norm": 0.0035471986047923565,
+      "learning_rate": 0.001,
+      "loss": 0.4127,
+      "step": 7325
+    },
+    {
+      "epoch": 0.20214046499756844,
+      "grad_norm": 0.0031859648879617453,
+      "learning_rate": 0.001,
+      "loss": 0.4442,
+      "step": 7326
+    },
+    {
+      "epoch": 0.20216805719863282,
+      "grad_norm": 0.003315818263217807,
+      "learning_rate": 0.001,
+      "loss": 0.4262,
+      "step": 7327
+    },
+    {
+      "epoch": 0.20219564939969717,
+      "grad_norm": 0.004785994999110699,
+      "learning_rate": 0.001,
+      "loss": 0.346,
+      "step": 7328
+    },
+    {
+      "epoch": 0.20222324160076155,
+      "grad_norm": 0.004568551201373339,
+      "learning_rate": 0.001,
+      "loss": 0.3935,
+      "step": 7329
+    },
+    {
+      "epoch": 0.2022508338018259,
+      "grad_norm": 0.00264363712631166,
+      "learning_rate": 0.001,
+      "loss": 0.4275,
+      "step": 7330
+    },
+    {
+      "epoch": 0.20227842600289028,
+      "grad_norm": 0.003292328678071499,
+      "learning_rate": 0.001,
+      "loss": 0.416,
+      "step": 7331
+    },
+    {
+      "epoch": 0.20230601820395466,
+      "grad_norm": 0.004438953939825296,
+      "learning_rate": 0.001,
+      "loss": 0.3979,
+      "step": 7332
+    },
+    {
+      "epoch": 0.20233361040501902,
+      "grad_norm": 0.0032826552633196115,
+      "learning_rate": 0.001,
+      "loss": 0.4065,
+      "step": 7333
+    },
+    {
+      "epoch": 0.2023612026060834,
+      "grad_norm": 0.03612243011593819,
+      "learning_rate": 0.001,
+      "loss": 0.4007,
+      "step": 7334
+    },
+    {
+      "epoch": 0.20238879480714775,
+      "grad_norm": 0.01141429878771305,
+      "learning_rate": 0.001,
+      "loss": 0.3783,
+      "step": 7335
+    },
+    {
+      "epoch": 0.20241638700821213,
+      "grad_norm": 0.0034477144945412874,
+      "learning_rate": 0.001,
+      "loss": 0.4199,
+      "step": 7336
+    },
+    {
+      "epoch": 0.2024439792092765,
+      "grad_norm": 0.007367158308625221,
+      "learning_rate": 0.001,
+      "loss": 0.37,
+      "step": 7337
+    },
+    {
+      "epoch": 0.20247157141034086,
+      "grad_norm": 0.0035362287890166044,
+      "learning_rate": 0.001,
+      "loss": 0.3938,
+      "step": 7338
+    },
+    {
+      "epoch": 0.20249916361140524,
+      "grad_norm": 0.002065723529085517,
+      "learning_rate": 0.001,
+      "loss": 0.3783,
+      "step": 7339
+    },
+    {
+      "epoch": 0.2025267558124696,
+      "grad_norm": 0.004578362684696913,
+      "learning_rate": 0.001,
+      "loss": 0.4092,
+      "step": 7340
+    },
+    {
+      "epoch": 0.20255434801353397,
+      "grad_norm": 0.002685924293473363,
+      "learning_rate": 0.001,
+      "loss": 0.3728,
+      "step": 7341
+    },
+    {
+      "epoch": 0.20258194021459835,
+      "grad_norm": 0.0038794514257460833,
+      "learning_rate": 0.001,
+      "loss": 0.3913,
+      "step": 7342
+    },
+    {
+      "epoch": 0.2026095324156627,
+      "grad_norm": 0.002912055002525449,
+      "learning_rate": 0.001,
+      "loss": 0.4266,
+      "step": 7343
+    },
+    {
+      "epoch": 0.2026371246167271,
+      "grad_norm": 0.003003375604748726,
+      "learning_rate": 0.001,
+      "loss": 0.376,
+      "step": 7344
+    },
+    {
+      "epoch": 0.20266471681779144,
+      "grad_norm": 0.003948207478970289,
+      "learning_rate": 0.001,
+      "loss": 0.4227,
+      "step": 7345
+    },
+    {
+      "epoch": 0.20269230901885582,
+      "grad_norm": 0.0033744163811206818,
+      "learning_rate": 0.001,
+      "loss": 0.4091,
+      "step": 7346
+    },
+    {
+      "epoch": 0.2027199012199202,
+      "grad_norm": 0.004838820081204176,
+      "learning_rate": 0.001,
+      "loss": 0.4642,
+      "step": 7347
+    },
+    {
+      "epoch": 0.20274749342098455,
+      "grad_norm": 0.0024067736230790615,
+      "learning_rate": 0.001,
+      "loss": 0.3415,
+      "step": 7348
+    },
+    {
+      "epoch": 0.20277508562204893,
+      "grad_norm": 0.0045340536162257195,
+      "learning_rate": 0.001,
+      "loss": 0.3561,
+      "step": 7349
+    },
+    {
+      "epoch": 0.2028026778231133,
+      "grad_norm": 0.003165213158354163,
+      "learning_rate": 0.001,
+      "loss": 0.372,
+      "step": 7350
+    },
+    {
+      "epoch": 0.20283027002417767,
+      "grad_norm": 0.004642155487090349,
+      "learning_rate": 0.001,
+      "loss": 0.3858,
+      "step": 7351
+    },
+    {
+      "epoch": 0.20285786222524205,
+      "grad_norm": 0.005825064145028591,
+      "learning_rate": 0.001,
+      "loss": 0.4276,
+      "step": 7352
+    },
+    {
+      "epoch": 0.2028854544263064,
+      "grad_norm": 0.002787059871479869,
+      "learning_rate": 0.001,
+      "loss": 0.407,
+      "step": 7353
+    },
+    {
+      "epoch": 0.20291304662737078,
+      "grad_norm": 0.004571388475596905,
+      "learning_rate": 0.001,
+      "loss": 0.3881,
+      "step": 7354
+    },
+    {
+      "epoch": 0.20294063882843513,
+      "grad_norm": 0.0032351436093449593,
+      "learning_rate": 0.001,
+      "loss": 0.3958,
+      "step": 7355
+    },
+    {
+      "epoch": 0.2029682310294995,
+      "grad_norm": 0.0041768900118768215,
+      "learning_rate": 0.001,
+      "loss": 0.4159,
+      "step": 7356
+    },
+    {
+      "epoch": 0.2029958232305639,
+      "grad_norm": 0.0028741308487951756,
+      "learning_rate": 0.001,
+      "loss": 0.4076,
+      "step": 7357
+    },
+    {
+      "epoch": 0.20302341543162825,
+      "grad_norm": 0.00324125774204731,
+      "learning_rate": 0.001,
+      "loss": 0.3766,
+      "step": 7358
+    },
+    {
+      "epoch": 0.20305100763269263,
+      "grad_norm": 0.003516893368214369,
+      "learning_rate": 0.001,
+      "loss": 0.3873,
+      "step": 7359
+    },
+    {
+      "epoch": 0.20307859983375698,
+      "grad_norm": 0.0020891185849905014,
+      "learning_rate": 0.001,
+      "loss": 0.422,
+      "step": 7360
+    },
+    {
+      "epoch": 0.20310619203482136,
+      "grad_norm": 0.0024756919592618942,
+      "learning_rate": 0.001,
+      "loss": 0.4182,
+      "step": 7361
+    },
+    {
+      "epoch": 0.20313378423588574,
+      "grad_norm": 0.007347720675170422,
+      "learning_rate": 0.001,
+      "loss": 0.4016,
+      "step": 7362
+    },
+    {
+      "epoch": 0.2031613764369501,
+      "grad_norm": 0.009618041105568409,
+      "learning_rate": 0.001,
+      "loss": 0.4081,
+      "step": 7363
+    },
+    {
+      "epoch": 0.20318896863801447,
+      "grad_norm": 0.004037544596940279,
+      "learning_rate": 0.001,
+      "loss": 0.4009,
+      "step": 7364
+    },
+    {
+      "epoch": 0.20321656083907882,
+      "grad_norm": 0.002381223253905773,
+      "learning_rate": 0.001,
+      "loss": 0.427,
+      "step": 7365
+    },
+    {
+      "epoch": 0.2032441530401432,
+      "grad_norm": 0.0031598431523889303,
+      "learning_rate": 0.001,
+      "loss": 0.4249,
+      "step": 7366
+    },
+    {
+      "epoch": 0.20327174524120759,
+      "grad_norm": 0.00809874664992094,
+      "learning_rate": 0.001,
+      "loss": 0.4179,
+      "step": 7367
+    },
+    {
+      "epoch": 0.20329933744227194,
+      "grad_norm": 0.002093277871608734,
+      "learning_rate": 0.001,
+      "loss": 0.4407,
+      "step": 7368
+    },
+    {
+      "epoch": 0.20332692964333632,
+      "grad_norm": 0.002877402352169156,
+      "learning_rate": 0.001,
+      "loss": 0.4162,
+      "step": 7369
+    },
+    {
+      "epoch": 0.20335452184440067,
+      "grad_norm": 0.004072779323905706,
+      "learning_rate": 0.001,
+      "loss": 0.3556,
+      "step": 7370
+    },
+    {
+      "epoch": 0.20338211404546505,
+      "grad_norm": 0.0036910090129822493,
+      "learning_rate": 0.001,
+      "loss": 0.3764,
+      "step": 7371
+    },
+    {
+      "epoch": 0.20340970624652943,
+      "grad_norm": 0.002574736950919032,
+      "learning_rate": 0.001,
+      "loss": 0.4183,
+      "step": 7372
+    },
+    {
+      "epoch": 0.20343729844759378,
+      "grad_norm": 0.0024004147853702307,
+      "learning_rate": 0.001,
+      "loss": 0.4241,
+      "step": 7373
+    },
+    {
+      "epoch": 0.20346489064865816,
+      "grad_norm": 0.003890526946634054,
+      "learning_rate": 0.001,
+      "loss": 0.4319,
+      "step": 7374
+    },
+    {
+      "epoch": 0.20349248284972252,
+      "grad_norm": 0.0030879208352416754,
+      "learning_rate": 0.001,
+      "loss": 0.4009,
+      "step": 7375
+    },
+    {
+      "epoch": 0.2035200750507869,
+      "grad_norm": 0.003786006011068821,
+      "learning_rate": 0.001,
+      "loss": 0.4258,
+      "step": 7376
+    },
+    {
+      "epoch": 0.20354766725185128,
+      "grad_norm": 0.0021713408641517162,
+      "learning_rate": 0.001,
+      "loss": 0.3805,
+      "step": 7377
+    },
+    {
+      "epoch": 0.20357525945291563,
+      "grad_norm": 0.00433305511251092,
+      "learning_rate": 0.001,
+      "loss": 0.392,
+      "step": 7378
+    },
+    {
+      "epoch": 0.20360285165398,
+      "grad_norm": 0.0030764697585254908,
+      "learning_rate": 0.001,
+      "loss": 0.4037,
+      "step": 7379
+    },
+    {
+      "epoch": 0.20363044385504436,
+      "grad_norm": 0.005284723825752735,
+      "learning_rate": 0.001,
+      "loss": 0.3861,
+      "step": 7380
+    },
+    {
+      "epoch": 0.20365803605610874,
+      "grad_norm": 0.004528792109340429,
+      "learning_rate": 0.001,
+      "loss": 0.3937,
+      "step": 7381
+    },
+    {
+      "epoch": 0.20368562825717312,
+      "grad_norm": 0.029749277979135513,
+      "learning_rate": 0.001,
+      "loss": 0.3852,
+      "step": 7382
+    },
+    {
+      "epoch": 0.20371322045823748,
+      "grad_norm": 0.004927974659949541,
+      "learning_rate": 0.001,
+      "loss": 0.4053,
+      "step": 7383
+    },
+    {
+      "epoch": 0.20374081265930186,
+      "grad_norm": 0.004339797887951136,
+      "learning_rate": 0.001,
+      "loss": 0.4044,
+      "step": 7384
+    },
+    {
+      "epoch": 0.2037684048603662,
+      "grad_norm": 0.002485482720658183,
+      "learning_rate": 0.001,
+      "loss": 0.4244,
+      "step": 7385
+    },
+    {
+      "epoch": 0.2037959970614306,
+      "grad_norm": 0.004456434864550829,
+      "learning_rate": 0.001,
+      "loss": 0.3738,
+      "step": 7386
+    },
+    {
+      "epoch": 0.20382358926249497,
+      "grad_norm": 0.0022889727260917425,
+      "learning_rate": 0.001,
+      "loss": 0.4261,
+      "step": 7387
+    },
+    {
+      "epoch": 0.20385118146355932,
+      "grad_norm": 0.006221712101250887,
+      "learning_rate": 0.001,
+      "loss": 0.3579,
+      "step": 7388
+    },
+    {
+      "epoch": 0.2038787736646237,
+      "grad_norm": 0.003358406713232398,
+      "learning_rate": 0.001,
+      "loss": 0.3869,
+      "step": 7389
+    },
+    {
+      "epoch": 0.20390636586568806,
+      "grad_norm": 0.0029171998612582684,
+      "learning_rate": 0.001,
+      "loss": 0.4179,
+      "step": 7390
+    },
+    {
+      "epoch": 0.20393395806675244,
+      "grad_norm": 0.0030932044610381126,
+      "learning_rate": 0.001,
+      "loss": 0.415,
+      "step": 7391
+    },
+    {
+      "epoch": 0.2039615502678168,
+      "grad_norm": 0.009446857497096062,
+      "learning_rate": 0.001,
+      "loss": 0.411,
+      "step": 7392
+    },
+    {
+      "epoch": 0.20398914246888117,
+      "grad_norm": 0.003646930679678917,
+      "learning_rate": 0.001,
+      "loss": 0.3674,
+      "step": 7393
+    },
+    {
+      "epoch": 0.20401673466994555,
+      "grad_norm": 0.002314311685040593,
+      "learning_rate": 0.001,
+      "loss": 0.4071,
+      "step": 7394
+    },
+    {
+      "epoch": 0.2040443268710099,
+      "grad_norm": 0.0032062383834272623,
+      "learning_rate": 0.001,
+      "loss": 0.378,
+      "step": 7395
+    },
+    {
+      "epoch": 0.20407191907207428,
+      "grad_norm": 0.005731469485908747,
+      "learning_rate": 0.001,
+      "loss": 0.42,
+      "step": 7396
+    },
+    {
+      "epoch": 0.20409951127313863,
+      "grad_norm": 0.002991395303979516,
+      "learning_rate": 0.001,
+      "loss": 0.4357,
+      "step": 7397
+    },
+    {
+      "epoch": 0.20412710347420301,
+      "grad_norm": 0.0036581414751708508,
+      "learning_rate": 0.001,
+      "loss": 0.401,
+      "step": 7398
+    },
+    {
+      "epoch": 0.2041546956752674,
+      "grad_norm": 0.00450851721689105,
+      "learning_rate": 0.001,
+      "loss": 0.3855,
+      "step": 7399
+    },
+    {
+      "epoch": 0.20418228787633175,
+      "grad_norm": 0.003067772602662444,
+      "learning_rate": 0.001,
+      "loss": 0.392,
+      "step": 7400
+    },
+    {
+      "epoch": 0.20420988007739613,
+      "grad_norm": 0.0030468387994915247,
+      "learning_rate": 0.001,
+      "loss": 0.3904,
+      "step": 7401
+    },
+    {
+      "epoch": 0.20423747227846048,
+      "grad_norm": 0.004612901713699102,
+      "learning_rate": 0.001,
+      "loss": 0.3639,
+      "step": 7402
+    },
+    {
+      "epoch": 0.20426506447952486,
+      "grad_norm": 0.0034134613815695047,
+      "learning_rate": 0.001,
+      "loss": 0.4229,
+      "step": 7403
+    },
+    {
+      "epoch": 0.20429265668058924,
+      "grad_norm": 0.009274942800402641,
+      "learning_rate": 0.001,
+      "loss": 0.4011,
+      "step": 7404
+    },
+    {
+      "epoch": 0.2043202488816536,
+      "grad_norm": 0.0026235580444335938,
+      "learning_rate": 0.001,
+      "loss": 0.3719,
+      "step": 7405
+    },
+    {
+      "epoch": 0.20434784108271797,
+      "grad_norm": 0.006790754850953817,
+      "learning_rate": 0.001,
+      "loss": 0.3867,
+      "step": 7406
+    },
+    {
+      "epoch": 0.20437543328378233,
+      "grad_norm": 0.0022785153705626726,
+      "learning_rate": 0.001,
+      "loss": 0.4071,
+      "step": 7407
+    },
+    {
+      "epoch": 0.2044030254848467,
+      "grad_norm": 0.005909627769142389,
+      "learning_rate": 0.001,
+      "loss": 0.3857,
+      "step": 7408
+    },
+    {
+      "epoch": 0.2044306176859111,
+      "grad_norm": 0.0032420544885098934,
+      "learning_rate": 0.001,
+      "loss": 0.3786,
+      "step": 7409
+    },
+    {
+      "epoch": 0.20445820988697544,
+      "grad_norm": 0.003111346159130335,
+      "learning_rate": 0.001,
+      "loss": 0.3925,
+      "step": 7410
+    },
+    {
+      "epoch": 0.20448580208803982,
+      "grad_norm": 0.003899330273270607,
+      "learning_rate": 0.001,
+      "loss": 0.3813,
+      "step": 7411
+    },
+    {
+      "epoch": 0.20451339428910417,
+      "grad_norm": 0.007644087076187134,
+      "learning_rate": 0.001,
+      "loss": 0.4065,
+      "step": 7412
+    },
+    {
+      "epoch": 0.20454098649016855,
+      "grad_norm": 0.0028403971809893847,
+      "learning_rate": 0.001,
+      "loss": 0.4052,
+      "step": 7413
+    },
+    {
+      "epoch": 0.20456857869123293,
+      "grad_norm": 0.00496717169880867,
+      "learning_rate": 0.001,
+      "loss": 0.3988,
+      "step": 7414
+    },
+    {
+      "epoch": 0.20459617089229729,
+      "grad_norm": 0.003765889909118414,
+      "learning_rate": 0.001,
+      "loss": 0.4465,
+      "step": 7415
+    },
+    {
+      "epoch": 0.20462376309336167,
+      "grad_norm": 0.0031095000449568033,
+      "learning_rate": 0.001,
+      "loss": 0.3615,
+      "step": 7416
+    },
+    {
+      "epoch": 0.20465135529442602,
+      "grad_norm": 0.0027119989972561598,
+      "learning_rate": 0.001,
+      "loss": 0.4084,
+      "step": 7417
+    },
+    {
+      "epoch": 0.2046789474954904,
+      "grad_norm": 0.0027941481675952673,
+      "learning_rate": 0.001,
+      "loss": 0.4315,
+      "step": 7418
+    },
+    {
+      "epoch": 0.20470653969655478,
+      "grad_norm": 0.002864205278456211,
+      "learning_rate": 0.001,
+      "loss": 0.4042,
+      "step": 7419
+    },
+    {
+      "epoch": 0.20473413189761913,
+      "grad_norm": 0.0032913892064243555,
+      "learning_rate": 0.001,
+      "loss": 0.3977,
+      "step": 7420
+    },
+    {
+      "epoch": 0.2047617240986835,
+      "grad_norm": 0.0031653158366680145,
+      "learning_rate": 0.001,
+      "loss": 0.3891,
+      "step": 7421
+    },
+    {
+      "epoch": 0.20478931629974786,
+      "grad_norm": 0.002829028759151697,
+      "learning_rate": 0.001,
+      "loss": 0.3897,
+      "step": 7422
+    },
+    {
+      "epoch": 0.20481690850081224,
+      "grad_norm": 0.0026797757018357515,
+      "learning_rate": 0.001,
+      "loss": 0.4038,
+      "step": 7423
+    },
+    {
+      "epoch": 0.20484450070187663,
+      "grad_norm": 0.0033115644473582506,
+      "learning_rate": 0.001,
+      "loss": 0.4109,
+      "step": 7424
+    },
+    {
+      "epoch": 0.20487209290294098,
+      "grad_norm": 0.0037570816930383444,
+      "learning_rate": 0.001,
+      "loss": 0.3904,
+      "step": 7425
+    },
+    {
+      "epoch": 0.20489968510400536,
+      "grad_norm": 0.0022269757464528084,
+      "learning_rate": 0.001,
+      "loss": 0.3928,
+      "step": 7426
+    },
+    {
+      "epoch": 0.2049272773050697,
+      "grad_norm": 0.003650048514828086,
+      "learning_rate": 0.001,
+      "loss": 0.391,
+      "step": 7427
+    },
+    {
+      "epoch": 0.2049548695061341,
+      "grad_norm": 0.003723046975210309,
+      "learning_rate": 0.001,
+      "loss": 0.4122,
+      "step": 7428
+    },
+    {
+      "epoch": 0.20498246170719847,
+      "grad_norm": 0.0031359000131487846,
+      "learning_rate": 0.001,
+      "loss": 0.3824,
+      "step": 7429
+    },
+    {
+      "epoch": 0.20501005390826282,
+      "grad_norm": 0.0026418371126055717,
+      "learning_rate": 0.001,
+      "loss": 0.4462,
+      "step": 7430
+    },
+    {
+      "epoch": 0.2050376461093272,
+      "grad_norm": 0.005113274324685335,
+      "learning_rate": 0.001,
+      "loss": 0.4356,
+      "step": 7431
+    },
+    {
+      "epoch": 0.20506523831039156,
+      "grad_norm": 0.0034433556720614433,
+      "learning_rate": 0.001,
+      "loss": 0.4077,
+      "step": 7432
+    },
+    {
+      "epoch": 0.20509283051145594,
+      "grad_norm": 0.0039060013368725777,
+      "learning_rate": 0.001,
+      "loss": 0.3965,
+      "step": 7433
+    },
+    {
+      "epoch": 0.20512042271252032,
+      "grad_norm": 0.0031034150160849094,
+      "learning_rate": 0.001,
+      "loss": 0.4195,
+      "step": 7434
+    },
+    {
+      "epoch": 0.20514801491358467,
+      "grad_norm": 0.004134570714086294,
+      "learning_rate": 0.001,
+      "loss": 0.4333,
+      "step": 7435
+    },
+    {
+      "epoch": 0.20517560711464905,
+      "grad_norm": 0.002315821126103401,
+      "learning_rate": 0.001,
+      "loss": 0.4121,
+      "step": 7436
+    },
+    {
+      "epoch": 0.2052031993157134,
+      "grad_norm": 0.004337753169238567,
+      "learning_rate": 0.001,
+      "loss": 0.3908,
+      "step": 7437
+    },
+    {
+      "epoch": 0.20523079151677778,
+      "grad_norm": 0.014801721088588238,
+      "learning_rate": 0.001,
+      "loss": 0.3668,
+      "step": 7438
+    },
+    {
+      "epoch": 0.20525838371784216,
+      "grad_norm": 0.003172953613102436,
+      "learning_rate": 0.001,
+      "loss": 0.4005,
+      "step": 7439
+    },
+    {
+      "epoch": 0.20528597591890652,
+      "grad_norm": 0.0031124092638492584,
+      "learning_rate": 0.001,
+      "loss": 0.3994,
+      "step": 7440
+    },
+    {
+      "epoch": 0.2053135681199709,
+      "grad_norm": 0.004870596341788769,
+      "learning_rate": 0.001,
+      "loss": 0.3706,
+      "step": 7441
+    },
+    {
+      "epoch": 0.20534116032103525,
+      "grad_norm": 0.002806989708915353,
+      "learning_rate": 0.001,
+      "loss": 0.3714,
+      "step": 7442
+    },
+    {
+      "epoch": 0.20536875252209963,
+      "grad_norm": 0.002985073020681739,
+      "learning_rate": 0.001,
+      "loss": 0.3927,
+      "step": 7443
+    },
+    {
+      "epoch": 0.205396344723164,
+      "grad_norm": 0.002774279797449708,
+      "learning_rate": 0.001,
+      "loss": 0.378,
+      "step": 7444
+    },
+    {
+      "epoch": 0.20542393692422836,
+      "grad_norm": 0.0028050506953150034,
+      "learning_rate": 0.001,
+      "loss": 0.3541,
+      "step": 7445
+    },
+    {
+      "epoch": 0.20545152912529274,
+      "grad_norm": 0.004381167236715555,
+      "learning_rate": 0.001,
+      "loss": 0.4285,
+      "step": 7446
+    },
+    {
+      "epoch": 0.2054791213263571,
+      "grad_norm": 0.003191079944372177,
+      "learning_rate": 0.001,
+      "loss": 0.4055,
+      "step": 7447
+    },
+    {
+      "epoch": 0.20550671352742148,
+      "grad_norm": 0.0027379656676203012,
+      "learning_rate": 0.001,
+      "loss": 0.385,
+      "step": 7448
+    },
+    {
+      "epoch": 0.20553430572848586,
+      "grad_norm": 0.0034181999508291483,
+      "learning_rate": 0.001,
+      "loss": 0.4061,
+      "step": 7449
+    },
+    {
+      "epoch": 0.2055618979295502,
+      "grad_norm": 0.003427153918892145,
+      "learning_rate": 0.001,
+      "loss": 0.3757,
+      "step": 7450
+    },
+    {
+      "epoch": 0.2055894901306146,
+      "grad_norm": 0.002515499945729971,
+      "learning_rate": 0.001,
+      "loss": 0.3551,
+      "step": 7451
+    },
+    {
+      "epoch": 0.20561708233167894,
+      "grad_norm": 0.002670654794201255,
+      "learning_rate": 0.001,
+      "loss": 0.4392,
+      "step": 7452
+    },
+    {
+      "epoch": 0.20564467453274332,
+      "grad_norm": 0.003271377645432949,
+      "learning_rate": 0.001,
+      "loss": 0.3804,
+      "step": 7453
+    },
+    {
+      "epoch": 0.2056722667338077,
+      "grad_norm": 0.0034607460256665945,
+      "learning_rate": 0.001,
+      "loss": 0.3951,
+      "step": 7454
+    },
+    {
+      "epoch": 0.20569985893487205,
+      "grad_norm": 0.002583264373242855,
+      "learning_rate": 0.001,
+      "loss": 0.4314,
+      "step": 7455
+    },
+    {
+      "epoch": 0.20572745113593643,
+      "grad_norm": 0.003357719397172332,
+      "learning_rate": 0.001,
+      "loss": 0.3857,
+      "step": 7456
+    },
+    {
+      "epoch": 0.2057550433370008,
+      "grad_norm": 0.005699231754988432,
+      "learning_rate": 0.001,
+      "loss": 0.3983,
+      "step": 7457
+    },
+    {
+      "epoch": 0.20578263553806517,
+      "grad_norm": 0.00468082819133997,
+      "learning_rate": 0.001,
+      "loss": 0.4166,
+      "step": 7458
+    },
+    {
+      "epoch": 0.20581022773912955,
+      "grad_norm": 0.002804639982059598,
+      "learning_rate": 0.001,
+      "loss": 0.4128,
+      "step": 7459
+    },
+    {
+      "epoch": 0.2058378199401939,
+      "grad_norm": 0.0031567709520459175,
+      "learning_rate": 0.001,
+      "loss": 0.452,
+      "step": 7460
+    },
+    {
+      "epoch": 0.20586541214125828,
+      "grad_norm": 0.0023213198874145746,
+      "learning_rate": 0.001,
+      "loss": 0.4353,
+      "step": 7461
+    },
+    {
+      "epoch": 0.20589300434232263,
+      "grad_norm": 0.003251513699069619,
+      "learning_rate": 0.001,
+      "loss": 0.3908,
+      "step": 7462
+    },
+    {
+      "epoch": 0.205920596543387,
+      "grad_norm": 0.0036114819813519716,
+      "learning_rate": 0.001,
+      "loss": 0.4622,
+      "step": 7463
+    },
+    {
+      "epoch": 0.2059481887444514,
+      "grad_norm": 0.0032585756853222847,
+      "learning_rate": 0.001,
+      "loss": 0.3934,
+      "step": 7464
+    },
+    {
+      "epoch": 0.20597578094551575,
+      "grad_norm": 0.004090276546776295,
+      "learning_rate": 0.001,
+      "loss": 0.369,
+      "step": 7465
+    },
+    {
+      "epoch": 0.20600337314658013,
+      "grad_norm": 0.006538175046443939,
+      "learning_rate": 0.001,
+      "loss": 0.3902,
+      "step": 7466
+    },
+    {
+      "epoch": 0.20603096534764448,
+      "grad_norm": 0.006117241457104683,
+      "learning_rate": 0.001,
+      "loss": 0.3976,
+      "step": 7467
+    },
+    {
+      "epoch": 0.20605855754870886,
+      "grad_norm": 0.004128247033804655,
+      "learning_rate": 0.001,
+      "loss": 0.4083,
+      "step": 7468
+    },
+    {
+      "epoch": 0.20608614974977324,
+      "grad_norm": 0.0023199093993753195,
+      "learning_rate": 0.001,
+      "loss": 0.3613,
+      "step": 7469
+    },
+    {
+      "epoch": 0.2061137419508376,
+      "grad_norm": 0.002994114300236106,
+      "learning_rate": 0.001,
+      "loss": 0.396,
+      "step": 7470
+    },
+    {
+      "epoch": 0.20614133415190197,
+      "grad_norm": 0.00449094595387578,
+      "learning_rate": 0.001,
+      "loss": 0.3779,
+      "step": 7471
+    },
+    {
+      "epoch": 0.20616892635296633,
+      "grad_norm": 0.002262924797832966,
+      "learning_rate": 0.001,
+      "loss": 0.3891,
+      "step": 7472
+    },
+    {
+      "epoch": 0.2061965185540307,
+      "grad_norm": 0.0033367322757840157,
+      "learning_rate": 0.001,
+      "loss": 0.3902,
+      "step": 7473
+    },
+    {
+      "epoch": 0.20622411075509509,
+      "grad_norm": 0.004621635656803846,
+      "learning_rate": 0.001,
+      "loss": 0.3829,
+      "step": 7474
+    },
+    {
+      "epoch": 0.20625170295615944,
+      "grad_norm": 0.0024976639542728662,
+      "learning_rate": 0.001,
+      "loss": 0.3893,
+      "step": 7475
+    },
+    {
+      "epoch": 0.20627929515722382,
+      "grad_norm": 0.0032991066109389067,
+      "learning_rate": 0.001,
+      "loss": 0.3909,
+      "step": 7476
+    },
+    {
+      "epoch": 0.20630688735828817,
+      "grad_norm": 0.004945872817188501,
+      "learning_rate": 0.001,
+      "loss": 0.3938,
+      "step": 7477
+    },
+    {
+      "epoch": 0.20633447955935255,
+      "grad_norm": 0.003072677878662944,
+      "learning_rate": 0.001,
+      "loss": 0.3846,
+      "step": 7478
+    },
+    {
+      "epoch": 0.20636207176041693,
+      "grad_norm": 0.004622003063559532,
+      "learning_rate": 0.001,
+      "loss": 0.4524,
+      "step": 7479
+    },
+    {
+      "epoch": 0.20638966396148128,
+      "grad_norm": 0.005966620985418558,
+      "learning_rate": 0.001,
+      "loss": 0.3915,
+      "step": 7480
+    },
+    {
+      "epoch": 0.20641725616254566,
+      "grad_norm": 0.002969125984236598,
+      "learning_rate": 0.001,
+      "loss": 0.4081,
+      "step": 7481
+    },
+    {
+      "epoch": 0.20644484836361002,
+      "grad_norm": 0.020619425922632217,
+      "learning_rate": 0.001,
+      "loss": 0.4044,
+      "step": 7482
+    },
+    {
+      "epoch": 0.2064724405646744,
+      "grad_norm": 0.005815454758703709,
+      "learning_rate": 0.001,
+      "loss": 0.3981,
+      "step": 7483
+    },
+    {
+      "epoch": 0.20650003276573875,
+      "grad_norm": 0.0029577158857136965,
+      "learning_rate": 0.001,
+      "loss": 0.4203,
+      "step": 7484
+    },
+    {
+      "epoch": 0.20652762496680313,
+      "grad_norm": 0.0039966655895113945,
+      "learning_rate": 0.001,
+      "loss": 0.3665,
+      "step": 7485
+    },
+    {
+      "epoch": 0.2065552171678675,
+      "grad_norm": 0.00554103497415781,
+      "learning_rate": 0.001,
+      "loss": 0.4024,
+      "step": 7486
+    },
+    {
+      "epoch": 0.20658280936893186,
+      "grad_norm": 0.00304683530703187,
+      "learning_rate": 0.001,
+      "loss": 0.4444,
+      "step": 7487
+    },
+    {
+      "epoch": 0.20661040156999624,
+      "grad_norm": 0.007905455306172371,
+      "learning_rate": 0.001,
+      "loss": 0.357,
+      "step": 7488
+    },
+    {
+      "epoch": 0.2066379937710606,
+      "grad_norm": 0.0040861996822059155,
+      "learning_rate": 0.001,
+      "loss": 0.4093,
+      "step": 7489
+    },
+    {
+      "epoch": 0.20666558597212498,
+      "grad_norm": 0.00287861586548388,
+      "learning_rate": 0.001,
+      "loss": 0.4079,
+      "step": 7490
+    },
+    {
+      "epoch": 0.20669317817318936,
+      "grad_norm": 0.00307285669259727,
+      "learning_rate": 0.001,
+      "loss": 0.4153,
+      "step": 7491
+    },
+    {
+      "epoch": 0.2067207703742537,
+      "grad_norm": 0.0033920174464583397,
+      "learning_rate": 0.001,
+      "loss": 0.3732,
+      "step": 7492
+    },
+    {
+      "epoch": 0.2067483625753181,
+      "grad_norm": 0.0028167800046503544,
+      "learning_rate": 0.001,
+      "loss": 0.4377,
+      "step": 7493
+    },
+    {
+      "epoch": 0.20677595477638244,
+      "grad_norm": 0.0022106487303972244,
+      "learning_rate": 0.001,
+      "loss": 0.4182,
+      "step": 7494
+    },
+    {
+      "epoch": 0.20680354697744682,
+      "grad_norm": 0.002651102375239134,
+      "learning_rate": 0.001,
+      "loss": 0.3978,
+      "step": 7495
+    },
+    {
+      "epoch": 0.2068311391785112,
+      "grad_norm": 0.003722158260643482,
+      "learning_rate": 0.001,
+      "loss": 0.4121,
+      "step": 7496
+    },
+    {
+      "epoch": 0.20685873137957556,
+      "grad_norm": 0.004339053761214018,
+      "learning_rate": 0.001,
+      "loss": 0.4251,
+      "step": 7497
+    },
+    {
+      "epoch": 0.20688632358063994,
+      "grad_norm": 0.002610723953694105,
+      "learning_rate": 0.001,
+      "loss": 0.4134,
+      "step": 7498
+    },
+    {
+      "epoch": 0.2069139157817043,
+      "grad_norm": 0.0024759217631071806,
+      "learning_rate": 0.001,
+      "loss": 0.4228,
+      "step": 7499
+    },
+    {
+      "epoch": 0.20694150798276867,
+      "grad_norm": 0.005409894045442343,
+      "learning_rate": 0.001,
+      "loss": 0.3883,
+      "step": 7500
+    },
+    {
+      "epoch": 0.20694150798276867,
+      "eval_runtime": 24.8623,
+      "eval_samples_per_second": 1.287,
+      "eval_steps_per_second": 0.161,
+      "step": 7500
+    },
+    {
+      "epoch": 0.20696910018383305,
+      "grad_norm": 0.0043741133995354176,
+      "learning_rate": 0.001,
+      "loss": 0.3872,
+      "step": 7501
+    },
+    {
+      "epoch": 0.2069966923848974,
+      "grad_norm": 0.0034702117554843426,
+      "learning_rate": 0.001,
+      "loss": 0.3996,
+      "step": 7502
+    },
+    {
+      "epoch": 0.20702428458596178,
+      "grad_norm": 0.002621249994263053,
+      "learning_rate": 0.001,
+      "loss": 0.3799,
+      "step": 7503
+    },
+    {
+      "epoch": 0.20705187678702613,
+      "grad_norm": 0.00401890417560935,
+      "learning_rate": 0.001,
+      "loss": 0.3787,
+      "step": 7504
+    },
+    {
+      "epoch": 0.20707946898809051,
+      "grad_norm": 0.005101599730551243,
+      "learning_rate": 0.001,
+      "loss": 0.4362,
+      "step": 7505
+    },
+    {
+      "epoch": 0.2071070611891549,
+      "grad_norm": 0.006792318541556597,
+      "learning_rate": 0.001,
+      "loss": 0.4084,
+      "step": 7506
+    },
+    {
+      "epoch": 0.20713465339021925,
+      "grad_norm": 0.0032163949217647314,
+      "learning_rate": 0.001,
+      "loss": 0.3853,
+      "step": 7507
+    },
+    {
+      "epoch": 0.20716224559128363,
+      "grad_norm": 0.006187311839312315,
+      "learning_rate": 0.001,
+      "loss": 0.4082,
+      "step": 7508
+    },
+    {
+      "epoch": 0.20718983779234798,
+      "grad_norm": 0.003395832609385252,
+      "learning_rate": 0.001,
+      "loss": 0.3804,
+      "step": 7509
+    },
+    {
+      "epoch": 0.20721742999341236,
+      "grad_norm": 0.007058777846395969,
+      "learning_rate": 0.001,
+      "loss": 0.3545,
+      "step": 7510
+    },
+    {
+      "epoch": 0.20724502219447674,
+      "grad_norm": 0.0022814422845840454,
+      "learning_rate": 0.001,
+      "loss": 0.3949,
+      "step": 7511
+    },
+    {
+      "epoch": 0.2072726143955411,
+      "grad_norm": 0.003063853131607175,
+      "learning_rate": 0.001,
+      "loss": 0.4,
+      "step": 7512
+    },
+    {
+      "epoch": 0.20730020659660547,
+      "grad_norm": 0.003978427965193987,
+      "learning_rate": 0.001,
+      "loss": 0.3732,
+      "step": 7513
+    },
+    {
+      "epoch": 0.20732779879766983,
+      "grad_norm": 0.002310180803760886,
+      "learning_rate": 0.001,
+      "loss": 0.4172,
+      "step": 7514
+    },
+    {
+      "epoch": 0.2073553909987342,
+      "grad_norm": 0.0026651090011000633,
+      "learning_rate": 0.001,
+      "loss": 0.3855,
+      "step": 7515
+    },
+    {
+      "epoch": 0.2073829831997986,
+      "grad_norm": 0.0029477854259312153,
+      "learning_rate": 0.001,
+      "loss": 0.4296,
+      "step": 7516
+    },
+    {
+      "epoch": 0.20741057540086294,
+      "grad_norm": 0.0022027394734323025,
+      "learning_rate": 0.001,
+      "loss": 0.4052,
+      "step": 7517
+    },
+    {
+      "epoch": 0.20743816760192732,
+      "grad_norm": 0.0025718417018651962,
+      "learning_rate": 0.001,
+      "loss": 0.4058,
+      "step": 7518
+    },
+    {
+      "epoch": 0.20746575980299167,
+      "grad_norm": 0.002921158680692315,
+      "learning_rate": 0.001,
+      "loss": 0.383,
+      "step": 7519
+    },
+    {
+      "epoch": 0.20749335200405605,
+      "grad_norm": 0.0037957970052957535,
+      "learning_rate": 0.001,
+      "loss": 0.4122,
+      "step": 7520
+    },
+    {
+      "epoch": 0.20752094420512043,
+      "grad_norm": 0.003919035196304321,
+      "learning_rate": 0.001,
+      "loss": 0.4063,
+      "step": 7521
+    },
+    {
+      "epoch": 0.20754853640618479,
+      "grad_norm": 0.0024592061527073383,
+      "learning_rate": 0.001,
+      "loss": 0.4195,
+      "step": 7522
+    },
+    {
+      "epoch": 0.20757612860724917,
+      "grad_norm": 0.0034799345303326845,
+      "learning_rate": 0.001,
+      "loss": 0.3907,
+      "step": 7523
+    },
+    {
+      "epoch": 0.20760372080831352,
+      "grad_norm": 0.00930013321340084,
+      "learning_rate": 0.001,
+      "loss": 0.3504,
+      "step": 7524
+    },
+    {
+      "epoch": 0.2076313130093779,
+      "grad_norm": 0.004809627775102854,
+      "learning_rate": 0.001,
+      "loss": 0.3851,
+      "step": 7525
+    },
+    {
+      "epoch": 0.20765890521044228,
+      "grad_norm": 0.0031031654216349125,
+      "learning_rate": 0.001,
+      "loss": 0.3746,
+      "step": 7526
+    },
+    {
+      "epoch": 0.20768649741150663,
+      "grad_norm": 0.0024205984082072973,
+      "learning_rate": 0.001,
+      "loss": 0.414,
+      "step": 7527
+    },
+    {
+      "epoch": 0.207714089612571,
+      "grad_norm": 0.003100008238106966,
+      "learning_rate": 0.001,
+      "loss": 0.4384,
+      "step": 7528
+    },
+    {
+      "epoch": 0.20774168181363536,
+      "grad_norm": 0.0023790469858795404,
+      "learning_rate": 0.001,
+      "loss": 0.4258,
+      "step": 7529
+    },
+    {
+      "epoch": 0.20776927401469975,
+      "grad_norm": 0.0028267705347388983,
+      "learning_rate": 0.001,
+      "loss": 0.3943,
+      "step": 7530
+    },
+    {
+      "epoch": 0.20779686621576413,
+      "grad_norm": 0.002445453545078635,
+      "learning_rate": 0.001,
+      "loss": 0.4182,
+      "step": 7531
+    },
+    {
+      "epoch": 0.20782445841682848,
+      "grad_norm": 0.0038315069396048784,
+      "learning_rate": 0.001,
+      "loss": 0.3965,
+      "step": 7532
+    },
+    {
+      "epoch": 0.20785205061789286,
+      "grad_norm": 0.002911431947723031,
+      "learning_rate": 0.001,
+      "loss": 0.3953,
+      "step": 7533
+    },
+    {
+      "epoch": 0.2078796428189572,
+      "grad_norm": 0.024655209854245186,
+      "learning_rate": 0.001,
+      "loss": 0.4144,
+      "step": 7534
+    },
+    {
+      "epoch": 0.2079072350200216,
+      "grad_norm": 0.0030850169714540243,
+      "learning_rate": 0.001,
+      "loss": 0.3774,
+      "step": 7535
+    },
+    {
+      "epoch": 0.20793482722108597,
+      "grad_norm": 0.0029573985375463963,
+      "learning_rate": 0.001,
+      "loss": 0.4237,
+      "step": 7536
+    },
+    {
+      "epoch": 0.20796241942215032,
+      "grad_norm": 0.002838612301275134,
+      "learning_rate": 0.001,
+      "loss": 0.4398,
+      "step": 7537
+    },
+    {
+      "epoch": 0.2079900116232147,
+      "grad_norm": 0.0038796397857367992,
+      "learning_rate": 0.001,
+      "loss": 0.3815,
+      "step": 7538
+    },
+    {
+      "epoch": 0.20801760382427906,
+      "grad_norm": 0.002909380476921797,
+      "learning_rate": 0.001,
+      "loss": 0.3851,
+      "step": 7539
+    },
+    {
+      "epoch": 0.20804519602534344,
+      "grad_norm": 0.005611411761492491,
+      "learning_rate": 0.001,
+      "loss": 0.3835,
+      "step": 7540
+    },
+    {
+      "epoch": 0.20807278822640782,
+      "grad_norm": 0.005777435377240181,
+      "learning_rate": 0.001,
+      "loss": 0.3989,
+      "step": 7541
+    },
+    {
+      "epoch": 0.20810038042747217,
+      "grad_norm": 0.002417078008875251,
+      "learning_rate": 0.001,
+      "loss": 0.4334,
+      "step": 7542
+    },
+    {
+      "epoch": 0.20812797262853655,
+      "grad_norm": 0.0039400262758135796,
+      "learning_rate": 0.001,
+      "loss": 0.4242,
+      "step": 7543
+    },
+    {
+      "epoch": 0.2081555648296009,
+      "grad_norm": 0.005697792861610651,
+      "learning_rate": 0.001,
+      "loss": 0.3795,
+      "step": 7544
+    },
+    {
+      "epoch": 0.20818315703066528,
+      "grad_norm": 0.0035569635219872,
+      "learning_rate": 0.001,
+      "loss": 0.4059,
+      "step": 7545
+    },
+    {
+      "epoch": 0.20821074923172966,
+      "grad_norm": 0.0035249588545411825,
+      "learning_rate": 0.001,
+      "loss": 0.4174,
+      "step": 7546
+    },
+    {
+      "epoch": 0.20823834143279402,
+      "grad_norm": 0.003973274026066065,
+      "learning_rate": 0.001,
+      "loss": 0.3732,
+      "step": 7547
+    },
+    {
+      "epoch": 0.2082659336338584,
+      "grad_norm": 0.010858539491891861,
+      "learning_rate": 0.001,
+      "loss": 0.3627,
+      "step": 7548
+    },
+    {
+      "epoch": 0.20829352583492275,
+      "grad_norm": 0.002799051348119974,
+      "learning_rate": 0.001,
+      "loss": 0.4172,
+      "step": 7549
+    },
+    {
+      "epoch": 0.20832111803598713,
+      "grad_norm": 0.0032943717669695616,
+      "learning_rate": 0.001,
+      "loss": 0.4083,
+      "step": 7550
+    },
+    {
+      "epoch": 0.2083487102370515,
+      "grad_norm": 0.003968504723161459,
+      "learning_rate": 0.001,
+      "loss": 0.4046,
+      "step": 7551
+    },
+    {
+      "epoch": 0.20837630243811586,
+      "grad_norm": 0.0025952784344553947,
+      "learning_rate": 0.001,
+      "loss": 0.4116,
+      "step": 7552
+    },
+    {
+      "epoch": 0.20840389463918024,
+      "grad_norm": 0.0023193256929516792,
+      "learning_rate": 0.001,
+      "loss": 0.3947,
+      "step": 7553
+    },
+    {
+      "epoch": 0.2084314868402446,
+      "grad_norm": 0.020232345908880234,
+      "learning_rate": 0.001,
+      "loss": 0.3861,
+      "step": 7554
+    },
+    {
+      "epoch": 0.20845907904130898,
+      "grad_norm": 0.005101238377392292,
+      "learning_rate": 0.001,
+      "loss": 0.3826,
+      "step": 7555
+    },
+    {
+      "epoch": 0.20848667124237336,
+      "grad_norm": 0.0028565346729010344,
+      "learning_rate": 0.001,
+      "loss": 0.3878,
+      "step": 7556
+    },
+    {
+      "epoch": 0.2085142634434377,
+      "grad_norm": 0.0023610927164554596,
+      "learning_rate": 0.001,
+      "loss": 0.4383,
+      "step": 7557
+    },
+    {
+      "epoch": 0.2085418556445021,
+      "grad_norm": 0.0025352907832711935,
+      "learning_rate": 0.001,
+      "loss": 0.4298,
+      "step": 7558
+    },
+    {
+      "epoch": 0.20856944784556644,
+      "grad_norm": 0.002582514425739646,
+      "learning_rate": 0.001,
+      "loss": 0.3841,
+      "step": 7559
+    },
+    {
+      "epoch": 0.20859704004663082,
+      "grad_norm": 0.0051224916242063046,
+      "learning_rate": 0.001,
+      "loss": 0.3857,
+      "step": 7560
+    },
+    {
+      "epoch": 0.2086246322476952,
+      "grad_norm": 0.004121377132833004,
+      "learning_rate": 0.001,
+      "loss": 0.3783,
+      "step": 7561
+    },
+    {
+      "epoch": 0.20865222444875955,
+      "grad_norm": 0.0028835702687501907,
+      "learning_rate": 0.001,
+      "loss": 0.4232,
+      "step": 7562
+    },
+    {
+      "epoch": 0.20867981664982393,
+      "grad_norm": 0.0035318653099238873,
+      "learning_rate": 0.001,
+      "loss": 0.38,
+      "step": 7563
+    },
+    {
+      "epoch": 0.2087074088508883,
+      "grad_norm": 0.003049545455724001,
+      "learning_rate": 0.001,
+      "loss": 0.3697,
+      "step": 7564
+    },
+    {
+      "epoch": 0.20873500105195267,
+      "grad_norm": 0.01318820845335722,
+      "learning_rate": 0.001,
+      "loss": 0.4478,
+      "step": 7565
+    },
+    {
+      "epoch": 0.20876259325301705,
+      "grad_norm": 0.0028140349313616753,
+      "learning_rate": 0.001,
+      "loss": 0.3822,
+      "step": 7566
+    },
+    {
+      "epoch": 0.2087901854540814,
+      "grad_norm": 0.0023844155948609114,
+      "learning_rate": 0.001,
+      "loss": 0.4497,
+      "step": 7567
+    },
+    {
+      "epoch": 0.20881777765514578,
+      "grad_norm": 0.0021965601481497288,
+      "learning_rate": 0.001,
+      "loss": 0.3985,
+      "step": 7568
+    },
+    {
+      "epoch": 0.20884536985621013,
+      "grad_norm": 0.003476344281807542,
+      "learning_rate": 0.001,
+      "loss": 0.4209,
+      "step": 7569
+    },
+    {
+      "epoch": 0.2088729620572745,
+      "grad_norm": 0.004417444113641977,
+      "learning_rate": 0.001,
+      "loss": 0.3787,
+      "step": 7570
+    },
+    {
+      "epoch": 0.2089005542583389,
+      "grad_norm": 0.002342290710657835,
+      "learning_rate": 0.001,
+      "loss": 0.4322,
+      "step": 7571
+    },
+    {
+      "epoch": 0.20892814645940325,
+      "grad_norm": 0.002702725352719426,
+      "learning_rate": 0.001,
+      "loss": 0.4292,
+      "step": 7572
+    },
+    {
+      "epoch": 0.20895573866046763,
+      "grad_norm": 0.003713875776156783,
+      "learning_rate": 0.001,
+      "loss": 0.387,
+      "step": 7573
+    },
+    {
+      "epoch": 0.20898333086153198,
+      "grad_norm": 0.002027127193287015,
+      "learning_rate": 0.001,
+      "loss": 0.4431,
+      "step": 7574
+    },
+    {
+      "epoch": 0.20901092306259636,
+      "grad_norm": 0.0038261881563812494,
+      "learning_rate": 0.001,
+      "loss": 0.3868,
+      "step": 7575
+    },
+    {
+      "epoch": 0.2090385152636607,
+      "grad_norm": 0.002401645528152585,
+      "learning_rate": 0.001,
+      "loss": 0.4568,
+      "step": 7576
+    },
+    {
+      "epoch": 0.2090661074647251,
+      "grad_norm": 0.002996440976858139,
+      "learning_rate": 0.001,
+      "loss": 0.3903,
+      "step": 7577
+    },
+    {
+      "epoch": 0.20909369966578947,
+      "grad_norm": 0.0030510788783431053,
+      "learning_rate": 0.001,
+      "loss": 0.3791,
+      "step": 7578
+    },
+    {
+      "epoch": 0.20912129186685383,
+      "grad_norm": 0.0031568347476422787,
+      "learning_rate": 0.001,
+      "loss": 0.3908,
+      "step": 7579
+    },
+    {
+      "epoch": 0.2091488840679182,
+      "grad_norm": 0.0025101054925471544,
+      "learning_rate": 0.001,
+      "loss": 0.4009,
+      "step": 7580
+    },
+    {
+      "epoch": 0.20917647626898256,
+      "grad_norm": 0.002685755491256714,
+      "learning_rate": 0.001,
+      "loss": 0.4096,
+      "step": 7581
+    },
+    {
+      "epoch": 0.20920406847004694,
+      "grad_norm": 0.00574698718264699,
+      "learning_rate": 0.001,
+      "loss": 0.3911,
+      "step": 7582
+    },
+    {
+      "epoch": 0.20923166067111132,
+      "grad_norm": 0.0023178094998002052,
+      "learning_rate": 0.001,
+      "loss": 0.3971,
+      "step": 7583
+    },
+    {
+      "epoch": 0.20925925287217567,
+      "grad_norm": 0.0048984200693666935,
+      "learning_rate": 0.001,
+      "loss": 0.3566,
+      "step": 7584
+    },
+    {
+      "epoch": 0.20928684507324005,
+      "grad_norm": 0.0027060145512223244,
+      "learning_rate": 0.001,
+      "loss": 0.3853,
+      "step": 7585
+    },
+    {
+      "epoch": 0.2093144372743044,
+      "grad_norm": 0.00470772897824645,
+      "learning_rate": 0.001,
+      "loss": 0.385,
+      "step": 7586
+    },
+    {
+      "epoch": 0.20934202947536878,
+      "grad_norm": 0.002918388694524765,
+      "learning_rate": 0.001,
+      "loss": 0.4374,
+      "step": 7587
+    },
+    {
+      "epoch": 0.20936962167643317,
+      "grad_norm": 0.002451063599437475,
+      "learning_rate": 0.001,
+      "loss": 0.4166,
+      "step": 7588
+    },
+    {
+      "epoch": 0.20939721387749752,
+      "grad_norm": 0.0024748530704528093,
+      "learning_rate": 0.001,
+      "loss": 0.4419,
+      "step": 7589
+    },
+    {
+      "epoch": 0.2094248060785619,
+      "grad_norm": 0.0029519018717110157,
+      "learning_rate": 0.001,
+      "loss": 0.3743,
+      "step": 7590
+    },
+    {
+      "epoch": 0.20945239827962625,
+      "grad_norm": 0.004285396076738834,
+      "learning_rate": 0.001,
+      "loss": 0.3722,
+      "step": 7591
+    },
+    {
+      "epoch": 0.20947999048069063,
+      "grad_norm": 0.0032341107726097107,
+      "learning_rate": 0.001,
+      "loss": 0.3655,
+      "step": 7592
+    },
+    {
+      "epoch": 0.209507582681755,
+      "grad_norm": 0.0032684197649359703,
+      "learning_rate": 0.001,
+      "loss": 0.3992,
+      "step": 7593
+    },
+    {
+      "epoch": 0.20953517488281936,
+      "grad_norm": 0.002590345684438944,
+      "learning_rate": 0.001,
+      "loss": 0.4147,
+      "step": 7594
+    },
+    {
+      "epoch": 0.20956276708388374,
+      "grad_norm": 0.0075867660343647,
+      "learning_rate": 0.001,
+      "loss": 0.4297,
+      "step": 7595
+    },
+    {
+      "epoch": 0.2095903592849481,
+      "grad_norm": 0.011841630563139915,
+      "learning_rate": 0.001,
+      "loss": 0.3941,
+      "step": 7596
+    },
+    {
+      "epoch": 0.20961795148601248,
+      "grad_norm": 0.002727978862822056,
+      "learning_rate": 0.001,
+      "loss": 0.4122,
+      "step": 7597
+    },
+    {
+      "epoch": 0.20964554368707686,
+      "grad_norm": 0.0028050565160810947,
+      "learning_rate": 0.001,
+      "loss": 0.4043,
+      "step": 7598
+    },
+    {
+      "epoch": 0.2096731358881412,
+      "grad_norm": 0.0022304770536720753,
+      "learning_rate": 0.001,
+      "loss": 0.4186,
+      "step": 7599
+    },
+    {
+      "epoch": 0.2097007280892056,
+      "grad_norm": 0.0042723724618554115,
+      "learning_rate": 0.001,
+      "loss": 0.409,
+      "step": 7600
+    },
+    {
+      "epoch": 0.20972832029026994,
+      "grad_norm": 0.00373181514441967,
+      "learning_rate": 0.001,
+      "loss": 0.4158,
+      "step": 7601
+    },
+    {
+      "epoch": 0.20975591249133432,
+      "grad_norm": 0.0032532603945583105,
+      "learning_rate": 0.001,
+      "loss": 0.4188,
+      "step": 7602
+    },
+    {
+      "epoch": 0.2097835046923987,
+      "grad_norm": 0.002218460664153099,
+      "learning_rate": 0.001,
+      "loss": 0.4221,
+      "step": 7603
+    },
+    {
+      "epoch": 0.20981109689346306,
+      "grad_norm": 0.00374056794680655,
+      "learning_rate": 0.001,
+      "loss": 0.412,
+      "step": 7604
+    },
+    {
+      "epoch": 0.20983868909452744,
+      "grad_norm": 0.0029005371034145355,
+      "learning_rate": 0.001,
+      "loss": 0.3753,
+      "step": 7605
+    },
+    {
+      "epoch": 0.2098662812955918,
+      "grad_norm": 0.0028765443712472916,
+      "learning_rate": 0.001,
+      "loss": 0.4026,
+      "step": 7606
+    },
+    {
+      "epoch": 0.20989387349665617,
+      "grad_norm": 0.0032596492674201727,
+      "learning_rate": 0.001,
+      "loss": 0.4057,
+      "step": 7607
+    },
+    {
+      "epoch": 0.20992146569772055,
+      "grad_norm": 0.0027232312131673098,
+      "learning_rate": 0.001,
+      "loss": 0.4062,
+      "step": 7608
+    },
+    {
+      "epoch": 0.2099490578987849,
+      "grad_norm": 0.0047914572060108185,
+      "learning_rate": 0.001,
+      "loss": 0.383,
+      "step": 7609
+    },
+    {
+      "epoch": 0.20997665009984928,
+      "grad_norm": 0.002376759424805641,
+      "learning_rate": 0.001,
+      "loss": 0.4403,
+      "step": 7610
+    },
+    {
+      "epoch": 0.21000424230091363,
+      "grad_norm": 0.0035623747389763594,
+      "learning_rate": 0.001,
+      "loss": 0.3995,
+      "step": 7611
+    },
+    {
+      "epoch": 0.21003183450197802,
+      "grad_norm": 0.003275655210018158,
+      "learning_rate": 0.001,
+      "loss": 0.4302,
+      "step": 7612
+    },
+    {
+      "epoch": 0.2100594267030424,
+      "grad_norm": 0.0037639643996953964,
+      "learning_rate": 0.001,
+      "loss": 0.4191,
+      "step": 7613
+    },
+    {
+      "epoch": 0.21008701890410675,
+      "grad_norm": 0.010343240574002266,
+      "learning_rate": 0.001,
+      "loss": 0.3744,
+      "step": 7614
+    },
+    {
+      "epoch": 0.21011461110517113,
+      "grad_norm": 0.002769758924841881,
+      "learning_rate": 0.001,
+      "loss": 0.4141,
+      "step": 7615
+    },
+    {
+      "epoch": 0.21014220330623548,
+      "grad_norm": 0.0020274778362363577,
+      "learning_rate": 0.001,
+      "loss": 0.4406,
+      "step": 7616
+    },
+    {
+      "epoch": 0.21016979550729986,
+      "grad_norm": 0.0034504812210798264,
+      "learning_rate": 0.001,
+      "loss": 0.4136,
+      "step": 7617
+    },
+    {
+      "epoch": 0.21019738770836424,
+      "grad_norm": 0.0032671205699443817,
+      "learning_rate": 0.001,
+      "loss": 0.3769,
+      "step": 7618
+    },
+    {
+      "epoch": 0.2102249799094286,
+      "grad_norm": 0.0033758427016437054,
+      "learning_rate": 0.001,
+      "loss": 0.4191,
+      "step": 7619
+    },
+    {
+      "epoch": 0.21025257211049297,
+      "grad_norm": 0.005549068097025156,
+      "learning_rate": 0.001,
+      "loss": 0.383,
+      "step": 7620
+    },
+    {
+      "epoch": 0.21028016431155733,
+      "grad_norm": 0.0029898162465542555,
+      "learning_rate": 0.001,
+      "loss": 0.4265,
+      "step": 7621
+    },
+    {
+      "epoch": 0.2103077565126217,
+      "grad_norm": 0.0020999349653720856,
+      "learning_rate": 0.001,
+      "loss": 0.405,
+      "step": 7622
+    },
+    {
+      "epoch": 0.2103353487136861,
+      "grad_norm": 0.0028715464286506176,
+      "learning_rate": 0.001,
+      "loss": 0.3823,
+      "step": 7623
+    },
+    {
+      "epoch": 0.21036294091475044,
+      "grad_norm": 0.0032411713618785143,
+      "learning_rate": 0.001,
+      "loss": 0.3973,
+      "step": 7624
+    },
+    {
+      "epoch": 0.21039053311581482,
+      "grad_norm": 0.006850194651633501,
+      "learning_rate": 0.001,
+      "loss": 0.3864,
+      "step": 7625
+    },
+    {
+      "epoch": 0.21041812531687917,
+      "grad_norm": 0.0031761995051056147,
+      "learning_rate": 0.001,
+      "loss": 0.3689,
+      "step": 7626
+    },
+    {
+      "epoch": 0.21044571751794355,
+      "grad_norm": 0.0024642094504088163,
+      "learning_rate": 0.001,
+      "loss": 0.4037,
+      "step": 7627
+    },
+    {
+      "epoch": 0.21047330971900793,
+      "grad_norm": 0.0035544894635677338,
+      "learning_rate": 0.001,
+      "loss": 0.3876,
+      "step": 7628
+    },
+    {
+      "epoch": 0.2105009019200723,
+      "grad_norm": 0.009451251477003098,
+      "learning_rate": 0.001,
+      "loss": 0.394,
+      "step": 7629
+    },
+    {
+      "epoch": 0.21052849412113667,
+      "grad_norm": 0.002780449576675892,
+      "learning_rate": 0.001,
+      "loss": 0.4073,
+      "step": 7630
+    },
+    {
+      "epoch": 0.21055608632220102,
+      "grad_norm": 0.0032124193385243416,
+      "learning_rate": 0.001,
+      "loss": 0.4213,
+      "step": 7631
+    },
+    {
+      "epoch": 0.2105836785232654,
+      "grad_norm": 0.004741044715046883,
+      "learning_rate": 0.001,
+      "loss": 0.3862,
+      "step": 7632
+    },
+    {
+      "epoch": 0.21061127072432978,
+      "grad_norm": 0.005938271526247263,
+      "learning_rate": 0.001,
+      "loss": 0.3802,
+      "step": 7633
+    },
+    {
+      "epoch": 0.21063886292539413,
+      "grad_norm": 0.003253635484725237,
+      "learning_rate": 0.001,
+      "loss": 0.362,
+      "step": 7634
+    },
+    {
+      "epoch": 0.2106664551264585,
+      "grad_norm": 0.0031698490492999554,
+      "learning_rate": 0.001,
+      "loss": 0.4288,
+      "step": 7635
+    },
+    {
+      "epoch": 0.21069404732752287,
+      "grad_norm": 0.0025998263154178858,
+      "learning_rate": 0.001,
+      "loss": 0.3606,
+      "step": 7636
+    },
+    {
+      "epoch": 0.21072163952858725,
+      "grad_norm": 0.0023143659345805645,
+      "learning_rate": 0.001,
+      "loss": 0.4475,
+      "step": 7637
+    },
+    {
+      "epoch": 0.21074923172965163,
+      "grad_norm": 0.0022685672156512737,
+      "learning_rate": 0.001,
+      "loss": 0.3564,
+      "step": 7638
+    },
+    {
+      "epoch": 0.21077682393071598,
+      "grad_norm": 0.003365386975929141,
+      "learning_rate": 0.001,
+      "loss": 0.4087,
+      "step": 7639
+    },
+    {
+      "epoch": 0.21080441613178036,
+      "grad_norm": 0.0031616310589015484,
+      "learning_rate": 0.001,
+      "loss": 0.3569,
+      "step": 7640
+    },
+    {
+      "epoch": 0.2108320083328447,
+      "grad_norm": 0.0061267949640750885,
+      "learning_rate": 0.001,
+      "loss": 0.3689,
+      "step": 7641
+    },
+    {
+      "epoch": 0.2108596005339091,
+      "grad_norm": 0.0033039050176739693,
+      "learning_rate": 0.001,
+      "loss": 0.4077,
+      "step": 7642
+    },
+    {
+      "epoch": 0.21088719273497347,
+      "grad_norm": 0.002562372013926506,
+      "learning_rate": 0.001,
+      "loss": 0.3989,
+      "step": 7643
+    },
+    {
+      "epoch": 0.21091478493603782,
+      "grad_norm": 0.0026763584464788437,
+      "learning_rate": 0.001,
+      "loss": 0.3675,
+      "step": 7644
+    },
+    {
+      "epoch": 0.2109423771371022,
+      "grad_norm": 0.0032096514478325844,
+      "learning_rate": 0.001,
+      "loss": 0.3803,
+      "step": 7645
+    },
+    {
+      "epoch": 0.21096996933816656,
+      "grad_norm": 0.002220802940428257,
+      "learning_rate": 0.001,
+      "loss": 0.4069,
+      "step": 7646
+    },
+    {
+      "epoch": 0.21099756153923094,
+      "grad_norm": 0.0039492035284638405,
+      "learning_rate": 0.001,
+      "loss": 0.3918,
+      "step": 7647
+    },
+    {
+      "epoch": 0.21102515374029532,
+      "grad_norm": 0.0027708462439477444,
+      "learning_rate": 0.001,
+      "loss": 0.4026,
+      "step": 7648
+    },
+    {
+      "epoch": 0.21105274594135967,
+      "grad_norm": 0.002891121432185173,
+      "learning_rate": 0.001,
+      "loss": 0.4248,
+      "step": 7649
+    },
+    {
+      "epoch": 0.21108033814242405,
+      "grad_norm": 0.008183618076145649,
+      "learning_rate": 0.001,
+      "loss": 0.3889,
+      "step": 7650
+    },
+    {
+      "epoch": 0.2111079303434884,
+      "grad_norm": 0.0033024440053850412,
+      "learning_rate": 0.001,
+      "loss": 0.408,
+      "step": 7651
+    },
+    {
+      "epoch": 0.21113552254455278,
+      "grad_norm": 0.0020405303221195936,
+      "learning_rate": 0.001,
+      "loss": 0.421,
+      "step": 7652
+    },
+    {
+      "epoch": 0.21116311474561716,
+      "grad_norm": 0.004159392323344946,
+      "learning_rate": 0.001,
+      "loss": 0.3637,
+      "step": 7653
+    },
+    {
+      "epoch": 0.21119070694668152,
+      "grad_norm": 0.004359352868050337,
+      "learning_rate": 0.001,
+      "loss": 0.4021,
+      "step": 7654
+    },
+    {
+      "epoch": 0.2112182991477459,
+      "grad_norm": 0.003729519434273243,
+      "learning_rate": 0.001,
+      "loss": 0.4094,
+      "step": 7655
+    },
+    {
+      "epoch": 0.21124589134881025,
+      "grad_norm": 0.0033560399897396564,
+      "learning_rate": 0.001,
+      "loss": 0.3683,
+      "step": 7656
+    },
+    {
+      "epoch": 0.21127348354987463,
+      "grad_norm": 0.008382863365113735,
+      "learning_rate": 0.001,
+      "loss": 0.3876,
+      "step": 7657
+    },
+    {
+      "epoch": 0.211301075750939,
+      "grad_norm": 0.002670932561159134,
+      "learning_rate": 0.001,
+      "loss": 0.3864,
+      "step": 7658
+    },
+    {
+      "epoch": 0.21132866795200336,
+      "grad_norm": 0.0026596023235470057,
+      "learning_rate": 0.001,
+      "loss": 0.3925,
+      "step": 7659
+    },
+    {
+      "epoch": 0.21135626015306774,
+      "grad_norm": 0.0030948955100029707,
+      "learning_rate": 0.001,
+      "loss": 0.4074,
+      "step": 7660
+    },
+    {
+      "epoch": 0.2113838523541321,
+      "grad_norm": 0.005249246954917908,
+      "learning_rate": 0.001,
+      "loss": 0.3961,
+      "step": 7661
+    },
+    {
+      "epoch": 0.21141144455519648,
+      "grad_norm": 0.0028843062464147806,
+      "learning_rate": 0.001,
+      "loss": 0.3791,
+      "step": 7662
+    },
+    {
+      "epoch": 0.21143903675626086,
+      "grad_norm": 0.0027450129855424166,
+      "learning_rate": 0.001,
+      "loss": 0.3956,
+      "step": 7663
+    },
+    {
+      "epoch": 0.2114666289573252,
+      "grad_norm": 0.01026302482932806,
+      "learning_rate": 0.001,
+      "loss": 0.3887,
+      "step": 7664
+    },
+    {
+      "epoch": 0.2114942211583896,
+      "grad_norm": 0.0032274313271045685,
+      "learning_rate": 0.001,
+      "loss": 0.3951,
+      "step": 7665
+    },
+    {
+      "epoch": 0.21152181335945394,
+      "grad_norm": 0.005412998143583536,
+      "learning_rate": 0.001,
+      "loss": 0.4233,
+      "step": 7666
+    },
+    {
+      "epoch": 0.21154940556051832,
+      "grad_norm": 0.00447877449914813,
+      "learning_rate": 0.001,
+      "loss": 0.3871,
+      "step": 7667
+    },
+    {
+      "epoch": 0.2115769977615827,
+      "grad_norm": 0.0035270792432129383,
+      "learning_rate": 0.001,
+      "loss": 0.3559,
+      "step": 7668
+    },
+    {
+      "epoch": 0.21160458996264706,
+      "grad_norm": 0.009330617263913155,
+      "learning_rate": 0.001,
+      "loss": 0.4233,
+      "step": 7669
+    },
+    {
+      "epoch": 0.21163218216371144,
+      "grad_norm": 0.003103942610323429,
+      "learning_rate": 0.001,
+      "loss": 0.3885,
+      "step": 7670
+    },
+    {
+      "epoch": 0.2116597743647758,
+      "grad_norm": 0.0024018525145947933,
+      "learning_rate": 0.001,
+      "loss": 0.384,
+      "step": 7671
+    },
+    {
+      "epoch": 0.21168736656584017,
+      "grad_norm": 0.0029490841552615166,
+      "learning_rate": 0.001,
+      "loss": 0.3935,
+      "step": 7672
+    },
+    {
+      "epoch": 0.21171495876690452,
+      "grad_norm": 0.003537841374054551,
+      "learning_rate": 0.001,
+      "loss": 0.4237,
+      "step": 7673
+    },
+    {
+      "epoch": 0.2117425509679689,
+      "grad_norm": 0.007800383027642965,
+      "learning_rate": 0.001,
+      "loss": 0.4145,
+      "step": 7674
+    },
+    {
+      "epoch": 0.21177014316903328,
+      "grad_norm": 0.0025011899415403605,
+      "learning_rate": 0.001,
+      "loss": 0.4257,
+      "step": 7675
+    },
+    {
+      "epoch": 0.21179773537009763,
+      "grad_norm": 0.004059432540088892,
+      "learning_rate": 0.001,
+      "loss": 0.3877,
+      "step": 7676
+    },
+    {
+      "epoch": 0.21182532757116201,
+      "grad_norm": 0.0030776297207921743,
+      "learning_rate": 0.001,
+      "loss": 0.3552,
+      "step": 7677
+    },
+    {
+      "epoch": 0.21185291977222637,
+      "grad_norm": 0.004162721801549196,
+      "learning_rate": 0.001,
+      "loss": 0.3905,
+      "step": 7678
+    },
+    {
+      "epoch": 0.21188051197329075,
+      "grad_norm": 0.003027817001566291,
+      "learning_rate": 0.001,
+      "loss": 0.3937,
+      "step": 7679
+    },
+    {
+      "epoch": 0.21190810417435513,
+      "grad_norm": 0.004329508636146784,
+      "learning_rate": 0.001,
+      "loss": 0.3874,
+      "step": 7680
+    },
+    {
+      "epoch": 0.21193569637541948,
+      "grad_norm": 0.003321508876979351,
+      "learning_rate": 0.001,
+      "loss": 0.4219,
+      "step": 7681
+    },
+    {
+      "epoch": 0.21196328857648386,
+      "grad_norm": 0.003910355735570192,
+      "learning_rate": 0.001,
+      "loss": 0.3825,
+      "step": 7682
+    },
+    {
+      "epoch": 0.2119908807775482,
+      "grad_norm": 0.002138720592483878,
+      "learning_rate": 0.001,
+      "loss": 0.4045,
+      "step": 7683
+    },
+    {
+      "epoch": 0.2120184729786126,
+      "grad_norm": 0.003873582696542144,
+      "learning_rate": 0.001,
+      "loss": 0.4121,
+      "step": 7684
+    },
+    {
+      "epoch": 0.21204606517967697,
+      "grad_norm": 0.008897165767848492,
+      "learning_rate": 0.001,
+      "loss": 0.3989,
+      "step": 7685
+    },
+    {
+      "epoch": 0.21207365738074133,
+      "grad_norm": 0.009049713611602783,
+      "learning_rate": 0.001,
+      "loss": 0.3947,
+      "step": 7686
+    },
+    {
+      "epoch": 0.2121012495818057,
+      "grad_norm": 0.00294255162589252,
+      "learning_rate": 0.001,
+      "loss": 0.3908,
+      "step": 7687
+    },
+    {
+      "epoch": 0.21212884178287006,
+      "grad_norm": 0.003320804564282298,
+      "learning_rate": 0.001,
+      "loss": 0.3711,
+      "step": 7688
+    },
+    {
+      "epoch": 0.21215643398393444,
+      "grad_norm": 0.006188528146594763,
+      "learning_rate": 0.001,
+      "loss": 0.404,
+      "step": 7689
+    },
+    {
+      "epoch": 0.21218402618499882,
+      "grad_norm": 0.002375251380726695,
+      "learning_rate": 0.001,
+      "loss": 0.4214,
+      "step": 7690
+    },
+    {
+      "epoch": 0.21221161838606317,
+      "grad_norm": 0.00397599907591939,
+      "learning_rate": 0.001,
+      "loss": 0.4141,
+      "step": 7691
+    },
+    {
+      "epoch": 0.21223921058712755,
+      "grad_norm": 0.002846440998837352,
+      "learning_rate": 0.001,
+      "loss": 0.3989,
+      "step": 7692
+    },
+    {
+      "epoch": 0.2122668027881919,
+      "grad_norm": 0.005194092635065317,
+      "learning_rate": 0.001,
+      "loss": 0.3759,
+      "step": 7693
+    },
+    {
+      "epoch": 0.21229439498925629,
+      "grad_norm": 0.004019029904156923,
+      "learning_rate": 0.001,
+      "loss": 0.4079,
+      "step": 7694
+    },
+    {
+      "epoch": 0.21232198719032067,
+      "grad_norm": 0.0032543425913900137,
+      "learning_rate": 0.001,
+      "loss": 0.4172,
+      "step": 7695
+    },
+    {
+      "epoch": 0.21234957939138502,
+      "grad_norm": 0.004645978100597858,
+      "learning_rate": 0.001,
+      "loss": 0.4443,
+      "step": 7696
+    },
+    {
+      "epoch": 0.2123771715924494,
+      "grad_norm": 0.003786055836826563,
+      "learning_rate": 0.001,
+      "loss": 0.4281,
+      "step": 7697
+    },
+    {
+      "epoch": 0.21240476379351375,
+      "grad_norm": 0.005406087264418602,
+      "learning_rate": 0.001,
+      "loss": 0.4052,
+      "step": 7698
+    },
+    {
+      "epoch": 0.21243235599457813,
+      "grad_norm": 0.005549001973122358,
+      "learning_rate": 0.001,
+      "loss": 0.4038,
+      "step": 7699
+    },
+    {
+      "epoch": 0.2124599481956425,
+      "grad_norm": 0.002728297607973218,
+      "learning_rate": 0.001,
+      "loss": 0.4014,
+      "step": 7700
+    },
+    {
+      "epoch": 0.21248754039670686,
+      "grad_norm": 0.005214143078774214,
+      "learning_rate": 0.001,
+      "loss": 0.4125,
+      "step": 7701
+    },
+    {
+      "epoch": 0.21251513259777124,
+      "grad_norm": 0.0025736193638294935,
+      "learning_rate": 0.001,
+      "loss": 0.4219,
+      "step": 7702
+    },
+    {
+      "epoch": 0.2125427247988356,
+      "grad_norm": 0.00296932109631598,
+      "learning_rate": 0.001,
+      "loss": 0.3849,
+      "step": 7703
+    },
+    {
+      "epoch": 0.21257031699989998,
+      "grad_norm": 0.003445189446210861,
+      "learning_rate": 0.001,
+      "loss": 0.3584,
+      "step": 7704
+    },
+    {
+      "epoch": 0.21259790920096436,
+      "grad_norm": 0.0035377286840230227,
+      "learning_rate": 0.001,
+      "loss": 0.3979,
+      "step": 7705
+    },
+    {
+      "epoch": 0.2126255014020287,
+      "grad_norm": 0.0029369608964771032,
+      "learning_rate": 0.001,
+      "loss": 0.4084,
+      "step": 7706
+    },
+    {
+      "epoch": 0.2126530936030931,
+      "grad_norm": 0.0030892151407897472,
+      "learning_rate": 0.001,
+      "loss": 0.395,
+      "step": 7707
+    },
+    {
+      "epoch": 0.21268068580415744,
+      "grad_norm": 0.004821309354156256,
+      "learning_rate": 0.001,
+      "loss": 0.4165,
+      "step": 7708
+    },
+    {
+      "epoch": 0.21270827800522182,
+      "grad_norm": 0.003210441442206502,
+      "learning_rate": 0.001,
+      "loss": 0.4002,
+      "step": 7709
+    },
+    {
+      "epoch": 0.2127358702062862,
+      "grad_norm": 0.0030719011556357145,
+      "learning_rate": 0.001,
+      "loss": 0.399,
+      "step": 7710
+    },
+    {
+      "epoch": 0.21276346240735056,
+      "grad_norm": 0.0028634623158723116,
+      "learning_rate": 0.001,
+      "loss": 0.3503,
+      "step": 7711
+    },
+    {
+      "epoch": 0.21279105460841494,
+      "grad_norm": 0.004046322777867317,
+      "learning_rate": 0.001,
+      "loss": 0.4125,
+      "step": 7712
+    },
+    {
+      "epoch": 0.2128186468094793,
+      "grad_norm": 0.00294592441059649,
+      "learning_rate": 0.001,
+      "loss": 0.3417,
+      "step": 7713
+    },
+    {
+      "epoch": 0.21284623901054367,
+      "grad_norm": 0.0021102267783135176,
+      "learning_rate": 0.001,
+      "loss": 0.4369,
+      "step": 7714
+    },
+    {
+      "epoch": 0.21287383121160805,
+      "grad_norm": 0.0028041282203048468,
+      "learning_rate": 0.001,
+      "loss": 0.3816,
+      "step": 7715
+    },
+    {
+      "epoch": 0.2129014234126724,
+      "grad_norm": 0.0034821259323507547,
+      "learning_rate": 0.001,
+      "loss": 0.3934,
+      "step": 7716
+    },
+    {
+      "epoch": 0.21292901561373678,
+      "grad_norm": 0.003997510299086571,
+      "learning_rate": 0.001,
+      "loss": 0.3884,
+      "step": 7717
+    },
+    {
+      "epoch": 0.21295660781480114,
+      "grad_norm": 0.00297334766946733,
+      "learning_rate": 0.001,
+      "loss": 0.4262,
+      "step": 7718
+    },
+    {
+      "epoch": 0.21298420001586552,
+      "grad_norm": 0.0029889491852372885,
+      "learning_rate": 0.001,
+      "loss": 0.3567,
+      "step": 7719
+    },
+    {
+      "epoch": 0.2130117922169299,
+      "grad_norm": 0.005800141021609306,
+      "learning_rate": 0.001,
+      "loss": 0.3693,
+      "step": 7720
+    },
+    {
+      "epoch": 0.21303938441799425,
+      "grad_norm": 0.004063557833433151,
+      "learning_rate": 0.001,
+      "loss": 0.4006,
+      "step": 7721
+    },
+    {
+      "epoch": 0.21306697661905863,
+      "grad_norm": 0.0052527179941535,
+      "learning_rate": 0.001,
+      "loss": 0.3863,
+      "step": 7722
+    },
+    {
+      "epoch": 0.21309456882012298,
+      "grad_norm": 0.004150967579334974,
+      "learning_rate": 0.001,
+      "loss": 0.3818,
+      "step": 7723
+    },
+    {
+      "epoch": 0.21312216102118736,
+      "grad_norm": 0.003142510773614049,
+      "learning_rate": 0.001,
+      "loss": 0.4367,
+      "step": 7724
+    },
+    {
+      "epoch": 0.21314975322225174,
+      "grad_norm": 0.0024571139365434647,
+      "learning_rate": 0.001,
+      "loss": 0.397,
+      "step": 7725
+    },
+    {
+      "epoch": 0.2131773454233161,
+      "grad_norm": 0.007491269148886204,
+      "learning_rate": 0.001,
+      "loss": 0.4254,
+      "step": 7726
+    },
+    {
+      "epoch": 0.21320493762438048,
+      "grad_norm": 0.003059527836740017,
+      "learning_rate": 0.001,
+      "loss": 0.4296,
+      "step": 7727
+    },
+    {
+      "epoch": 0.21323252982544483,
+      "grad_norm": 0.0030870058108121157,
+      "learning_rate": 0.001,
+      "loss": 0.4143,
+      "step": 7728
+    },
+    {
+      "epoch": 0.2132601220265092,
+      "grad_norm": 0.002307609189301729,
+      "learning_rate": 0.001,
+      "loss": 0.404,
+      "step": 7729
+    },
+    {
+      "epoch": 0.2132877142275736,
+      "grad_norm": 0.004910447634756565,
+      "learning_rate": 0.001,
+      "loss": 0.3676,
+      "step": 7730
+    },
+    {
+      "epoch": 0.21331530642863794,
+      "grad_norm": 0.0032054877374321222,
+      "learning_rate": 0.001,
+      "loss": 0.4157,
+      "step": 7731
+    },
+    {
+      "epoch": 0.21334289862970232,
+      "grad_norm": 0.004739253781735897,
+      "learning_rate": 0.001,
+      "loss": 0.3884,
+      "step": 7732
+    },
+    {
+      "epoch": 0.21337049083076667,
+      "grad_norm": 0.0027026189491152763,
+      "learning_rate": 0.001,
+      "loss": 0.3723,
+      "step": 7733
+    },
+    {
+      "epoch": 0.21339808303183105,
+      "grad_norm": 0.0031935579609125853,
+      "learning_rate": 0.001,
+      "loss": 0.4053,
+      "step": 7734
+    },
+    {
+      "epoch": 0.21342567523289543,
+      "grad_norm": 0.0026129090692847967,
+      "learning_rate": 0.001,
+      "loss": 0.3873,
+      "step": 7735
+    },
+    {
+      "epoch": 0.2134532674339598,
+      "grad_norm": 0.0030374531634151936,
+      "learning_rate": 0.001,
+      "loss": 0.3681,
+      "step": 7736
+    },
+    {
+      "epoch": 0.21348085963502417,
+      "grad_norm": 0.0035555511713027954,
+      "learning_rate": 0.001,
+      "loss": 0.4156,
+      "step": 7737
+    },
+    {
+      "epoch": 0.21350845183608852,
+      "grad_norm": 0.005884343292564154,
+      "learning_rate": 0.001,
+      "loss": 0.3908,
+      "step": 7738
+    },
+    {
+      "epoch": 0.2135360440371529,
+      "grad_norm": 0.0030709283892065287,
+      "learning_rate": 0.001,
+      "loss": 0.376,
+      "step": 7739
+    },
+    {
+      "epoch": 0.21356363623821728,
+      "grad_norm": 0.0027664625085890293,
+      "learning_rate": 0.001,
+      "loss": 0.4092,
+      "step": 7740
+    },
+    {
+      "epoch": 0.21359122843928163,
+      "grad_norm": 0.005600049160420895,
+      "learning_rate": 0.001,
+      "loss": 0.3438,
+      "step": 7741
+    },
+    {
+      "epoch": 0.213618820640346,
+      "grad_norm": 0.00319514493457973,
+      "learning_rate": 0.001,
+      "loss": 0.3608,
+      "step": 7742
+    },
+    {
+      "epoch": 0.21364641284141037,
+      "grad_norm": 0.0033522641751915216,
+      "learning_rate": 0.001,
+      "loss": 0.4022,
+      "step": 7743
+    },
+    {
+      "epoch": 0.21367400504247475,
+      "grad_norm": 0.0030811361502856016,
+      "learning_rate": 0.001,
+      "loss": 0.3919,
+      "step": 7744
+    },
+    {
+      "epoch": 0.21370159724353913,
+      "grad_norm": 0.0026056889910250902,
+      "learning_rate": 0.001,
+      "loss": 0.4102,
+      "step": 7745
+    },
+    {
+      "epoch": 0.21372918944460348,
+      "grad_norm": 0.0046222517266869545,
+      "learning_rate": 0.001,
+      "loss": 0.3717,
+      "step": 7746
+    },
+    {
+      "epoch": 0.21375678164566786,
+      "grad_norm": 0.005610965192317963,
+      "learning_rate": 0.001,
+      "loss": 0.3899,
+      "step": 7747
+    },
+    {
+      "epoch": 0.2137843738467322,
+      "grad_norm": 0.004591417033225298,
+      "learning_rate": 0.001,
+      "loss": 0.3818,
+      "step": 7748
+    },
+    {
+      "epoch": 0.2138119660477966,
+      "grad_norm": 0.0033903804142028093,
+      "learning_rate": 0.001,
+      "loss": 0.396,
+      "step": 7749
+    },
+    {
+      "epoch": 0.21383955824886097,
+      "grad_norm": 0.0037878809962421656,
+      "learning_rate": 0.001,
+      "loss": 0.4143,
+      "step": 7750
+    },
+    {
+      "epoch": 0.21386715044992533,
+      "grad_norm": 0.006682871840894222,
+      "learning_rate": 0.001,
+      "loss": 0.3891,
+      "step": 7751
+    },
+    {
+      "epoch": 0.2138947426509897,
+      "grad_norm": 0.0054921298287808895,
+      "learning_rate": 0.001,
+      "loss": 0.4035,
+      "step": 7752
+    },
+    {
+      "epoch": 0.21392233485205406,
+      "grad_norm": 0.002533350605517626,
+      "learning_rate": 0.001,
+      "loss": 0.415,
+      "step": 7753
+    },
+    {
+      "epoch": 0.21394992705311844,
+      "grad_norm": 0.004424331709742546,
+      "learning_rate": 0.001,
+      "loss": 0.4039,
+      "step": 7754
+    },
+    {
+      "epoch": 0.21397751925418282,
+      "grad_norm": 0.003484759945422411,
+      "learning_rate": 0.001,
+      "loss": 0.4224,
+      "step": 7755
+    },
+    {
+      "epoch": 0.21400511145524717,
+      "grad_norm": 0.0028570671565830708,
+      "learning_rate": 0.001,
+      "loss": 0.3774,
+      "step": 7756
+    },
+    {
+      "epoch": 0.21403270365631155,
+      "grad_norm": 0.0032984951976686716,
+      "learning_rate": 0.001,
+      "loss": 0.405,
+      "step": 7757
+    },
+    {
+      "epoch": 0.2140602958573759,
+      "grad_norm": 0.002993236295878887,
+      "learning_rate": 0.001,
+      "loss": 0.381,
+      "step": 7758
+    },
+    {
+      "epoch": 0.21408788805844028,
+      "grad_norm": 0.0039431145414710045,
+      "learning_rate": 0.001,
+      "loss": 0.3957,
+      "step": 7759
+    },
+    {
+      "epoch": 0.21411548025950466,
+      "grad_norm": 0.0023782916832715273,
+      "learning_rate": 0.001,
+      "loss": 0.3813,
+      "step": 7760
+    },
+    {
+      "epoch": 0.21414307246056902,
+      "grad_norm": 0.005132326390594244,
+      "learning_rate": 0.001,
+      "loss": 0.3922,
+      "step": 7761
+    },
+    {
+      "epoch": 0.2141706646616334,
+      "grad_norm": 0.004260215442627668,
+      "learning_rate": 0.001,
+      "loss": 0.3811,
+      "step": 7762
+    },
+    {
+      "epoch": 0.21419825686269775,
+      "grad_norm": 0.00576009601354599,
+      "learning_rate": 0.001,
+      "loss": 0.431,
+      "step": 7763
+    },
+    {
+      "epoch": 0.21422584906376213,
+      "grad_norm": 0.002464310498908162,
+      "learning_rate": 0.001,
+      "loss": 0.4194,
+      "step": 7764
+    },
+    {
+      "epoch": 0.21425344126482648,
+      "grad_norm": 0.0022607767023146152,
+      "learning_rate": 0.001,
+      "loss": 0.4046,
+      "step": 7765
+    },
+    {
+      "epoch": 0.21428103346589086,
+      "grad_norm": 0.002766657853499055,
+      "learning_rate": 0.001,
+      "loss": 0.3995,
+      "step": 7766
+    },
+    {
+      "epoch": 0.21430862566695524,
+      "grad_norm": 0.002656952477991581,
+      "learning_rate": 0.001,
+      "loss": 0.3916,
+      "step": 7767
+    },
+    {
+      "epoch": 0.2143362178680196,
+      "grad_norm": 0.005709274671971798,
+      "learning_rate": 0.001,
+      "loss": 0.3811,
+      "step": 7768
+    },
+    {
+      "epoch": 0.21436381006908398,
+      "grad_norm": 0.0032649333588778973,
+      "learning_rate": 0.001,
+      "loss": 0.4037,
+      "step": 7769
+    },
+    {
+      "epoch": 0.21439140227014833,
+      "grad_norm": 0.0026809549890458584,
+      "learning_rate": 0.001,
+      "loss": 0.3998,
+      "step": 7770
+    },
+    {
+      "epoch": 0.2144189944712127,
+      "grad_norm": 0.010057074949145317,
+      "learning_rate": 0.001,
+      "loss": 0.3962,
+      "step": 7771
+    },
+    {
+      "epoch": 0.2144465866722771,
+      "grad_norm": 0.003958344459533691,
+      "learning_rate": 0.001,
+      "loss": 0.4137,
+      "step": 7772
+    },
+    {
+      "epoch": 0.21447417887334144,
+      "grad_norm": 0.004084006417542696,
+      "learning_rate": 0.001,
+      "loss": 0.4316,
+      "step": 7773
+    },
+    {
+      "epoch": 0.21450177107440582,
+      "grad_norm": 0.0051500373519957066,
+      "learning_rate": 0.001,
+      "loss": 0.4043,
+      "step": 7774
+    },
+    {
+      "epoch": 0.21452936327547018,
+      "grad_norm": 0.003794713644310832,
+      "learning_rate": 0.001,
+      "loss": 0.4239,
+      "step": 7775
+    },
+    {
+      "epoch": 0.21455695547653456,
+      "grad_norm": 0.02025657705962658,
+      "learning_rate": 0.001,
+      "loss": 0.4091,
+      "step": 7776
+    },
+    {
+      "epoch": 0.21458454767759894,
+      "grad_norm": 0.0035089454613626003,
+      "learning_rate": 0.001,
+      "loss": 0.3933,
+      "step": 7777
+    },
+    {
+      "epoch": 0.2146121398786633,
+      "grad_norm": 0.0029882427770644426,
+      "learning_rate": 0.001,
+      "loss": 0.4157,
+      "step": 7778
+    },
+    {
+      "epoch": 0.21463973207972767,
+      "grad_norm": 0.0025343652814626694,
+      "learning_rate": 0.001,
+      "loss": 0.391,
+      "step": 7779
+    },
+    {
+      "epoch": 0.21466732428079202,
+      "grad_norm": 0.0028338837437331676,
+      "learning_rate": 0.001,
+      "loss": 0.3991,
+      "step": 7780
+    },
+    {
+      "epoch": 0.2146949164818564,
+      "grad_norm": 0.0033451595809310675,
+      "learning_rate": 0.001,
+      "loss": 0.4014,
+      "step": 7781
+    },
+    {
+      "epoch": 0.21472250868292078,
+      "grad_norm": 0.0033898449037224054,
+      "learning_rate": 0.001,
+      "loss": 0.3728,
+      "step": 7782
+    },
+    {
+      "epoch": 0.21475010088398513,
+      "grad_norm": 0.0038336054421961308,
+      "learning_rate": 0.001,
+      "loss": 0.4243,
+      "step": 7783
+    },
+    {
+      "epoch": 0.21477769308504951,
+      "grad_norm": 0.012188595719635487,
+      "learning_rate": 0.001,
+      "loss": 0.4066,
+      "step": 7784
+    },
+    {
+      "epoch": 0.21480528528611387,
+      "grad_norm": 0.005000379402190447,
+      "learning_rate": 0.001,
+      "loss": 0.3877,
+      "step": 7785
+    },
+    {
+      "epoch": 0.21483287748717825,
+      "grad_norm": 0.00532375555485487,
+      "learning_rate": 0.001,
+      "loss": 0.3766,
+      "step": 7786
+    },
+    {
+      "epoch": 0.21486046968824263,
+      "grad_norm": 0.004247152712196112,
+      "learning_rate": 0.001,
+      "loss": 0.4009,
+      "step": 7787
+    },
+    {
+      "epoch": 0.21488806188930698,
+      "grad_norm": 0.0024253970477730036,
+      "learning_rate": 0.001,
+      "loss": 0.4233,
+      "step": 7788
+    },
+    {
+      "epoch": 0.21491565409037136,
+      "grad_norm": 0.0028955123852938414,
+      "learning_rate": 0.001,
+      "loss": 0.3938,
+      "step": 7789
+    },
+    {
+      "epoch": 0.2149432462914357,
+      "grad_norm": 0.0019871145486831665,
+      "learning_rate": 0.001,
+      "loss": 0.4043,
+      "step": 7790
+    },
+    {
+      "epoch": 0.2149708384925001,
+      "grad_norm": 0.0026270560920238495,
+      "learning_rate": 0.001,
+      "loss": 0.4235,
+      "step": 7791
+    },
+    {
+      "epoch": 0.21499843069356447,
+      "grad_norm": 0.0022613939363509417,
+      "learning_rate": 0.001,
+      "loss": 0.399,
+      "step": 7792
+    },
+    {
+      "epoch": 0.21502602289462883,
+      "grad_norm": 0.0022467582020908594,
+      "learning_rate": 0.001,
+      "loss": 0.4139,
+      "step": 7793
+    },
+    {
+      "epoch": 0.2150536150956932,
+      "grad_norm": 0.003144500544294715,
+      "learning_rate": 0.001,
+      "loss": 0.4121,
+      "step": 7794
+    },
+    {
+      "epoch": 0.21508120729675756,
+      "grad_norm": 0.004727689083665609,
+      "learning_rate": 0.001,
+      "loss": 0.3643,
+      "step": 7795
+    },
+    {
+      "epoch": 0.21510879949782194,
+      "grad_norm": 0.0030138723086565733,
+      "learning_rate": 0.001,
+      "loss": 0.4194,
+      "step": 7796
+    },
+    {
+      "epoch": 0.21513639169888632,
+      "grad_norm": 0.003997731953859329,
+      "learning_rate": 0.001,
+      "loss": 0.4116,
+      "step": 7797
+    },
+    {
+      "epoch": 0.21516398389995067,
+      "grad_norm": 0.0034538882318884134,
+      "learning_rate": 0.001,
+      "loss": 0.4138,
+      "step": 7798
+    },
+    {
+      "epoch": 0.21519157610101505,
+      "grad_norm": 0.0030911024659872055,
+      "learning_rate": 0.001,
+      "loss": 0.4247,
+      "step": 7799
+    },
+    {
+      "epoch": 0.2152191683020794,
+      "grad_norm": 0.006776158697903156,
+      "learning_rate": 0.001,
+      "loss": 0.406,
+      "step": 7800
+    },
+    {
+      "epoch": 0.21524676050314379,
+      "grad_norm": 0.0032637538388371468,
+      "learning_rate": 0.001,
+      "loss": 0.392,
+      "step": 7801
+    },
+    {
+      "epoch": 0.21527435270420817,
+      "grad_norm": 0.0028168826829642057,
+      "learning_rate": 0.001,
+      "loss": 0.3902,
+      "step": 7802
+    },
+    {
+      "epoch": 0.21530194490527252,
+      "grad_norm": 0.002768394071608782,
+      "learning_rate": 0.001,
+      "loss": 0.3795,
+      "step": 7803
+    },
+    {
+      "epoch": 0.2153295371063369,
+      "grad_norm": 0.004282908979803324,
+      "learning_rate": 0.001,
+      "loss": 0.3893,
+      "step": 7804
+    },
+    {
+      "epoch": 0.21535712930740125,
+      "grad_norm": 0.002293472411110997,
+      "learning_rate": 0.001,
+      "loss": 0.4139,
+      "step": 7805
+    },
+    {
+      "epoch": 0.21538472150846563,
+      "grad_norm": 0.003117901738733053,
+      "learning_rate": 0.001,
+      "loss": 0.4014,
+      "step": 7806
+    },
+    {
+      "epoch": 0.21541231370953,
+      "grad_norm": 0.0030702080111950636,
+      "learning_rate": 0.001,
+      "loss": 0.3675,
+      "step": 7807
+    },
+    {
+      "epoch": 0.21543990591059436,
+      "grad_norm": 0.0037037963047623634,
+      "learning_rate": 0.001,
+      "loss": 0.4097,
+      "step": 7808
+    },
+    {
+      "epoch": 0.21546749811165875,
+      "grad_norm": 0.003569400403648615,
+      "learning_rate": 0.001,
+      "loss": 0.4125,
+      "step": 7809
+    },
+    {
+      "epoch": 0.2154950903127231,
+      "grad_norm": 0.005185401998460293,
+      "learning_rate": 0.001,
+      "loss": 0.4272,
+      "step": 7810
+    },
+    {
+      "epoch": 0.21552268251378748,
+      "grad_norm": 0.0026427856646478176,
+      "learning_rate": 0.001,
+      "loss": 0.4301,
+      "step": 7811
+    },
+    {
+      "epoch": 0.21555027471485186,
+      "grad_norm": 0.0026996051892638206,
+      "learning_rate": 0.001,
+      "loss": 0.4272,
+      "step": 7812
+    },
+    {
+      "epoch": 0.2155778669159162,
+      "grad_norm": 0.0026872565504163504,
+      "learning_rate": 0.001,
+      "loss": 0.3894,
+      "step": 7813
+    },
+    {
+      "epoch": 0.2156054591169806,
+      "grad_norm": 0.0033470892813056707,
+      "learning_rate": 0.001,
+      "loss": 0.3964,
+      "step": 7814
+    },
+    {
+      "epoch": 0.21563305131804494,
+      "grad_norm": 0.002979893935844302,
+      "learning_rate": 0.001,
+      "loss": 0.4182,
+      "step": 7815
+    },
+    {
+      "epoch": 0.21566064351910932,
+      "grad_norm": 0.002338412217795849,
+      "learning_rate": 0.001,
+      "loss": 0.4119,
+      "step": 7816
+    },
+    {
+      "epoch": 0.2156882357201737,
+      "grad_norm": 0.0026039378717541695,
+      "learning_rate": 0.001,
+      "loss": 0.4217,
+      "step": 7817
+    },
+    {
+      "epoch": 0.21571582792123806,
+      "grad_norm": 0.004928927402943373,
+      "learning_rate": 0.001,
+      "loss": 0.3915,
+      "step": 7818
+    },
+    {
+      "epoch": 0.21574342012230244,
+      "grad_norm": 0.003969724290072918,
+      "learning_rate": 0.001,
+      "loss": 0.4052,
+      "step": 7819
+    },
+    {
+      "epoch": 0.2157710123233668,
+      "grad_norm": 0.004132222384214401,
+      "learning_rate": 0.001,
+      "loss": 0.4446,
+      "step": 7820
+    },
+    {
+      "epoch": 0.21579860452443117,
+      "grad_norm": 0.003034410998225212,
+      "learning_rate": 0.001,
+      "loss": 0.4339,
+      "step": 7821
+    },
+    {
+      "epoch": 0.21582619672549555,
+      "grad_norm": 0.0034465952776372433,
+      "learning_rate": 0.001,
+      "loss": 0.4024,
+      "step": 7822
+    },
+    {
+      "epoch": 0.2158537889265599,
+      "grad_norm": 0.0028153613675385714,
+      "learning_rate": 0.001,
+      "loss": 0.4302,
+      "step": 7823
+    },
+    {
+      "epoch": 0.21588138112762428,
+      "grad_norm": 0.007028549909591675,
+      "learning_rate": 0.001,
+      "loss": 0.3774,
+      "step": 7824
+    },
+    {
+      "epoch": 0.21590897332868864,
+      "grad_norm": 0.0035245222970843315,
+      "learning_rate": 0.001,
+      "loss": 0.4092,
+      "step": 7825
+    },
+    {
+      "epoch": 0.21593656552975302,
+      "grad_norm": 0.0027619630564004183,
+      "learning_rate": 0.001,
+      "loss": 0.3975,
+      "step": 7826
+    },
+    {
+      "epoch": 0.2159641577308174,
+      "grad_norm": 0.0029921175446361303,
+      "learning_rate": 0.001,
+      "loss": 0.3609,
+      "step": 7827
+    },
+    {
+      "epoch": 0.21599174993188175,
+      "grad_norm": 0.004075322765856981,
+      "learning_rate": 0.001,
+      "loss": 0.3674,
+      "step": 7828
+    },
+    {
+      "epoch": 0.21601934213294613,
+      "grad_norm": 0.004721477162092924,
+      "learning_rate": 0.001,
+      "loss": 0.4119,
+      "step": 7829
+    },
+    {
+      "epoch": 0.21604693433401048,
+      "grad_norm": 0.00825839675962925,
+      "learning_rate": 0.001,
+      "loss": 0.3915,
+      "step": 7830
+    },
+    {
+      "epoch": 0.21607452653507486,
+      "grad_norm": 0.0029653101228177547,
+      "learning_rate": 0.001,
+      "loss": 0.4062,
+      "step": 7831
+    },
+    {
+      "epoch": 0.21610211873613924,
+      "grad_norm": 0.004584647715091705,
+      "learning_rate": 0.001,
+      "loss": 0.3687,
+      "step": 7832
+    },
+    {
+      "epoch": 0.2161297109372036,
+      "grad_norm": 0.0019515285966917872,
+      "learning_rate": 0.001,
+      "loss": 0.4817,
+      "step": 7833
+    },
+    {
+      "epoch": 0.21615730313826798,
+      "grad_norm": 0.0029263091273605824,
+      "learning_rate": 0.001,
+      "loss": 0.3899,
+      "step": 7834
+    },
+    {
+      "epoch": 0.21618489533933233,
+      "grad_norm": 0.002956991782411933,
+      "learning_rate": 0.001,
+      "loss": 0.3953,
+      "step": 7835
+    },
+    {
+      "epoch": 0.2162124875403967,
+      "grad_norm": 0.0037960107438266277,
+      "learning_rate": 0.001,
+      "loss": 0.3888,
+      "step": 7836
+    },
+    {
+      "epoch": 0.2162400797414611,
+      "grad_norm": 0.0029722515027970076,
+      "learning_rate": 0.001,
+      "loss": 0.3817,
+      "step": 7837
+    },
+    {
+      "epoch": 0.21626767194252544,
+      "grad_norm": 0.0032150924671441317,
+      "learning_rate": 0.001,
+      "loss": 0.3977,
+      "step": 7838
+    },
+    {
+      "epoch": 0.21629526414358982,
+      "grad_norm": 0.005019112955778837,
+      "learning_rate": 0.001,
+      "loss": 0.4123,
+      "step": 7839
+    },
+    {
+      "epoch": 0.21632285634465417,
+      "grad_norm": 0.0028452936094254255,
+      "learning_rate": 0.001,
+      "loss": 0.3936,
+      "step": 7840
+    },
+    {
+      "epoch": 0.21635044854571855,
+      "grad_norm": 0.0035461215302348137,
+      "learning_rate": 0.001,
+      "loss": 0.4054,
+      "step": 7841
+    },
+    {
+      "epoch": 0.21637804074678293,
+      "grad_norm": 0.0051398370414972305,
+      "learning_rate": 0.001,
+      "loss": 0.3803,
+      "step": 7842
+    },
+    {
+      "epoch": 0.2164056329478473,
+      "grad_norm": 0.0029704368207603693,
+      "learning_rate": 0.001,
+      "loss": 0.4225,
+      "step": 7843
+    },
+    {
+      "epoch": 0.21643322514891167,
+      "grad_norm": 0.004192579071968794,
+      "learning_rate": 0.001,
+      "loss": 0.3837,
+      "step": 7844
+    },
+    {
+      "epoch": 0.21646081734997602,
+      "grad_norm": 0.0036786114796996117,
+      "learning_rate": 0.001,
+      "loss": 0.3991,
+      "step": 7845
+    },
+    {
+      "epoch": 0.2164884095510404,
+      "grad_norm": 0.0032894443720579147,
+      "learning_rate": 0.001,
+      "loss": 0.3671,
+      "step": 7846
+    },
+    {
+      "epoch": 0.21651600175210478,
+      "grad_norm": 0.00437202537432313,
+      "learning_rate": 0.001,
+      "loss": 0.4354,
+      "step": 7847
+    },
+    {
+      "epoch": 0.21654359395316913,
+      "grad_norm": 0.004263371229171753,
+      "learning_rate": 0.001,
+      "loss": 0.3765,
+      "step": 7848
+    },
+    {
+      "epoch": 0.2165711861542335,
+      "grad_norm": 0.0023139826953411102,
+      "learning_rate": 0.001,
+      "loss": 0.3926,
+      "step": 7849
+    },
+    {
+      "epoch": 0.21659877835529787,
+      "grad_norm": 0.002766036195680499,
+      "learning_rate": 0.001,
+      "loss": 0.3959,
+      "step": 7850
+    },
+    {
+      "epoch": 0.21662637055636225,
+      "grad_norm": 0.004952535964548588,
+      "learning_rate": 0.001,
+      "loss": 0.3839,
+      "step": 7851
+    },
+    {
+      "epoch": 0.21665396275742663,
+      "grad_norm": 0.008287088945508003,
+      "learning_rate": 0.001,
+      "loss": 0.3569,
+      "step": 7852
+    },
+    {
+      "epoch": 0.21668155495849098,
+      "grad_norm": 0.019219983369112015,
+      "learning_rate": 0.001,
+      "loss": 0.4082,
+      "step": 7853
+    },
+    {
+      "epoch": 0.21670914715955536,
+      "grad_norm": 0.0029021003283560276,
+      "learning_rate": 0.001,
+      "loss": 0.4005,
+      "step": 7854
+    },
+    {
+      "epoch": 0.2167367393606197,
+      "grad_norm": 0.004268075339496136,
+      "learning_rate": 0.001,
+      "loss": 0.3729,
+      "step": 7855
+    },
+    {
+      "epoch": 0.2167643315616841,
+      "grad_norm": 0.0038819487672299147,
+      "learning_rate": 0.001,
+      "loss": 0.366,
+      "step": 7856
+    },
+    {
+      "epoch": 0.21679192376274847,
+      "grad_norm": 0.0033485370222479105,
+      "learning_rate": 0.001,
+      "loss": 0.3949,
+      "step": 7857
+    },
+    {
+      "epoch": 0.21681951596381283,
+      "grad_norm": 0.004031899850815535,
+      "learning_rate": 0.001,
+      "loss": 0.3687,
+      "step": 7858
+    },
+    {
+      "epoch": 0.2168471081648772,
+      "grad_norm": 0.0028309531044214964,
+      "learning_rate": 0.001,
+      "loss": 0.3855,
+      "step": 7859
+    },
+    {
+      "epoch": 0.21687470036594156,
+      "grad_norm": 0.005679224617779255,
+      "learning_rate": 0.001,
+      "loss": 0.3675,
+      "step": 7860
+    },
+    {
+      "epoch": 0.21690229256700594,
+      "grad_norm": 0.0028747431933879852,
+      "learning_rate": 0.001,
+      "loss": 0.4003,
+      "step": 7861
+    },
+    {
+      "epoch": 0.2169298847680703,
+      "grad_norm": 0.0022720531560480595,
+      "learning_rate": 0.001,
+      "loss": 0.4386,
+      "step": 7862
+    },
+    {
+      "epoch": 0.21695747696913467,
+      "grad_norm": 0.0037069146055728197,
+      "learning_rate": 0.001,
+      "loss": 0.399,
+      "step": 7863
+    },
+    {
+      "epoch": 0.21698506917019905,
+      "grad_norm": 0.004593148361891508,
+      "learning_rate": 0.001,
+      "loss": 0.3713,
+      "step": 7864
+    },
+    {
+      "epoch": 0.2170126613712634,
+      "grad_norm": 0.004522950388491154,
+      "learning_rate": 0.001,
+      "loss": 0.3679,
+      "step": 7865
+    },
+    {
+      "epoch": 0.21704025357232778,
+      "grad_norm": 0.0038752169348299503,
+      "learning_rate": 0.001,
+      "loss": 0.4058,
+      "step": 7866
+    },
+    {
+      "epoch": 0.21706784577339214,
+      "grad_norm": 0.002294728998094797,
+      "learning_rate": 0.001,
+      "loss": 0.4457,
+      "step": 7867
+    },
+    {
+      "epoch": 0.21709543797445652,
+      "grad_norm": 0.002397543750703335,
+      "learning_rate": 0.001,
+      "loss": 0.3893,
+      "step": 7868
+    },
+    {
+      "epoch": 0.2171230301755209,
+      "grad_norm": 0.009867599233984947,
+      "learning_rate": 0.001,
+      "loss": 0.3669,
+      "step": 7869
+    },
+    {
+      "epoch": 0.21715062237658525,
+      "grad_norm": 0.003990727476775646,
+      "learning_rate": 0.001,
+      "loss": 0.3495,
+      "step": 7870
+    },
+    {
+      "epoch": 0.21717821457764963,
+      "grad_norm": 0.003901657648384571,
+      "learning_rate": 0.001,
+      "loss": 0.3531,
+      "step": 7871
+    },
+    {
+      "epoch": 0.21720580677871398,
+      "grad_norm": 0.00325774191878736,
+      "learning_rate": 0.001,
+      "loss": 0.3673,
+      "step": 7872
+    },
+    {
+      "epoch": 0.21723339897977836,
+      "grad_norm": 0.003350186161696911,
+      "learning_rate": 0.001,
+      "loss": 0.4241,
+      "step": 7873
+    },
+    {
+      "epoch": 0.21726099118084274,
+      "grad_norm": 0.002645864151418209,
+      "learning_rate": 0.001,
+      "loss": 0.3833,
+      "step": 7874
+    },
+    {
+      "epoch": 0.2172885833819071,
+      "grad_norm": 0.006232825573533773,
+      "learning_rate": 0.001,
+      "loss": 0.3636,
+      "step": 7875
+    },
+    {
+      "epoch": 0.21731617558297148,
+      "grad_norm": 0.0021961445454508066,
+      "learning_rate": 0.001,
+      "loss": 0.3775,
+      "step": 7876
+    },
+    {
+      "epoch": 0.21734376778403583,
+      "grad_norm": 0.0030032650101929903,
+      "learning_rate": 0.001,
+      "loss": 0.3906,
+      "step": 7877
+    },
+    {
+      "epoch": 0.2173713599851002,
+      "grad_norm": 0.006241418421268463,
+      "learning_rate": 0.001,
+      "loss": 0.3826,
+      "step": 7878
+    },
+    {
+      "epoch": 0.2173989521861646,
+      "grad_norm": 0.003943136427551508,
+      "learning_rate": 0.001,
+      "loss": 0.4231,
+      "step": 7879
+    },
+    {
+      "epoch": 0.21742654438722894,
+      "grad_norm": 0.0024433995131403208,
+      "learning_rate": 0.001,
+      "loss": 0.4239,
+      "step": 7880
+    },
+    {
+      "epoch": 0.21745413658829332,
+      "grad_norm": 0.0043607852421700954,
+      "learning_rate": 0.001,
+      "loss": 0.4227,
+      "step": 7881
+    },
+    {
+      "epoch": 0.21748172878935768,
+      "grad_norm": 0.004530392121523619,
+      "learning_rate": 0.001,
+      "loss": 0.4202,
+      "step": 7882
+    },
+    {
+      "epoch": 0.21750932099042206,
+      "grad_norm": 0.0024341184180229902,
+      "learning_rate": 0.001,
+      "loss": 0.4491,
+      "step": 7883
+    },
+    {
+      "epoch": 0.21753691319148644,
+      "grad_norm": 0.003600149182602763,
+      "learning_rate": 0.001,
+      "loss": 0.3859,
+      "step": 7884
+    },
+    {
+      "epoch": 0.2175645053925508,
+      "grad_norm": 0.002762843621894717,
+      "learning_rate": 0.001,
+      "loss": 0.4081,
+      "step": 7885
+    },
+    {
+      "epoch": 0.21759209759361517,
+      "grad_norm": 0.0029462978709489107,
+      "learning_rate": 0.001,
+      "loss": 0.375,
+      "step": 7886
+    },
+    {
+      "epoch": 0.21761968979467952,
+      "grad_norm": 0.0023480509407818317,
+      "learning_rate": 0.001,
+      "loss": 0.4116,
+      "step": 7887
+    },
+    {
+      "epoch": 0.2176472819957439,
+      "grad_norm": 0.002522410824894905,
+      "learning_rate": 0.001,
+      "loss": 0.434,
+      "step": 7888
+    },
+    {
+      "epoch": 0.21767487419680828,
+      "grad_norm": 0.0022286747116595507,
+      "learning_rate": 0.001,
+      "loss": 0.4345,
+      "step": 7889
+    },
+    {
+      "epoch": 0.21770246639787263,
+      "grad_norm": 0.002925209002569318,
+      "learning_rate": 0.001,
+      "loss": 0.3965,
+      "step": 7890
+    },
+    {
+      "epoch": 0.21773005859893702,
+      "grad_norm": 0.003066236851736903,
+      "learning_rate": 0.001,
+      "loss": 0.3512,
+      "step": 7891
+    },
+    {
+      "epoch": 0.21775765080000137,
+      "grad_norm": 0.003831464098766446,
+      "learning_rate": 0.001,
+      "loss": 0.4169,
+      "step": 7892
+    },
+    {
+      "epoch": 0.21778524300106575,
+      "grad_norm": 0.0033246371895074844,
+      "learning_rate": 0.001,
+      "loss": 0.4158,
+      "step": 7893
+    },
+    {
+      "epoch": 0.21781283520213013,
+      "grad_norm": 0.0038893541786819696,
+      "learning_rate": 0.001,
+      "loss": 0.3877,
+      "step": 7894
+    },
+    {
+      "epoch": 0.21784042740319448,
+      "grad_norm": 0.003152028191834688,
+      "learning_rate": 0.001,
+      "loss": 0.3895,
+      "step": 7895
+    },
+    {
+      "epoch": 0.21786801960425886,
+      "grad_norm": 0.004073096439242363,
+      "learning_rate": 0.001,
+      "loss": 0.4466,
+      "step": 7896
+    },
+    {
+      "epoch": 0.2178956118053232,
+      "grad_norm": 0.00838442798703909,
+      "learning_rate": 0.001,
+      "loss": 0.43,
+      "step": 7897
+    },
+    {
+      "epoch": 0.2179232040063876,
+      "grad_norm": 0.004715254995971918,
+      "learning_rate": 0.001,
+      "loss": 0.4276,
+      "step": 7898
+    },
+    {
+      "epoch": 0.21795079620745197,
+      "grad_norm": 0.005644774530082941,
+      "learning_rate": 0.001,
+      "loss": 0.3796,
+      "step": 7899
+    },
+    {
+      "epoch": 0.21797838840851633,
+      "grad_norm": 0.0035621551796793938,
+      "learning_rate": 0.001,
+      "loss": 0.4021,
+      "step": 7900
+    },
+    {
+      "epoch": 0.2180059806095807,
+      "grad_norm": 0.003715448547154665,
+      "learning_rate": 0.001,
+      "loss": 0.4418,
+      "step": 7901
+    },
+    {
+      "epoch": 0.21803357281064506,
+      "grad_norm": 0.0025230993051081896,
+      "learning_rate": 0.001,
+      "loss": 0.389,
+      "step": 7902
+    },
+    {
+      "epoch": 0.21806116501170944,
+      "grad_norm": 0.0039021014235913754,
+      "learning_rate": 0.001,
+      "loss": 0.3779,
+      "step": 7903
+    },
+    {
+      "epoch": 0.21808875721277382,
+      "grad_norm": 0.004864577203989029,
+      "learning_rate": 0.001,
+      "loss": 0.4003,
+      "step": 7904
+    },
+    {
+      "epoch": 0.21811634941383817,
+      "grad_norm": 0.007110804785043001,
+      "learning_rate": 0.001,
+      "loss": 0.4061,
+      "step": 7905
+    },
+    {
+      "epoch": 0.21814394161490255,
+      "grad_norm": 0.004145526327192783,
+      "learning_rate": 0.001,
+      "loss": 0.398,
+      "step": 7906
+    },
+    {
+      "epoch": 0.2181715338159669,
+      "grad_norm": 0.003214552765712142,
+      "learning_rate": 0.001,
+      "loss": 0.4061,
+      "step": 7907
+    },
+    {
+      "epoch": 0.2181991260170313,
+      "grad_norm": 0.0025902027264237404,
+      "learning_rate": 0.001,
+      "loss": 0.4325,
+      "step": 7908
+    },
+    {
+      "epoch": 0.21822671821809567,
+      "grad_norm": 0.005056621506810188,
+      "learning_rate": 0.001,
+      "loss": 0.384,
+      "step": 7909
+    },
+    {
+      "epoch": 0.21825431041916002,
+      "grad_norm": 0.0027781168464571238,
+      "learning_rate": 0.001,
+      "loss": 0.3889,
+      "step": 7910
+    },
+    {
+      "epoch": 0.2182819026202244,
+      "grad_norm": 0.002909269416704774,
+      "learning_rate": 0.001,
+      "loss": 0.409,
+      "step": 7911
+    },
+    {
+      "epoch": 0.21830949482128875,
+      "grad_norm": 0.003969018347561359,
+      "learning_rate": 0.001,
+      "loss": 0.415,
+      "step": 7912
+    },
+    {
+      "epoch": 0.21833708702235313,
+      "grad_norm": 0.002928847912698984,
+      "learning_rate": 0.001,
+      "loss": 0.4139,
+      "step": 7913
+    },
+    {
+      "epoch": 0.2183646792234175,
+      "grad_norm": 0.003366072429344058,
+      "learning_rate": 0.001,
+      "loss": 0.3787,
+      "step": 7914
+    },
+    {
+      "epoch": 0.21839227142448187,
+      "grad_norm": 0.003718828782439232,
+      "learning_rate": 0.001,
+      "loss": 0.4136,
+      "step": 7915
+    },
+    {
+      "epoch": 0.21841986362554625,
+      "grad_norm": 0.0029193959198892117,
+      "learning_rate": 0.001,
+      "loss": 0.3749,
+      "step": 7916
+    },
+    {
+      "epoch": 0.2184474558266106,
+      "grad_norm": 0.002285629976540804,
+      "learning_rate": 0.001,
+      "loss": 0.4077,
+      "step": 7917
+    },
+    {
+      "epoch": 0.21847504802767498,
+      "grad_norm": 0.002450938569381833,
+      "learning_rate": 0.001,
+      "loss": 0.3821,
+      "step": 7918
+    },
+    {
+      "epoch": 0.21850264022873936,
+      "grad_norm": 0.002341889077797532,
+      "learning_rate": 0.001,
+      "loss": 0.4225,
+      "step": 7919
+    },
+    {
+      "epoch": 0.2185302324298037,
+      "grad_norm": 0.003041157266125083,
+      "learning_rate": 0.001,
+      "loss": 0.4094,
+      "step": 7920
+    },
+    {
+      "epoch": 0.2185578246308681,
+      "grad_norm": 0.004077001940459013,
+      "learning_rate": 0.001,
+      "loss": 0.4035,
+      "step": 7921
+    },
+    {
+      "epoch": 0.21858541683193244,
+      "grad_norm": 0.002720333868637681,
+      "learning_rate": 0.001,
+      "loss": 0.3965,
+      "step": 7922
+    },
+    {
+      "epoch": 0.21861300903299682,
+      "grad_norm": 0.005403697956353426,
+      "learning_rate": 0.001,
+      "loss": 0.3741,
+      "step": 7923
+    },
+    {
+      "epoch": 0.2186406012340612,
+      "grad_norm": 0.004800851922482252,
+      "learning_rate": 0.001,
+      "loss": 0.3859,
+      "step": 7924
+    },
+    {
+      "epoch": 0.21866819343512556,
+      "grad_norm": 0.003321266733109951,
+      "learning_rate": 0.001,
+      "loss": 0.4121,
+      "step": 7925
+    },
+    {
+      "epoch": 0.21869578563618994,
+      "grad_norm": 0.003606748068705201,
+      "learning_rate": 0.001,
+      "loss": 0.3809,
+      "step": 7926
+    },
+    {
+      "epoch": 0.2187233778372543,
+      "grad_norm": 0.002816277788951993,
+      "learning_rate": 0.001,
+      "loss": 0.4194,
+      "step": 7927
+    },
+    {
+      "epoch": 0.21875097003831867,
+      "grad_norm": 0.0028528219554573298,
+      "learning_rate": 0.001,
+      "loss": 0.423,
+      "step": 7928
+    },
+    {
+      "epoch": 0.21877856223938305,
+      "grad_norm": 0.00370188825763762,
+      "learning_rate": 0.001,
+      "loss": 0.3935,
+      "step": 7929
+    },
+    {
+      "epoch": 0.2188061544404474,
+      "grad_norm": 0.00391001021489501,
+      "learning_rate": 0.001,
+      "loss": 0.357,
+      "step": 7930
+    },
+    {
+      "epoch": 0.21883374664151178,
+      "grad_norm": 0.002469925908371806,
+      "learning_rate": 0.001,
+      "loss": 0.4025,
+      "step": 7931
+    },
+    {
+      "epoch": 0.21886133884257614,
+      "grad_norm": 0.004554017446935177,
+      "learning_rate": 0.001,
+      "loss": 0.4178,
+      "step": 7932
+    },
+    {
+      "epoch": 0.21888893104364052,
+      "grad_norm": 0.005216664168983698,
+      "learning_rate": 0.001,
+      "loss": 0.404,
+      "step": 7933
+    },
+    {
+      "epoch": 0.2189165232447049,
+      "grad_norm": 0.002773087937384844,
+      "learning_rate": 0.001,
+      "loss": 0.3708,
+      "step": 7934
+    },
+    {
+      "epoch": 0.21894411544576925,
+      "grad_norm": 0.002156183123588562,
+      "learning_rate": 0.001,
+      "loss": 0.4021,
+      "step": 7935
+    },
+    {
+      "epoch": 0.21897170764683363,
+      "grad_norm": 0.0025752519723027945,
+      "learning_rate": 0.001,
+      "loss": 0.4342,
+      "step": 7936
+    },
+    {
+      "epoch": 0.21899929984789798,
+      "grad_norm": 0.002592964330688119,
+      "learning_rate": 0.001,
+      "loss": 0.3793,
+      "step": 7937
+    },
+    {
+      "epoch": 0.21902689204896236,
+      "grad_norm": 0.0023423507809638977,
+      "learning_rate": 0.001,
+      "loss": 0.4115,
+      "step": 7938
+    },
+    {
+      "epoch": 0.21905448425002674,
+      "grad_norm": 0.0025791767984628677,
+      "learning_rate": 0.001,
+      "loss": 0.3848,
+      "step": 7939
+    },
+    {
+      "epoch": 0.2190820764510911,
+      "grad_norm": 0.0030768734868615866,
+      "learning_rate": 0.001,
+      "loss": 0.384,
+      "step": 7940
+    },
+    {
+      "epoch": 0.21910966865215548,
+      "grad_norm": 0.0035807385575026274,
+      "learning_rate": 0.001,
+      "loss": 0.4136,
+      "step": 7941
+    },
+    {
+      "epoch": 0.21913726085321983,
+      "grad_norm": 0.0031572608277201653,
+      "learning_rate": 0.001,
+      "loss": 0.3611,
+      "step": 7942
+    },
+    {
+      "epoch": 0.2191648530542842,
+      "grad_norm": 0.0027867392636835575,
+      "learning_rate": 0.001,
+      "loss": 0.3867,
+      "step": 7943
+    },
+    {
+      "epoch": 0.2191924452553486,
+      "grad_norm": 0.0025052884593605995,
+      "learning_rate": 0.001,
+      "loss": 0.4148,
+      "step": 7944
+    },
+    {
+      "epoch": 0.21922003745641294,
+      "grad_norm": 0.0027730814181268215,
+      "learning_rate": 0.001,
+      "loss": 0.4021,
+      "step": 7945
+    },
+    {
+      "epoch": 0.21924762965747732,
+      "grad_norm": 0.0026065954007208347,
+      "learning_rate": 0.001,
+      "loss": 0.4361,
+      "step": 7946
+    },
+    {
+      "epoch": 0.21927522185854167,
+      "grad_norm": 0.00736627820879221,
+      "learning_rate": 0.001,
+      "loss": 0.3858,
+      "step": 7947
+    },
+    {
+      "epoch": 0.21930281405960605,
+      "grad_norm": 0.003984878305345774,
+      "learning_rate": 0.001,
+      "loss": 0.4282,
+      "step": 7948
+    },
+    {
+      "epoch": 0.21933040626067044,
+      "grad_norm": 0.005414186976850033,
+      "learning_rate": 0.001,
+      "loss": 0.4019,
+      "step": 7949
+    },
+    {
+      "epoch": 0.2193579984617348,
+      "grad_norm": 0.0026020752266049385,
+      "learning_rate": 0.001,
+      "loss": 0.4018,
+      "step": 7950
+    },
+    {
+      "epoch": 0.21938559066279917,
+      "grad_norm": 0.0026629299391061068,
+      "learning_rate": 0.001,
+      "loss": 0.4099,
+      "step": 7951
+    },
+    {
+      "epoch": 0.21941318286386352,
+      "grad_norm": 0.0022089278791099787,
+      "learning_rate": 0.001,
+      "loss": 0.4051,
+      "step": 7952
+    },
+    {
+      "epoch": 0.2194407750649279,
+      "grad_norm": 0.0028548568952828646,
+      "learning_rate": 0.001,
+      "loss": 0.3881,
+      "step": 7953
+    },
+    {
+      "epoch": 0.21946836726599225,
+      "grad_norm": 0.0029203901067376137,
+      "learning_rate": 0.001,
+      "loss": 0.4204,
+      "step": 7954
+    },
+    {
+      "epoch": 0.21949595946705663,
+      "grad_norm": 0.0032568967435508966,
+      "learning_rate": 0.001,
+      "loss": 0.4321,
+      "step": 7955
+    },
+    {
+      "epoch": 0.21952355166812101,
+      "grad_norm": 0.005700434558093548,
+      "learning_rate": 0.001,
+      "loss": 0.3849,
+      "step": 7956
+    },
+    {
+      "epoch": 0.21955114386918537,
+      "grad_norm": 0.0026754315476864576,
+      "learning_rate": 0.001,
+      "loss": 0.4113,
+      "step": 7957
+    },
+    {
+      "epoch": 0.21957873607024975,
+      "grad_norm": 0.004874562378972769,
+      "learning_rate": 0.001,
+      "loss": 0.3982,
+      "step": 7958
+    },
+    {
+      "epoch": 0.2196063282713141,
+      "grad_norm": 0.0051938071846961975,
+      "learning_rate": 0.001,
+      "loss": 0.412,
+      "step": 7959
+    },
+    {
+      "epoch": 0.21963392047237848,
+      "grad_norm": 0.005463124252855778,
+      "learning_rate": 0.001,
+      "loss": 0.392,
+      "step": 7960
+    },
+    {
+      "epoch": 0.21966151267344286,
+      "grad_norm": 0.0026376366149634123,
+      "learning_rate": 0.001,
+      "loss": 0.3492,
+      "step": 7961
+    },
+    {
+      "epoch": 0.2196891048745072,
+      "grad_norm": 0.0029911526944488287,
+      "learning_rate": 0.001,
+      "loss": 0.4019,
+      "step": 7962
+    },
+    {
+      "epoch": 0.2197166970755716,
+      "grad_norm": 0.002725818660110235,
+      "learning_rate": 0.001,
+      "loss": 0.3863,
+      "step": 7963
+    },
+    {
+      "epoch": 0.21974428927663595,
+      "grad_norm": 0.0030495068058371544,
+      "learning_rate": 0.001,
+      "loss": 0.4246,
+      "step": 7964
+    },
+    {
+      "epoch": 0.21977188147770033,
+      "grad_norm": 0.0032879766076803207,
+      "learning_rate": 0.001,
+      "loss": 0.3787,
+      "step": 7965
+    },
+    {
+      "epoch": 0.2197994736787647,
+      "grad_norm": 0.005279941018670797,
+      "learning_rate": 0.001,
+      "loss": 0.3665,
+      "step": 7966
+    },
+    {
+      "epoch": 0.21982706587982906,
+      "grad_norm": 0.004807317163795233,
+      "learning_rate": 0.001,
+      "loss": 0.4117,
+      "step": 7967
+    },
+    {
+      "epoch": 0.21985465808089344,
+      "grad_norm": 0.004063909407705069,
+      "learning_rate": 0.001,
+      "loss": 0.38,
+      "step": 7968
+    },
+    {
+      "epoch": 0.2198822502819578,
+      "grad_norm": 0.002503841184079647,
+      "learning_rate": 0.001,
+      "loss": 0.3815,
+      "step": 7969
+    },
+    {
+      "epoch": 0.21990984248302217,
+      "grad_norm": 0.002414180664345622,
+      "learning_rate": 0.001,
+      "loss": 0.3916,
+      "step": 7970
+    },
+    {
+      "epoch": 0.21993743468408655,
+      "grad_norm": 0.004260845948010683,
+      "learning_rate": 0.001,
+      "loss": 0.3956,
+      "step": 7971
+    },
+    {
+      "epoch": 0.2199650268851509,
+      "grad_norm": 0.003588800085708499,
+      "learning_rate": 0.001,
+      "loss": 0.3539,
+      "step": 7972
+    },
+    {
+      "epoch": 0.21999261908621529,
+      "grad_norm": 0.01770518720149994,
+      "learning_rate": 0.001,
+      "loss": 0.3468,
+      "step": 7973
+    },
+    {
+      "epoch": 0.22002021128727964,
+      "grad_norm": 0.004121637437492609,
+      "learning_rate": 0.001,
+      "loss": 0.3988,
+      "step": 7974
+    },
+    {
+      "epoch": 0.22004780348834402,
+      "grad_norm": 0.0025279296096414328,
+      "learning_rate": 0.001,
+      "loss": 0.4385,
+      "step": 7975
+    },
+    {
+      "epoch": 0.2200753956894084,
+      "grad_norm": 0.0031378697603940964,
+      "learning_rate": 0.001,
+      "loss": 0.4171,
+      "step": 7976
+    },
+    {
+      "epoch": 0.22010298789047275,
+      "grad_norm": 0.006243572104722261,
+      "learning_rate": 0.001,
+      "loss": 0.3851,
+      "step": 7977
+    },
+    {
+      "epoch": 0.22013058009153713,
+      "grad_norm": 0.005349922459572554,
+      "learning_rate": 0.001,
+      "loss": 0.3876,
+      "step": 7978
+    },
+    {
+      "epoch": 0.22015817229260148,
+      "grad_norm": 0.004400680772960186,
+      "learning_rate": 0.001,
+      "loss": 0.399,
+      "step": 7979
+    },
+    {
+      "epoch": 0.22018576449366586,
+      "grad_norm": 0.005931714083999395,
+      "learning_rate": 0.001,
+      "loss": 0.3864,
+      "step": 7980
+    },
+    {
+      "epoch": 0.22021335669473024,
+      "grad_norm": 0.006180520635098219,
+      "learning_rate": 0.001,
+      "loss": 0.3868,
+      "step": 7981
+    },
+    {
+      "epoch": 0.2202409488957946,
+      "grad_norm": 0.002432264154776931,
+      "learning_rate": 0.001,
+      "loss": 0.3742,
+      "step": 7982
+    },
+    {
+      "epoch": 0.22026854109685898,
+      "grad_norm": 0.002539379522204399,
+      "learning_rate": 0.001,
+      "loss": 0.4761,
+      "step": 7983
+    },
+    {
+      "epoch": 0.22029613329792333,
+      "grad_norm": 0.0032303177285939455,
+      "learning_rate": 0.001,
+      "loss": 0.4102,
+      "step": 7984
+    },
+    {
+      "epoch": 0.2203237254989877,
+      "grad_norm": 0.004244436044245958,
+      "learning_rate": 0.001,
+      "loss": 0.3844,
+      "step": 7985
+    },
+    {
+      "epoch": 0.2203513177000521,
+      "grad_norm": 0.003201343584805727,
+      "learning_rate": 0.001,
+      "loss": 0.4399,
+      "step": 7986
+    },
+    {
+      "epoch": 0.22037890990111644,
+      "grad_norm": 0.004487950354814529,
+      "learning_rate": 0.001,
+      "loss": 0.4543,
+      "step": 7987
+    },
+    {
+      "epoch": 0.22040650210218082,
+      "grad_norm": 0.002907637506723404,
+      "learning_rate": 0.001,
+      "loss": 0.4088,
+      "step": 7988
+    },
+    {
+      "epoch": 0.22043409430324518,
+      "grad_norm": 0.002862214343622327,
+      "learning_rate": 0.001,
+      "loss": 0.4378,
+      "step": 7989
+    },
+    {
+      "epoch": 0.22046168650430956,
+      "grad_norm": 0.011252621188759804,
+      "learning_rate": 0.001,
+      "loss": 0.4219,
+      "step": 7990
+    },
+    {
+      "epoch": 0.22048927870537394,
+      "grad_norm": 0.002715484006330371,
+      "learning_rate": 0.001,
+      "loss": 0.4007,
+      "step": 7991
+    },
+    {
+      "epoch": 0.2205168709064383,
+      "grad_norm": 0.0029305212665349245,
+      "learning_rate": 0.001,
+      "loss": 0.4262,
+      "step": 7992
+    },
+    {
+      "epoch": 0.22054446310750267,
+      "grad_norm": 0.0029706000350415707,
+      "learning_rate": 0.001,
+      "loss": 0.348,
+      "step": 7993
+    },
+    {
+      "epoch": 0.22057205530856702,
+      "grad_norm": 0.00246282946318388,
+      "learning_rate": 0.001,
+      "loss": 0.4015,
+      "step": 7994
+    },
+    {
+      "epoch": 0.2205996475096314,
+      "grad_norm": 0.0036219728644937277,
+      "learning_rate": 0.001,
+      "loss": 0.3819,
+      "step": 7995
+    },
+    {
+      "epoch": 0.22062723971069578,
+      "grad_norm": 0.0028112917207181454,
+      "learning_rate": 0.001,
+      "loss": 0.4004,
+      "step": 7996
+    },
+    {
+      "epoch": 0.22065483191176014,
+      "grad_norm": 0.005280990153551102,
+      "learning_rate": 0.001,
+      "loss": 0.3977,
+      "step": 7997
+    },
+    {
+      "epoch": 0.22068242411282452,
+      "grad_norm": 0.004479492083191872,
+      "learning_rate": 0.001,
+      "loss": 0.3864,
+      "step": 7998
+    },
+    {
+      "epoch": 0.22071001631388887,
+      "grad_norm": 0.0030434499494731426,
+      "learning_rate": 0.001,
+      "loss": 0.3753,
+      "step": 7999
+    },
+    {
+      "epoch": 0.22073760851495325,
+      "grad_norm": 0.016081584617495537,
+      "learning_rate": 0.001,
+      "loss": 0.3885,
+      "step": 8000
+    },
+    {
+      "epoch": 0.22073760851495325,
+      "eval_runtime": 24.424,
+      "eval_samples_per_second": 1.31,
+      "eval_steps_per_second": 0.164,
+      "step": 8000
+    },
+    {
+      "epoch": 0.22076520071601763,
+      "grad_norm": 0.004563974682241678,
+      "learning_rate": 0.001,
+      "loss": 0.3883,
+      "step": 8001
+    },
+    {
+      "epoch": 0.22079279291708198,
+      "grad_norm": 0.002752038650214672,
+      "learning_rate": 0.001,
+      "loss": 0.3746,
+      "step": 8002
+    },
+    {
+      "epoch": 0.22082038511814636,
+      "grad_norm": 0.0030077442061156034,
+      "learning_rate": 0.001,
+      "loss": 0.403,
+      "step": 8003
+    },
+    {
+      "epoch": 0.22084797731921071,
+      "grad_norm": 0.0024848515167832375,
+      "learning_rate": 0.001,
+      "loss": 0.387,
+      "step": 8004
+    },
+    {
+      "epoch": 0.2208755695202751,
+      "grad_norm": 0.0023180947173386812,
+      "learning_rate": 0.001,
+      "loss": 0.4063,
+      "step": 8005
+    },
+    {
+      "epoch": 0.22090316172133947,
+      "grad_norm": 0.0037670242600142956,
+      "learning_rate": 0.001,
+      "loss": 0.4131,
+      "step": 8006
+    },
+    {
+      "epoch": 0.22093075392240383,
+      "grad_norm": 0.003507862566038966,
+      "learning_rate": 0.001,
+      "loss": 0.3705,
+      "step": 8007
+    },
+    {
+      "epoch": 0.2209583461234682,
+      "grad_norm": 0.004687591455876827,
+      "learning_rate": 0.001,
+      "loss": 0.3816,
+      "step": 8008
+    },
+    {
+      "epoch": 0.22098593832453256,
+      "grad_norm": 0.002483742544427514,
+      "learning_rate": 0.001,
+      "loss": 0.4235,
+      "step": 8009
+    },
+    {
+      "epoch": 0.22101353052559694,
+      "grad_norm": 0.004836369771510363,
+      "learning_rate": 0.001,
+      "loss": 0.4485,
+      "step": 8010
+    },
+    {
+      "epoch": 0.22104112272666132,
+      "grad_norm": 0.003374388674274087,
+      "learning_rate": 0.001,
+      "loss": 0.398,
+      "step": 8011
+    },
+    {
+      "epoch": 0.22106871492772567,
+      "grad_norm": 0.004700181540101767,
+      "learning_rate": 0.001,
+      "loss": 0.3449,
+      "step": 8012
+    },
+    {
+      "epoch": 0.22109630712879005,
+      "grad_norm": 0.011050062254071236,
+      "learning_rate": 0.001,
+      "loss": 0.3755,
+      "step": 8013
+    },
+    {
+      "epoch": 0.2211238993298544,
+      "grad_norm": 0.0021428498439490795,
+      "learning_rate": 0.001,
+      "loss": 0.3979,
+      "step": 8014
+    },
+    {
+      "epoch": 0.2211514915309188,
+      "grad_norm": 0.00233366503380239,
+      "learning_rate": 0.001,
+      "loss": 0.4079,
+      "step": 8015
+    },
+    {
+      "epoch": 0.22117908373198317,
+      "grad_norm": 0.0025947142858058214,
+      "learning_rate": 0.001,
+      "loss": 0.4434,
+      "step": 8016
+    },
+    {
+      "epoch": 0.22120667593304752,
+      "grad_norm": 0.00214517954736948,
+      "learning_rate": 0.001,
+      "loss": 0.3971,
+      "step": 8017
+    },
+    {
+      "epoch": 0.2212342681341119,
+      "grad_norm": 0.003991144709289074,
+      "learning_rate": 0.001,
+      "loss": 0.4082,
+      "step": 8018
+    },
+    {
+      "epoch": 0.22126186033517625,
+      "grad_norm": 0.003229195484891534,
+      "learning_rate": 0.001,
+      "loss": 0.3928,
+      "step": 8019
+    },
+    {
+      "epoch": 0.22128945253624063,
+      "grad_norm": 0.002979196375235915,
+      "learning_rate": 0.001,
+      "loss": 0.4029,
+      "step": 8020
+    },
+    {
+      "epoch": 0.221317044737305,
+      "grad_norm": 0.004278901033103466,
+      "learning_rate": 0.001,
+      "loss": 0.4478,
+      "step": 8021
+    },
+    {
+      "epoch": 0.22134463693836937,
+      "grad_norm": 0.007357322610914707,
+      "learning_rate": 0.001,
+      "loss": 0.4118,
+      "step": 8022
+    },
+    {
+      "epoch": 0.22137222913943375,
+      "grad_norm": 0.0035520815290510654,
+      "learning_rate": 0.001,
+      "loss": 0.368,
+      "step": 8023
+    },
+    {
+      "epoch": 0.2213998213404981,
+      "grad_norm": 0.004246349446475506,
+      "learning_rate": 0.001,
+      "loss": 0.3497,
+      "step": 8024
+    },
+    {
+      "epoch": 0.22142741354156248,
+      "grad_norm": 0.0035226894542574883,
+      "learning_rate": 0.001,
+      "loss": 0.3883,
+      "step": 8025
+    },
+    {
+      "epoch": 0.22145500574262686,
+      "grad_norm": 0.0033106855116784573,
+      "learning_rate": 0.001,
+      "loss": 0.4018,
+      "step": 8026
+    },
+    {
+      "epoch": 0.2214825979436912,
+      "grad_norm": 0.004675147123634815,
+      "learning_rate": 0.001,
+      "loss": 0.3878,
+      "step": 8027
+    },
+    {
+      "epoch": 0.2215101901447556,
+      "grad_norm": 0.004575135186314583,
+      "learning_rate": 0.001,
+      "loss": 0.3664,
+      "step": 8028
+    },
+    {
+      "epoch": 0.22153778234581994,
+      "grad_norm": 0.002557947300374508,
+      "learning_rate": 0.001,
+      "loss": 0.4291,
+      "step": 8029
+    },
+    {
+      "epoch": 0.22156537454688432,
+      "grad_norm": 0.0031390858348459005,
+      "learning_rate": 0.001,
+      "loss": 0.454,
+      "step": 8030
+    },
+    {
+      "epoch": 0.2215929667479487,
+      "grad_norm": 0.004276024177670479,
+      "learning_rate": 0.001,
+      "loss": 0.3828,
+      "step": 8031
+    },
+    {
+      "epoch": 0.22162055894901306,
+      "grad_norm": 0.00613689050078392,
+      "learning_rate": 0.001,
+      "loss": 0.3498,
+      "step": 8032
+    },
+    {
+      "epoch": 0.22164815115007744,
+      "grad_norm": 0.0052116867154836655,
+      "learning_rate": 0.001,
+      "loss": 0.4038,
+      "step": 8033
+    },
+    {
+      "epoch": 0.2216757433511418,
+      "grad_norm": 0.0032780475448817015,
+      "learning_rate": 0.001,
+      "loss": 0.4159,
+      "step": 8034
+    },
+    {
+      "epoch": 0.22170333555220617,
+      "grad_norm": 0.003026155522093177,
+      "learning_rate": 0.001,
+      "loss": 0.4079,
+      "step": 8035
+    },
+    {
+      "epoch": 0.22173092775327055,
+      "grad_norm": 0.0075735533609986305,
+      "learning_rate": 0.001,
+      "loss": 0.4076,
+      "step": 8036
+    },
+    {
+      "epoch": 0.2217585199543349,
+      "grad_norm": 0.010422303341329098,
+      "learning_rate": 0.001,
+      "loss": 0.3969,
+      "step": 8037
+    },
+    {
+      "epoch": 0.22178611215539928,
+      "grad_norm": 0.009534427896142006,
+      "learning_rate": 0.001,
+      "loss": 0.3807,
+      "step": 8038
+    },
+    {
+      "epoch": 0.22181370435646364,
+      "grad_norm": 0.004428492859005928,
+      "learning_rate": 0.001,
+      "loss": 0.3755,
+      "step": 8039
+    },
+    {
+      "epoch": 0.22184129655752802,
+      "grad_norm": 0.006164777558296919,
+      "learning_rate": 0.001,
+      "loss": 0.3625,
+      "step": 8040
+    },
+    {
+      "epoch": 0.2218688887585924,
+      "grad_norm": 0.003359799971804023,
+      "learning_rate": 0.001,
+      "loss": 0.3913,
+      "step": 8041
+    },
+    {
+      "epoch": 0.22189648095965675,
+      "grad_norm": 0.006137318443506956,
+      "learning_rate": 0.001,
+      "loss": 0.3905,
+      "step": 8042
+    },
+    {
+      "epoch": 0.22192407316072113,
+      "grad_norm": 0.002756676636636257,
+      "learning_rate": 0.001,
+      "loss": 0.4066,
+      "step": 8043
+    },
+    {
+      "epoch": 0.22195166536178548,
+      "grad_norm": 0.004392385482788086,
+      "learning_rate": 0.001,
+      "loss": 0.3889,
+      "step": 8044
+    },
+    {
+      "epoch": 0.22197925756284986,
+      "grad_norm": 0.002814218634739518,
+      "learning_rate": 0.001,
+      "loss": 0.4143,
+      "step": 8045
+    },
+    {
+      "epoch": 0.22200684976391422,
+      "grad_norm": 0.002874415833503008,
+      "learning_rate": 0.001,
+      "loss": 0.3878,
+      "step": 8046
+    },
+    {
+      "epoch": 0.2220344419649786,
+      "grad_norm": 0.005475836340337992,
+      "learning_rate": 0.001,
+      "loss": 0.3815,
+      "step": 8047
+    },
+    {
+      "epoch": 0.22206203416604298,
+      "grad_norm": 0.002302660373970866,
+      "learning_rate": 0.001,
+      "loss": 0.3921,
+      "step": 8048
+    },
+    {
+      "epoch": 0.22208962636710733,
+      "grad_norm": 0.003013992914929986,
+      "learning_rate": 0.001,
+      "loss": 0.3822,
+      "step": 8049
+    },
+    {
+      "epoch": 0.2221172185681717,
+      "grad_norm": 0.0038887159898877144,
+      "learning_rate": 0.001,
+      "loss": 0.3634,
+      "step": 8050
+    },
+    {
+      "epoch": 0.22214481076923606,
+      "grad_norm": 0.004871399141848087,
+      "learning_rate": 0.001,
+      "loss": 0.4017,
+      "step": 8051
+    },
+    {
+      "epoch": 0.22217240297030044,
+      "grad_norm": 0.002852171426638961,
+      "learning_rate": 0.001,
+      "loss": 0.4119,
+      "step": 8052
+    },
+    {
+      "epoch": 0.22219999517136482,
+      "grad_norm": 0.0025132293812930584,
+      "learning_rate": 0.001,
+      "loss": 0.4383,
+      "step": 8053
+    },
+    {
+      "epoch": 0.22222758737242918,
+      "grad_norm": 0.002570350421592593,
+      "learning_rate": 0.001,
+      "loss": 0.4199,
+      "step": 8054
+    },
+    {
+      "epoch": 0.22225517957349356,
+      "grad_norm": 0.005364352371543646,
+      "learning_rate": 0.001,
+      "loss": 0.3952,
+      "step": 8055
+    },
+    {
+      "epoch": 0.2222827717745579,
+      "grad_norm": 0.0030977013520896435,
+      "learning_rate": 0.001,
+      "loss": 0.3853,
+      "step": 8056
+    },
+    {
+      "epoch": 0.2223103639756223,
+      "grad_norm": 0.0027198875322937965,
+      "learning_rate": 0.001,
+      "loss": 0.4575,
+      "step": 8057
+    },
+    {
+      "epoch": 0.22233795617668667,
+      "grad_norm": 0.011028733104467392,
+      "learning_rate": 0.001,
+      "loss": 0.3959,
+      "step": 8058
+    },
+    {
+      "epoch": 0.22236554837775102,
+      "grad_norm": 0.0037885240744799376,
+      "learning_rate": 0.001,
+      "loss": 0.4189,
+      "step": 8059
+    },
+    {
+      "epoch": 0.2223931405788154,
+      "grad_norm": 0.005188632290810347,
+      "learning_rate": 0.001,
+      "loss": 0.3937,
+      "step": 8060
+    },
+    {
+      "epoch": 0.22242073277987975,
+      "grad_norm": 0.003573206951841712,
+      "learning_rate": 0.001,
+      "loss": 0.4131,
+      "step": 8061
+    },
+    {
+      "epoch": 0.22244832498094413,
+      "grad_norm": 0.0036306334659457207,
+      "learning_rate": 0.001,
+      "loss": 0.4015,
+      "step": 8062
+    },
+    {
+      "epoch": 0.22247591718200851,
+      "grad_norm": 0.0033542602322995663,
+      "learning_rate": 0.001,
+      "loss": 0.3792,
+      "step": 8063
+    },
+    {
+      "epoch": 0.22250350938307287,
+      "grad_norm": 0.004111922346055508,
+      "learning_rate": 0.001,
+      "loss": 0.424,
+      "step": 8064
+    },
+    {
+      "epoch": 0.22253110158413725,
+      "grad_norm": 0.004905781242996454,
+      "learning_rate": 0.001,
+      "loss": 0.4319,
+      "step": 8065
+    },
+    {
+      "epoch": 0.2225586937852016,
+      "grad_norm": 0.009073971770703793,
+      "learning_rate": 0.001,
+      "loss": 0.3574,
+      "step": 8066
+    },
+    {
+      "epoch": 0.22258628598626598,
+      "grad_norm": 0.0031212870962917805,
+      "learning_rate": 0.001,
+      "loss": 0.4205,
+      "step": 8067
+    },
+    {
+      "epoch": 0.22261387818733036,
+      "grad_norm": 0.0061768037267029285,
+      "learning_rate": 0.001,
+      "loss": 0.4067,
+      "step": 8068
+    },
+    {
+      "epoch": 0.2226414703883947,
+      "grad_norm": 0.004057316109538078,
+      "learning_rate": 0.001,
+      "loss": 0.4284,
+      "step": 8069
+    },
+    {
+      "epoch": 0.2226690625894591,
+      "grad_norm": 0.002062835730612278,
+      "learning_rate": 0.001,
+      "loss": 0.4023,
+      "step": 8070
+    },
+    {
+      "epoch": 0.22269665479052345,
+      "grad_norm": 0.010553326457738876,
+      "learning_rate": 0.001,
+      "loss": 0.3838,
+      "step": 8071
+    },
+    {
+      "epoch": 0.22272424699158783,
+      "grad_norm": 0.004648920148611069,
+      "learning_rate": 0.001,
+      "loss": 0.3822,
+      "step": 8072
+    },
+    {
+      "epoch": 0.2227518391926522,
+      "grad_norm": 0.002884708344936371,
+      "learning_rate": 0.001,
+      "loss": 0.4043,
+      "step": 8073
+    },
+    {
+      "epoch": 0.22277943139371656,
+      "grad_norm": 0.002019941108301282,
+      "learning_rate": 0.001,
+      "loss": 0.4259,
+      "step": 8074
+    },
+    {
+      "epoch": 0.22280702359478094,
+      "grad_norm": 0.004767908249050379,
+      "learning_rate": 0.001,
+      "loss": 0.423,
+      "step": 8075
+    },
+    {
+      "epoch": 0.2228346157958453,
+      "grad_norm": 0.003971953876316547,
+      "learning_rate": 0.001,
+      "loss": 0.3976,
+      "step": 8076
+    },
+    {
+      "epoch": 0.22286220799690967,
+      "grad_norm": 0.004503239411860704,
+      "learning_rate": 0.001,
+      "loss": 0.3891,
+      "step": 8077
+    },
+    {
+      "epoch": 0.22288980019797405,
+      "grad_norm": 0.002689438173547387,
+      "learning_rate": 0.001,
+      "loss": 0.4082,
+      "step": 8078
+    },
+    {
+      "epoch": 0.2229173923990384,
+      "grad_norm": 0.003111765021458268,
+      "learning_rate": 0.001,
+      "loss": 0.3963,
+      "step": 8079
+    },
+    {
+      "epoch": 0.22294498460010279,
+      "grad_norm": 0.003488035872578621,
+      "learning_rate": 0.001,
+      "loss": 0.3488,
+      "step": 8080
+    },
+    {
+      "epoch": 0.22297257680116714,
+      "grad_norm": 0.004513781983405352,
+      "learning_rate": 0.001,
+      "loss": 0.3627,
+      "step": 8081
+    },
+    {
+      "epoch": 0.22300016900223152,
+      "grad_norm": 0.0029607918113470078,
+      "learning_rate": 0.001,
+      "loss": 0.4006,
+      "step": 8082
+    },
+    {
+      "epoch": 0.2230277612032959,
+      "grad_norm": 0.0026686247438192368,
+      "learning_rate": 0.001,
+      "loss": 0.4311,
+      "step": 8083
+    },
+    {
+      "epoch": 0.22305535340436025,
+      "grad_norm": 0.003172766650095582,
+      "learning_rate": 0.001,
+      "loss": 0.4155,
+      "step": 8084
+    },
+    {
+      "epoch": 0.22308294560542463,
+      "grad_norm": 0.002642567502334714,
+      "learning_rate": 0.001,
+      "loss": 0.4093,
+      "step": 8085
+    },
+    {
+      "epoch": 0.22311053780648898,
+      "grad_norm": 0.002758550923317671,
+      "learning_rate": 0.001,
+      "loss": 0.3962,
+      "step": 8086
+    },
+    {
+      "epoch": 0.22313813000755336,
+      "grad_norm": 0.004465687554329634,
+      "learning_rate": 0.001,
+      "loss": 0.3925,
+      "step": 8087
+    },
+    {
+      "epoch": 0.22316572220861775,
+      "grad_norm": 0.003773334203287959,
+      "learning_rate": 0.001,
+      "loss": 0.3736,
+      "step": 8088
+    },
+    {
+      "epoch": 0.2231933144096821,
+      "grad_norm": 0.0027832011692225933,
+      "learning_rate": 0.001,
+      "loss": 0.3967,
+      "step": 8089
+    },
+    {
+      "epoch": 0.22322090661074648,
+      "grad_norm": 0.00280310888774693,
+      "learning_rate": 0.001,
+      "loss": 0.3727,
+      "step": 8090
+    },
+    {
+      "epoch": 0.22324849881181083,
+      "grad_norm": 0.003244071500375867,
+      "learning_rate": 0.001,
+      "loss": 0.3965,
+      "step": 8091
+    },
+    {
+      "epoch": 0.2232760910128752,
+      "grad_norm": 0.0024930760264396667,
+      "learning_rate": 0.001,
+      "loss": 0.4322,
+      "step": 8092
+    },
+    {
+      "epoch": 0.2233036832139396,
+      "grad_norm": 0.002686891006305814,
+      "learning_rate": 0.001,
+      "loss": 0.3844,
+      "step": 8093
+    },
+    {
+      "epoch": 0.22333127541500394,
+      "grad_norm": 0.00237724045291543,
+      "learning_rate": 0.001,
+      "loss": 0.437,
+      "step": 8094
+    },
+    {
+      "epoch": 0.22335886761606832,
+      "grad_norm": 0.004243023227900267,
+      "learning_rate": 0.001,
+      "loss": 0.3899,
+      "step": 8095
+    },
+    {
+      "epoch": 0.22338645981713268,
+      "grad_norm": 0.004333519376814365,
+      "learning_rate": 0.001,
+      "loss": 0.4245,
+      "step": 8096
+    },
+    {
+      "epoch": 0.22341405201819706,
+      "grad_norm": 0.0025725301820784807,
+      "learning_rate": 0.001,
+      "loss": 0.3968,
+      "step": 8097
+    },
+    {
+      "epoch": 0.22344164421926144,
+      "grad_norm": 0.004863837733864784,
+      "learning_rate": 0.001,
+      "loss": 0.4261,
+      "step": 8098
+    },
+    {
+      "epoch": 0.2234692364203258,
+      "grad_norm": 0.0025464182253926992,
+      "learning_rate": 0.001,
+      "loss": 0.412,
+      "step": 8099
+    },
+    {
+      "epoch": 0.22349682862139017,
+      "grad_norm": 0.003454705933108926,
+      "learning_rate": 0.001,
+      "loss": 0.3631,
+      "step": 8100
+    },
+    {
+      "epoch": 0.22352442082245452,
+      "grad_norm": 0.004076403100043535,
+      "learning_rate": 0.001,
+      "loss": 0.4161,
+      "step": 8101
+    },
+    {
+      "epoch": 0.2235520130235189,
+      "grad_norm": 0.002538996748626232,
+      "learning_rate": 0.001,
+      "loss": 0.4246,
+      "step": 8102
+    },
+    {
+      "epoch": 0.22357960522458328,
+      "grad_norm": 0.002853696933016181,
+      "learning_rate": 0.001,
+      "loss": 0.3856,
+      "step": 8103
+    },
+    {
+      "epoch": 0.22360719742564764,
+      "grad_norm": 0.005618890281766653,
+      "learning_rate": 0.001,
+      "loss": 0.4101,
+      "step": 8104
+    },
+    {
+      "epoch": 0.22363478962671202,
+      "grad_norm": 0.0023889478761702776,
+      "learning_rate": 0.001,
+      "loss": 0.407,
+      "step": 8105
+    },
+    {
+      "epoch": 0.22366238182777637,
+      "grad_norm": 0.0049384357407689095,
+      "learning_rate": 0.001,
+      "loss": 0.3995,
+      "step": 8106
+    },
+    {
+      "epoch": 0.22368997402884075,
+      "grad_norm": 0.0029198869597166777,
+      "learning_rate": 0.001,
+      "loss": 0.3977,
+      "step": 8107
+    },
+    {
+      "epoch": 0.22371756622990513,
+      "grad_norm": 0.00721718929708004,
+      "learning_rate": 0.001,
+      "loss": 0.3965,
+      "step": 8108
+    },
+    {
+      "epoch": 0.22374515843096948,
+      "grad_norm": 0.0038066962733864784,
+      "learning_rate": 0.001,
+      "loss": 0.3987,
+      "step": 8109
+    },
+    {
+      "epoch": 0.22377275063203386,
+      "grad_norm": 0.0029343971982598305,
+      "learning_rate": 0.001,
+      "loss": 0.3974,
+      "step": 8110
+    },
+    {
+      "epoch": 0.22380034283309821,
+      "grad_norm": 0.004783669952303171,
+      "learning_rate": 0.001,
+      "loss": 0.4032,
+      "step": 8111
+    },
+    {
+      "epoch": 0.2238279350341626,
+      "grad_norm": 0.005818706471472979,
+      "learning_rate": 0.001,
+      "loss": 0.3737,
+      "step": 8112
+    },
+    {
+      "epoch": 0.22385552723522698,
+      "grad_norm": 0.0027874866500496864,
+      "learning_rate": 0.001,
+      "loss": 0.3932,
+      "step": 8113
+    },
+    {
+      "epoch": 0.22388311943629133,
+      "grad_norm": 0.006304432172328234,
+      "learning_rate": 0.001,
+      "loss": 0.4075,
+      "step": 8114
+    },
+    {
+      "epoch": 0.2239107116373557,
+      "grad_norm": 0.006212018895894289,
+      "learning_rate": 0.001,
+      "loss": 0.3632,
+      "step": 8115
+    },
+    {
+      "epoch": 0.22393830383842006,
+      "grad_norm": 0.0021557544823735952,
+      "learning_rate": 0.001,
+      "loss": 0.4262,
+      "step": 8116
+    },
+    {
+      "epoch": 0.22396589603948444,
+      "grad_norm": 0.0035563178826123476,
+      "learning_rate": 0.001,
+      "loss": 0.4008,
+      "step": 8117
+    },
+    {
+      "epoch": 0.22399348824054882,
+      "grad_norm": 0.002528556389734149,
+      "learning_rate": 0.001,
+      "loss": 0.3833,
+      "step": 8118
+    },
+    {
+      "epoch": 0.22402108044161317,
+      "grad_norm": 0.0022855051793158054,
+      "learning_rate": 0.001,
+      "loss": 0.3971,
+      "step": 8119
+    },
+    {
+      "epoch": 0.22404867264267755,
+      "grad_norm": 0.0027871252968907356,
+      "learning_rate": 0.001,
+      "loss": 0.3744,
+      "step": 8120
+    },
+    {
+      "epoch": 0.2240762648437419,
+      "grad_norm": 0.003169616684317589,
+      "learning_rate": 0.001,
+      "loss": 0.3901,
+      "step": 8121
+    },
+    {
+      "epoch": 0.2241038570448063,
+      "grad_norm": 0.003899726551026106,
+      "learning_rate": 0.001,
+      "loss": 0.4177,
+      "step": 8122
+    },
+    {
+      "epoch": 0.22413144924587067,
+      "grad_norm": 0.0040968372486531734,
+      "learning_rate": 0.001,
+      "loss": 0.3875,
+      "step": 8123
+    },
+    {
+      "epoch": 0.22415904144693502,
+      "grad_norm": 0.002130861859768629,
+      "learning_rate": 0.001,
+      "loss": 0.4066,
+      "step": 8124
+    },
+    {
+      "epoch": 0.2241866336479994,
+      "grad_norm": 0.0030744632240384817,
+      "learning_rate": 0.001,
+      "loss": 0.3703,
+      "step": 8125
+    },
+    {
+      "epoch": 0.22421422584906375,
+      "grad_norm": 0.0023388941772282124,
+      "learning_rate": 0.001,
+      "loss": 0.428,
+      "step": 8126
+    },
+    {
+      "epoch": 0.22424181805012813,
+      "grad_norm": 0.005775371100753546,
+      "learning_rate": 0.001,
+      "loss": 0.3984,
+      "step": 8127
+    },
+    {
+      "epoch": 0.2242694102511925,
+      "grad_norm": 0.002374644624069333,
+      "learning_rate": 0.001,
+      "loss": 0.4077,
+      "step": 8128
+    },
+    {
+      "epoch": 0.22429700245225687,
+      "grad_norm": 0.0058793784119188786,
+      "learning_rate": 0.001,
+      "loss": 0.3693,
+      "step": 8129
+    },
+    {
+      "epoch": 0.22432459465332125,
+      "grad_norm": 0.0027781794779002666,
+      "learning_rate": 0.001,
+      "loss": 0.4153,
+      "step": 8130
+    },
+    {
+      "epoch": 0.2243521868543856,
+      "grad_norm": 0.004530200269073248,
+      "learning_rate": 0.001,
+      "loss": 0.3909,
+      "step": 8131
+    },
+    {
+      "epoch": 0.22437977905544998,
+      "grad_norm": 0.002897880505770445,
+      "learning_rate": 0.001,
+      "loss": 0.3793,
+      "step": 8132
+    },
+    {
+      "epoch": 0.22440737125651436,
+      "grad_norm": 0.0025317175313830376,
+      "learning_rate": 0.001,
+      "loss": 0.398,
+      "step": 8133
+    },
+    {
+      "epoch": 0.2244349634575787,
+      "grad_norm": 0.004969809204339981,
+      "learning_rate": 0.001,
+      "loss": 0.3771,
+      "step": 8134
+    },
+    {
+      "epoch": 0.2244625556586431,
+      "grad_norm": 0.019586963579058647,
+      "learning_rate": 0.001,
+      "loss": 0.4059,
+      "step": 8135
+    },
+    {
+      "epoch": 0.22449014785970745,
+      "grad_norm": 0.003650497179478407,
+      "learning_rate": 0.001,
+      "loss": 0.4429,
+      "step": 8136
+    },
+    {
+      "epoch": 0.22451774006077183,
+      "grad_norm": 0.004681742750108242,
+      "learning_rate": 0.001,
+      "loss": 0.3727,
+      "step": 8137
+    },
+    {
+      "epoch": 0.2245453322618362,
+      "grad_norm": 0.0019167111022397876,
+      "learning_rate": 0.001,
+      "loss": 0.4284,
+      "step": 8138
+    },
+    {
+      "epoch": 0.22457292446290056,
+      "grad_norm": 0.002770308405160904,
+      "learning_rate": 0.001,
+      "loss": 0.393,
+      "step": 8139
+    },
+    {
+      "epoch": 0.22460051666396494,
+      "grad_norm": 0.002731415443122387,
+      "learning_rate": 0.001,
+      "loss": 0.373,
+      "step": 8140
+    },
+    {
+      "epoch": 0.2246281088650293,
+      "grad_norm": 0.0024662816431373358,
+      "learning_rate": 0.001,
+      "loss": 0.3751,
+      "step": 8141
+    },
+    {
+      "epoch": 0.22465570106609367,
+      "grad_norm": 0.00265320367179811,
+      "learning_rate": 0.001,
+      "loss": 0.3967,
+      "step": 8142
+    },
+    {
+      "epoch": 0.22468329326715802,
+      "grad_norm": 0.0031822596210986376,
+      "learning_rate": 0.001,
+      "loss": 0.3825,
+      "step": 8143
+    },
+    {
+      "epoch": 0.2247108854682224,
+      "grad_norm": 0.005078164394944906,
+      "learning_rate": 0.001,
+      "loss": 0.3827,
+      "step": 8144
+    },
+    {
+      "epoch": 0.22473847766928678,
+      "grad_norm": 0.003918538335710764,
+      "learning_rate": 0.001,
+      "loss": 0.3767,
+      "step": 8145
+    },
+    {
+      "epoch": 0.22476606987035114,
+      "grad_norm": 0.0030788714066147804,
+      "learning_rate": 0.001,
+      "loss": 0.4253,
+      "step": 8146
+    },
+    {
+      "epoch": 0.22479366207141552,
+      "grad_norm": 0.0029016491025686264,
+      "learning_rate": 0.001,
+      "loss": 0.4175,
+      "step": 8147
+    },
+    {
+      "epoch": 0.22482125427247987,
+      "grad_norm": 0.0023613544180989265,
+      "learning_rate": 0.001,
+      "loss": 0.4157,
+      "step": 8148
+    },
+    {
+      "epoch": 0.22484884647354425,
+      "grad_norm": 0.0021770396269857883,
+      "learning_rate": 0.001,
+      "loss": 0.3961,
+      "step": 8149
+    },
+    {
+      "epoch": 0.22487643867460863,
+      "grad_norm": 0.004851729609072208,
+      "learning_rate": 0.001,
+      "loss": 0.3996,
+      "step": 8150
+    },
+    {
+      "epoch": 0.22490403087567298,
+      "grad_norm": 0.0023205161560326815,
+      "learning_rate": 0.001,
+      "loss": 0.4151,
+      "step": 8151
+    },
+    {
+      "epoch": 0.22493162307673736,
+      "grad_norm": 0.0027498926501721144,
+      "learning_rate": 0.001,
+      "loss": 0.3587,
+      "step": 8152
+    },
+    {
+      "epoch": 0.22495921527780172,
+      "grad_norm": 0.004693325143307447,
+      "learning_rate": 0.001,
+      "loss": 0.3944,
+      "step": 8153
+    },
+    {
+      "epoch": 0.2249868074788661,
+      "grad_norm": 0.0030215794686228037,
+      "learning_rate": 0.001,
+      "loss": 0.4149,
+      "step": 8154
+    },
+    {
+      "epoch": 0.22501439967993048,
+      "grad_norm": 0.0024281737860292196,
+      "learning_rate": 0.001,
+      "loss": 0.4235,
+      "step": 8155
+    },
+    {
+      "epoch": 0.22504199188099483,
+      "grad_norm": 0.006052395328879356,
+      "learning_rate": 0.001,
+      "loss": 0.3992,
+      "step": 8156
+    },
+    {
+      "epoch": 0.2250695840820592,
+      "grad_norm": 0.005301719065755606,
+      "learning_rate": 0.001,
+      "loss": 0.3867,
+      "step": 8157
+    },
+    {
+      "epoch": 0.22509717628312356,
+      "grad_norm": 0.0024900834541767836,
+      "learning_rate": 0.001,
+      "loss": 0.4067,
+      "step": 8158
+    },
+    {
+      "epoch": 0.22512476848418794,
+      "grad_norm": 0.00321084912866354,
+      "learning_rate": 0.001,
+      "loss": 0.3843,
+      "step": 8159
+    },
+    {
+      "epoch": 0.22515236068525232,
+      "grad_norm": 0.002374766394495964,
+      "learning_rate": 0.001,
+      "loss": 0.3948,
+      "step": 8160
+    },
+    {
+      "epoch": 0.22517995288631668,
+      "grad_norm": 0.002608582377433777,
+      "learning_rate": 0.001,
+      "loss": 0.409,
+      "step": 8161
+    },
+    {
+      "epoch": 0.22520754508738106,
+      "grad_norm": 0.0029289585072547197,
+      "learning_rate": 0.001,
+      "loss": 0.3982,
+      "step": 8162
+    },
+    {
+      "epoch": 0.2252351372884454,
+      "grad_norm": 0.0028822184540331364,
+      "learning_rate": 0.001,
+      "loss": 0.3814,
+      "step": 8163
+    },
+    {
+      "epoch": 0.2252627294895098,
+      "grad_norm": 0.004350913688540459,
+      "learning_rate": 0.001,
+      "loss": 0.3946,
+      "step": 8164
+    },
+    {
+      "epoch": 0.22529032169057417,
+      "grad_norm": 0.0049639069475233555,
+      "learning_rate": 0.001,
+      "loss": 0.4058,
+      "step": 8165
+    },
+    {
+      "epoch": 0.22531791389163852,
+      "grad_norm": 0.005849256180226803,
+      "learning_rate": 0.001,
+      "loss": 0.4043,
+      "step": 8166
+    },
+    {
+      "epoch": 0.2253455060927029,
+      "grad_norm": 0.007307564374059439,
+      "learning_rate": 0.001,
+      "loss": 0.4227,
+      "step": 8167
+    },
+    {
+      "epoch": 0.22537309829376725,
+      "grad_norm": 0.0058277989737689495,
+      "learning_rate": 0.001,
+      "loss": 0.4153,
+      "step": 8168
+    },
+    {
+      "epoch": 0.22540069049483163,
+      "grad_norm": 0.0024729755241423845,
+      "learning_rate": 0.001,
+      "loss": 0.4078,
+      "step": 8169
+    },
+    {
+      "epoch": 0.22542828269589602,
+      "grad_norm": 0.0030476911924779415,
+      "learning_rate": 0.001,
+      "loss": 0.4271,
+      "step": 8170
+    },
+    {
+      "epoch": 0.22545587489696037,
+      "grad_norm": 0.003712064353749156,
+      "learning_rate": 0.001,
+      "loss": 0.405,
+      "step": 8171
+    },
+    {
+      "epoch": 0.22548346709802475,
+      "grad_norm": 0.0031436847057193518,
+      "learning_rate": 0.001,
+      "loss": 0.3763,
+      "step": 8172
+    },
+    {
+      "epoch": 0.2255110592990891,
+      "grad_norm": 0.002069181762635708,
+      "learning_rate": 0.001,
+      "loss": 0.4075,
+      "step": 8173
+    },
+    {
+      "epoch": 0.22553865150015348,
+      "grad_norm": 0.0023455459158867598,
+      "learning_rate": 0.001,
+      "loss": 0.4161,
+      "step": 8174
+    },
+    {
+      "epoch": 0.22556624370121786,
+      "grad_norm": 0.005130277015268803,
+      "learning_rate": 0.001,
+      "loss": 0.3735,
+      "step": 8175
+    },
+    {
+      "epoch": 0.2255938359022822,
+      "grad_norm": 0.002300563734024763,
+      "learning_rate": 0.001,
+      "loss": 0.4008,
+      "step": 8176
+    },
+    {
+      "epoch": 0.2256214281033466,
+      "grad_norm": 0.002703635022044182,
+      "learning_rate": 0.001,
+      "loss": 0.4045,
+      "step": 8177
+    },
+    {
+      "epoch": 0.22564902030441095,
+      "grad_norm": 0.00716767180711031,
+      "learning_rate": 0.001,
+      "loss": 0.4063,
+      "step": 8178
+    },
+    {
+      "epoch": 0.22567661250547533,
+      "grad_norm": 0.002465134486556053,
+      "learning_rate": 0.001,
+      "loss": 0.4011,
+      "step": 8179
+    },
+    {
+      "epoch": 0.2257042047065397,
+      "grad_norm": 0.004032150376588106,
+      "learning_rate": 0.001,
+      "loss": 0.4256,
+      "step": 8180
+    },
+    {
+      "epoch": 0.22573179690760406,
+      "grad_norm": 0.0025379545986652374,
+      "learning_rate": 0.001,
+      "loss": 0.3947,
+      "step": 8181
+    },
+    {
+      "epoch": 0.22575938910866844,
+      "grad_norm": 0.0032701275777071714,
+      "learning_rate": 0.001,
+      "loss": 0.4011,
+      "step": 8182
+    },
+    {
+      "epoch": 0.2257869813097328,
+      "grad_norm": 0.002877070102840662,
+      "learning_rate": 0.001,
+      "loss": 0.3997,
+      "step": 8183
+    },
+    {
+      "epoch": 0.22581457351079717,
+      "grad_norm": 0.003329310566186905,
+      "learning_rate": 0.001,
+      "loss": 0.4441,
+      "step": 8184
+    },
+    {
+      "epoch": 0.22584216571186155,
+      "grad_norm": 0.003345692064613104,
+      "learning_rate": 0.001,
+      "loss": 0.3866,
+      "step": 8185
+    },
+    {
+      "epoch": 0.2258697579129259,
+      "grad_norm": 0.002473951783031225,
+      "learning_rate": 0.001,
+      "loss": 0.4265,
+      "step": 8186
+    },
+    {
+      "epoch": 0.2258973501139903,
+      "grad_norm": 0.0027697773184627295,
+      "learning_rate": 0.001,
+      "loss": 0.4272,
+      "step": 8187
+    },
+    {
+      "epoch": 0.22592494231505464,
+      "grad_norm": 0.00608485285192728,
+      "learning_rate": 0.001,
+      "loss": 0.3947,
+      "step": 8188
+    },
+    {
+      "epoch": 0.22595253451611902,
+      "grad_norm": 0.0026021813973784447,
+      "learning_rate": 0.001,
+      "loss": 0.3898,
+      "step": 8189
+    },
+    {
+      "epoch": 0.2259801267171834,
+      "grad_norm": 0.003049626713618636,
+      "learning_rate": 0.001,
+      "loss": 0.3972,
+      "step": 8190
+    },
+    {
+      "epoch": 0.22600771891824775,
+      "grad_norm": 0.0028316755779087543,
+      "learning_rate": 0.001,
+      "loss": 0.402,
+      "step": 8191
+    },
+    {
+      "epoch": 0.22603531111931213,
+      "grad_norm": 0.0024335982743650675,
+      "learning_rate": 0.001,
+      "loss": 0.3873,
+      "step": 8192
+    },
+    {
+      "epoch": 0.22606290332037648,
+      "grad_norm": 0.0019406350329518318,
+      "learning_rate": 0.001,
+      "loss": 0.3972,
+      "step": 8193
+    },
+    {
+      "epoch": 0.22609049552144087,
+      "grad_norm": 0.0031156607437878847,
+      "learning_rate": 0.001,
+      "loss": 0.4109,
+      "step": 8194
+    },
+    {
+      "epoch": 0.22611808772250525,
+      "grad_norm": 0.0024029607884585857,
+      "learning_rate": 0.001,
+      "loss": 0.3982,
+      "step": 8195
+    },
+    {
+      "epoch": 0.2261456799235696,
+      "grad_norm": 0.002428837353363633,
+      "learning_rate": 0.001,
+      "loss": 0.3735,
+      "step": 8196
+    },
+    {
+      "epoch": 0.22617327212463398,
+      "grad_norm": 0.003429250791668892,
+      "learning_rate": 0.001,
+      "loss": 0.4288,
+      "step": 8197
+    },
+    {
+      "epoch": 0.22620086432569833,
+      "grad_norm": 0.003104103496298194,
+      "learning_rate": 0.001,
+      "loss": 0.3953,
+      "step": 8198
+    },
+    {
+      "epoch": 0.2262284565267627,
+      "grad_norm": 0.002276889281347394,
+      "learning_rate": 0.001,
+      "loss": 0.3932,
+      "step": 8199
+    },
+    {
+      "epoch": 0.2262560487278271,
+      "grad_norm": 0.002815461019054055,
+      "learning_rate": 0.001,
+      "loss": 0.378,
+      "step": 8200
+    },
+    {
+      "epoch": 0.22628364092889144,
+      "grad_norm": 0.0021125515922904015,
+      "learning_rate": 0.001,
+      "loss": 0.416,
+      "step": 8201
+    },
+    {
+      "epoch": 0.22631123312995582,
+      "grad_norm": 0.0026191947981715202,
+      "learning_rate": 0.001,
+      "loss": 0.3934,
+      "step": 8202
+    },
+    {
+      "epoch": 0.22633882533102018,
+      "grad_norm": 0.002689339919015765,
+      "learning_rate": 0.001,
+      "loss": 0.3949,
+      "step": 8203
+    },
+    {
+      "epoch": 0.22636641753208456,
+      "grad_norm": 0.00354541908018291,
+      "learning_rate": 0.001,
+      "loss": 0.3831,
+      "step": 8204
+    },
+    {
+      "epoch": 0.22639400973314894,
+      "grad_norm": 0.0037923678755760193,
+      "learning_rate": 0.001,
+      "loss": 0.4343,
+      "step": 8205
+    },
+    {
+      "epoch": 0.2264216019342133,
+      "grad_norm": 0.004423064645379782,
+      "learning_rate": 0.001,
+      "loss": 0.4002,
+      "step": 8206
+    },
+    {
+      "epoch": 0.22644919413527767,
+      "grad_norm": 0.004195543006062508,
+      "learning_rate": 0.001,
+      "loss": 0.4197,
+      "step": 8207
+    },
+    {
+      "epoch": 0.22647678633634202,
+      "grad_norm": 0.0025238455273211002,
+      "learning_rate": 0.001,
+      "loss": 0.4091,
+      "step": 8208
+    },
+    {
+      "epoch": 0.2265043785374064,
+      "grad_norm": 0.0026775554288178682,
+      "learning_rate": 0.001,
+      "loss": 0.4152,
+      "step": 8209
+    },
+    {
+      "epoch": 0.22653197073847078,
+      "grad_norm": 0.004028724506497383,
+      "learning_rate": 0.001,
+      "loss": 0.3778,
+      "step": 8210
+    },
+    {
+      "epoch": 0.22655956293953514,
+      "grad_norm": 0.002054669661447406,
+      "learning_rate": 0.001,
+      "loss": 0.3787,
+      "step": 8211
+    },
+    {
+      "epoch": 0.22658715514059952,
+      "grad_norm": 0.003321393858641386,
+      "learning_rate": 0.001,
+      "loss": 0.3685,
+      "step": 8212
+    },
+    {
+      "epoch": 0.22661474734166387,
+      "grad_norm": 0.0035699696745723486,
+      "learning_rate": 0.001,
+      "loss": 0.4127,
+      "step": 8213
+    },
+    {
+      "epoch": 0.22664233954272825,
+      "grad_norm": 0.004189506638795137,
+      "learning_rate": 0.001,
+      "loss": 0.383,
+      "step": 8214
+    },
+    {
+      "epoch": 0.22666993174379263,
+      "grad_norm": 0.003797245444729924,
+      "learning_rate": 0.001,
+      "loss": 0.3812,
+      "step": 8215
+    },
+    {
+      "epoch": 0.22669752394485698,
+      "grad_norm": 0.0045156171545386314,
+      "learning_rate": 0.001,
+      "loss": 0.4078,
+      "step": 8216
+    },
+    {
+      "epoch": 0.22672511614592136,
+      "grad_norm": 0.0026707893703132868,
+      "learning_rate": 0.001,
+      "loss": 0.3575,
+      "step": 8217
+    },
+    {
+      "epoch": 0.22675270834698572,
+      "grad_norm": 0.0025783004239201546,
+      "learning_rate": 0.001,
+      "loss": 0.4373,
+      "step": 8218
+    },
+    {
+      "epoch": 0.2267803005480501,
+      "grad_norm": 0.004405698273330927,
+      "learning_rate": 0.001,
+      "loss": 0.4001,
+      "step": 8219
+    },
+    {
+      "epoch": 0.22680789274911448,
+      "grad_norm": 0.00231956597417593,
+      "learning_rate": 0.001,
+      "loss": 0.3762,
+      "step": 8220
+    },
+    {
+      "epoch": 0.22683548495017883,
+      "grad_norm": 0.0033627781085669994,
+      "learning_rate": 0.001,
+      "loss": 0.4201,
+      "step": 8221
+    },
+    {
+      "epoch": 0.2268630771512432,
+      "grad_norm": 0.003316913964226842,
+      "learning_rate": 0.001,
+      "loss": 0.4167,
+      "step": 8222
+    },
+    {
+      "epoch": 0.22689066935230756,
+      "grad_norm": 0.0026452546007931232,
+      "learning_rate": 0.001,
+      "loss": 0.3952,
+      "step": 8223
+    },
+    {
+      "epoch": 0.22691826155337194,
+      "grad_norm": 0.004451108165085316,
+      "learning_rate": 0.001,
+      "loss": 0.3784,
+      "step": 8224
+    },
+    {
+      "epoch": 0.22694585375443632,
+      "grad_norm": 0.0040152668952941895,
+      "learning_rate": 0.001,
+      "loss": 0.3664,
+      "step": 8225
+    },
+    {
+      "epoch": 0.22697344595550067,
+      "grad_norm": 0.003606460290029645,
+      "learning_rate": 0.001,
+      "loss": 0.441,
+      "step": 8226
+    },
+    {
+      "epoch": 0.22700103815656505,
+      "grad_norm": 0.003297601593658328,
+      "learning_rate": 0.001,
+      "loss": 0.4248,
+      "step": 8227
+    },
+    {
+      "epoch": 0.2270286303576294,
+      "grad_norm": 0.003477993654087186,
+      "learning_rate": 0.001,
+      "loss": 0.3738,
+      "step": 8228
+    },
+    {
+      "epoch": 0.2270562225586938,
+      "grad_norm": 0.002349859569221735,
+      "learning_rate": 0.001,
+      "loss": 0.4143,
+      "step": 8229
+    },
+    {
+      "epoch": 0.22708381475975817,
+      "grad_norm": 0.003242079634219408,
+      "learning_rate": 0.001,
+      "loss": 0.4205,
+      "step": 8230
+    },
+    {
+      "epoch": 0.22711140696082252,
+      "grad_norm": 0.007759904023259878,
+      "learning_rate": 0.001,
+      "loss": 0.3954,
+      "step": 8231
+    },
+    {
+      "epoch": 0.2271389991618869,
+      "grad_norm": 0.003948306664824486,
+      "learning_rate": 0.001,
+      "loss": 0.3749,
+      "step": 8232
+    },
+    {
+      "epoch": 0.22716659136295125,
+      "grad_norm": 0.002385435625910759,
+      "learning_rate": 0.001,
+      "loss": 0.3925,
+      "step": 8233
+    },
+    {
+      "epoch": 0.22719418356401563,
+      "grad_norm": 0.0032237351406365633,
+      "learning_rate": 0.001,
+      "loss": 0.398,
+      "step": 8234
+    },
+    {
+      "epoch": 0.22722177576508,
+      "grad_norm": 0.0027539869770407677,
+      "learning_rate": 0.001,
+      "loss": 0.4185,
+      "step": 8235
+    },
+    {
+      "epoch": 0.22724936796614437,
+      "grad_norm": 0.002958692843094468,
+      "learning_rate": 0.001,
+      "loss": 0.3964,
+      "step": 8236
+    },
+    {
+      "epoch": 0.22727696016720875,
+      "grad_norm": 0.005276213400065899,
+      "learning_rate": 0.001,
+      "loss": 0.3884,
+      "step": 8237
+    },
+    {
+      "epoch": 0.2273045523682731,
+      "grad_norm": 0.0028638257645070553,
+      "learning_rate": 0.001,
+      "loss": 0.4019,
+      "step": 8238
+    },
+    {
+      "epoch": 0.22733214456933748,
+      "grad_norm": 0.0034719184041023254,
+      "learning_rate": 0.001,
+      "loss": 0.4131,
+      "step": 8239
+    },
+    {
+      "epoch": 0.22735973677040183,
+      "grad_norm": 0.002775615779682994,
+      "learning_rate": 0.001,
+      "loss": 0.4081,
+      "step": 8240
+    },
+    {
+      "epoch": 0.2273873289714662,
+      "grad_norm": 0.0028845765627920628,
+      "learning_rate": 0.001,
+      "loss": 0.3774,
+      "step": 8241
+    },
+    {
+      "epoch": 0.2274149211725306,
+      "grad_norm": 0.002906092908233404,
+      "learning_rate": 0.001,
+      "loss": 0.3827,
+      "step": 8242
+    },
+    {
+      "epoch": 0.22744251337359495,
+      "grad_norm": 0.0031754986848682165,
+      "learning_rate": 0.001,
+      "loss": 0.4096,
+      "step": 8243
+    },
+    {
+      "epoch": 0.22747010557465933,
+      "grad_norm": 0.00252131768502295,
+      "learning_rate": 0.001,
+      "loss": 0.4347,
+      "step": 8244
+    },
+    {
+      "epoch": 0.22749769777572368,
+      "grad_norm": 0.0025290907360613346,
+      "learning_rate": 0.001,
+      "loss": 0.4043,
+      "step": 8245
+    },
+    {
+      "epoch": 0.22752528997678806,
+      "grad_norm": 0.0034006794448941946,
+      "learning_rate": 0.001,
+      "loss": 0.3981,
+      "step": 8246
+    },
+    {
+      "epoch": 0.22755288217785244,
+      "grad_norm": 0.002173987217247486,
+      "learning_rate": 0.001,
+      "loss": 0.4244,
+      "step": 8247
+    },
+    {
+      "epoch": 0.2275804743789168,
+      "grad_norm": 0.002650484209880233,
+      "learning_rate": 0.001,
+      "loss": 0.4232,
+      "step": 8248
+    },
+    {
+      "epoch": 0.22760806657998117,
+      "grad_norm": 0.0069848657585680485,
+      "learning_rate": 0.001,
+      "loss": 0.3648,
+      "step": 8249
+    },
+    {
+      "epoch": 0.22763565878104552,
+      "grad_norm": 0.002296354155987501,
+      "learning_rate": 0.001,
+      "loss": 0.4071,
+      "step": 8250
+    },
+    {
+      "epoch": 0.2276632509821099,
+      "grad_norm": 0.0023510761093348265,
+      "learning_rate": 0.001,
+      "loss": 0.348,
+      "step": 8251
+    },
+    {
+      "epoch": 0.22769084318317429,
+      "grad_norm": 0.002062094397842884,
+      "learning_rate": 0.001,
+      "loss": 0.3983,
+      "step": 8252
+    },
+    {
+      "epoch": 0.22771843538423864,
+      "grad_norm": 0.004330387804657221,
+      "learning_rate": 0.001,
+      "loss": 0.3546,
+      "step": 8253
+    },
+    {
+      "epoch": 0.22774602758530302,
+      "grad_norm": 0.0027810055762529373,
+      "learning_rate": 0.001,
+      "loss": 0.3931,
+      "step": 8254
+    },
+    {
+      "epoch": 0.22777361978636737,
+      "grad_norm": 0.0025015720166265965,
+      "learning_rate": 0.001,
+      "loss": 0.3958,
+      "step": 8255
+    },
+    {
+      "epoch": 0.22780121198743175,
+      "grad_norm": 0.004701843950897455,
+      "learning_rate": 0.001,
+      "loss": 0.3982,
+      "step": 8256
+    },
+    {
+      "epoch": 0.22782880418849613,
+      "grad_norm": 0.0027081798762083054,
+      "learning_rate": 0.001,
+      "loss": 0.3989,
+      "step": 8257
+    },
+    {
+      "epoch": 0.22785639638956048,
+      "grad_norm": 0.00656101293861866,
+      "learning_rate": 0.001,
+      "loss": 0.3426,
+      "step": 8258
+    },
+    {
+      "epoch": 0.22788398859062486,
+      "grad_norm": 0.004077386576682329,
+      "learning_rate": 0.001,
+      "loss": 0.4016,
+      "step": 8259
+    },
+    {
+      "epoch": 0.22791158079168922,
+      "grad_norm": 0.005094864405691624,
+      "learning_rate": 0.001,
+      "loss": 0.3737,
+      "step": 8260
+    },
+    {
+      "epoch": 0.2279391729927536,
+      "grad_norm": 0.003425776259973645,
+      "learning_rate": 0.001,
+      "loss": 0.3538,
+      "step": 8261
+    },
+    {
+      "epoch": 0.22796676519381798,
+      "grad_norm": 0.0027753396425396204,
+      "learning_rate": 0.001,
+      "loss": 0.4208,
+      "step": 8262
+    },
+    {
+      "epoch": 0.22799435739488233,
+      "grad_norm": 0.00263264961540699,
+      "learning_rate": 0.001,
+      "loss": 0.4395,
+      "step": 8263
+    },
+    {
+      "epoch": 0.2280219495959467,
+      "grad_norm": 0.002466138917952776,
+      "learning_rate": 0.001,
+      "loss": 0.402,
+      "step": 8264
+    },
+    {
+      "epoch": 0.22804954179701106,
+      "grad_norm": 0.0038233925588428974,
+      "learning_rate": 0.001,
+      "loss": 0.3909,
+      "step": 8265
+    },
+    {
+      "epoch": 0.22807713399807544,
+      "grad_norm": 0.004745953716337681,
+      "learning_rate": 0.001,
+      "loss": 0.3964,
+      "step": 8266
+    },
+    {
+      "epoch": 0.22810472619913982,
+      "grad_norm": 0.0024268226698040962,
+      "learning_rate": 0.001,
+      "loss": 0.4141,
+      "step": 8267
+    },
+    {
+      "epoch": 0.22813231840020418,
+      "grad_norm": 0.002283599926158786,
+      "learning_rate": 0.001,
+      "loss": 0.4234,
+      "step": 8268
+    },
+    {
+      "epoch": 0.22815991060126856,
+      "grad_norm": 0.003483585547655821,
+      "learning_rate": 0.001,
+      "loss": 0.3517,
+      "step": 8269
+    },
+    {
+      "epoch": 0.2281875028023329,
+      "grad_norm": 0.002310832031071186,
+      "learning_rate": 0.001,
+      "loss": 0.4246,
+      "step": 8270
+    },
+    {
+      "epoch": 0.2282150950033973,
+      "grad_norm": 0.0025413238909095526,
+      "learning_rate": 0.001,
+      "loss": 0.4106,
+      "step": 8271
+    },
+    {
+      "epoch": 0.22824268720446167,
+      "grad_norm": 0.002345605753362179,
+      "learning_rate": 0.001,
+      "loss": 0.4098,
+      "step": 8272
+    },
+    {
+      "epoch": 0.22827027940552602,
+      "grad_norm": 0.005003250669687986,
+      "learning_rate": 0.001,
+      "loss": 0.3899,
+      "step": 8273
+    },
+    {
+      "epoch": 0.2282978716065904,
+      "grad_norm": 0.002230096375569701,
+      "learning_rate": 0.001,
+      "loss": 0.4071,
+      "step": 8274
+    },
+    {
+      "epoch": 0.22832546380765475,
+      "grad_norm": 0.0022804015316069126,
+      "learning_rate": 0.001,
+      "loss": 0.429,
+      "step": 8275
+    },
+    {
+      "epoch": 0.22835305600871914,
+      "grad_norm": 0.004967406392097473,
+      "learning_rate": 0.001,
+      "loss": 0.3755,
+      "step": 8276
+    },
+    {
+      "epoch": 0.22838064820978352,
+      "grad_norm": 0.003264637663960457,
+      "learning_rate": 0.001,
+      "loss": 0.3904,
+      "step": 8277
+    },
+    {
+      "epoch": 0.22840824041084787,
+      "grad_norm": 0.0022652260959148407,
+      "learning_rate": 0.001,
+      "loss": 0.4021,
+      "step": 8278
+    },
+    {
+      "epoch": 0.22843583261191225,
+      "grad_norm": 0.002336147939786315,
+      "learning_rate": 0.001,
+      "loss": 0.4132,
+      "step": 8279
+    },
+    {
+      "epoch": 0.2284634248129766,
+      "grad_norm": 0.00243670167401433,
+      "learning_rate": 0.001,
+      "loss": 0.3809,
+      "step": 8280
+    },
+    {
+      "epoch": 0.22849101701404098,
+      "grad_norm": 0.0036559735890477896,
+      "learning_rate": 0.001,
+      "loss": 0.4095,
+      "step": 8281
+    },
+    {
+      "epoch": 0.22851860921510536,
+      "grad_norm": 0.007312767673283815,
+      "learning_rate": 0.001,
+      "loss": 0.4186,
+      "step": 8282
+    },
+    {
+      "epoch": 0.22854620141616971,
+      "grad_norm": 0.008099487982690334,
+      "learning_rate": 0.001,
+      "loss": 0.356,
+      "step": 8283
+    },
+    {
+      "epoch": 0.2285737936172341,
+      "grad_norm": 0.0025754349771887064,
+      "learning_rate": 0.001,
+      "loss": 0.3802,
+      "step": 8284
+    },
+    {
+      "epoch": 0.22860138581829845,
+      "grad_norm": 0.002983193611726165,
+      "learning_rate": 0.001,
+      "loss": 0.3994,
+      "step": 8285
+    },
+    {
+      "epoch": 0.22862897801936283,
+      "grad_norm": 0.0024407468736171722,
+      "learning_rate": 0.001,
+      "loss": 0.4239,
+      "step": 8286
+    },
+    {
+      "epoch": 0.2286565702204272,
+      "grad_norm": 0.004473252687603235,
+      "learning_rate": 0.001,
+      "loss": 0.3746,
+      "step": 8287
+    },
+    {
+      "epoch": 0.22868416242149156,
+      "grad_norm": 0.002675954718142748,
+      "learning_rate": 0.001,
+      "loss": 0.4297,
+      "step": 8288
+    },
+    {
+      "epoch": 0.22871175462255594,
+      "grad_norm": 0.0025287093594670296,
+      "learning_rate": 0.001,
+      "loss": 0.395,
+      "step": 8289
+    },
+    {
+      "epoch": 0.2287393468236203,
+      "grad_norm": 0.002895546378567815,
+      "learning_rate": 0.001,
+      "loss": 0.3759,
+      "step": 8290
+    },
+    {
+      "epoch": 0.22876693902468467,
+      "grad_norm": 0.003705421229824424,
+      "learning_rate": 0.001,
+      "loss": 0.4142,
+      "step": 8291
+    },
+    {
+      "epoch": 0.22879453122574905,
+      "grad_norm": 0.003449185285717249,
+      "learning_rate": 0.001,
+      "loss": 0.4024,
+      "step": 8292
+    },
+    {
+      "epoch": 0.2288221234268134,
+      "grad_norm": 0.0022859005257487297,
+      "learning_rate": 0.001,
+      "loss": 0.3739,
+      "step": 8293
+    },
+    {
+      "epoch": 0.2288497156278778,
+      "grad_norm": 0.0023443542886525393,
+      "learning_rate": 0.001,
+      "loss": 0.391,
+      "step": 8294
+    },
+    {
+      "epoch": 0.22887730782894214,
+      "grad_norm": 0.0021697822958230972,
+      "learning_rate": 0.001,
+      "loss": 0.401,
+      "step": 8295
+    },
+    {
+      "epoch": 0.22890490003000652,
+      "grad_norm": 0.0023217375855892897,
+      "learning_rate": 0.001,
+      "loss": 0.3897,
+      "step": 8296
+    },
+    {
+      "epoch": 0.2289324922310709,
+      "grad_norm": 0.005043079610913992,
+      "learning_rate": 0.001,
+      "loss": 0.3887,
+      "step": 8297
+    },
+    {
+      "epoch": 0.22896008443213525,
+      "grad_norm": 0.004579662811011076,
+      "learning_rate": 0.001,
+      "loss": 0.3543,
+      "step": 8298
+    },
+    {
+      "epoch": 0.22898767663319963,
+      "grad_norm": 0.0032538543455302715,
+      "learning_rate": 0.001,
+      "loss": 0.3964,
+      "step": 8299
+    },
+    {
+      "epoch": 0.22901526883426399,
+      "grad_norm": 0.00212521362118423,
+      "learning_rate": 0.001,
+      "loss": 0.4059,
+      "step": 8300
+    },
+    {
+      "epoch": 0.22904286103532837,
+      "grad_norm": 0.003112571081146598,
+      "learning_rate": 0.001,
+      "loss": 0.4247,
+      "step": 8301
+    },
+    {
+      "epoch": 0.22907045323639275,
+      "grad_norm": 0.0030887876637279987,
+      "learning_rate": 0.001,
+      "loss": 0.4055,
+      "step": 8302
+    },
+    {
+      "epoch": 0.2290980454374571,
+      "grad_norm": 0.002820172579959035,
+      "learning_rate": 0.001,
+      "loss": 0.4015,
+      "step": 8303
+    },
+    {
+      "epoch": 0.22912563763852148,
+      "grad_norm": 0.003904817858710885,
+      "learning_rate": 0.001,
+      "loss": 0.4197,
+      "step": 8304
+    },
+    {
+      "epoch": 0.22915322983958583,
+      "grad_norm": 0.003779762424528599,
+      "learning_rate": 0.001,
+      "loss": 0.434,
+      "step": 8305
+    },
+    {
+      "epoch": 0.2291808220406502,
+      "grad_norm": 0.0025575195904821157,
+      "learning_rate": 0.001,
+      "loss": 0.3972,
+      "step": 8306
+    },
+    {
+      "epoch": 0.2292084142417146,
+      "grad_norm": 0.0039753662422299385,
+      "learning_rate": 0.001,
+      "loss": 0.401,
+      "step": 8307
+    },
+    {
+      "epoch": 0.22923600644277894,
+      "grad_norm": 0.0023605753667652607,
+      "learning_rate": 0.001,
+      "loss": 0.3817,
+      "step": 8308
+    },
+    {
+      "epoch": 0.22926359864384332,
+      "grad_norm": 0.0050887493416666985,
+      "learning_rate": 0.001,
+      "loss": 0.3755,
+      "step": 8309
+    },
+    {
+      "epoch": 0.22929119084490768,
+      "grad_norm": 0.0026991376653313637,
+      "learning_rate": 0.001,
+      "loss": 0.3734,
+      "step": 8310
+    },
+    {
+      "epoch": 0.22931878304597206,
+      "grad_norm": 0.0030844821594655514,
+      "learning_rate": 0.001,
+      "loss": 0.3769,
+      "step": 8311
+    },
+    {
+      "epoch": 0.22934637524703644,
+      "grad_norm": 0.0029710521921515465,
+      "learning_rate": 0.001,
+      "loss": 0.4106,
+      "step": 8312
+    },
+    {
+      "epoch": 0.2293739674481008,
+      "grad_norm": 0.0033121525775641203,
+      "learning_rate": 0.001,
+      "loss": 0.3688,
+      "step": 8313
+    },
+    {
+      "epoch": 0.22940155964916517,
+      "grad_norm": 0.002841067034751177,
+      "learning_rate": 0.001,
+      "loss": 0.3627,
+      "step": 8314
+    },
+    {
+      "epoch": 0.22942915185022952,
+      "grad_norm": 0.003009919775649905,
+      "learning_rate": 0.001,
+      "loss": 0.3875,
+      "step": 8315
+    },
+    {
+      "epoch": 0.2294567440512939,
+      "grad_norm": 0.007487253285944462,
+      "learning_rate": 0.001,
+      "loss": 0.3973,
+      "step": 8316
+    },
+    {
+      "epoch": 0.22948433625235828,
+      "grad_norm": 0.003841083962470293,
+      "learning_rate": 0.001,
+      "loss": 0.4063,
+      "step": 8317
+    },
+    {
+      "epoch": 0.22951192845342264,
+      "grad_norm": 0.003769118804484606,
+      "learning_rate": 0.001,
+      "loss": 0.4195,
+      "step": 8318
+    },
+    {
+      "epoch": 0.22953952065448702,
+      "grad_norm": 0.0026735500432550907,
+      "learning_rate": 0.001,
+      "loss": 0.4123,
+      "step": 8319
+    },
+    {
+      "epoch": 0.22956711285555137,
+      "grad_norm": 0.0035852077417075634,
+      "learning_rate": 0.001,
+      "loss": 0.376,
+      "step": 8320
+    },
+    {
+      "epoch": 0.22959470505661575,
+      "grad_norm": 0.004404496867209673,
+      "learning_rate": 0.001,
+      "loss": 0.3884,
+      "step": 8321
+    },
+    {
+      "epoch": 0.22962229725768013,
+      "grad_norm": 0.004810912534594536,
+      "learning_rate": 0.001,
+      "loss": 0.3728,
+      "step": 8322
+    },
+    {
+      "epoch": 0.22964988945874448,
+      "grad_norm": 0.002430463209748268,
+      "learning_rate": 0.001,
+      "loss": 0.4139,
+      "step": 8323
+    },
+    {
+      "epoch": 0.22967748165980886,
+      "grad_norm": 0.0034299069084227085,
+      "learning_rate": 0.001,
+      "loss": 0.3697,
+      "step": 8324
+    },
+    {
+      "epoch": 0.22970507386087322,
+      "grad_norm": 0.002617501188069582,
+      "learning_rate": 0.001,
+      "loss": 0.408,
+      "step": 8325
+    },
+    {
+      "epoch": 0.2297326660619376,
+      "grad_norm": 0.002159700496122241,
+      "learning_rate": 0.001,
+      "loss": 0.4001,
+      "step": 8326
+    },
+    {
+      "epoch": 0.22976025826300198,
+      "grad_norm": 0.003289289539679885,
+      "learning_rate": 0.001,
+      "loss": 0.4243,
+      "step": 8327
+    },
+    {
+      "epoch": 0.22978785046406633,
+      "grad_norm": 0.0034644966945052147,
+      "learning_rate": 0.001,
+      "loss": 0.3866,
+      "step": 8328
+    },
+    {
+      "epoch": 0.2298154426651307,
+      "grad_norm": 0.005093908403068781,
+      "learning_rate": 0.001,
+      "loss": 0.3948,
+      "step": 8329
+    },
+    {
+      "epoch": 0.22984303486619506,
+      "grad_norm": 0.0033900076523423195,
+      "learning_rate": 0.001,
+      "loss": 0.3983,
+      "step": 8330
+    },
+    {
+      "epoch": 0.22987062706725944,
+      "grad_norm": 0.0024000401608645916,
+      "learning_rate": 0.001,
+      "loss": 0.4097,
+      "step": 8331
+    },
+    {
+      "epoch": 0.2298982192683238,
+      "grad_norm": 0.0034197689965367317,
+      "learning_rate": 0.001,
+      "loss": 0.3977,
+      "step": 8332
+    },
+    {
+      "epoch": 0.22992581146938817,
+      "grad_norm": 0.0020301672630012035,
+      "learning_rate": 0.001,
+      "loss": 0.4249,
+      "step": 8333
+    },
+    {
+      "epoch": 0.22995340367045256,
+      "grad_norm": 0.002956526121124625,
+      "learning_rate": 0.001,
+      "loss": 0.4192,
+      "step": 8334
+    },
+    {
+      "epoch": 0.2299809958715169,
+      "grad_norm": 0.003122934838756919,
+      "learning_rate": 0.001,
+      "loss": 0.3925,
+      "step": 8335
+    },
+    {
+      "epoch": 0.2300085880725813,
+      "grad_norm": 0.005318638868629932,
+      "learning_rate": 0.001,
+      "loss": 0.4263,
+      "step": 8336
+    },
+    {
+      "epoch": 0.23003618027364564,
+      "grad_norm": 0.0033183018676936626,
+      "learning_rate": 0.001,
+      "loss": 0.3753,
+      "step": 8337
+    },
+    {
+      "epoch": 0.23006377247471002,
+      "grad_norm": 0.0026557277888059616,
+      "learning_rate": 0.001,
+      "loss": 0.4063,
+      "step": 8338
+    },
+    {
+      "epoch": 0.2300913646757744,
+      "grad_norm": 0.002340905833989382,
+      "learning_rate": 0.001,
+      "loss": 0.388,
+      "step": 8339
+    },
+    {
+      "epoch": 0.23011895687683875,
+      "grad_norm": 0.002298276871442795,
+      "learning_rate": 0.001,
+      "loss": 0.4251,
+      "step": 8340
+    },
+    {
+      "epoch": 0.23014654907790313,
+      "grad_norm": 0.004820041824132204,
+      "learning_rate": 0.001,
+      "loss": 0.3819,
+      "step": 8341
+    },
+    {
+      "epoch": 0.2301741412789675,
+      "grad_norm": 0.00302408030256629,
+      "learning_rate": 0.001,
+      "loss": 0.3706,
+      "step": 8342
+    },
+    {
+      "epoch": 0.23020173348003187,
+      "grad_norm": 0.002940029837191105,
+      "learning_rate": 0.001,
+      "loss": 0.3724,
+      "step": 8343
+    },
+    {
+      "epoch": 0.23022932568109625,
+      "grad_norm": 0.007366549223661423,
+      "learning_rate": 0.001,
+      "loss": 0.3765,
+      "step": 8344
+    },
+    {
+      "epoch": 0.2302569178821606,
+      "grad_norm": 0.00624980591237545,
+      "learning_rate": 0.001,
+      "loss": 0.4048,
+      "step": 8345
+    },
+    {
+      "epoch": 0.23028451008322498,
+      "grad_norm": 0.003244641702622175,
+      "learning_rate": 0.001,
+      "loss": 0.3637,
+      "step": 8346
+    },
+    {
+      "epoch": 0.23031210228428933,
+      "grad_norm": 0.003316201502457261,
+      "learning_rate": 0.001,
+      "loss": 0.409,
+      "step": 8347
+    },
+    {
+      "epoch": 0.2303396944853537,
+      "grad_norm": 0.004240705166012049,
+      "learning_rate": 0.001,
+      "loss": 0.3763,
+      "step": 8348
+    },
+    {
+      "epoch": 0.2303672866864181,
+      "grad_norm": 0.004042000509798527,
+      "learning_rate": 0.001,
+      "loss": 0.392,
+      "step": 8349
+    },
+    {
+      "epoch": 0.23039487888748245,
+      "grad_norm": 0.013157385401427746,
+      "learning_rate": 0.001,
+      "loss": 0.4483,
+      "step": 8350
+    },
+    {
+      "epoch": 0.23042247108854683,
+      "grad_norm": 0.014354042708873749,
+      "learning_rate": 0.001,
+      "loss": 0.3777,
+      "step": 8351
+    },
+    {
+      "epoch": 0.23045006328961118,
+      "grad_norm": 0.003511092159897089,
+      "learning_rate": 0.001,
+      "loss": 0.417,
+      "step": 8352
+    },
+    {
+      "epoch": 0.23047765549067556,
+      "grad_norm": 0.0031483755446970463,
+      "learning_rate": 0.001,
+      "loss": 0.3863,
+      "step": 8353
+    },
+    {
+      "epoch": 0.23050524769173994,
+      "grad_norm": 0.0037154678720980883,
+      "learning_rate": 0.001,
+      "loss": 0.3836,
+      "step": 8354
+    },
+    {
+      "epoch": 0.2305328398928043,
+      "grad_norm": 0.004443106707185507,
+      "learning_rate": 0.001,
+      "loss": 0.4128,
+      "step": 8355
+    },
+    {
+      "epoch": 0.23056043209386867,
+      "grad_norm": 0.0030360580421984196,
+      "learning_rate": 0.001,
+      "loss": 0.4006,
+      "step": 8356
+    },
+    {
+      "epoch": 0.23058802429493302,
+      "grad_norm": 0.0023466795682907104,
+      "learning_rate": 0.001,
+      "loss": 0.3841,
+      "step": 8357
+    },
+    {
+      "epoch": 0.2306156164959974,
+      "grad_norm": 0.005221103318035603,
+      "learning_rate": 0.001,
+      "loss": 0.3873,
+      "step": 8358
+    },
+    {
+      "epoch": 0.23064320869706179,
+      "grad_norm": 0.0028962953947484493,
+      "learning_rate": 0.001,
+      "loss": 0.4029,
+      "step": 8359
+    },
+    {
+      "epoch": 0.23067080089812614,
+      "grad_norm": 0.0029752026312053204,
+      "learning_rate": 0.001,
+      "loss": 0.3917,
+      "step": 8360
+    },
+    {
+      "epoch": 0.23069839309919052,
+      "grad_norm": 0.005493995267897844,
+      "learning_rate": 0.001,
+      "loss": 0.3873,
+      "step": 8361
+    },
+    {
+      "epoch": 0.23072598530025487,
+      "grad_norm": 0.0037254132330417633,
+      "learning_rate": 0.001,
+      "loss": 0.3481,
+      "step": 8362
+    },
+    {
+      "epoch": 0.23075357750131925,
+      "grad_norm": 0.0025524902157485485,
+      "learning_rate": 0.001,
+      "loss": 0.3797,
+      "step": 8363
+    },
+    {
+      "epoch": 0.23078116970238363,
+      "grad_norm": 0.0027644087094813585,
+      "learning_rate": 0.001,
+      "loss": 0.4005,
+      "step": 8364
+    },
+    {
+      "epoch": 0.23080876190344798,
+      "grad_norm": 0.0027628885582089424,
+      "learning_rate": 0.001,
+      "loss": 0.4303,
+      "step": 8365
+    },
+    {
+      "epoch": 0.23083635410451236,
+      "grad_norm": 0.0039875865913927555,
+      "learning_rate": 0.001,
+      "loss": 0.3986,
+      "step": 8366
+    },
+    {
+      "epoch": 0.23086394630557672,
+      "grad_norm": 0.003650275059044361,
+      "learning_rate": 0.001,
+      "loss": 0.386,
+      "step": 8367
+    },
+    {
+      "epoch": 0.2308915385066411,
+      "grad_norm": 0.0026859240606427193,
+      "learning_rate": 0.001,
+      "loss": 0.4035,
+      "step": 8368
+    },
+    {
+      "epoch": 0.23091913070770548,
+      "grad_norm": 0.004638598766177893,
+      "learning_rate": 0.001,
+      "loss": 0.4052,
+      "step": 8369
+    },
+    {
+      "epoch": 0.23094672290876983,
+      "grad_norm": 0.0029127905145287514,
+      "learning_rate": 0.001,
+      "loss": 0.4049,
+      "step": 8370
+    },
+    {
+      "epoch": 0.2309743151098342,
+      "grad_norm": 0.01036902703344822,
+      "learning_rate": 0.001,
+      "loss": 0.3835,
+      "step": 8371
+    },
+    {
+      "epoch": 0.23100190731089856,
+      "grad_norm": 0.0038089959416538477,
+      "learning_rate": 0.001,
+      "loss": 0.3554,
+      "step": 8372
+    },
+    {
+      "epoch": 0.23102949951196294,
+      "grad_norm": 0.0037768797483295202,
+      "learning_rate": 0.001,
+      "loss": 0.3817,
+      "step": 8373
+    },
+    {
+      "epoch": 0.23105709171302732,
+      "grad_norm": 0.006050050258636475,
+      "learning_rate": 0.001,
+      "loss": 0.3826,
+      "step": 8374
+    },
+    {
+      "epoch": 0.23108468391409168,
+      "grad_norm": 0.0037368466146290302,
+      "learning_rate": 0.001,
+      "loss": 0.4804,
+      "step": 8375
+    },
+    {
+      "epoch": 0.23111227611515606,
+      "grad_norm": 0.002388233318924904,
+      "learning_rate": 0.001,
+      "loss": 0.4179,
+      "step": 8376
+    },
+    {
+      "epoch": 0.2311398683162204,
+      "grad_norm": 0.002968754153698683,
+      "learning_rate": 0.001,
+      "loss": 0.393,
+      "step": 8377
+    },
+    {
+      "epoch": 0.2311674605172848,
+      "grad_norm": 0.005483376793563366,
+      "learning_rate": 0.001,
+      "loss": 0.418,
+      "step": 8378
+    },
+    {
+      "epoch": 0.23119505271834917,
+      "grad_norm": 0.0025941620115190744,
+      "learning_rate": 0.001,
+      "loss": 0.392,
+      "step": 8379
+    },
+    {
+      "epoch": 0.23122264491941352,
+      "grad_norm": 0.0066278777085244656,
+      "learning_rate": 0.001,
+      "loss": 0.386,
+      "step": 8380
+    },
+    {
+      "epoch": 0.2312502371204779,
+      "grad_norm": 0.00500098429620266,
+      "learning_rate": 0.001,
+      "loss": 0.3955,
+      "step": 8381
+    },
+    {
+      "epoch": 0.23127782932154226,
+      "grad_norm": 0.002849952783435583,
+      "learning_rate": 0.001,
+      "loss": 0.4072,
+      "step": 8382
+    },
+    {
+      "epoch": 0.23130542152260664,
+      "grad_norm": 0.0047418465837836266,
+      "learning_rate": 0.001,
+      "loss": 0.3982,
+      "step": 8383
+    },
+    {
+      "epoch": 0.23133301372367102,
+      "grad_norm": 0.005365386605262756,
+      "learning_rate": 0.001,
+      "loss": 0.4049,
+      "step": 8384
+    },
+    {
+      "epoch": 0.23136060592473537,
+      "grad_norm": 0.002952918875962496,
+      "learning_rate": 0.001,
+      "loss": 0.4072,
+      "step": 8385
+    },
+    {
+      "epoch": 0.23138819812579975,
+      "grad_norm": 0.003814739640802145,
+      "learning_rate": 0.001,
+      "loss": 0.4248,
+      "step": 8386
+    },
+    {
+      "epoch": 0.2314157903268641,
+      "grad_norm": 0.0031040378380566835,
+      "learning_rate": 0.001,
+      "loss": 0.4025,
+      "step": 8387
+    },
+    {
+      "epoch": 0.23144338252792848,
+      "grad_norm": 0.0033247387036681175,
+      "learning_rate": 0.001,
+      "loss": 0.3893,
+      "step": 8388
+    },
+    {
+      "epoch": 0.23147097472899286,
+      "grad_norm": 0.002116522518917918,
+      "learning_rate": 0.001,
+      "loss": 0.408,
+      "step": 8389
+    },
+    {
+      "epoch": 0.23149856693005721,
+      "grad_norm": 0.0033864439465105534,
+      "learning_rate": 0.001,
+      "loss": 0.3803,
+      "step": 8390
+    },
+    {
+      "epoch": 0.2315261591311216,
+      "grad_norm": 0.0037893191911280155,
+      "learning_rate": 0.001,
+      "loss": 0.4271,
+      "step": 8391
+    },
+    {
+      "epoch": 0.23155375133218595,
+      "grad_norm": 0.0026662342716008425,
+      "learning_rate": 0.001,
+      "loss": 0.3892,
+      "step": 8392
+    },
+    {
+      "epoch": 0.23158134353325033,
+      "grad_norm": 0.002584991976618767,
+      "learning_rate": 0.001,
+      "loss": 0.3913,
+      "step": 8393
+    },
+    {
+      "epoch": 0.2316089357343147,
+      "grad_norm": 0.002258981578052044,
+      "learning_rate": 0.001,
+      "loss": 0.4066,
+      "step": 8394
+    },
+    {
+      "epoch": 0.23163652793537906,
+      "grad_norm": 0.004243454430252314,
+      "learning_rate": 0.001,
+      "loss": 0.3886,
+      "step": 8395
+    },
+    {
+      "epoch": 0.23166412013644344,
+      "grad_norm": 0.004912800621241331,
+      "learning_rate": 0.001,
+      "loss": 0.3912,
+      "step": 8396
+    },
+    {
+      "epoch": 0.2316917123375078,
+      "grad_norm": 0.0038783997297286987,
+      "learning_rate": 0.001,
+      "loss": 0.3873,
+      "step": 8397
+    },
+    {
+      "epoch": 0.23171930453857217,
+      "grad_norm": 0.0028120707720518112,
+      "learning_rate": 0.001,
+      "loss": 0.3917,
+      "step": 8398
+    },
+    {
+      "epoch": 0.23174689673963655,
+      "grad_norm": 0.003508375259116292,
+      "learning_rate": 0.001,
+      "loss": 0.3738,
+      "step": 8399
+    },
+    {
+      "epoch": 0.2317744889407009,
+      "grad_norm": 0.0044393884018063545,
+      "learning_rate": 0.001,
+      "loss": 0.3629,
+      "step": 8400
+    },
+    {
+      "epoch": 0.2318020811417653,
+      "grad_norm": 0.006484494544565678,
+      "learning_rate": 0.001,
+      "loss": 0.3978,
+      "step": 8401
+    },
+    {
+      "epoch": 0.23182967334282964,
+      "grad_norm": 0.0041907550767064095,
+      "learning_rate": 0.001,
+      "loss": 0.3908,
+      "step": 8402
+    },
+    {
+      "epoch": 0.23185726554389402,
+      "grad_norm": 0.002757348818704486,
+      "learning_rate": 0.001,
+      "loss": 0.3968,
+      "step": 8403
+    },
+    {
+      "epoch": 0.2318848577449584,
+      "grad_norm": 0.004819660913199186,
+      "learning_rate": 0.001,
+      "loss": 0.3842,
+      "step": 8404
+    },
+    {
+      "epoch": 0.23191244994602275,
+      "grad_norm": 0.0022516471799463034,
+      "learning_rate": 0.001,
+      "loss": 0.4485,
+      "step": 8405
+    },
+    {
+      "epoch": 0.23194004214708713,
+      "grad_norm": 0.002369027817621827,
+      "learning_rate": 0.001,
+      "loss": 0.3904,
+      "step": 8406
+    },
+    {
+      "epoch": 0.23196763434815149,
+      "grad_norm": 0.0035298550501465797,
+      "learning_rate": 0.001,
+      "loss": 0.3857,
+      "step": 8407
+    },
+    {
+      "epoch": 0.23199522654921587,
+      "grad_norm": 0.002744413213804364,
+      "learning_rate": 0.001,
+      "loss": 0.3847,
+      "step": 8408
+    },
+    {
+      "epoch": 0.23202281875028025,
+      "grad_norm": 0.002334202639758587,
+      "learning_rate": 0.001,
+      "loss": 0.4083,
+      "step": 8409
+    },
+    {
+      "epoch": 0.2320504109513446,
+      "grad_norm": 0.0026494518388062716,
+      "learning_rate": 0.001,
+      "loss": 0.4027,
+      "step": 8410
+    },
+    {
+      "epoch": 0.23207800315240898,
+      "grad_norm": 0.002644699066877365,
+      "learning_rate": 0.001,
+      "loss": 0.3781,
+      "step": 8411
+    },
+    {
+      "epoch": 0.23210559535347333,
+      "grad_norm": 0.0030237159226089716,
+      "learning_rate": 0.001,
+      "loss": 0.4239,
+      "step": 8412
+    },
+    {
+      "epoch": 0.2321331875545377,
+      "grad_norm": 0.003086669836193323,
+      "learning_rate": 0.001,
+      "loss": 0.3747,
+      "step": 8413
+    },
+    {
+      "epoch": 0.2321607797556021,
+      "grad_norm": 0.002666539279744029,
+      "learning_rate": 0.001,
+      "loss": 0.4265,
+      "step": 8414
+    },
+    {
+      "epoch": 0.23218837195666645,
+      "grad_norm": 0.0034365833271294832,
+      "learning_rate": 0.001,
+      "loss": 0.3816,
+      "step": 8415
+    },
+    {
+      "epoch": 0.23221596415773083,
+      "grad_norm": 0.0030753256287425756,
+      "learning_rate": 0.001,
+      "loss": 0.3846,
+      "step": 8416
+    },
+    {
+      "epoch": 0.23224355635879518,
+      "grad_norm": 0.003509074915200472,
+      "learning_rate": 0.001,
+      "loss": 0.4075,
+      "step": 8417
+    },
+    {
+      "epoch": 0.23227114855985956,
+      "grad_norm": 0.005115900654345751,
+      "learning_rate": 0.001,
+      "loss": 0.4373,
+      "step": 8418
+    },
+    {
+      "epoch": 0.23229874076092394,
+      "grad_norm": 0.005185318179428577,
+      "learning_rate": 0.001,
+      "loss": 0.4104,
+      "step": 8419
+    },
+    {
+      "epoch": 0.2323263329619883,
+      "grad_norm": 0.0024755573831498623,
+      "learning_rate": 0.001,
+      "loss": 0.4156,
+      "step": 8420
+    },
+    {
+      "epoch": 0.23235392516305267,
+      "grad_norm": 0.005173765122890472,
+      "learning_rate": 0.001,
+      "loss": 0.3588,
+      "step": 8421
+    },
+    {
+      "epoch": 0.23238151736411702,
+      "grad_norm": 0.004211380612105131,
+      "learning_rate": 0.001,
+      "loss": 0.3902,
+      "step": 8422
+    },
+    {
+      "epoch": 0.2324091095651814,
+      "grad_norm": 0.002343039261177182,
+      "learning_rate": 0.001,
+      "loss": 0.3848,
+      "step": 8423
+    },
+    {
+      "epoch": 0.23243670176624576,
+      "grad_norm": 0.004708436783403158,
+      "learning_rate": 0.001,
+      "loss": 0.3721,
+      "step": 8424
+    },
+    {
+      "epoch": 0.23246429396731014,
+      "grad_norm": 0.004656490869820118,
+      "learning_rate": 0.001,
+      "loss": 0.3696,
+      "step": 8425
+    },
+    {
+      "epoch": 0.23249188616837452,
+      "grad_norm": 0.002325586276128888,
+      "learning_rate": 0.001,
+      "loss": 0.3799,
+      "step": 8426
+    },
+    {
+      "epoch": 0.23251947836943887,
+      "grad_norm": 0.00335716363042593,
+      "learning_rate": 0.001,
+      "loss": 0.3875,
+      "step": 8427
+    },
+    {
+      "epoch": 0.23254707057050325,
+      "grad_norm": 0.00340313115157187,
+      "learning_rate": 0.001,
+      "loss": 0.3978,
+      "step": 8428
+    },
+    {
+      "epoch": 0.2325746627715676,
+      "grad_norm": 0.0026581473648548126,
+      "learning_rate": 0.001,
+      "loss": 0.399,
+      "step": 8429
+    },
+    {
+      "epoch": 0.23260225497263198,
+      "grad_norm": 0.019147882238030434,
+      "learning_rate": 0.001,
+      "loss": 0.4087,
+      "step": 8430
+    },
+    {
+      "epoch": 0.23262984717369636,
+      "grad_norm": 0.0027826984878629446,
+      "learning_rate": 0.001,
+      "loss": 0.3854,
+      "step": 8431
+    },
+    {
+      "epoch": 0.23265743937476072,
+      "grad_norm": 0.002678795251995325,
+      "learning_rate": 0.001,
+      "loss": 0.3954,
+      "step": 8432
+    },
+    {
+      "epoch": 0.2326850315758251,
+      "grad_norm": 0.002674340968951583,
+      "learning_rate": 0.001,
+      "loss": 0.3785,
+      "step": 8433
+    },
+    {
+      "epoch": 0.23271262377688945,
+      "grad_norm": 0.01514797005802393,
+      "learning_rate": 0.001,
+      "loss": 0.3665,
+      "step": 8434
+    },
+    {
+      "epoch": 0.23274021597795383,
+      "grad_norm": 0.005059736780822277,
+      "learning_rate": 0.001,
+      "loss": 0.4126,
+      "step": 8435
+    },
+    {
+      "epoch": 0.2327678081790182,
+      "grad_norm": 0.0021096577402204275,
+      "learning_rate": 0.001,
+      "loss": 0.4087,
+      "step": 8436
+    },
+    {
+      "epoch": 0.23279540038008256,
+      "grad_norm": 0.003909014165401459,
+      "learning_rate": 0.001,
+      "loss": 0.4205,
+      "step": 8437
+    },
+    {
+      "epoch": 0.23282299258114694,
+      "grad_norm": 0.003196495585143566,
+      "learning_rate": 0.001,
+      "loss": 0.3961,
+      "step": 8438
+    },
+    {
+      "epoch": 0.2328505847822113,
+      "grad_norm": 0.009719678200781345,
+      "learning_rate": 0.001,
+      "loss": 0.3553,
+      "step": 8439
+    },
+    {
+      "epoch": 0.23287817698327568,
+      "grad_norm": 0.0031904377974569798,
+      "learning_rate": 0.001,
+      "loss": 0.4286,
+      "step": 8440
+    },
+    {
+      "epoch": 0.23290576918434006,
+      "grad_norm": 0.0028641929384320974,
+      "learning_rate": 0.001,
+      "loss": 0.4295,
+      "step": 8441
+    },
+    {
+      "epoch": 0.2329333613854044,
+      "grad_norm": 0.0037782087456434965,
+      "learning_rate": 0.001,
+      "loss": 0.3953,
+      "step": 8442
+    },
+    {
+      "epoch": 0.2329609535864688,
+      "grad_norm": 0.002453066175803542,
+      "learning_rate": 0.001,
+      "loss": 0.3873,
+      "step": 8443
+    },
+    {
+      "epoch": 0.23298854578753314,
+      "grad_norm": 0.0037556791212409735,
+      "learning_rate": 0.001,
+      "loss": 0.3777,
+      "step": 8444
+    },
+    {
+      "epoch": 0.23301613798859752,
+      "grad_norm": 0.0029747902881354094,
+      "learning_rate": 0.001,
+      "loss": 0.3621,
+      "step": 8445
+    },
+    {
+      "epoch": 0.2330437301896619,
+      "grad_norm": 0.004175838083028793,
+      "learning_rate": 0.001,
+      "loss": 0.3412,
+      "step": 8446
+    },
+    {
+      "epoch": 0.23307132239072625,
+      "grad_norm": 0.003063888754695654,
+      "learning_rate": 0.001,
+      "loss": 0.3858,
+      "step": 8447
+    },
+    {
+      "epoch": 0.23309891459179063,
+      "grad_norm": 0.0027233557775616646,
+      "learning_rate": 0.001,
+      "loss": 0.3872,
+      "step": 8448
+    },
+    {
+      "epoch": 0.233126506792855,
+      "grad_norm": 0.002462203847244382,
+      "learning_rate": 0.001,
+      "loss": 0.3878,
+      "step": 8449
+    },
+    {
+      "epoch": 0.23315409899391937,
+      "grad_norm": 0.003968900069594383,
+      "learning_rate": 0.001,
+      "loss": 0.3769,
+      "step": 8450
+    },
+    {
+      "epoch": 0.23318169119498375,
+      "grad_norm": 0.004251559264957905,
+      "learning_rate": 0.001,
+      "loss": 0.4029,
+      "step": 8451
+    },
+    {
+      "epoch": 0.2332092833960481,
+      "grad_norm": 0.0033609773963689804,
+      "learning_rate": 0.001,
+      "loss": 0.3802,
+      "step": 8452
+    },
+    {
+      "epoch": 0.23323687559711248,
+      "grad_norm": 0.0026037050411105156,
+      "learning_rate": 0.001,
+      "loss": 0.4176,
+      "step": 8453
+    },
+    {
+      "epoch": 0.23326446779817683,
+      "grad_norm": 0.0028592885937541723,
+      "learning_rate": 0.001,
+      "loss": 0.4125,
+      "step": 8454
+    },
+    {
+      "epoch": 0.2332920599992412,
+      "grad_norm": 0.0032447976991534233,
+      "learning_rate": 0.001,
+      "loss": 0.3974,
+      "step": 8455
+    },
+    {
+      "epoch": 0.2333196522003056,
+      "grad_norm": 0.0034741810522973537,
+      "learning_rate": 0.001,
+      "loss": 0.4033,
+      "step": 8456
+    },
+    {
+      "epoch": 0.23334724440136995,
+      "grad_norm": 0.0030226546805351973,
+      "learning_rate": 0.001,
+      "loss": 0.4209,
+      "step": 8457
+    },
+    {
+      "epoch": 0.23337483660243433,
+      "grad_norm": 0.002790014259517193,
+      "learning_rate": 0.001,
+      "loss": 0.3957,
+      "step": 8458
+    },
+    {
+      "epoch": 0.23340242880349868,
+      "grad_norm": 0.0024738421197980642,
+      "learning_rate": 0.001,
+      "loss": 0.4226,
+      "step": 8459
+    },
+    {
+      "epoch": 0.23343002100456306,
+      "grad_norm": 0.002386496402323246,
+      "learning_rate": 0.001,
+      "loss": 0.4101,
+      "step": 8460
+    },
+    {
+      "epoch": 0.23345761320562744,
+      "grad_norm": 0.005026910919696093,
+      "learning_rate": 0.001,
+      "loss": 0.4357,
+      "step": 8461
+    },
+    {
+      "epoch": 0.2334852054066918,
+      "grad_norm": 0.002900604158639908,
+      "learning_rate": 0.001,
+      "loss": 0.3655,
+      "step": 8462
+    },
+    {
+      "epoch": 0.23351279760775617,
+      "grad_norm": 0.002177110407501459,
+      "learning_rate": 0.001,
+      "loss": 0.3966,
+      "step": 8463
+    },
+    {
+      "epoch": 0.23354038980882053,
+      "grad_norm": 0.003278181655332446,
+      "learning_rate": 0.001,
+      "loss": 0.3813,
+      "step": 8464
+    },
+    {
+      "epoch": 0.2335679820098849,
+      "grad_norm": 0.002810918027535081,
+      "learning_rate": 0.001,
+      "loss": 0.4054,
+      "step": 8465
+    },
+    {
+      "epoch": 0.23359557421094929,
+      "grad_norm": 0.002259747590869665,
+      "learning_rate": 0.001,
+      "loss": 0.4319,
+      "step": 8466
+    },
+    {
+      "epoch": 0.23362316641201364,
+      "grad_norm": 0.0023482360411435366,
+      "learning_rate": 0.001,
+      "loss": 0.4158,
+      "step": 8467
+    },
+    {
+      "epoch": 0.23365075861307802,
+      "grad_norm": 0.004843599628657103,
+      "learning_rate": 0.001,
+      "loss": 0.3671,
+      "step": 8468
+    },
+    {
+      "epoch": 0.23367835081414237,
+      "grad_norm": 0.0035332750994712114,
+      "learning_rate": 0.001,
+      "loss": 0.379,
+      "step": 8469
+    },
+    {
+      "epoch": 0.23370594301520675,
+      "grad_norm": 0.009748833253979683,
+      "learning_rate": 0.001,
+      "loss": 0.4217,
+      "step": 8470
+    },
+    {
+      "epoch": 0.23373353521627113,
+      "grad_norm": 0.002617282560095191,
+      "learning_rate": 0.001,
+      "loss": 0.407,
+      "step": 8471
+    },
+    {
+      "epoch": 0.23376112741733548,
+      "grad_norm": 0.0023585897870361805,
+      "learning_rate": 0.001,
+      "loss": 0.4036,
+      "step": 8472
+    },
+    {
+      "epoch": 0.23378871961839987,
+      "grad_norm": 0.004000307526439428,
+      "learning_rate": 0.001,
+      "loss": 0.4238,
+      "step": 8473
+    },
+    {
+      "epoch": 0.23381631181946422,
+      "grad_norm": 0.003987070173025131,
+      "learning_rate": 0.001,
+      "loss": 0.3792,
+      "step": 8474
+    },
+    {
+      "epoch": 0.2338439040205286,
+      "grad_norm": 0.004123962949961424,
+      "learning_rate": 0.001,
+      "loss": 0.3982,
+      "step": 8475
+    },
+    {
+      "epoch": 0.23387149622159298,
+      "grad_norm": 0.014491601847112179,
+      "learning_rate": 0.001,
+      "loss": 0.4144,
+      "step": 8476
+    },
+    {
+      "epoch": 0.23389908842265733,
+      "grad_norm": 0.0026626239996403456,
+      "learning_rate": 0.001,
+      "loss": 0.3685,
+      "step": 8477
+    },
+    {
+      "epoch": 0.2339266806237217,
+      "grad_norm": 0.006109694018959999,
+      "learning_rate": 0.001,
+      "loss": 0.3845,
+      "step": 8478
+    },
+    {
+      "epoch": 0.23395427282478606,
+      "grad_norm": 0.00370441609993577,
+      "learning_rate": 0.001,
+      "loss": 0.3962,
+      "step": 8479
+    },
+    {
+      "epoch": 0.23398186502585044,
+      "grad_norm": 0.004282459616661072,
+      "learning_rate": 0.001,
+      "loss": 0.4074,
+      "step": 8480
+    },
+    {
+      "epoch": 0.23400945722691482,
+      "grad_norm": 0.003556980052962899,
+      "learning_rate": 0.001,
+      "loss": 0.3753,
+      "step": 8481
+    },
+    {
+      "epoch": 0.23403704942797918,
+      "grad_norm": 0.00320242578163743,
+      "learning_rate": 0.001,
+      "loss": 0.4397,
+      "step": 8482
+    },
+    {
+      "epoch": 0.23406464162904356,
+      "grad_norm": 0.0037783649750053883,
+      "learning_rate": 0.001,
+      "loss": 0.3802,
+      "step": 8483
+    },
+    {
+      "epoch": 0.2340922338301079,
+      "grad_norm": 0.003380347043275833,
+      "learning_rate": 0.001,
+      "loss": 0.4028,
+      "step": 8484
+    },
+    {
+      "epoch": 0.2341198260311723,
+      "grad_norm": 0.0029710596427321434,
+      "learning_rate": 0.001,
+      "loss": 0.4007,
+      "step": 8485
+    },
+    {
+      "epoch": 0.23414741823223667,
+      "grad_norm": 0.003365810727700591,
+      "learning_rate": 0.001,
+      "loss": 0.3871,
+      "step": 8486
+    },
+    {
+      "epoch": 0.23417501043330102,
+      "grad_norm": 0.0027016180101782084,
+      "learning_rate": 0.001,
+      "loss": 0.4146,
+      "step": 8487
+    },
+    {
+      "epoch": 0.2342026026343654,
+      "grad_norm": 0.002262349473312497,
+      "learning_rate": 0.001,
+      "loss": 0.3878,
+      "step": 8488
+    },
+    {
+      "epoch": 0.23423019483542976,
+      "grad_norm": 0.002855840837582946,
+      "learning_rate": 0.001,
+      "loss": 0.3857,
+      "step": 8489
+    },
+    {
+      "epoch": 0.23425778703649414,
+      "grad_norm": 0.004416047595441341,
+      "learning_rate": 0.001,
+      "loss": 0.389,
+      "step": 8490
+    },
+    {
+      "epoch": 0.23428537923755852,
+      "grad_norm": 0.0035155070945620537,
+      "learning_rate": 0.001,
+      "loss": 0.392,
+      "step": 8491
+    },
+    {
+      "epoch": 0.23431297143862287,
+      "grad_norm": 0.003230678616091609,
+      "learning_rate": 0.001,
+      "loss": 0.3429,
+      "step": 8492
+    },
+    {
+      "epoch": 0.23434056363968725,
+      "grad_norm": 0.0038198174443095922,
+      "learning_rate": 0.001,
+      "loss": 0.4244,
+      "step": 8493
+    },
+    {
+      "epoch": 0.2343681558407516,
+      "grad_norm": 0.003024979494512081,
+      "learning_rate": 0.001,
+      "loss": 0.3822,
+      "step": 8494
+    },
+    {
+      "epoch": 0.23439574804181598,
+      "grad_norm": 0.0022407593205571175,
+      "learning_rate": 0.001,
+      "loss": 0.4219,
+      "step": 8495
+    },
+    {
+      "epoch": 0.23442334024288036,
+      "grad_norm": 0.0024072916712611914,
+      "learning_rate": 0.001,
+      "loss": 0.4466,
+      "step": 8496
+    },
+    {
+      "epoch": 0.23445093244394472,
+      "grad_norm": 0.004507563542574644,
+      "learning_rate": 0.001,
+      "loss": 0.3714,
+      "step": 8497
+    },
+    {
+      "epoch": 0.2344785246450091,
+      "grad_norm": 0.0035954902414232492,
+      "learning_rate": 0.001,
+      "loss": 0.3927,
+      "step": 8498
+    },
+    {
+      "epoch": 0.23450611684607345,
+      "grad_norm": 0.003051930107176304,
+      "learning_rate": 0.001,
+      "loss": 0.4189,
+      "step": 8499
+    },
+    {
+      "epoch": 0.23453370904713783,
+      "grad_norm": 0.0034672280307859182,
+      "learning_rate": 0.001,
+      "loss": 0.4067,
+      "step": 8500
+    },
+    {
+      "epoch": 0.23453370904713783,
+      "eval_runtime": 23.4916,
+      "eval_samples_per_second": 1.362,
+      "eval_steps_per_second": 0.17,
+      "step": 8500
+    },
+    {
+      "epoch": 0.2345613012482022,
+      "grad_norm": 0.004859286360442638,
+      "learning_rate": 0.001,
+      "loss": 0.3772,
+      "step": 8501
+    },
+    {
+      "epoch": 0.23458889344926656,
+      "grad_norm": 0.005819413810968399,
+      "learning_rate": 0.001,
+      "loss": 0.3959,
+      "step": 8502
+    },
+    {
+      "epoch": 0.23461648565033094,
+      "grad_norm": 0.0030995369888842106,
+      "learning_rate": 0.001,
+      "loss": 0.3793,
+      "step": 8503
+    },
+    {
+      "epoch": 0.2346440778513953,
+      "grad_norm": 0.0032427343539893627,
+      "learning_rate": 0.001,
+      "loss": 0.3817,
+      "step": 8504
+    },
+    {
+      "epoch": 0.23467167005245967,
+      "grad_norm": 0.0034287397284060717,
+      "learning_rate": 0.001,
+      "loss": 0.3937,
+      "step": 8505
+    },
+    {
+      "epoch": 0.23469926225352405,
+      "grad_norm": 0.003676005406305194,
+      "learning_rate": 0.001,
+      "loss": 0.422,
+      "step": 8506
+    },
+    {
+      "epoch": 0.2347268544545884,
+      "grad_norm": 0.0029224164318293333,
+      "learning_rate": 0.001,
+      "loss": 0.3695,
+      "step": 8507
+    },
+    {
+      "epoch": 0.2347544466556528,
+      "grad_norm": 0.00621433649212122,
+      "learning_rate": 0.001,
+      "loss": 0.412,
+      "step": 8508
+    },
+    {
+      "epoch": 0.23478203885671714,
+      "grad_norm": 0.0034033150877803564,
+      "learning_rate": 0.001,
+      "loss": 0.3924,
+      "step": 8509
+    },
+    {
+      "epoch": 0.23480963105778152,
+      "grad_norm": 0.006222172640264034,
+      "learning_rate": 0.001,
+      "loss": 0.381,
+      "step": 8510
+    },
+    {
+      "epoch": 0.2348372232588459,
+      "grad_norm": 0.0026931515894830227,
+      "learning_rate": 0.001,
+      "loss": 0.3948,
+      "step": 8511
+    },
+    {
+      "epoch": 0.23486481545991025,
+      "grad_norm": 0.004435943905264139,
+      "learning_rate": 0.001,
+      "loss": 0.4159,
+      "step": 8512
+    },
+    {
+      "epoch": 0.23489240766097463,
+      "grad_norm": 0.003721091663464904,
+      "learning_rate": 0.001,
+      "loss": 0.384,
+      "step": 8513
+    },
+    {
+      "epoch": 0.234919999862039,
+      "grad_norm": 0.002308727940544486,
+      "learning_rate": 0.001,
+      "loss": 0.3725,
+      "step": 8514
+    },
+    {
+      "epoch": 0.23494759206310337,
+      "grad_norm": 0.0038359523750841618,
+      "learning_rate": 0.001,
+      "loss": 0.3872,
+      "step": 8515
+    },
+    {
+      "epoch": 0.23497518426416775,
+      "grad_norm": 0.002997712232172489,
+      "learning_rate": 0.001,
+      "loss": 0.3917,
+      "step": 8516
+    },
+    {
+      "epoch": 0.2350027764652321,
+      "grad_norm": 0.0034595001488924026,
+      "learning_rate": 0.001,
+      "loss": 0.3658,
+      "step": 8517
+    },
+    {
+      "epoch": 0.23503036866629648,
+      "grad_norm": 0.0026987416204065084,
+      "learning_rate": 0.001,
+      "loss": 0.3928,
+      "step": 8518
+    },
+    {
+      "epoch": 0.23505796086736083,
+      "grad_norm": 0.00263298605568707,
+      "learning_rate": 0.001,
+      "loss": 0.4276,
+      "step": 8519
+    },
+    {
+      "epoch": 0.2350855530684252,
+      "grad_norm": 0.0024717003107070923,
+      "learning_rate": 0.001,
+      "loss": 0.438,
+      "step": 8520
+    },
+    {
+      "epoch": 0.23511314526948957,
+      "grad_norm": 0.003266090527176857,
+      "learning_rate": 0.001,
+      "loss": 0.4012,
+      "step": 8521
+    },
+    {
+      "epoch": 0.23514073747055395,
+      "grad_norm": 0.007142478600144386,
+      "learning_rate": 0.001,
+      "loss": 0.3845,
+      "step": 8522
+    },
+    {
+      "epoch": 0.23516832967161833,
+      "grad_norm": 0.003394266590476036,
+      "learning_rate": 0.001,
+      "loss": 0.3885,
+      "step": 8523
+    },
+    {
+      "epoch": 0.23519592187268268,
+      "grad_norm": 0.0034649951849132776,
+      "learning_rate": 0.001,
+      "loss": 0.3894,
+      "step": 8524
+    },
+    {
+      "epoch": 0.23522351407374706,
+      "grad_norm": 0.0022391905076801777,
+      "learning_rate": 0.001,
+      "loss": 0.4333,
+      "step": 8525
+    },
+    {
+      "epoch": 0.2352511062748114,
+      "grad_norm": 0.002735828747972846,
+      "learning_rate": 0.001,
+      "loss": 0.396,
+      "step": 8526
+    },
+    {
+      "epoch": 0.2352786984758758,
+      "grad_norm": 0.00296612735837698,
+      "learning_rate": 0.001,
+      "loss": 0.3983,
+      "step": 8527
+    },
+    {
+      "epoch": 0.23530629067694017,
+      "grad_norm": 0.0029764720238745213,
+      "learning_rate": 0.001,
+      "loss": 0.3874,
+      "step": 8528
+    },
+    {
+      "epoch": 0.23533388287800452,
+      "grad_norm": 0.002517016837373376,
+      "learning_rate": 0.001,
+      "loss": 0.3849,
+      "step": 8529
+    },
+    {
+      "epoch": 0.2353614750790689,
+      "grad_norm": 0.004660237114876509,
+      "learning_rate": 0.001,
+      "loss": 0.3877,
+      "step": 8530
+    },
+    {
+      "epoch": 0.23538906728013326,
+      "grad_norm": 0.01403157226741314,
+      "learning_rate": 0.001,
+      "loss": 0.3797,
+      "step": 8531
+    },
+    {
+      "epoch": 0.23541665948119764,
+      "grad_norm": 0.002752164611592889,
+      "learning_rate": 0.001,
+      "loss": 0.3705,
+      "step": 8532
+    },
+    {
+      "epoch": 0.23544425168226202,
+      "grad_norm": 0.004661790560930967,
+      "learning_rate": 0.001,
+      "loss": 0.3852,
+      "step": 8533
+    },
+    {
+      "epoch": 0.23547184388332637,
+      "grad_norm": 0.0029863761737942696,
+      "learning_rate": 0.001,
+      "loss": 0.4253,
+      "step": 8534
+    },
+    {
+      "epoch": 0.23549943608439075,
+      "grad_norm": 0.002214206848293543,
+      "learning_rate": 0.001,
+      "loss": 0.4207,
+      "step": 8535
+    },
+    {
+      "epoch": 0.2355270282854551,
+      "grad_norm": 0.00356200966052711,
+      "learning_rate": 0.001,
+      "loss": 0.3813,
+      "step": 8536
+    },
+    {
+      "epoch": 0.23555462048651948,
+      "grad_norm": 0.0029001201037317514,
+      "learning_rate": 0.001,
+      "loss": 0.3874,
+      "step": 8537
+    },
+    {
+      "epoch": 0.23558221268758386,
+      "grad_norm": 0.0023703081533312798,
+      "learning_rate": 0.001,
+      "loss": 0.4531,
+      "step": 8538
+    },
+    {
+      "epoch": 0.23560980488864822,
+      "grad_norm": 0.0027728870045393705,
+      "learning_rate": 0.001,
+      "loss": 0.4478,
+      "step": 8539
+    },
+    {
+      "epoch": 0.2356373970897126,
+      "grad_norm": 0.002979228738695383,
+      "learning_rate": 0.001,
+      "loss": 0.4161,
+      "step": 8540
+    },
+    {
+      "epoch": 0.23566498929077695,
+      "grad_norm": 0.0030399069655686617,
+      "learning_rate": 0.001,
+      "loss": 0.4239,
+      "step": 8541
+    },
+    {
+      "epoch": 0.23569258149184133,
+      "grad_norm": 0.0033329655416309834,
+      "learning_rate": 0.001,
+      "loss": 0.3837,
+      "step": 8542
+    },
+    {
+      "epoch": 0.2357201736929057,
+      "grad_norm": 0.006761785596609116,
+      "learning_rate": 0.001,
+      "loss": 0.4624,
+      "step": 8543
+    },
+    {
+      "epoch": 0.23574776589397006,
+      "grad_norm": 0.0030948342755436897,
+      "learning_rate": 0.001,
+      "loss": 0.4237,
+      "step": 8544
+    },
+    {
+      "epoch": 0.23577535809503444,
+      "grad_norm": 0.003666854929178953,
+      "learning_rate": 0.001,
+      "loss": 0.4007,
+      "step": 8545
+    },
+    {
+      "epoch": 0.2358029502960988,
+      "grad_norm": 0.0034998871851712465,
+      "learning_rate": 0.001,
+      "loss": 0.4306,
+      "step": 8546
+    },
+    {
+      "epoch": 0.23583054249716318,
+      "grad_norm": 0.002683976199477911,
+      "learning_rate": 0.001,
+      "loss": 0.4485,
+      "step": 8547
+    },
+    {
+      "epoch": 0.23585813469822756,
+      "grad_norm": 0.003037314862012863,
+      "learning_rate": 0.001,
+      "loss": 0.38,
+      "step": 8548
+    },
+    {
+      "epoch": 0.2358857268992919,
+      "grad_norm": 0.002266938565298915,
+      "learning_rate": 0.001,
+      "loss": 0.3883,
+      "step": 8549
+    },
+    {
+      "epoch": 0.2359133191003563,
+      "grad_norm": 0.004051513969898224,
+      "learning_rate": 0.001,
+      "loss": 0.3992,
+      "step": 8550
+    },
+    {
+      "epoch": 0.23594091130142064,
+      "grad_norm": 0.0025013620033860207,
+      "learning_rate": 0.001,
+      "loss": 0.4445,
+      "step": 8551
+    },
+    {
+      "epoch": 0.23596850350248502,
+      "grad_norm": 0.0030788013245910406,
+      "learning_rate": 0.001,
+      "loss": 0.3761,
+      "step": 8552
+    },
+    {
+      "epoch": 0.2359960957035494,
+      "grad_norm": 0.003969093784689903,
+      "learning_rate": 0.001,
+      "loss": 0.3943,
+      "step": 8553
+    },
+    {
+      "epoch": 0.23602368790461375,
+      "grad_norm": 0.003966695163398981,
+      "learning_rate": 0.001,
+      "loss": 0.3912,
+      "step": 8554
+    },
+    {
+      "epoch": 0.23605128010567814,
+      "grad_norm": 0.009804654866456985,
+      "learning_rate": 0.001,
+      "loss": 0.425,
+      "step": 8555
+    },
+    {
+      "epoch": 0.2360788723067425,
+      "grad_norm": 0.0031127145048230886,
+      "learning_rate": 0.001,
+      "loss": 0.3728,
+      "step": 8556
+    },
+    {
+      "epoch": 0.23610646450780687,
+      "grad_norm": 0.0036678691394627094,
+      "learning_rate": 0.001,
+      "loss": 0.3702,
+      "step": 8557
+    },
+    {
+      "epoch": 0.23613405670887125,
+      "grad_norm": 0.003199309343472123,
+      "learning_rate": 0.001,
+      "loss": 0.4266,
+      "step": 8558
+    },
+    {
+      "epoch": 0.2361616489099356,
+      "grad_norm": 0.0028935642912983894,
+      "learning_rate": 0.001,
+      "loss": 0.3864,
+      "step": 8559
+    },
+    {
+      "epoch": 0.23618924111099998,
+      "grad_norm": 0.0033331215381622314,
+      "learning_rate": 0.001,
+      "loss": 0.3761,
+      "step": 8560
+    },
+    {
+      "epoch": 0.23621683331206433,
+      "grad_norm": 0.004135193768888712,
+      "learning_rate": 0.001,
+      "loss": 0.4333,
+      "step": 8561
+    },
+    {
+      "epoch": 0.23624442551312871,
+      "grad_norm": 0.008535767905414104,
+      "learning_rate": 0.001,
+      "loss": 0.4235,
+      "step": 8562
+    },
+    {
+      "epoch": 0.2362720177141931,
+      "grad_norm": 0.005464480258524418,
+      "learning_rate": 0.001,
+      "loss": 0.3991,
+      "step": 8563
+    },
+    {
+      "epoch": 0.23629960991525745,
+      "grad_norm": 0.0026870830915868282,
+      "learning_rate": 0.001,
+      "loss": 0.4111,
+      "step": 8564
+    },
+    {
+      "epoch": 0.23632720211632183,
+      "grad_norm": 0.002947178203612566,
+      "learning_rate": 0.001,
+      "loss": 0.3947,
+      "step": 8565
+    },
+    {
+      "epoch": 0.23635479431738618,
+      "grad_norm": 0.004835926927626133,
+      "learning_rate": 0.001,
+      "loss": 0.44,
+      "step": 8566
+    },
+    {
+      "epoch": 0.23638238651845056,
+      "grad_norm": 0.004437744617462158,
+      "learning_rate": 0.001,
+      "loss": 0.4173,
+      "step": 8567
+    },
+    {
+      "epoch": 0.23640997871951494,
+      "grad_norm": 0.0033282919321209192,
+      "learning_rate": 0.001,
+      "loss": 0.3913,
+      "step": 8568
+    },
+    {
+      "epoch": 0.2364375709205793,
+      "grad_norm": 0.0035205124877393246,
+      "learning_rate": 0.001,
+      "loss": 0.3725,
+      "step": 8569
+    },
+    {
+      "epoch": 0.23646516312164367,
+      "grad_norm": 0.004044828005135059,
+      "learning_rate": 0.001,
+      "loss": 0.4158,
+      "step": 8570
+    },
+    {
+      "epoch": 0.23649275532270803,
+      "grad_norm": 0.008543507196009159,
+      "learning_rate": 0.001,
+      "loss": 0.4246,
+      "step": 8571
+    },
+    {
+      "epoch": 0.2365203475237724,
+      "grad_norm": 0.005565674975514412,
+      "learning_rate": 0.001,
+      "loss": 0.4339,
+      "step": 8572
+    },
+    {
+      "epoch": 0.2365479397248368,
+      "grad_norm": 0.0032614252995699644,
+      "learning_rate": 0.001,
+      "loss": 0.4018,
+      "step": 8573
+    },
+    {
+      "epoch": 0.23657553192590114,
+      "grad_norm": 0.0036154796835035086,
+      "learning_rate": 0.001,
+      "loss": 0.4157,
+      "step": 8574
+    },
+    {
+      "epoch": 0.23660312412696552,
+      "grad_norm": 0.003126727417111397,
+      "learning_rate": 0.001,
+      "loss": 0.375,
+      "step": 8575
+    },
+    {
+      "epoch": 0.23663071632802987,
+      "grad_norm": 0.002990932669490576,
+      "learning_rate": 0.001,
+      "loss": 0.4282,
+      "step": 8576
+    },
+    {
+      "epoch": 0.23665830852909425,
+      "grad_norm": 0.0033934074454009533,
+      "learning_rate": 0.001,
+      "loss": 0.3771,
+      "step": 8577
+    },
+    {
+      "epoch": 0.23668590073015863,
+      "grad_norm": 0.002845100359991193,
+      "learning_rate": 0.001,
+      "loss": 0.4331,
+      "step": 8578
+    },
+    {
+      "epoch": 0.23671349293122299,
+      "grad_norm": 0.005931117571890354,
+      "learning_rate": 0.001,
+      "loss": 0.4225,
+      "step": 8579
+    },
+    {
+      "epoch": 0.23674108513228737,
+      "grad_norm": 0.0030062024015933275,
+      "learning_rate": 0.001,
+      "loss": 0.4249,
+      "step": 8580
+    },
+    {
+      "epoch": 0.23676867733335172,
+      "grad_norm": 0.004353534895926714,
+      "learning_rate": 0.001,
+      "loss": 0.4043,
+      "step": 8581
+    },
+    {
+      "epoch": 0.2367962695344161,
+      "grad_norm": 0.007340231444686651,
+      "learning_rate": 0.001,
+      "loss": 0.368,
+      "step": 8582
+    },
+    {
+      "epoch": 0.23682386173548048,
+      "grad_norm": 0.0033272774890065193,
+      "learning_rate": 0.001,
+      "loss": 0.3833,
+      "step": 8583
+    },
+    {
+      "epoch": 0.23685145393654483,
+      "grad_norm": 0.002648326801136136,
+      "learning_rate": 0.001,
+      "loss": 0.4067,
+      "step": 8584
+    },
+    {
+      "epoch": 0.2368790461376092,
+      "grad_norm": 0.0026433998718857765,
+      "learning_rate": 0.001,
+      "loss": 0.3819,
+      "step": 8585
+    },
+    {
+      "epoch": 0.23690663833867356,
+      "grad_norm": 0.0039000390097498894,
+      "learning_rate": 0.001,
+      "loss": 0.3706,
+      "step": 8586
+    },
+    {
+      "epoch": 0.23693423053973794,
+      "grad_norm": 0.00217796815559268,
+      "learning_rate": 0.001,
+      "loss": 0.4424,
+      "step": 8587
+    },
+    {
+      "epoch": 0.23696182274080232,
+      "grad_norm": 0.007604559417814016,
+      "learning_rate": 0.001,
+      "loss": 0.4186,
+      "step": 8588
+    },
+    {
+      "epoch": 0.23698941494186668,
+      "grad_norm": 0.003747879760339856,
+      "learning_rate": 0.001,
+      "loss": 0.4283,
+      "step": 8589
+    },
+    {
+      "epoch": 0.23701700714293106,
+      "grad_norm": 0.0028189239092171192,
+      "learning_rate": 0.001,
+      "loss": 0.3938,
+      "step": 8590
+    },
+    {
+      "epoch": 0.2370445993439954,
+      "grad_norm": 0.0033880542032420635,
+      "learning_rate": 0.001,
+      "loss": 0.3977,
+      "step": 8591
+    },
+    {
+      "epoch": 0.2370721915450598,
+      "grad_norm": 0.004370769020169973,
+      "learning_rate": 0.001,
+      "loss": 0.4062,
+      "step": 8592
+    },
+    {
+      "epoch": 0.23709978374612417,
+      "grad_norm": 0.0029521717224270105,
+      "learning_rate": 0.001,
+      "loss": 0.3783,
+      "step": 8593
+    },
+    {
+      "epoch": 0.23712737594718852,
+      "grad_norm": 0.0032671017106622458,
+      "learning_rate": 0.001,
+      "loss": 0.4022,
+      "step": 8594
+    },
+    {
+      "epoch": 0.2371549681482529,
+      "grad_norm": 0.002473097527399659,
+      "learning_rate": 0.001,
+      "loss": 0.4058,
+      "step": 8595
+    },
+    {
+      "epoch": 0.23718256034931726,
+      "grad_norm": 0.0034314331132918596,
+      "learning_rate": 0.001,
+      "loss": 0.4229,
+      "step": 8596
+    },
+    {
+      "epoch": 0.23721015255038164,
+      "grad_norm": 0.003747452748939395,
+      "learning_rate": 0.001,
+      "loss": 0.3886,
+      "step": 8597
+    },
+    {
+      "epoch": 0.23723774475144602,
+      "grad_norm": 0.0053464872762560844,
+      "learning_rate": 0.001,
+      "loss": 0.3828,
+      "step": 8598
+    },
+    {
+      "epoch": 0.23726533695251037,
+      "grad_norm": 0.004069317597895861,
+      "learning_rate": 0.001,
+      "loss": 0.4012,
+      "step": 8599
+    },
+    {
+      "epoch": 0.23729292915357475,
+      "grad_norm": 0.00245997728779912,
+      "learning_rate": 0.001,
+      "loss": 0.3766,
+      "step": 8600
+    },
+    {
+      "epoch": 0.2373205213546391,
+      "grad_norm": 0.0024410157930105925,
+      "learning_rate": 0.001,
+      "loss": 0.4366,
+      "step": 8601
+    },
+    {
+      "epoch": 0.23734811355570348,
+      "grad_norm": 0.002567254938185215,
+      "learning_rate": 0.001,
+      "loss": 0.3979,
+      "step": 8602
+    },
+    {
+      "epoch": 0.23737570575676786,
+      "grad_norm": 0.003037130692973733,
+      "learning_rate": 0.001,
+      "loss": 0.366,
+      "step": 8603
+    },
+    {
+      "epoch": 0.23740329795783222,
+      "grad_norm": 0.0020308520179241896,
+      "learning_rate": 0.001,
+      "loss": 0.3905,
+      "step": 8604
+    },
+    {
+      "epoch": 0.2374308901588966,
+      "grad_norm": 0.008115851320326328,
+      "learning_rate": 0.001,
+      "loss": 0.3839,
+      "step": 8605
+    },
+    {
+      "epoch": 0.23745848235996095,
+      "grad_norm": 0.002992655150592327,
+      "learning_rate": 0.001,
+      "loss": 0.3865,
+      "step": 8606
+    },
+    {
+      "epoch": 0.23748607456102533,
+      "grad_norm": 0.004864429123699665,
+      "learning_rate": 0.001,
+      "loss": 0.4524,
+      "step": 8607
+    },
+    {
+      "epoch": 0.2375136667620897,
+      "grad_norm": 0.0024918217677623034,
+      "learning_rate": 0.001,
+      "loss": 0.4263,
+      "step": 8608
+    },
+    {
+      "epoch": 0.23754125896315406,
+      "grad_norm": 0.0024185108486562967,
+      "learning_rate": 0.001,
+      "loss": 0.4077,
+      "step": 8609
+    },
+    {
+      "epoch": 0.23756885116421844,
+      "grad_norm": 0.0024980721063911915,
+      "learning_rate": 0.001,
+      "loss": 0.4185,
+      "step": 8610
+    },
+    {
+      "epoch": 0.2375964433652828,
+      "grad_norm": 0.002587017137557268,
+      "learning_rate": 0.001,
+      "loss": 0.3884,
+      "step": 8611
+    },
+    {
+      "epoch": 0.23762403556634717,
+      "grad_norm": 0.002519281581044197,
+      "learning_rate": 0.001,
+      "loss": 0.3612,
+      "step": 8612
+    },
+    {
+      "epoch": 0.23765162776741153,
+      "grad_norm": 0.0031468705274164677,
+      "learning_rate": 0.001,
+      "loss": 0.3984,
+      "step": 8613
+    },
+    {
+      "epoch": 0.2376792199684759,
+      "grad_norm": 0.0039115361869335175,
+      "learning_rate": 0.001,
+      "loss": 0.3788,
+      "step": 8614
+    },
+    {
+      "epoch": 0.2377068121695403,
+      "grad_norm": 0.002601664513349533,
+      "learning_rate": 0.001,
+      "loss": 0.4121,
+      "step": 8615
+    },
+    {
+      "epoch": 0.23773440437060464,
+      "grad_norm": 0.0024887637700885534,
+      "learning_rate": 0.001,
+      "loss": 0.3928,
+      "step": 8616
+    },
+    {
+      "epoch": 0.23776199657166902,
+      "grad_norm": 0.0022282125428318977,
+      "learning_rate": 0.001,
+      "loss": 0.4265,
+      "step": 8617
+    },
+    {
+      "epoch": 0.23778958877273337,
+      "grad_norm": 0.00493327621370554,
+      "learning_rate": 0.001,
+      "loss": 0.3815,
+      "step": 8618
+    },
+    {
+      "epoch": 0.23781718097379775,
+      "grad_norm": 0.003642312716692686,
+      "learning_rate": 0.001,
+      "loss": 0.3982,
+      "step": 8619
+    },
+    {
+      "epoch": 0.23784477317486213,
+      "grad_norm": 0.002251163125038147,
+      "learning_rate": 0.001,
+      "loss": 0.4593,
+      "step": 8620
+    },
+    {
+      "epoch": 0.2378723653759265,
+      "grad_norm": 0.0038927237037569284,
+      "learning_rate": 0.001,
+      "loss": 0.4109,
+      "step": 8621
+    },
+    {
+      "epoch": 0.23789995757699087,
+      "grad_norm": 0.002186572877690196,
+      "learning_rate": 0.001,
+      "loss": 0.4184,
+      "step": 8622
+    },
+    {
+      "epoch": 0.23792754977805522,
+      "grad_norm": 0.008558029308915138,
+      "learning_rate": 0.001,
+      "loss": 0.4067,
+      "step": 8623
+    },
+    {
+      "epoch": 0.2379551419791196,
+      "grad_norm": 0.0047720009461045265,
+      "learning_rate": 0.001,
+      "loss": 0.395,
+      "step": 8624
+    },
+    {
+      "epoch": 0.23798273418018398,
+      "grad_norm": 0.007841572165489197,
+      "learning_rate": 0.001,
+      "loss": 0.3685,
+      "step": 8625
+    },
+    {
+      "epoch": 0.23801032638124833,
+      "grad_norm": 0.006973532494157553,
+      "learning_rate": 0.001,
+      "loss": 0.3952,
+      "step": 8626
+    },
+    {
+      "epoch": 0.2380379185823127,
+      "grad_norm": 0.002645769389346242,
+      "learning_rate": 0.001,
+      "loss": 0.3651,
+      "step": 8627
+    },
+    {
+      "epoch": 0.23806551078337707,
+      "grad_norm": 0.013875133357942104,
+      "learning_rate": 0.001,
+      "loss": 0.4192,
+      "step": 8628
+    },
+    {
+      "epoch": 0.23809310298444145,
+      "grad_norm": 0.005976676940917969,
+      "learning_rate": 0.001,
+      "loss": 0.4268,
+      "step": 8629
+    },
+    {
+      "epoch": 0.23812069518550583,
+      "grad_norm": 0.0029498045332729816,
+      "learning_rate": 0.001,
+      "loss": 0.368,
+      "step": 8630
+    },
+    {
+      "epoch": 0.23814828738657018,
+      "grad_norm": 0.00462282495573163,
+      "learning_rate": 0.001,
+      "loss": 0.445,
+      "step": 8631
+    },
+    {
+      "epoch": 0.23817587958763456,
+      "grad_norm": 0.003022623248398304,
+      "learning_rate": 0.001,
+      "loss": 0.3804,
+      "step": 8632
+    },
+    {
+      "epoch": 0.2382034717886989,
+      "grad_norm": 0.003926178906112909,
+      "learning_rate": 0.001,
+      "loss": 0.3651,
+      "step": 8633
+    },
+    {
+      "epoch": 0.2382310639897633,
+      "grad_norm": 0.0035945766139775515,
+      "learning_rate": 0.001,
+      "loss": 0.3681,
+      "step": 8634
+    },
+    {
+      "epoch": 0.23825865619082767,
+      "grad_norm": 0.00597176980227232,
+      "learning_rate": 0.001,
+      "loss": 0.3961,
+      "step": 8635
+    },
+    {
+      "epoch": 0.23828624839189202,
+      "grad_norm": 0.004440873861312866,
+      "learning_rate": 0.001,
+      "loss": 0.3614,
+      "step": 8636
+    },
+    {
+      "epoch": 0.2383138405929564,
+      "grad_norm": 0.002808907302096486,
+      "learning_rate": 0.001,
+      "loss": 0.3908,
+      "step": 8637
+    },
+    {
+      "epoch": 0.23834143279402076,
+      "grad_norm": 0.002455639885738492,
+      "learning_rate": 0.001,
+      "loss": 0.3661,
+      "step": 8638
+    },
+    {
+      "epoch": 0.23836902499508514,
+      "grad_norm": 0.003995021339505911,
+      "learning_rate": 0.001,
+      "loss": 0.4142,
+      "step": 8639
+    },
+    {
+      "epoch": 0.23839661719614952,
+      "grad_norm": 0.0029346742667257786,
+      "learning_rate": 0.001,
+      "loss": 0.4076,
+      "step": 8640
+    },
+    {
+      "epoch": 0.23842420939721387,
+      "grad_norm": 0.0027218139730393887,
+      "learning_rate": 0.001,
+      "loss": 0.4117,
+      "step": 8641
+    },
+    {
+      "epoch": 0.23845180159827825,
+      "grad_norm": 0.002403157763183117,
+      "learning_rate": 0.001,
+      "loss": 0.3999,
+      "step": 8642
+    },
+    {
+      "epoch": 0.2384793937993426,
+      "grad_norm": 0.0038983020931482315,
+      "learning_rate": 0.001,
+      "loss": 0.3968,
+      "step": 8643
+    },
+    {
+      "epoch": 0.23850698600040698,
+      "grad_norm": 0.002467036945745349,
+      "learning_rate": 0.001,
+      "loss": 0.4214,
+      "step": 8644
+    },
+    {
+      "epoch": 0.23853457820147136,
+      "grad_norm": 0.003569959197193384,
+      "learning_rate": 0.001,
+      "loss": 0.4,
+      "step": 8645
+    },
+    {
+      "epoch": 0.23856217040253572,
+      "grad_norm": 0.0020964080467820168,
+      "learning_rate": 0.001,
+      "loss": 0.3975,
+      "step": 8646
+    },
+    {
+      "epoch": 0.2385897626036001,
+      "grad_norm": 0.0025515384040772915,
+      "learning_rate": 0.001,
+      "loss": 0.3767,
+      "step": 8647
+    },
+    {
+      "epoch": 0.23861735480466445,
+      "grad_norm": 0.0025259265676140785,
+      "learning_rate": 0.001,
+      "loss": 0.3789,
+      "step": 8648
+    },
+    {
+      "epoch": 0.23864494700572883,
+      "grad_norm": 0.002351469825953245,
+      "learning_rate": 0.001,
+      "loss": 0.4007,
+      "step": 8649
+    },
+    {
+      "epoch": 0.2386725392067932,
+      "grad_norm": 0.002201459836214781,
+      "learning_rate": 0.001,
+      "loss": 0.4112,
+      "step": 8650
+    },
+    {
+      "epoch": 0.23870013140785756,
+      "grad_norm": 0.0032044921535998583,
+      "learning_rate": 0.001,
+      "loss": 0.3948,
+      "step": 8651
+    },
+    {
+      "epoch": 0.23872772360892194,
+      "grad_norm": 0.0036108682397753,
+      "learning_rate": 0.001,
+      "loss": 0.4082,
+      "step": 8652
+    },
+    {
+      "epoch": 0.2387553158099863,
+      "grad_norm": 0.0031174325849860907,
+      "learning_rate": 0.001,
+      "loss": 0.3674,
+      "step": 8653
+    },
+    {
+      "epoch": 0.23878290801105068,
+      "grad_norm": 0.0035186016466468573,
+      "learning_rate": 0.001,
+      "loss": 0.3882,
+      "step": 8654
+    },
+    {
+      "epoch": 0.23881050021211506,
+      "grad_norm": 0.002634822390973568,
+      "learning_rate": 0.001,
+      "loss": 0.4128,
+      "step": 8655
+    },
+    {
+      "epoch": 0.2388380924131794,
+      "grad_norm": 0.0029784373473376036,
+      "learning_rate": 0.001,
+      "loss": 0.3823,
+      "step": 8656
+    },
+    {
+      "epoch": 0.2388656846142438,
+      "grad_norm": 0.0027631595730781555,
+      "learning_rate": 0.001,
+      "loss": 0.4058,
+      "step": 8657
+    },
+    {
+      "epoch": 0.23889327681530814,
+      "grad_norm": 0.007283089216798544,
+      "learning_rate": 0.001,
+      "loss": 0.3752,
+      "step": 8658
+    },
+    {
+      "epoch": 0.23892086901637252,
+      "grad_norm": 0.0024284112732857466,
+      "learning_rate": 0.001,
+      "loss": 0.3813,
+      "step": 8659
+    },
+    {
+      "epoch": 0.2389484612174369,
+      "grad_norm": 0.0027097521815449,
+      "learning_rate": 0.001,
+      "loss": 0.3926,
+      "step": 8660
+    },
+    {
+      "epoch": 0.23897605341850126,
+      "grad_norm": 0.0025627308059483767,
+      "learning_rate": 0.001,
+      "loss": 0.4046,
+      "step": 8661
+    },
+    {
+      "epoch": 0.23900364561956564,
+      "grad_norm": 0.003014184534549713,
+      "learning_rate": 0.001,
+      "loss": 0.3913,
+      "step": 8662
+    },
+    {
+      "epoch": 0.23903123782063,
+      "grad_norm": 0.0035087834112346172,
+      "learning_rate": 0.001,
+      "loss": 0.3609,
+      "step": 8663
+    },
+    {
+      "epoch": 0.23905883002169437,
+      "grad_norm": 0.004521367605775595,
+      "learning_rate": 0.001,
+      "loss": 0.423,
+      "step": 8664
+    },
+    {
+      "epoch": 0.23908642222275875,
+      "grad_norm": 0.003859465243294835,
+      "learning_rate": 0.001,
+      "loss": 0.4388,
+      "step": 8665
+    },
+    {
+      "epoch": 0.2391140144238231,
+      "grad_norm": 0.0035421785432845354,
+      "learning_rate": 0.001,
+      "loss": 0.3947,
+      "step": 8666
+    },
+    {
+      "epoch": 0.23914160662488748,
+      "grad_norm": 0.0022301869466900826,
+      "learning_rate": 0.001,
+      "loss": 0.4046,
+      "step": 8667
+    },
+    {
+      "epoch": 0.23916919882595183,
+      "grad_norm": 0.0030494078528136015,
+      "learning_rate": 0.001,
+      "loss": 0.4232,
+      "step": 8668
+    },
+    {
+      "epoch": 0.23919679102701621,
+      "grad_norm": 0.0031116558238863945,
+      "learning_rate": 0.001,
+      "loss": 0.411,
+      "step": 8669
+    },
+    {
+      "epoch": 0.2392243832280806,
+      "grad_norm": 0.002859594067558646,
+      "learning_rate": 0.001,
+      "loss": 0.3991,
+      "step": 8670
+    },
+    {
+      "epoch": 0.23925197542914495,
+      "grad_norm": 0.002495800144970417,
+      "learning_rate": 0.001,
+      "loss": 0.4545,
+      "step": 8671
+    },
+    {
+      "epoch": 0.23927956763020933,
+      "grad_norm": 0.0030453058425337076,
+      "learning_rate": 0.001,
+      "loss": 0.4287,
+      "step": 8672
+    },
+    {
+      "epoch": 0.23930715983127368,
+      "grad_norm": 0.0048926519230008125,
+      "learning_rate": 0.001,
+      "loss": 0.4131,
+      "step": 8673
+    },
+    {
+      "epoch": 0.23933475203233806,
+      "grad_norm": 0.0029445511754602194,
+      "learning_rate": 0.001,
+      "loss": 0.4025,
+      "step": 8674
+    },
+    {
+      "epoch": 0.23936234423340244,
+      "grad_norm": 0.00260615604929626,
+      "learning_rate": 0.001,
+      "loss": 0.4355,
+      "step": 8675
+    },
+    {
+      "epoch": 0.2393899364344668,
+      "grad_norm": 0.003435730002820492,
+      "learning_rate": 0.001,
+      "loss": 0.4164,
+      "step": 8676
+    },
+    {
+      "epoch": 0.23941752863553117,
+      "grad_norm": 0.0024756945203989744,
+      "learning_rate": 0.001,
+      "loss": 0.4061,
+      "step": 8677
+    },
+    {
+      "epoch": 0.23944512083659553,
+      "grad_norm": 0.0030621823389083147,
+      "learning_rate": 0.001,
+      "loss": 0.3652,
+      "step": 8678
+    },
+    {
+      "epoch": 0.2394727130376599,
+      "grad_norm": 0.003644311334937811,
+      "learning_rate": 0.001,
+      "loss": 0.3834,
+      "step": 8679
+    },
+    {
+      "epoch": 0.2395003052387243,
+      "grad_norm": 0.0026737714651972055,
+      "learning_rate": 0.001,
+      "loss": 0.4288,
+      "step": 8680
+    },
+    {
+      "epoch": 0.23952789743978864,
+      "grad_norm": 0.0027650066185742617,
+      "learning_rate": 0.001,
+      "loss": 0.3592,
+      "step": 8681
+    },
+    {
+      "epoch": 0.23955548964085302,
+      "grad_norm": 0.010126309469342232,
+      "learning_rate": 0.001,
+      "loss": 0.4196,
+      "step": 8682
+    },
+    {
+      "epoch": 0.23958308184191737,
+      "grad_norm": 0.008825338445603848,
+      "learning_rate": 0.001,
+      "loss": 0.41,
+      "step": 8683
+    },
+    {
+      "epoch": 0.23961067404298175,
+      "grad_norm": 0.0028963929507881403,
+      "learning_rate": 0.001,
+      "loss": 0.3777,
+      "step": 8684
+    },
+    {
+      "epoch": 0.23963826624404613,
+      "grad_norm": 0.006523852702230215,
+      "learning_rate": 0.001,
+      "loss": 0.3625,
+      "step": 8685
+    },
+    {
+      "epoch": 0.23966585844511049,
+      "grad_norm": 0.0027353796176612377,
+      "learning_rate": 0.001,
+      "loss": 0.3735,
+      "step": 8686
+    },
+    {
+      "epoch": 0.23969345064617487,
+      "grad_norm": 0.0034795296378433704,
+      "learning_rate": 0.001,
+      "loss": 0.3765,
+      "step": 8687
+    },
+    {
+      "epoch": 0.23972104284723922,
+      "grad_norm": 0.002904376946389675,
+      "learning_rate": 0.001,
+      "loss": 0.4083,
+      "step": 8688
+    },
+    {
+      "epoch": 0.2397486350483036,
+      "grad_norm": 0.00341939739882946,
+      "learning_rate": 0.001,
+      "loss": 0.3886,
+      "step": 8689
+    },
+    {
+      "epoch": 0.23977622724936798,
+      "grad_norm": 0.00255574774928391,
+      "learning_rate": 0.001,
+      "loss": 0.4052,
+      "step": 8690
+    },
+    {
+      "epoch": 0.23980381945043233,
+      "grad_norm": 0.002928397851064801,
+      "learning_rate": 0.001,
+      "loss": 0.3964,
+      "step": 8691
+    },
+    {
+      "epoch": 0.2398314116514967,
+      "grad_norm": 0.003814896335825324,
+      "learning_rate": 0.001,
+      "loss": 0.3993,
+      "step": 8692
+    },
+    {
+      "epoch": 0.23985900385256106,
+      "grad_norm": 0.004918345715850592,
+      "learning_rate": 0.001,
+      "loss": 0.3962,
+      "step": 8693
+    },
+    {
+      "epoch": 0.23988659605362544,
+      "grad_norm": 0.00214549177326262,
+      "learning_rate": 0.001,
+      "loss": 0.4043,
+      "step": 8694
+    },
+    {
+      "epoch": 0.23991418825468983,
+      "grad_norm": 0.00271332124248147,
+      "learning_rate": 0.001,
+      "loss": 0.3752,
+      "step": 8695
+    },
+    {
+      "epoch": 0.23994178045575418,
+      "grad_norm": 0.005185315851122141,
+      "learning_rate": 0.001,
+      "loss": 0.4045,
+      "step": 8696
+    },
+    {
+      "epoch": 0.23996937265681856,
+      "grad_norm": 0.003353232517838478,
+      "learning_rate": 0.001,
+      "loss": 0.4263,
+      "step": 8697
+    },
+    {
+      "epoch": 0.2399969648578829,
+      "grad_norm": 0.0031625647097826004,
+      "learning_rate": 0.001,
+      "loss": 0.3659,
+      "step": 8698
+    },
+    {
+      "epoch": 0.2400245570589473,
+      "grad_norm": 0.0053766947239637375,
+      "learning_rate": 0.001,
+      "loss": 0.4522,
+      "step": 8699
+    },
+    {
+      "epoch": 0.24005214926001167,
+      "grad_norm": 0.005576374474912882,
+      "learning_rate": 0.001,
+      "loss": 0.431,
+      "step": 8700
+    },
+    {
+      "epoch": 0.24007974146107602,
+      "grad_norm": 0.003967816010117531,
+      "learning_rate": 0.001,
+      "loss": 0.3623,
+      "step": 8701
+    },
+    {
+      "epoch": 0.2401073336621404,
+      "grad_norm": 0.0026094361674040556,
+      "learning_rate": 0.001,
+      "loss": 0.3997,
+      "step": 8702
+    },
+    {
+      "epoch": 0.24013492586320476,
+      "grad_norm": 0.049603283405303955,
+      "learning_rate": 0.001,
+      "loss": 0.4175,
+      "step": 8703
+    },
+    {
+      "epoch": 0.24016251806426914,
+      "grad_norm": 0.0027699440252035856,
+      "learning_rate": 0.001,
+      "loss": 0.388,
+      "step": 8704
+    },
+    {
+      "epoch": 0.2401901102653335,
+      "grad_norm": 0.0032966039143502712,
+      "learning_rate": 0.001,
+      "loss": 0.3743,
+      "step": 8705
+    },
+    {
+      "epoch": 0.24021770246639787,
+      "grad_norm": 0.0029616530518978834,
+      "learning_rate": 0.001,
+      "loss": 0.4218,
+      "step": 8706
+    },
+    {
+      "epoch": 0.24024529466746225,
+      "grad_norm": 0.0031759492121636868,
+      "learning_rate": 0.001,
+      "loss": 0.3906,
+      "step": 8707
+    },
+    {
+      "epoch": 0.2402728868685266,
+      "grad_norm": 0.002272370969876647,
+      "learning_rate": 0.001,
+      "loss": 0.4132,
+      "step": 8708
+    },
+    {
+      "epoch": 0.24030047906959098,
+      "grad_norm": 0.004187437240034342,
+      "learning_rate": 0.001,
+      "loss": 0.4007,
+      "step": 8709
+    },
+    {
+      "epoch": 0.24032807127065534,
+      "grad_norm": 0.0026218758430331945,
+      "learning_rate": 0.001,
+      "loss": 0.3878,
+      "step": 8710
+    },
+    {
+      "epoch": 0.24035566347171972,
+      "grad_norm": 0.0024012320209294558,
+      "learning_rate": 0.001,
+      "loss": 0.4175,
+      "step": 8711
+    },
+    {
+      "epoch": 0.2403832556727841,
+      "grad_norm": 0.004809997510164976,
+      "learning_rate": 0.001,
+      "loss": 0.4036,
+      "step": 8712
+    },
+    {
+      "epoch": 0.24041084787384845,
+      "grad_norm": 0.0030046291649341583,
+      "learning_rate": 0.001,
+      "loss": 0.4037,
+      "step": 8713
+    },
+    {
+      "epoch": 0.24043844007491283,
+      "grad_norm": 0.0032057911157608032,
+      "learning_rate": 0.001,
+      "loss": 0.4252,
+      "step": 8714
+    },
+    {
+      "epoch": 0.24046603227597718,
+      "grad_norm": 0.0034800011198967695,
+      "learning_rate": 0.001,
+      "loss": 0.404,
+      "step": 8715
+    },
+    {
+      "epoch": 0.24049362447704156,
+      "grad_norm": 0.0029743260238319635,
+      "learning_rate": 0.001,
+      "loss": 0.3756,
+      "step": 8716
+    },
+    {
+      "epoch": 0.24052121667810594,
+      "grad_norm": 0.0031264862045645714,
+      "learning_rate": 0.001,
+      "loss": 0.3836,
+      "step": 8717
+    },
+    {
+      "epoch": 0.2405488088791703,
+      "grad_norm": 0.0027079239953309298,
+      "learning_rate": 0.001,
+      "loss": 0.3933,
+      "step": 8718
+    },
+    {
+      "epoch": 0.24057640108023468,
+      "grad_norm": 0.0026093865744769573,
+      "learning_rate": 0.001,
+      "loss": 0.3818,
+      "step": 8719
+    },
+    {
+      "epoch": 0.24060399328129903,
+      "grad_norm": 0.004578443244099617,
+      "learning_rate": 0.001,
+      "loss": 0.3829,
+      "step": 8720
+    },
+    {
+      "epoch": 0.2406315854823634,
+      "grad_norm": 0.0024112521205097437,
+      "learning_rate": 0.001,
+      "loss": 0.4194,
+      "step": 8721
+    },
+    {
+      "epoch": 0.2406591776834278,
+      "grad_norm": 0.0029260313604027033,
+      "learning_rate": 0.001,
+      "loss": 0.3582,
+      "step": 8722
+    },
+    {
+      "epoch": 0.24068676988449214,
+      "grad_norm": 0.003243402810767293,
+      "learning_rate": 0.001,
+      "loss": 0.3539,
+      "step": 8723
+    },
+    {
+      "epoch": 0.24071436208555652,
+      "grad_norm": 0.03149477019906044,
+      "learning_rate": 0.001,
+      "loss": 0.4075,
+      "step": 8724
+    },
+    {
+      "epoch": 0.24074195428662087,
+      "grad_norm": 0.002621316583827138,
+      "learning_rate": 0.001,
+      "loss": 0.3835,
+      "step": 8725
+    },
+    {
+      "epoch": 0.24076954648768525,
+      "grad_norm": 0.007181955501437187,
+      "learning_rate": 0.001,
+      "loss": 0.374,
+      "step": 8726
+    },
+    {
+      "epoch": 0.24079713868874963,
+      "grad_norm": 0.006243562325835228,
+      "learning_rate": 0.001,
+      "loss": 0.4195,
+      "step": 8727
+    },
+    {
+      "epoch": 0.240824730889814,
+      "grad_norm": 0.005759544670581818,
+      "learning_rate": 0.001,
+      "loss": 0.3862,
+      "step": 8728
+    },
+    {
+      "epoch": 0.24085232309087837,
+      "grad_norm": 0.002845682902261615,
+      "learning_rate": 0.001,
+      "loss": 0.4156,
+      "step": 8729
+    },
+    {
+      "epoch": 0.24087991529194272,
+      "grad_norm": 0.00525195337831974,
+      "learning_rate": 0.001,
+      "loss": 0.4125,
+      "step": 8730
+    },
+    {
+      "epoch": 0.2409075074930071,
+      "grad_norm": 0.004564746282994747,
+      "learning_rate": 0.001,
+      "loss": 0.3639,
+      "step": 8731
+    },
+    {
+      "epoch": 0.24093509969407148,
+      "grad_norm": 0.006655642297118902,
+      "learning_rate": 0.001,
+      "loss": 0.4284,
+      "step": 8732
+    },
+    {
+      "epoch": 0.24096269189513583,
+      "grad_norm": 0.003846411593258381,
+      "learning_rate": 0.001,
+      "loss": 0.4351,
+      "step": 8733
+    },
+    {
+      "epoch": 0.2409902840962002,
+      "grad_norm": 0.003326327772811055,
+      "learning_rate": 0.001,
+      "loss": 0.3892,
+      "step": 8734
+    },
+    {
+      "epoch": 0.24101787629726457,
+      "grad_norm": 0.013138518668711185,
+      "learning_rate": 0.001,
+      "loss": 0.3856,
+      "step": 8735
+    },
+    {
+      "epoch": 0.24104546849832895,
+      "grad_norm": 0.007260841317474842,
+      "learning_rate": 0.001,
+      "loss": 0.3965,
+      "step": 8736
+    },
+    {
+      "epoch": 0.24107306069939333,
+      "grad_norm": 0.0034342585131525993,
+      "learning_rate": 0.001,
+      "loss": 0.3759,
+      "step": 8737
+    },
+    {
+      "epoch": 0.24110065290045768,
+      "grad_norm": 0.0038822691421955824,
+      "learning_rate": 0.001,
+      "loss": 0.4026,
+      "step": 8738
+    },
+    {
+      "epoch": 0.24112824510152206,
+      "grad_norm": 0.0024222838692367077,
+      "learning_rate": 0.001,
+      "loss": 0.4189,
+      "step": 8739
+    },
+    {
+      "epoch": 0.2411558373025864,
+      "grad_norm": 0.0026801645290106535,
+      "learning_rate": 0.001,
+      "loss": 0.3915,
+      "step": 8740
+    },
+    {
+      "epoch": 0.2411834295036508,
+      "grad_norm": 0.007557088974863291,
+      "learning_rate": 0.001,
+      "loss": 0.4343,
+      "step": 8741
+    },
+    {
+      "epoch": 0.24121102170471517,
+      "grad_norm": 0.02677319385111332,
+      "learning_rate": 0.001,
+      "loss": 0.4009,
+      "step": 8742
+    },
+    {
+      "epoch": 0.24123861390577953,
+      "grad_norm": 0.0026736543513834476,
+      "learning_rate": 0.001,
+      "loss": 0.3915,
+      "step": 8743
+    },
+    {
+      "epoch": 0.2412662061068439,
+      "grad_norm": 0.0023438511416316032,
+      "learning_rate": 0.001,
+      "loss": 0.4387,
+      "step": 8744
+    },
+    {
+      "epoch": 0.24129379830790826,
+      "grad_norm": 0.004221671260893345,
+      "learning_rate": 0.001,
+      "loss": 0.3227,
+      "step": 8745
+    },
+    {
+      "epoch": 0.24132139050897264,
+      "grad_norm": 0.004964245017617941,
+      "learning_rate": 0.001,
+      "loss": 0.3824,
+      "step": 8746
+    },
+    {
+      "epoch": 0.24134898271003702,
+      "grad_norm": 0.004307127092033625,
+      "learning_rate": 0.001,
+      "loss": 0.3757,
+      "step": 8747
+    },
+    {
+      "epoch": 0.24137657491110137,
+      "grad_norm": 0.004791958257555962,
+      "learning_rate": 0.001,
+      "loss": 0.38,
+      "step": 8748
+    },
+    {
+      "epoch": 0.24140416711216575,
+      "grad_norm": 0.004529708996415138,
+      "learning_rate": 0.001,
+      "loss": 0.3639,
+      "step": 8749
+    },
+    {
+      "epoch": 0.2414317593132301,
+      "grad_norm": 0.004783578682690859,
+      "learning_rate": 0.001,
+      "loss": 0.3648,
+      "step": 8750
+    },
+    {
+      "epoch": 0.24145935151429448,
+      "grad_norm": 0.003613299923017621,
+      "learning_rate": 0.001,
+      "loss": 0.4064,
+      "step": 8751
+    },
+    {
+      "epoch": 0.24148694371535886,
+      "grad_norm": 0.003846370615065098,
+      "learning_rate": 0.001,
+      "loss": 0.4148,
+      "step": 8752
+    },
+    {
+      "epoch": 0.24151453591642322,
+      "grad_norm": 0.002877620980143547,
+      "learning_rate": 0.001,
+      "loss": 0.3835,
+      "step": 8753
+    },
+    {
+      "epoch": 0.2415421281174876,
+      "grad_norm": 0.004218699410557747,
+      "learning_rate": 0.001,
+      "loss": 0.4264,
+      "step": 8754
+    },
+    {
+      "epoch": 0.24156972031855195,
+      "grad_norm": 0.003392399987205863,
+      "learning_rate": 0.001,
+      "loss": 0.4003,
+      "step": 8755
+    },
+    {
+      "epoch": 0.24159731251961633,
+      "grad_norm": 0.0036800485104322433,
+      "learning_rate": 0.001,
+      "loss": 0.4078,
+      "step": 8756
+    },
+    {
+      "epoch": 0.2416249047206807,
+      "grad_norm": 0.0023208935745060444,
+      "learning_rate": 0.001,
+      "loss": 0.4371,
+      "step": 8757
+    },
+    {
+      "epoch": 0.24165249692174506,
+      "grad_norm": 0.0029652283992618322,
+      "learning_rate": 0.001,
+      "loss": 0.3543,
+      "step": 8758
+    },
+    {
+      "epoch": 0.24168008912280944,
+      "grad_norm": 0.00395188620314002,
+      "learning_rate": 0.001,
+      "loss": 0.398,
+      "step": 8759
+    },
+    {
+      "epoch": 0.2417076813238738,
+      "grad_norm": 0.0028020909521728754,
+      "learning_rate": 0.001,
+      "loss": 0.423,
+      "step": 8760
+    },
+    {
+      "epoch": 0.24173527352493818,
+      "grad_norm": 0.002905269619077444,
+      "learning_rate": 0.001,
+      "loss": 0.3582,
+      "step": 8761
+    },
+    {
+      "epoch": 0.24176286572600256,
+      "grad_norm": 0.0026353024877607822,
+      "learning_rate": 0.001,
+      "loss": 0.3987,
+      "step": 8762
+    },
+    {
+      "epoch": 0.2417904579270669,
+      "grad_norm": 0.009191271848976612,
+      "learning_rate": 0.001,
+      "loss": 0.3785,
+      "step": 8763
+    },
+    {
+      "epoch": 0.2418180501281313,
+      "grad_norm": 0.002872015815228224,
+      "learning_rate": 0.001,
+      "loss": 0.4074,
+      "step": 8764
+    },
+    {
+      "epoch": 0.24184564232919564,
+      "grad_norm": 0.0035799501929432154,
+      "learning_rate": 0.001,
+      "loss": 0.3845,
+      "step": 8765
+    },
+    {
+      "epoch": 0.24187323453026002,
+      "grad_norm": 0.0039528412744402885,
+      "learning_rate": 0.001,
+      "loss": 0.3875,
+      "step": 8766
+    },
+    {
+      "epoch": 0.2419008267313244,
+      "grad_norm": 0.004701048135757446,
+      "learning_rate": 0.001,
+      "loss": 0.3817,
+      "step": 8767
+    },
+    {
+      "epoch": 0.24192841893238876,
+      "grad_norm": 0.0029811752028763294,
+      "learning_rate": 0.001,
+      "loss": 0.3966,
+      "step": 8768
+    },
+    {
+      "epoch": 0.24195601113345314,
+      "grad_norm": 0.00273152650333941,
+      "learning_rate": 0.001,
+      "loss": 0.3978,
+      "step": 8769
+    },
+    {
+      "epoch": 0.2419836033345175,
+      "grad_norm": 0.0036260110791772604,
+      "learning_rate": 0.001,
+      "loss": 0.4006,
+      "step": 8770
+    },
+    {
+      "epoch": 0.24201119553558187,
+      "grad_norm": 0.0030035688541829586,
+      "learning_rate": 0.001,
+      "loss": 0.4333,
+      "step": 8771
+    },
+    {
+      "epoch": 0.24203878773664625,
+      "grad_norm": 0.003084450261667371,
+      "learning_rate": 0.001,
+      "loss": 0.4114,
+      "step": 8772
+    },
+    {
+      "epoch": 0.2420663799377106,
+      "grad_norm": 0.0035567975137382746,
+      "learning_rate": 0.001,
+      "loss": 0.3657,
+      "step": 8773
+    },
+    {
+      "epoch": 0.24209397213877498,
+      "grad_norm": 0.0024042977020144463,
+      "learning_rate": 0.001,
+      "loss": 0.4216,
+      "step": 8774
+    },
+    {
+      "epoch": 0.24212156433983933,
+      "grad_norm": 0.005611390806734562,
+      "learning_rate": 0.001,
+      "loss": 0.4223,
+      "step": 8775
+    },
+    {
+      "epoch": 0.24214915654090371,
+      "grad_norm": 0.0022727535106241703,
+      "learning_rate": 0.001,
+      "loss": 0.403,
+      "step": 8776
+    },
+    {
+      "epoch": 0.2421767487419681,
+      "grad_norm": 0.004948400892317295,
+      "learning_rate": 0.001,
+      "loss": 0.3891,
+      "step": 8777
+    },
+    {
+      "epoch": 0.24220434094303245,
+      "grad_norm": 0.005593698471784592,
+      "learning_rate": 0.001,
+      "loss": 0.4204,
+      "step": 8778
+    },
+    {
+      "epoch": 0.24223193314409683,
+      "grad_norm": 0.002849163953214884,
+      "learning_rate": 0.001,
+      "loss": 0.375,
+      "step": 8779
+    },
+    {
+      "epoch": 0.24225952534516118,
+      "grad_norm": 0.0032104626297950745,
+      "learning_rate": 0.001,
+      "loss": 0.4159,
+      "step": 8780
+    },
+    {
+      "epoch": 0.24228711754622556,
+      "grad_norm": 0.0051482305862009525,
+      "learning_rate": 0.001,
+      "loss": 0.3794,
+      "step": 8781
+    },
+    {
+      "epoch": 0.24231470974728994,
+      "grad_norm": 0.0035567518789321184,
+      "learning_rate": 0.001,
+      "loss": 0.3672,
+      "step": 8782
+    },
+    {
+      "epoch": 0.2423423019483543,
+      "grad_norm": 0.0023606910835951567,
+      "learning_rate": 0.001,
+      "loss": 0.4256,
+      "step": 8783
+    },
+    {
+      "epoch": 0.24236989414941867,
+      "grad_norm": 0.0028458137530833483,
+      "learning_rate": 0.001,
+      "loss": 0.3894,
+      "step": 8784
+    },
+    {
+      "epoch": 0.24239748635048303,
+      "grad_norm": 0.0029930644668638706,
+      "learning_rate": 0.001,
+      "loss": 0.4275,
+      "step": 8785
+    },
+    {
+      "epoch": 0.2424250785515474,
+      "grad_norm": 0.0022928270045667887,
+      "learning_rate": 0.001,
+      "loss": 0.399,
+      "step": 8786
+    },
+    {
+      "epoch": 0.2424526707526118,
+      "grad_norm": 0.003093563485890627,
+      "learning_rate": 0.001,
+      "loss": 0.3978,
+      "step": 8787
+    },
+    {
+      "epoch": 0.24248026295367614,
+      "grad_norm": 0.002772730076685548,
+      "learning_rate": 0.001,
+      "loss": 0.3936,
+      "step": 8788
+    },
+    {
+      "epoch": 0.24250785515474052,
+      "grad_norm": 0.00274568609893322,
+      "learning_rate": 0.001,
+      "loss": 0.4049,
+      "step": 8789
+    },
+    {
+      "epoch": 0.24253544735580487,
+      "grad_norm": 0.0022818988654762506,
+      "learning_rate": 0.001,
+      "loss": 0.3854,
+      "step": 8790
+    },
+    {
+      "epoch": 0.24256303955686925,
+      "grad_norm": 0.004985783249139786,
+      "learning_rate": 0.001,
+      "loss": 0.3894,
+      "step": 8791
+    },
+    {
+      "epoch": 0.24259063175793363,
+      "grad_norm": 0.00211884337477386,
+      "learning_rate": 0.001,
+      "loss": 0.4173,
+      "step": 8792
+    },
+    {
+      "epoch": 0.24261822395899799,
+      "grad_norm": 0.00450787041336298,
+      "learning_rate": 0.001,
+      "loss": 0.3957,
+      "step": 8793
+    },
+    {
+      "epoch": 0.24264581616006237,
+      "grad_norm": 0.003899297444149852,
+      "learning_rate": 0.001,
+      "loss": 0.3815,
+      "step": 8794
+    },
+    {
+      "epoch": 0.24267340836112672,
+      "grad_norm": 0.002133857225999236,
+      "learning_rate": 0.001,
+      "loss": 0.3983,
+      "step": 8795
+    },
+    {
+      "epoch": 0.2427010005621911,
+      "grad_norm": 0.002347117057070136,
+      "learning_rate": 0.001,
+      "loss": 0.4273,
+      "step": 8796
+    },
+    {
+      "epoch": 0.24272859276325548,
+      "grad_norm": 0.0024913426022976637,
+      "learning_rate": 0.001,
+      "loss": 0.4102,
+      "step": 8797
+    },
+    {
+      "epoch": 0.24275618496431983,
+      "grad_norm": 0.0028214750345796347,
+      "learning_rate": 0.001,
+      "loss": 0.4053,
+      "step": 8798
+    },
+    {
+      "epoch": 0.2427837771653842,
+      "grad_norm": 0.002642587758600712,
+      "learning_rate": 0.001,
+      "loss": 0.3987,
+      "step": 8799
+    },
+    {
+      "epoch": 0.24281136936644857,
+      "grad_norm": 0.002706260420382023,
+      "learning_rate": 0.001,
+      "loss": 0.42,
+      "step": 8800
+    },
+    {
+      "epoch": 0.24283896156751295,
+      "grad_norm": 0.0023238682188093662,
+      "learning_rate": 0.001,
+      "loss": 0.3988,
+      "step": 8801
+    },
+    {
+      "epoch": 0.2428665537685773,
+      "grad_norm": 0.003259580582380295,
+      "learning_rate": 0.001,
+      "loss": 0.3964,
+      "step": 8802
+    },
+    {
+      "epoch": 0.24289414596964168,
+      "grad_norm": 0.007316852454096079,
+      "learning_rate": 0.001,
+      "loss": 0.4152,
+      "step": 8803
+    },
+    {
+      "epoch": 0.24292173817070606,
+      "grad_norm": 0.003550121560692787,
+      "learning_rate": 0.001,
+      "loss": 0.4007,
+      "step": 8804
+    },
+    {
+      "epoch": 0.2429493303717704,
+      "grad_norm": 0.002780807903036475,
+      "learning_rate": 0.001,
+      "loss": 0.3873,
+      "step": 8805
+    },
+    {
+      "epoch": 0.2429769225728348,
+      "grad_norm": 0.0035296916030347347,
+      "learning_rate": 0.001,
+      "loss": 0.3655,
+      "step": 8806
+    },
+    {
+      "epoch": 0.24300451477389914,
+      "grad_norm": 0.005781420040875673,
+      "learning_rate": 0.001,
+      "loss": 0.4002,
+      "step": 8807
+    },
+    {
+      "epoch": 0.24303210697496352,
+      "grad_norm": 0.0032384470105171204,
+      "learning_rate": 0.001,
+      "loss": 0.3871,
+      "step": 8808
+    },
+    {
+      "epoch": 0.2430596991760279,
+      "grad_norm": 0.007727981545031071,
+      "learning_rate": 0.001,
+      "loss": 0.4077,
+      "step": 8809
+    },
+    {
+      "epoch": 0.24308729137709226,
+      "grad_norm": 0.024563631042838097,
+      "learning_rate": 0.001,
+      "loss": 0.4065,
+      "step": 8810
+    },
+    {
+      "epoch": 0.24311488357815664,
+      "grad_norm": 0.00322868674993515,
+      "learning_rate": 0.001,
+      "loss": 0.401,
+      "step": 8811
+    },
+    {
+      "epoch": 0.243142475779221,
+      "grad_norm": 0.004947735462337732,
+      "learning_rate": 0.001,
+      "loss": 0.3812,
+      "step": 8812
+    },
+    {
+      "epoch": 0.24317006798028537,
+      "grad_norm": 0.004064179491251707,
+      "learning_rate": 0.001,
+      "loss": 0.3854,
+      "step": 8813
+    },
+    {
+      "epoch": 0.24319766018134975,
+      "grad_norm": 0.005366888828575611,
+      "learning_rate": 0.001,
+      "loss": 0.4295,
+      "step": 8814
+    },
+    {
+      "epoch": 0.2432252523824141,
+      "grad_norm": 0.012629834935069084,
+      "learning_rate": 0.001,
+      "loss": 0.3826,
+      "step": 8815
+    },
+    {
+      "epoch": 0.24325284458347848,
+      "grad_norm": 0.004325262736529112,
+      "learning_rate": 0.001,
+      "loss": 0.3958,
+      "step": 8816
+    },
+    {
+      "epoch": 0.24328043678454284,
+      "grad_norm": 0.002770826453343034,
+      "learning_rate": 0.001,
+      "loss": 0.4106,
+      "step": 8817
+    },
+    {
+      "epoch": 0.24330802898560722,
+      "grad_norm": 0.005562702193856239,
+      "learning_rate": 0.001,
+      "loss": 0.3986,
+      "step": 8818
+    },
+    {
+      "epoch": 0.2433356211866716,
+      "grad_norm": 0.002646266482770443,
+      "learning_rate": 0.001,
+      "loss": 0.4243,
+      "step": 8819
+    },
+    {
+      "epoch": 0.24336321338773595,
+      "grad_norm": 0.003909732680767775,
+      "learning_rate": 0.001,
+      "loss": 0.3725,
+      "step": 8820
+    },
+    {
+      "epoch": 0.24339080558880033,
+      "grad_norm": 0.0035097142681479454,
+      "learning_rate": 0.001,
+      "loss": 0.3911,
+      "step": 8821
+    },
+    {
+      "epoch": 0.24341839778986468,
+      "grad_norm": 0.002032148651778698,
+      "learning_rate": 0.001,
+      "loss": 0.4234,
+      "step": 8822
+    },
+    {
+      "epoch": 0.24344598999092906,
+      "grad_norm": 0.00408203387632966,
+      "learning_rate": 0.001,
+      "loss": 0.4452,
+      "step": 8823
+    },
+    {
+      "epoch": 0.24347358219199344,
+      "grad_norm": 0.0028013025876134634,
+      "learning_rate": 0.001,
+      "loss": 0.3754,
+      "step": 8824
+    },
+    {
+      "epoch": 0.2435011743930578,
+      "grad_norm": 0.0023070168681442738,
+      "learning_rate": 0.001,
+      "loss": 0.3962,
+      "step": 8825
+    },
+    {
+      "epoch": 0.24352876659412218,
+      "grad_norm": 0.006049746181815863,
+      "learning_rate": 0.001,
+      "loss": 0.3658,
+      "step": 8826
+    },
+    {
+      "epoch": 0.24355635879518653,
+      "grad_norm": 0.004005006048828363,
+      "learning_rate": 0.001,
+      "loss": 0.4135,
+      "step": 8827
+    },
+    {
+      "epoch": 0.2435839509962509,
+      "grad_norm": 0.003078661160543561,
+      "learning_rate": 0.001,
+      "loss": 0.3725,
+      "step": 8828
+    },
+    {
+      "epoch": 0.2436115431973153,
+      "grad_norm": 0.0025324684102088213,
+      "learning_rate": 0.001,
+      "loss": 0.3884,
+      "step": 8829
+    },
+    {
+      "epoch": 0.24363913539837964,
+      "grad_norm": 0.0028970125131309032,
+      "learning_rate": 0.001,
+      "loss": 0.3961,
+      "step": 8830
+    },
+    {
+      "epoch": 0.24366672759944402,
+      "grad_norm": 0.0022717502433806658,
+      "learning_rate": 0.001,
+      "loss": 0.3677,
+      "step": 8831
+    },
+    {
+      "epoch": 0.24369431980050837,
+      "grad_norm": 0.0029859100468456745,
+      "learning_rate": 0.001,
+      "loss": 0.3481,
+      "step": 8832
+    },
+    {
+      "epoch": 0.24372191200157275,
+      "grad_norm": 0.002873897785320878,
+      "learning_rate": 0.001,
+      "loss": 0.3884,
+      "step": 8833
+    },
+    {
+      "epoch": 0.24374950420263714,
+      "grad_norm": 0.00388680980540812,
+      "learning_rate": 0.001,
+      "loss": 0.3748,
+      "step": 8834
+    },
+    {
+      "epoch": 0.2437770964037015,
+      "grad_norm": 0.0036059573758393526,
+      "learning_rate": 0.001,
+      "loss": 0.4325,
+      "step": 8835
+    },
+    {
+      "epoch": 0.24380468860476587,
+      "grad_norm": 0.004464290104806423,
+      "learning_rate": 0.001,
+      "loss": 0.3621,
+      "step": 8836
+    },
+    {
+      "epoch": 0.24383228080583022,
+      "grad_norm": 0.002686945255845785,
+      "learning_rate": 0.001,
+      "loss": 0.4247,
+      "step": 8837
+    },
+    {
+      "epoch": 0.2438598730068946,
+      "grad_norm": 0.0031961945351213217,
+      "learning_rate": 0.001,
+      "loss": 0.3662,
+      "step": 8838
+    },
+    {
+      "epoch": 0.24388746520795898,
+      "grad_norm": 0.002320102881640196,
+      "learning_rate": 0.001,
+      "loss": 0.3876,
+      "step": 8839
+    },
+    {
+      "epoch": 0.24391505740902333,
+      "grad_norm": 0.003384110052138567,
+      "learning_rate": 0.001,
+      "loss": 0.3677,
+      "step": 8840
+    },
+    {
+      "epoch": 0.24394264961008771,
+      "grad_norm": 0.002216142136603594,
+      "learning_rate": 0.001,
+      "loss": 0.399,
+      "step": 8841
+    },
+    {
+      "epoch": 0.24397024181115207,
+      "grad_norm": 0.0023850633297115564,
+      "learning_rate": 0.001,
+      "loss": 0.4719,
+      "step": 8842
+    },
+    {
+      "epoch": 0.24399783401221645,
+      "grad_norm": 0.0023910165764391422,
+      "learning_rate": 0.001,
+      "loss": 0.3848,
+      "step": 8843
+    },
+    {
+      "epoch": 0.24402542621328083,
+      "grad_norm": 0.002412325469776988,
+      "learning_rate": 0.001,
+      "loss": 0.3571,
+      "step": 8844
+    },
+    {
+      "epoch": 0.24405301841434518,
+      "grad_norm": 0.003925090655684471,
+      "learning_rate": 0.001,
+      "loss": 0.358,
+      "step": 8845
+    },
+    {
+      "epoch": 0.24408061061540956,
+      "grad_norm": 0.002139107324182987,
+      "learning_rate": 0.001,
+      "loss": 0.4108,
+      "step": 8846
+    },
+    {
+      "epoch": 0.2441082028164739,
+      "grad_norm": 0.003215159522369504,
+      "learning_rate": 0.001,
+      "loss": 0.3964,
+      "step": 8847
+    },
+    {
+      "epoch": 0.2441357950175383,
+      "grad_norm": 0.0029652882367372513,
+      "learning_rate": 0.001,
+      "loss": 0.3926,
+      "step": 8848
+    },
+    {
+      "epoch": 0.24416338721860267,
+      "grad_norm": 0.0026338063180446625,
+      "learning_rate": 0.001,
+      "loss": 0.392,
+      "step": 8849
+    },
+    {
+      "epoch": 0.24419097941966703,
+      "grad_norm": 0.00244903308339417,
+      "learning_rate": 0.001,
+      "loss": 0.4068,
+      "step": 8850
+    },
+    {
+      "epoch": 0.2442185716207314,
+      "grad_norm": 0.002739574061706662,
+      "learning_rate": 0.001,
+      "loss": 0.384,
+      "step": 8851
+    },
+    {
+      "epoch": 0.24424616382179576,
+      "grad_norm": 0.003445478854700923,
+      "learning_rate": 0.001,
+      "loss": 0.409,
+      "step": 8852
+    },
+    {
+      "epoch": 0.24427375602286014,
+      "grad_norm": 0.0026570416521281004,
+      "learning_rate": 0.001,
+      "loss": 0.4162,
+      "step": 8853
+    },
+    {
+      "epoch": 0.24430134822392452,
+      "grad_norm": 0.0024555225390940905,
+      "learning_rate": 0.001,
+      "loss": 0.4071,
+      "step": 8854
+    },
+    {
+      "epoch": 0.24432894042498887,
+      "grad_norm": 0.004189019091427326,
+      "learning_rate": 0.001,
+      "loss": 0.3589,
+      "step": 8855
+    },
+    {
+      "epoch": 0.24435653262605325,
+      "grad_norm": 0.0035908452700823545,
+      "learning_rate": 0.001,
+      "loss": 0.3956,
+      "step": 8856
+    },
+    {
+      "epoch": 0.2443841248271176,
+      "grad_norm": 0.002694958820939064,
+      "learning_rate": 0.001,
+      "loss": 0.3764,
+      "step": 8857
+    },
+    {
+      "epoch": 0.24441171702818199,
+      "grad_norm": 0.002945966785773635,
+      "learning_rate": 0.001,
+      "loss": 0.4166,
+      "step": 8858
+    },
+    {
+      "epoch": 0.24443930922924637,
+      "grad_norm": 0.002466941252350807,
+      "learning_rate": 0.001,
+      "loss": 0.4007,
+      "step": 8859
+    },
+    {
+      "epoch": 0.24446690143031072,
+      "grad_norm": 0.003148929215967655,
+      "learning_rate": 0.001,
+      "loss": 0.4356,
+      "step": 8860
+    },
+    {
+      "epoch": 0.2444944936313751,
+      "grad_norm": 0.0029636970721185207,
+      "learning_rate": 0.001,
+      "loss": 0.3543,
+      "step": 8861
+    },
+    {
+      "epoch": 0.24452208583243945,
+      "grad_norm": 0.002561190165579319,
+      "learning_rate": 0.001,
+      "loss": 0.3965,
+      "step": 8862
+    },
+    {
+      "epoch": 0.24454967803350383,
+      "grad_norm": 0.0031816980335861444,
+      "learning_rate": 0.001,
+      "loss": 0.3796,
+      "step": 8863
+    },
+    {
+      "epoch": 0.2445772702345682,
+      "grad_norm": 0.0030445698648691177,
+      "learning_rate": 0.001,
+      "loss": 0.4252,
+      "step": 8864
+    },
+    {
+      "epoch": 0.24460486243563256,
+      "grad_norm": 0.003067183308303356,
+      "learning_rate": 0.001,
+      "loss": 0.3822,
+      "step": 8865
+    },
+    {
+      "epoch": 0.24463245463669694,
+      "grad_norm": 0.002353479852899909,
+      "learning_rate": 0.001,
+      "loss": 0.3911,
+      "step": 8866
+    },
+    {
+      "epoch": 0.2446600468377613,
+      "grad_norm": 0.0058910418301820755,
+      "learning_rate": 0.001,
+      "loss": 0.4077,
+      "step": 8867
+    },
+    {
+      "epoch": 0.24468763903882568,
+      "grad_norm": 0.003310910891741514,
+      "learning_rate": 0.001,
+      "loss": 0.38,
+      "step": 8868
+    },
+    {
+      "epoch": 0.24471523123989006,
+      "grad_norm": 0.003525955369696021,
+      "learning_rate": 0.001,
+      "loss": 0.4132,
+      "step": 8869
+    },
+    {
+      "epoch": 0.2447428234409544,
+      "grad_norm": 0.00252323760651052,
+      "learning_rate": 0.001,
+      "loss": 0.3562,
+      "step": 8870
+    },
+    {
+      "epoch": 0.2447704156420188,
+      "grad_norm": 0.0025553128216415644,
+      "learning_rate": 0.001,
+      "loss": 0.41,
+      "step": 8871
+    },
+    {
+      "epoch": 0.24479800784308314,
+      "grad_norm": 0.003628962906077504,
+      "learning_rate": 0.001,
+      "loss": 0.4232,
+      "step": 8872
+    },
+    {
+      "epoch": 0.24482560004414752,
+      "grad_norm": 0.00299929385073483,
+      "learning_rate": 0.001,
+      "loss": 0.384,
+      "step": 8873
+    },
+    {
+      "epoch": 0.2448531922452119,
+      "grad_norm": 0.004137910902500153,
+      "learning_rate": 0.001,
+      "loss": 0.3851,
+      "step": 8874
+    },
+    {
+      "epoch": 0.24488078444627626,
+      "grad_norm": 0.0027973924297839403,
+      "learning_rate": 0.001,
+      "loss": 0.3781,
+      "step": 8875
+    },
+    {
+      "epoch": 0.24490837664734064,
+      "grad_norm": 0.0029198199044913054,
+      "learning_rate": 0.001,
+      "loss": 0.3936,
+      "step": 8876
+    },
+    {
+      "epoch": 0.244935968848405,
+      "grad_norm": 0.0027383132837712765,
+      "learning_rate": 0.001,
+      "loss": 0.4067,
+      "step": 8877
+    },
+    {
+      "epoch": 0.24496356104946937,
+      "grad_norm": 0.00903650838881731,
+      "learning_rate": 0.001,
+      "loss": 0.3843,
+      "step": 8878
+    },
+    {
+      "epoch": 0.24499115325053375,
+      "grad_norm": 0.01571609638631344,
+      "learning_rate": 0.001,
+      "loss": 0.3934,
+      "step": 8879
+    },
+    {
+      "epoch": 0.2450187454515981,
+      "grad_norm": 0.0059790583327412605,
+      "learning_rate": 0.001,
+      "loss": 0.372,
+      "step": 8880
+    },
+    {
+      "epoch": 0.24504633765266248,
+      "grad_norm": 0.0021693052258342505,
+      "learning_rate": 0.001,
+      "loss": 0.4197,
+      "step": 8881
+    },
+    {
+      "epoch": 0.24507392985372684,
+      "grad_norm": 0.002195873064920306,
+      "learning_rate": 0.001,
+      "loss": 0.4027,
+      "step": 8882
+    },
+    {
+      "epoch": 0.24510152205479122,
+      "grad_norm": 0.0032636993564665318,
+      "learning_rate": 0.001,
+      "loss": 0.4107,
+      "step": 8883
+    },
+    {
+      "epoch": 0.2451291142558556,
+      "grad_norm": 0.005068526603281498,
+      "learning_rate": 0.001,
+      "loss": 0.4195,
+      "step": 8884
+    },
+    {
+      "epoch": 0.24515670645691995,
+      "grad_norm": 0.0020894519984722137,
+      "learning_rate": 0.001,
+      "loss": 0.4464,
+      "step": 8885
+    },
+    {
+      "epoch": 0.24518429865798433,
+      "grad_norm": 0.0024781054817140102,
+      "learning_rate": 0.001,
+      "loss": 0.3872,
+      "step": 8886
+    },
+    {
+      "epoch": 0.24521189085904868,
+      "grad_norm": 0.002742476761341095,
+      "learning_rate": 0.001,
+      "loss": 0.3852,
+      "step": 8887
+    },
+    {
+      "epoch": 0.24523948306011306,
+      "grad_norm": 0.0031709850300103426,
+      "learning_rate": 0.001,
+      "loss": 0.3621,
+      "step": 8888
+    },
+    {
+      "epoch": 0.24526707526117744,
+      "grad_norm": 0.00277856457978487,
+      "learning_rate": 0.001,
+      "loss": 0.4364,
+      "step": 8889
+    },
+    {
+      "epoch": 0.2452946674622418,
+      "grad_norm": 0.004140378907322884,
+      "learning_rate": 0.001,
+      "loss": 0.4148,
+      "step": 8890
+    },
+    {
+      "epoch": 0.24532225966330617,
+      "grad_norm": 0.0283675380051136,
+      "learning_rate": 0.001,
+      "loss": 0.3886,
+      "step": 8891
+    },
+    {
+      "epoch": 0.24534985186437053,
+      "grad_norm": 0.010619306936860085,
+      "learning_rate": 0.001,
+      "loss": 0.4202,
+      "step": 8892
+    },
+    {
+      "epoch": 0.2453774440654349,
+      "grad_norm": 0.0036299237981438637,
+      "learning_rate": 0.001,
+      "loss": 0.4419,
+      "step": 8893
+    },
+    {
+      "epoch": 0.24540503626649926,
+      "grad_norm": 0.00281274551525712,
+      "learning_rate": 0.001,
+      "loss": 0.4136,
+      "step": 8894
+    },
+    {
+      "epoch": 0.24543262846756364,
+      "grad_norm": 0.0032774037681519985,
+      "learning_rate": 0.001,
+      "loss": 0.4124,
+      "step": 8895
+    },
+    {
+      "epoch": 0.24546022066862802,
+      "grad_norm": 0.004956527147442102,
+      "learning_rate": 0.001,
+      "loss": 0.3636,
+      "step": 8896
+    },
+    {
+      "epoch": 0.24548781286969237,
+      "grad_norm": 0.003114642109721899,
+      "learning_rate": 0.001,
+      "loss": 0.3618,
+      "step": 8897
+    },
+    {
+      "epoch": 0.24551540507075675,
+      "grad_norm": 0.012209619395434856,
+      "learning_rate": 0.001,
+      "loss": 0.4098,
+      "step": 8898
+    },
+    {
+      "epoch": 0.2455429972718211,
+      "grad_norm": 0.005186257418245077,
+      "learning_rate": 0.001,
+      "loss": 0.3989,
+      "step": 8899
+    },
+    {
+      "epoch": 0.2455705894728855,
+      "grad_norm": 0.003014913760125637,
+      "learning_rate": 0.001,
+      "loss": 0.3578,
+      "step": 8900
+    },
+    {
+      "epoch": 0.24559818167394987,
+      "grad_norm": 0.0029307794757187366,
+      "learning_rate": 0.001,
+      "loss": 0.3941,
+      "step": 8901
+    },
+    {
+      "epoch": 0.24562577387501422,
+      "grad_norm": 0.0025628958828747272,
+      "learning_rate": 0.001,
+      "loss": 0.3884,
+      "step": 8902
+    },
+    {
+      "epoch": 0.2456533660760786,
+      "grad_norm": 0.0026851852890104055,
+      "learning_rate": 0.001,
+      "loss": 0.4178,
+      "step": 8903
+    },
+    {
+      "epoch": 0.24568095827714295,
+      "grad_norm": 0.0029029424767941236,
+      "learning_rate": 0.001,
+      "loss": 0.4017,
+      "step": 8904
+    },
+    {
+      "epoch": 0.24570855047820733,
+      "grad_norm": 0.0035970478784292936,
+      "learning_rate": 0.001,
+      "loss": 0.3955,
+      "step": 8905
+    },
+    {
+      "epoch": 0.2457361426792717,
+      "grad_norm": 0.006312237121164799,
+      "learning_rate": 0.001,
+      "loss": 0.4197,
+      "step": 8906
+    },
+    {
+      "epoch": 0.24576373488033607,
+      "grad_norm": 0.005121266935020685,
+      "learning_rate": 0.001,
+      "loss": 0.4264,
+      "step": 8907
+    },
+    {
+      "epoch": 0.24579132708140045,
+      "grad_norm": 0.003577583469450474,
+      "learning_rate": 0.001,
+      "loss": 0.3669,
+      "step": 8908
+    },
+    {
+      "epoch": 0.2458189192824648,
+      "grad_norm": 0.0027252668514847755,
+      "learning_rate": 0.001,
+      "loss": 0.3777,
+      "step": 8909
+    },
+    {
+      "epoch": 0.24584651148352918,
+      "grad_norm": 0.0026001029182225466,
+      "learning_rate": 0.001,
+      "loss": 0.3571,
+      "step": 8910
+    },
+    {
+      "epoch": 0.24587410368459356,
+      "grad_norm": 0.0025208101142197847,
+      "learning_rate": 0.001,
+      "loss": 0.4442,
+      "step": 8911
+    },
+    {
+      "epoch": 0.2459016958856579,
+      "grad_norm": 0.004707024432718754,
+      "learning_rate": 0.001,
+      "loss": 0.404,
+      "step": 8912
+    },
+    {
+      "epoch": 0.2459292880867223,
+      "grad_norm": 0.0031555844470858574,
+      "learning_rate": 0.001,
+      "loss": 0.3893,
+      "step": 8913
+    },
+    {
+      "epoch": 0.24595688028778664,
+      "grad_norm": 0.002120368182659149,
+      "learning_rate": 0.001,
+      "loss": 0.3965,
+      "step": 8914
+    },
+    {
+      "epoch": 0.24598447248885102,
+      "grad_norm": 0.002101072110235691,
+      "learning_rate": 0.001,
+      "loss": 0.4142,
+      "step": 8915
+    },
+    {
+      "epoch": 0.2460120646899154,
+      "grad_norm": 0.0030265292152762413,
+      "learning_rate": 0.001,
+      "loss": 0.4068,
+      "step": 8916
+    },
+    {
+      "epoch": 0.24603965689097976,
+      "grad_norm": 0.0027274913154542446,
+      "learning_rate": 0.001,
+      "loss": 0.3491,
+      "step": 8917
+    },
+    {
+      "epoch": 0.24606724909204414,
+      "grad_norm": 0.0028532680589705706,
+      "learning_rate": 0.001,
+      "loss": 0.3599,
+      "step": 8918
+    },
+    {
+      "epoch": 0.2460948412931085,
+      "grad_norm": 0.003202751511707902,
+      "learning_rate": 0.001,
+      "loss": 0.4211,
+      "step": 8919
+    },
+    {
+      "epoch": 0.24612243349417287,
+      "grad_norm": 0.00257576210424304,
+      "learning_rate": 0.001,
+      "loss": 0.4007,
+      "step": 8920
+    },
+    {
+      "epoch": 0.24615002569523725,
+      "grad_norm": 0.002339770086109638,
+      "learning_rate": 0.001,
+      "loss": 0.4208,
+      "step": 8921
+    },
+    {
+      "epoch": 0.2461776178963016,
+      "grad_norm": 0.0026211580261588097,
+      "learning_rate": 0.001,
+      "loss": 0.4038,
+      "step": 8922
+    },
+    {
+      "epoch": 0.24620521009736598,
+      "grad_norm": 0.002310456009581685,
+      "learning_rate": 0.001,
+      "loss": 0.4027,
+      "step": 8923
+    },
+    {
+      "epoch": 0.24623280229843034,
+      "grad_norm": 0.0026841405779123306,
+      "learning_rate": 0.001,
+      "loss": 0.3782,
+      "step": 8924
+    },
+    {
+      "epoch": 0.24626039449949472,
+      "grad_norm": 0.0026302135083824396,
+      "learning_rate": 0.001,
+      "loss": 0.3994,
+      "step": 8925
+    },
+    {
+      "epoch": 0.2462879867005591,
+      "grad_norm": 0.0022160590160638094,
+      "learning_rate": 0.001,
+      "loss": 0.4267,
+      "step": 8926
+    },
+    {
+      "epoch": 0.24631557890162345,
+      "grad_norm": 0.0031570009887218475,
+      "learning_rate": 0.001,
+      "loss": 0.3834,
+      "step": 8927
+    },
+    {
+      "epoch": 0.24634317110268783,
+      "grad_norm": 0.0042785764671862125,
+      "learning_rate": 0.001,
+      "loss": 0.3567,
+      "step": 8928
+    },
+    {
+      "epoch": 0.24637076330375218,
+      "grad_norm": 0.00619478989392519,
+      "learning_rate": 0.001,
+      "loss": 0.3987,
+      "step": 8929
+    },
+    {
+      "epoch": 0.24639835550481656,
+      "grad_norm": 0.0054093278013169765,
+      "learning_rate": 0.001,
+      "loss": 0.4192,
+      "step": 8930
+    },
+    {
+      "epoch": 0.24642594770588094,
+      "grad_norm": 0.004795154556632042,
+      "learning_rate": 0.001,
+      "loss": 0.3968,
+      "step": 8931
+    },
+    {
+      "epoch": 0.2464535399069453,
+      "grad_norm": 0.003340116934850812,
+      "learning_rate": 0.001,
+      "loss": 0.4441,
+      "step": 8932
+    },
+    {
+      "epoch": 0.24648113210800968,
+      "grad_norm": 0.0024299235083162785,
+      "learning_rate": 0.001,
+      "loss": 0.4043,
+      "step": 8933
+    },
+    {
+      "epoch": 0.24650872430907403,
+      "grad_norm": 0.0033976933918893337,
+      "learning_rate": 0.001,
+      "loss": 0.4082,
+      "step": 8934
+    },
+    {
+      "epoch": 0.2465363165101384,
+      "grad_norm": 0.004061064217239618,
+      "learning_rate": 0.001,
+      "loss": 0.4087,
+      "step": 8935
+    },
+    {
+      "epoch": 0.2465639087112028,
+      "grad_norm": 0.002939441241323948,
+      "learning_rate": 0.001,
+      "loss": 0.3877,
+      "step": 8936
+    },
+    {
+      "epoch": 0.24659150091226714,
+      "grad_norm": 0.003068729070946574,
+      "learning_rate": 0.001,
+      "loss": 0.4232,
+      "step": 8937
+    },
+    {
+      "epoch": 0.24661909311333152,
+      "grad_norm": 0.003941199276596308,
+      "learning_rate": 0.001,
+      "loss": 0.4244,
+      "step": 8938
+    },
+    {
+      "epoch": 0.24664668531439587,
+      "grad_norm": 0.0029029848519712687,
+      "learning_rate": 0.001,
+      "loss": 0.4147,
+      "step": 8939
+    },
+    {
+      "epoch": 0.24667427751546026,
+      "grad_norm": 0.0041121323592960835,
+      "learning_rate": 0.001,
+      "loss": 0.4205,
+      "step": 8940
+    },
+    {
+      "epoch": 0.24670186971652464,
+      "grad_norm": 0.002195686800405383,
+      "learning_rate": 0.001,
+      "loss": 0.4237,
+      "step": 8941
+    },
+    {
+      "epoch": 0.246729461917589,
+      "grad_norm": 0.00511584896594286,
+      "learning_rate": 0.001,
+      "loss": 0.4068,
+      "step": 8942
+    },
+    {
+      "epoch": 0.24675705411865337,
+      "grad_norm": 0.0033936495892703533,
+      "learning_rate": 0.001,
+      "loss": 0.4118,
+      "step": 8943
+    },
+    {
+      "epoch": 0.24678464631971772,
+      "grad_norm": 0.002642909763380885,
+      "learning_rate": 0.001,
+      "loss": 0.3948,
+      "step": 8944
+    },
+    {
+      "epoch": 0.2468122385207821,
+      "grad_norm": 0.007020313758403063,
+      "learning_rate": 0.001,
+      "loss": 0.3746,
+      "step": 8945
+    },
+    {
+      "epoch": 0.24683983072184648,
+      "grad_norm": 0.0026212832890450954,
+      "learning_rate": 0.001,
+      "loss": 0.4017,
+      "step": 8946
+    },
+    {
+      "epoch": 0.24686742292291083,
+      "grad_norm": 0.00335301854647696,
+      "learning_rate": 0.001,
+      "loss": 0.3873,
+      "step": 8947
+    },
+    {
+      "epoch": 0.24689501512397521,
+      "grad_norm": 0.004139469005167484,
+      "learning_rate": 0.001,
+      "loss": 0.4164,
+      "step": 8948
+    },
+    {
+      "epoch": 0.24692260732503957,
+      "grad_norm": 0.004907173104584217,
+      "learning_rate": 0.001,
+      "loss": 0.4066,
+      "step": 8949
+    },
+    {
+      "epoch": 0.24695019952610395,
+      "grad_norm": 0.0036863782443106174,
+      "learning_rate": 0.001,
+      "loss": 0.4069,
+      "step": 8950
+    },
+    {
+      "epoch": 0.24697779172716833,
+      "grad_norm": 0.004586333874613047,
+      "learning_rate": 0.001,
+      "loss": 0.3842,
+      "step": 8951
+    },
+    {
+      "epoch": 0.24700538392823268,
+      "grad_norm": 0.002670851768925786,
+      "learning_rate": 0.001,
+      "loss": 0.3911,
+      "step": 8952
+    },
+    {
+      "epoch": 0.24703297612929706,
+      "grad_norm": 0.003045836230739951,
+      "learning_rate": 0.001,
+      "loss": 0.3962,
+      "step": 8953
+    },
+    {
+      "epoch": 0.2470605683303614,
+      "grad_norm": 0.004562459886074066,
+      "learning_rate": 0.001,
+      "loss": 0.3694,
+      "step": 8954
+    },
+    {
+      "epoch": 0.2470881605314258,
+      "grad_norm": 0.003939764108508825,
+      "learning_rate": 0.001,
+      "loss": 0.3917,
+      "step": 8955
+    },
+    {
+      "epoch": 0.24711575273249017,
+      "grad_norm": 0.00309895072132349,
+      "learning_rate": 0.001,
+      "loss": 0.3878,
+      "step": 8956
+    },
+    {
+      "epoch": 0.24714334493355453,
+      "grad_norm": 0.00386620219796896,
+      "learning_rate": 0.001,
+      "loss": 0.3703,
+      "step": 8957
+    },
+    {
+      "epoch": 0.2471709371346189,
+      "grad_norm": 0.01034584641456604,
+      "learning_rate": 0.001,
+      "loss": 0.403,
+      "step": 8958
+    },
+    {
+      "epoch": 0.24719852933568326,
+      "grad_norm": 0.002349577145650983,
+      "learning_rate": 0.001,
+      "loss": 0.3831,
+      "step": 8959
+    },
+    {
+      "epoch": 0.24722612153674764,
+      "grad_norm": 0.003580132033675909,
+      "learning_rate": 0.001,
+      "loss": 0.3947,
+      "step": 8960
+    },
+    {
+      "epoch": 0.24725371373781202,
+      "grad_norm": 0.0027131785172969103,
+      "learning_rate": 0.001,
+      "loss": 0.3968,
+      "step": 8961
+    },
+    {
+      "epoch": 0.24728130593887637,
+      "grad_norm": 0.0030202209018170834,
+      "learning_rate": 0.001,
+      "loss": 0.3688,
+      "step": 8962
+    },
+    {
+      "epoch": 0.24730889813994075,
+      "grad_norm": 0.002842100802809,
+      "learning_rate": 0.001,
+      "loss": 0.3903,
+      "step": 8963
+    },
+    {
+      "epoch": 0.2473364903410051,
+      "grad_norm": 0.002618222264572978,
+      "learning_rate": 0.001,
+      "loss": 0.4299,
+      "step": 8964
+    },
+    {
+      "epoch": 0.24736408254206949,
+      "grad_norm": 0.0024017065297812223,
+      "learning_rate": 0.001,
+      "loss": 0.37,
+      "step": 8965
+    },
+    {
+      "epoch": 0.24739167474313387,
+      "grad_norm": 0.0025306292809545994,
+      "learning_rate": 0.001,
+      "loss": 0.4096,
+      "step": 8966
+    },
+    {
+      "epoch": 0.24741926694419822,
+      "grad_norm": 0.004213306587189436,
+      "learning_rate": 0.001,
+      "loss": 0.4261,
+      "step": 8967
+    },
+    {
+      "epoch": 0.2474468591452626,
+      "grad_norm": 0.0023831671569496393,
+      "learning_rate": 0.001,
+      "loss": 0.4304,
+      "step": 8968
+    },
+    {
+      "epoch": 0.24747445134632695,
+      "grad_norm": 0.002815242623910308,
+      "learning_rate": 0.001,
+      "loss": 0.4499,
+      "step": 8969
+    },
+    {
+      "epoch": 0.24750204354739133,
+      "grad_norm": 0.0026342514902353287,
+      "learning_rate": 0.001,
+      "loss": 0.3721,
+      "step": 8970
+    },
+    {
+      "epoch": 0.2475296357484557,
+      "grad_norm": 0.004273373633623123,
+      "learning_rate": 0.001,
+      "loss": 0.3637,
+      "step": 8971
+    },
+    {
+      "epoch": 0.24755722794952006,
+      "grad_norm": 0.003125197486951947,
+      "learning_rate": 0.001,
+      "loss": 0.3989,
+      "step": 8972
+    },
+    {
+      "epoch": 0.24758482015058444,
+      "grad_norm": 0.004526606295257807,
+      "learning_rate": 0.001,
+      "loss": 0.3571,
+      "step": 8973
+    },
+    {
+      "epoch": 0.2476124123516488,
+      "grad_norm": 0.0031581746879965067,
+      "learning_rate": 0.001,
+      "loss": 0.3923,
+      "step": 8974
+    },
+    {
+      "epoch": 0.24764000455271318,
+      "grad_norm": 0.0028788463678210974,
+      "learning_rate": 0.001,
+      "loss": 0.364,
+      "step": 8975
+    },
+    {
+      "epoch": 0.24766759675377756,
+      "grad_norm": 0.004946670029312372,
+      "learning_rate": 0.001,
+      "loss": 0.4213,
+      "step": 8976
+    },
+    {
+      "epoch": 0.2476951889548419,
+      "grad_norm": 0.004649583250284195,
+      "learning_rate": 0.001,
+      "loss": 0.419,
+      "step": 8977
+    },
+    {
+      "epoch": 0.2477227811559063,
+      "grad_norm": 0.0023594009689986706,
+      "learning_rate": 0.001,
+      "loss": 0.407,
+      "step": 8978
+    },
+    {
+      "epoch": 0.24775037335697064,
+      "grad_norm": 0.0028385408222675323,
+      "learning_rate": 0.001,
+      "loss": 0.3887,
+      "step": 8979
+    },
+    {
+      "epoch": 0.24777796555803502,
+      "grad_norm": 0.0029238059651106596,
+      "learning_rate": 0.001,
+      "loss": 0.3924,
+      "step": 8980
+    },
+    {
+      "epoch": 0.2478055577590994,
+      "grad_norm": 0.002410429297015071,
+      "learning_rate": 0.001,
+      "loss": 0.4243,
+      "step": 8981
+    },
+    {
+      "epoch": 0.24783314996016376,
+      "grad_norm": 0.0036725676618516445,
+      "learning_rate": 0.001,
+      "loss": 0.4202,
+      "step": 8982
+    },
+    {
+      "epoch": 0.24786074216122814,
+      "grad_norm": 0.0026173696387559175,
+      "learning_rate": 0.001,
+      "loss": 0.4119,
+      "step": 8983
+    },
+    {
+      "epoch": 0.2478883343622925,
+      "grad_norm": 0.0019191295141354203,
+      "learning_rate": 0.001,
+      "loss": 0.411,
+      "step": 8984
+    },
+    {
+      "epoch": 0.24791592656335687,
+      "grad_norm": 0.003160592168569565,
+      "learning_rate": 0.001,
+      "loss": 0.3958,
+      "step": 8985
+    },
+    {
+      "epoch": 0.24794351876442125,
+      "grad_norm": 0.002665070816874504,
+      "learning_rate": 0.001,
+      "loss": 0.3889,
+      "step": 8986
+    },
+    {
+      "epoch": 0.2479711109654856,
+      "grad_norm": 0.0035125231370329857,
+      "learning_rate": 0.001,
+      "loss": 0.3749,
+      "step": 8987
+    },
+    {
+      "epoch": 0.24799870316654998,
+      "grad_norm": 0.0034389530774205923,
+      "learning_rate": 0.001,
+      "loss": 0.3914,
+      "step": 8988
+    },
+    {
+      "epoch": 0.24802629536761434,
+      "grad_norm": 0.00577655341476202,
+      "learning_rate": 0.001,
+      "loss": 0.4068,
+      "step": 8989
+    },
+    {
+      "epoch": 0.24805388756867872,
+      "grad_norm": 0.002529504243284464,
+      "learning_rate": 0.001,
+      "loss": 0.4043,
+      "step": 8990
+    },
+    {
+      "epoch": 0.24808147976974307,
+      "grad_norm": 0.003672944149002433,
+      "learning_rate": 0.001,
+      "loss": 0.3858,
+      "step": 8991
+    },
+    {
+      "epoch": 0.24810907197080745,
+      "grad_norm": 0.004433492664247751,
+      "learning_rate": 0.001,
+      "loss": 0.3826,
+      "step": 8992
+    },
+    {
+      "epoch": 0.24813666417187183,
+      "grad_norm": 0.0036997883580625057,
+      "learning_rate": 0.001,
+      "loss": 0.4014,
+      "step": 8993
+    },
+    {
+      "epoch": 0.24816425637293618,
+      "grad_norm": 0.0071926964446902275,
+      "learning_rate": 0.001,
+      "loss": 0.4054,
+      "step": 8994
+    },
+    {
+      "epoch": 0.24819184857400056,
+      "grad_norm": 0.002511198166757822,
+      "learning_rate": 0.001,
+      "loss": 0.413,
+      "step": 8995
+    },
+    {
+      "epoch": 0.24821944077506491,
+      "grad_norm": 0.0027406965382397175,
+      "learning_rate": 0.001,
+      "loss": 0.4401,
+      "step": 8996
+    },
+    {
+      "epoch": 0.2482470329761293,
+      "grad_norm": 0.0022513847798109055,
+      "learning_rate": 0.001,
+      "loss": 0.4076,
+      "step": 8997
+    },
+    {
+      "epoch": 0.24827462517719368,
+      "grad_norm": 0.011038105934858322,
+      "learning_rate": 0.001,
+      "loss": 0.3784,
+      "step": 8998
+    },
+    {
+      "epoch": 0.24830221737825803,
+      "grad_norm": 0.002501958515495062,
+      "learning_rate": 0.001,
+      "loss": 0.3689,
+      "step": 8999
+    },
+    {
+      "epoch": 0.2483298095793224,
+      "grad_norm": 0.0029480638913810253,
+      "learning_rate": 0.001,
+      "loss": 0.406,
+      "step": 9000
+    },
+    {
+      "epoch": 0.2483298095793224,
+      "eval_runtime": 24.7639,
+      "eval_samples_per_second": 1.292,
+      "eval_steps_per_second": 0.162,
+      "step": 9000
+    },
+    {
+      "epoch": 0.24835740178038676,
+      "grad_norm": 0.0028752442449331284,
+      "learning_rate": 0.001,
+      "loss": 0.4275,
+      "step": 9001
+    },
+    {
+      "epoch": 0.24838499398145114,
+      "grad_norm": 0.002456225221976638,
+      "learning_rate": 0.001,
+      "loss": 0.4138,
+      "step": 9002
+    },
+    {
+      "epoch": 0.24841258618251552,
+      "grad_norm": 0.0035426050890237093,
+      "learning_rate": 0.001,
+      "loss": 0.385,
+      "step": 9003
+    },
+    {
+      "epoch": 0.24844017838357987,
+      "grad_norm": 0.0030952002853155136,
+      "learning_rate": 0.001,
+      "loss": 0.3833,
+      "step": 9004
+    },
+    {
+      "epoch": 0.24846777058464425,
+      "grad_norm": 0.004819031804800034,
+      "learning_rate": 0.001,
+      "loss": 0.4016,
+      "step": 9005
+    },
+    {
+      "epoch": 0.2484953627857086,
+      "grad_norm": 0.007780123967677355,
+      "learning_rate": 0.001,
+      "loss": 0.3896,
+      "step": 9006
+    },
+    {
+      "epoch": 0.248522954986773,
+      "grad_norm": 0.005544982384890318,
+      "learning_rate": 0.001,
+      "loss": 0.3812,
+      "step": 9007
+    },
+    {
+      "epoch": 0.24855054718783737,
+      "grad_norm": 0.00238407077267766,
+      "learning_rate": 0.001,
+      "loss": 0.4002,
+      "step": 9008
+    },
+    {
+      "epoch": 0.24857813938890172,
+      "grad_norm": 0.0024830265901982784,
+      "learning_rate": 0.001,
+      "loss": 0.3894,
+      "step": 9009
+    },
+    {
+      "epoch": 0.2486057315899661,
+      "grad_norm": 0.001999202184379101,
+      "learning_rate": 0.001,
+      "loss": 0.4219,
+      "step": 9010
+    },
+    {
+      "epoch": 0.24863332379103045,
+      "grad_norm": 0.0038518370129168034,
+      "learning_rate": 0.001,
+      "loss": 0.39,
+      "step": 9011
+    },
+    {
+      "epoch": 0.24866091599209483,
+      "grad_norm": 0.0025100288912653923,
+      "learning_rate": 0.001,
+      "loss": 0.4523,
+      "step": 9012
+    },
+    {
+      "epoch": 0.2486885081931592,
+      "grad_norm": 0.0034927844535559416,
+      "learning_rate": 0.001,
+      "loss": 0.3875,
+      "step": 9013
+    },
+    {
+      "epoch": 0.24871610039422357,
+      "grad_norm": 0.0020984727889299393,
+      "learning_rate": 0.001,
+      "loss": 0.4255,
+      "step": 9014
+    },
+    {
+      "epoch": 0.24874369259528795,
+      "grad_norm": 0.003706206800416112,
+      "learning_rate": 0.001,
+      "loss": 0.3502,
+      "step": 9015
+    },
+    {
+      "epoch": 0.2487712847963523,
+      "grad_norm": 0.002861752174794674,
+      "learning_rate": 0.001,
+      "loss": 0.3768,
+      "step": 9016
+    },
+    {
+      "epoch": 0.24879887699741668,
+      "grad_norm": 0.0027587637305259705,
+      "learning_rate": 0.001,
+      "loss": 0.4082,
+      "step": 9017
+    },
+    {
+      "epoch": 0.24882646919848106,
+      "grad_norm": 0.002627007896080613,
+      "learning_rate": 0.001,
+      "loss": 0.4274,
+      "step": 9018
+    },
+    {
+      "epoch": 0.2488540613995454,
+      "grad_norm": 0.0047552213072776794,
+      "learning_rate": 0.001,
+      "loss": 0.3754,
+      "step": 9019
+    },
+    {
+      "epoch": 0.2488816536006098,
+      "grad_norm": 0.002344539389014244,
+      "learning_rate": 0.001,
+      "loss": 0.4069,
+      "step": 9020
+    },
+    {
+      "epoch": 0.24890924580167414,
+      "grad_norm": 0.0031450914684683084,
+      "learning_rate": 0.001,
+      "loss": 0.3758,
+      "step": 9021
+    },
+    {
+      "epoch": 0.24893683800273853,
+      "grad_norm": 0.0026123332791030407,
+      "learning_rate": 0.001,
+      "loss": 0.4125,
+      "step": 9022
+    },
+    {
+      "epoch": 0.2489644302038029,
+      "grad_norm": 0.005729448515921831,
+      "learning_rate": 0.001,
+      "loss": 0.405,
+      "step": 9023
+    },
+    {
+      "epoch": 0.24899202240486726,
+      "grad_norm": 0.003826259169727564,
+      "learning_rate": 0.001,
+      "loss": 0.4105,
+      "step": 9024
+    },
+    {
+      "epoch": 0.24901961460593164,
+      "grad_norm": 0.005589526146650314,
+      "learning_rate": 0.001,
+      "loss": 0.3767,
+      "step": 9025
+    },
+    {
+      "epoch": 0.249047206806996,
+      "grad_norm": 0.003237026045098901,
+      "learning_rate": 0.001,
+      "loss": 0.3738,
+      "step": 9026
+    },
+    {
+      "epoch": 0.24907479900806037,
+      "grad_norm": 0.0024905288591980934,
+      "learning_rate": 0.001,
+      "loss": 0.418,
+      "step": 9027
+    },
+    {
+      "epoch": 0.24910239120912475,
+      "grad_norm": 0.0029746349900960922,
+      "learning_rate": 0.001,
+      "loss": 0.437,
+      "step": 9028
+    },
+    {
+      "epoch": 0.2491299834101891,
+      "grad_norm": 0.003939430229365826,
+      "learning_rate": 0.001,
+      "loss": 0.3604,
+      "step": 9029
+    },
+    {
+      "epoch": 0.24915757561125348,
+      "grad_norm": 0.0025192126631736755,
+      "learning_rate": 0.001,
+      "loss": 0.4166,
+      "step": 9030
+    },
+    {
+      "epoch": 0.24918516781231784,
+      "grad_norm": 0.0019102268852293491,
+      "learning_rate": 0.001,
+      "loss": 0.4205,
+      "step": 9031
+    },
+    {
+      "epoch": 0.24921276001338222,
+      "grad_norm": 0.008570538833737373,
+      "learning_rate": 0.001,
+      "loss": 0.3891,
+      "step": 9032
+    },
+    {
+      "epoch": 0.2492403522144466,
+      "grad_norm": 0.0023212565574795008,
+      "learning_rate": 0.001,
+      "loss": 0.4246,
+      "step": 9033
+    },
+    {
+      "epoch": 0.24926794441551095,
+      "grad_norm": 0.0023950086906552315,
+      "learning_rate": 0.001,
+      "loss": 0.3955,
+      "step": 9034
+    },
+    {
+      "epoch": 0.24929553661657533,
+      "grad_norm": 0.002595582278445363,
+      "learning_rate": 0.001,
+      "loss": 0.3921,
+      "step": 9035
+    },
+    {
+      "epoch": 0.24932312881763968,
+      "grad_norm": 0.012406972236931324,
+      "learning_rate": 0.001,
+      "loss": 0.3759,
+      "step": 9036
+    },
+    {
+      "epoch": 0.24935072101870406,
+      "grad_norm": 0.0029333438724279404,
+      "learning_rate": 0.001,
+      "loss": 0.3703,
+      "step": 9037
+    },
+    {
+      "epoch": 0.24937831321976844,
+      "grad_norm": 0.004842236638069153,
+      "learning_rate": 0.001,
+      "loss": 0.3641,
+      "step": 9038
+    },
+    {
+      "epoch": 0.2494059054208328,
+      "grad_norm": 0.0062524196691811085,
+      "learning_rate": 0.001,
+      "loss": 0.397,
+      "step": 9039
+    },
+    {
+      "epoch": 0.24943349762189718,
+      "grad_norm": 0.004324330948293209,
+      "learning_rate": 0.001,
+      "loss": 0.4325,
+      "step": 9040
+    },
+    {
+      "epoch": 0.24946108982296153,
+      "grad_norm": 0.0026493435725569725,
+      "learning_rate": 0.001,
+      "loss": 0.4121,
+      "step": 9041
+    },
+    {
+      "epoch": 0.2494886820240259,
+      "grad_norm": 0.003301947610452771,
+      "learning_rate": 0.001,
+      "loss": 0.3975,
+      "step": 9042
+    },
+    {
+      "epoch": 0.2495162742250903,
+      "grad_norm": 0.0035235814284533262,
+      "learning_rate": 0.001,
+      "loss": 0.3928,
+      "step": 9043
+    },
+    {
+      "epoch": 0.24954386642615464,
+      "grad_norm": 0.002071363152936101,
+      "learning_rate": 0.001,
+      "loss": 0.3916,
+      "step": 9044
+    },
+    {
+      "epoch": 0.24957145862721902,
+      "grad_norm": 0.0021638255566358566,
+      "learning_rate": 0.001,
+      "loss": 0.3837,
+      "step": 9045
+    },
+    {
+      "epoch": 0.24959905082828338,
+      "grad_norm": 0.0027805992867797613,
+      "learning_rate": 0.001,
+      "loss": 0.3732,
+      "step": 9046
+    },
+    {
+      "epoch": 0.24962664302934776,
+      "grad_norm": 0.004270479548722506,
+      "learning_rate": 0.001,
+      "loss": 0.3954,
+      "step": 9047
+    },
+    {
+      "epoch": 0.24965423523041214,
+      "grad_norm": 0.002844201633706689,
+      "learning_rate": 0.001,
+      "loss": 0.4189,
+      "step": 9048
+    },
+    {
+      "epoch": 0.2496818274314765,
+      "grad_norm": 0.0030719267670065165,
+      "learning_rate": 0.001,
+      "loss": 0.3851,
+      "step": 9049
+    },
+    {
+      "epoch": 0.24970941963254087,
+      "grad_norm": 0.008080038242042065,
+      "learning_rate": 0.001,
+      "loss": 0.3994,
+      "step": 9050
+    },
+    {
+      "epoch": 0.24973701183360522,
+      "grad_norm": 0.0022514360025525093,
+      "learning_rate": 0.001,
+      "loss": 0.4391,
+      "step": 9051
+    },
+    {
+      "epoch": 0.2497646040346696,
+      "grad_norm": 0.0026551985647529364,
+      "learning_rate": 0.001,
+      "loss": 0.4058,
+      "step": 9052
+    },
+    {
+      "epoch": 0.24979219623573398,
+      "grad_norm": 0.002240521600469947,
+      "learning_rate": 0.001,
+      "loss": 0.3958,
+      "step": 9053
+    },
+    {
+      "epoch": 0.24981978843679833,
+      "grad_norm": 0.0024243989028036594,
+      "learning_rate": 0.001,
+      "loss": 0.3804,
+      "step": 9054
+    },
+    {
+      "epoch": 0.24984738063786271,
+      "grad_norm": 0.004180778283625841,
+      "learning_rate": 0.001,
+      "loss": 0.4338,
+      "step": 9055
+    },
+    {
+      "epoch": 0.24987497283892707,
+      "grad_norm": 0.003084244905039668,
+      "learning_rate": 0.001,
+      "loss": 0.4469,
+      "step": 9056
+    },
+    {
+      "epoch": 0.24990256503999145,
+      "grad_norm": 0.0029490680899471045,
+      "learning_rate": 0.001,
+      "loss": 0.4049,
+      "step": 9057
+    },
+    {
+      "epoch": 0.24993015724105583,
+      "grad_norm": 0.002250351244583726,
+      "learning_rate": 0.001,
+      "loss": 0.4503,
+      "step": 9058
+    },
+    {
+      "epoch": 0.24995774944212018,
+      "grad_norm": 0.0033541766460984945,
+      "learning_rate": 0.001,
+      "loss": 0.3971,
+      "step": 9059
+    },
+    {
+      "epoch": 0.24998534164318456,
+      "grad_norm": 0.005710989236831665,
+      "learning_rate": 0.001,
+      "loss": 0.3707,
+      "step": 9060
+    },
+    {
+      "epoch": 0.25001293384424894,
+      "grad_norm": 0.002218688605353236,
+      "learning_rate": 0.001,
+      "loss": 0.4303,
+      "step": 9061
+    },
+    {
+      "epoch": 0.2500405260453133,
+      "grad_norm": 0.014134782366454601,
+      "learning_rate": 0.001,
+      "loss": 0.3534,
+      "step": 9062
+    },
+    {
+      "epoch": 0.25006811824637765,
+      "grad_norm": 0.003705043811351061,
+      "learning_rate": 0.001,
+      "loss": 0.4308,
+      "step": 9063
+    },
+    {
+      "epoch": 0.25009571044744205,
+      "grad_norm": 0.0030994920525699854,
+      "learning_rate": 0.001,
+      "loss": 0.4176,
+      "step": 9064
+    },
+    {
+      "epoch": 0.2501233026485064,
+      "grad_norm": 0.002610408468171954,
+      "learning_rate": 0.001,
+      "loss": 0.3696,
+      "step": 9065
+    },
+    {
+      "epoch": 0.25015089484957076,
+      "grad_norm": 0.0018913348903879523,
+      "learning_rate": 0.001,
+      "loss": 0.4023,
+      "step": 9066
+    },
+    {
+      "epoch": 0.2501784870506351,
+      "grad_norm": 0.002292625606060028,
+      "learning_rate": 0.001,
+      "loss": 0.4287,
+      "step": 9067
+    },
+    {
+      "epoch": 0.2502060792516995,
+      "grad_norm": 0.0036623626947402954,
+      "learning_rate": 0.001,
+      "loss": 0.3768,
+      "step": 9068
+    },
+    {
+      "epoch": 0.2502336714527639,
+      "grad_norm": 0.002552927238866687,
+      "learning_rate": 0.001,
+      "loss": 0.3931,
+      "step": 9069
+    },
+    {
+      "epoch": 0.2502612636538282,
+      "grad_norm": 0.005650006700307131,
+      "learning_rate": 0.001,
+      "loss": 0.4034,
+      "step": 9070
+    },
+    {
+      "epoch": 0.25028885585489263,
+      "grad_norm": 0.0022689776960760355,
+      "learning_rate": 0.001,
+      "loss": 0.4568,
+      "step": 9071
+    },
+    {
+      "epoch": 0.250316448055957,
+      "grad_norm": 0.002632437041029334,
+      "learning_rate": 0.001,
+      "loss": 0.4326,
+      "step": 9072
+    },
+    {
+      "epoch": 0.25034404025702134,
+      "grad_norm": 0.004504368640482426,
+      "learning_rate": 0.001,
+      "loss": 0.3648,
+      "step": 9073
+    },
+    {
+      "epoch": 0.25037163245808575,
+      "grad_norm": 0.0025573919992893934,
+      "learning_rate": 0.001,
+      "loss": 0.4094,
+      "step": 9074
+    },
+    {
+      "epoch": 0.2503992246591501,
+      "grad_norm": 0.00638620974496007,
+      "learning_rate": 0.001,
+      "loss": 0.3792,
+      "step": 9075
+    },
+    {
+      "epoch": 0.25042681686021445,
+      "grad_norm": 0.002866227412596345,
+      "learning_rate": 0.001,
+      "loss": 0.4103,
+      "step": 9076
+    },
+    {
+      "epoch": 0.2504544090612788,
+      "grad_norm": 0.002649690955877304,
+      "learning_rate": 0.001,
+      "loss": 0.3756,
+      "step": 9077
+    },
+    {
+      "epoch": 0.2504820012623432,
+      "grad_norm": 0.002525368705391884,
+      "learning_rate": 0.001,
+      "loss": 0.444,
+      "step": 9078
+    },
+    {
+      "epoch": 0.25050959346340756,
+      "grad_norm": 0.008922641165554523,
+      "learning_rate": 0.001,
+      "loss": 0.4392,
+      "step": 9079
+    },
+    {
+      "epoch": 0.2505371856644719,
+      "grad_norm": 0.0028822512831538916,
+      "learning_rate": 0.001,
+      "loss": 0.4405,
+      "step": 9080
+    },
+    {
+      "epoch": 0.2505647778655363,
+      "grad_norm": 0.0031333775259554386,
+      "learning_rate": 0.001,
+      "loss": 0.3556,
+      "step": 9081
+    },
+    {
+      "epoch": 0.2505923700666007,
+      "grad_norm": 0.0029146878514438868,
+      "learning_rate": 0.001,
+      "loss": 0.3944,
+      "step": 9082
+    },
+    {
+      "epoch": 0.25061996226766503,
+      "grad_norm": 0.005818530917167664,
+      "learning_rate": 0.001,
+      "loss": 0.4303,
+      "step": 9083
+    },
+    {
+      "epoch": 0.25064755446872944,
+      "grad_norm": 0.0026978689711540937,
+      "learning_rate": 0.001,
+      "loss": 0.4441,
+      "step": 9084
+    },
+    {
+      "epoch": 0.2506751466697938,
+      "grad_norm": 0.005949284881353378,
+      "learning_rate": 0.001,
+      "loss": 0.3697,
+      "step": 9085
+    },
+    {
+      "epoch": 0.25070273887085814,
+      "grad_norm": 0.003694251412525773,
+      "learning_rate": 0.001,
+      "loss": 0.3965,
+      "step": 9086
+    },
+    {
+      "epoch": 0.2507303310719225,
+      "grad_norm": 0.005075919907540083,
+      "learning_rate": 0.001,
+      "loss": 0.4207,
+      "step": 9087
+    },
+    {
+      "epoch": 0.2507579232729869,
+      "grad_norm": 0.002228903118520975,
+      "learning_rate": 0.001,
+      "loss": 0.4303,
+      "step": 9088
+    },
+    {
+      "epoch": 0.25078551547405126,
+      "grad_norm": 0.002552257152274251,
+      "learning_rate": 0.001,
+      "loss": 0.4026,
+      "step": 9089
+    },
+    {
+      "epoch": 0.2508131076751156,
+      "grad_norm": 0.004208039958029985,
+      "learning_rate": 0.001,
+      "loss": 0.4442,
+      "step": 9090
+    },
+    {
+      "epoch": 0.25084069987618,
+      "grad_norm": 0.004520753864198923,
+      "learning_rate": 0.001,
+      "loss": 0.3442,
+      "step": 9091
+    },
+    {
+      "epoch": 0.25086829207724437,
+      "grad_norm": 0.002299990737810731,
+      "learning_rate": 0.001,
+      "loss": 0.4183,
+      "step": 9092
+    },
+    {
+      "epoch": 0.2508958842783087,
+      "grad_norm": 0.003526929300278425,
+      "learning_rate": 0.001,
+      "loss": 0.3555,
+      "step": 9093
+    },
+    {
+      "epoch": 0.25092347647937313,
+      "grad_norm": 0.0026393721345812082,
+      "learning_rate": 0.001,
+      "loss": 0.4189,
+      "step": 9094
+    },
+    {
+      "epoch": 0.2509510686804375,
+      "grad_norm": 0.0042528389021754265,
+      "learning_rate": 0.001,
+      "loss": 0.4108,
+      "step": 9095
+    },
+    {
+      "epoch": 0.25097866088150184,
+      "grad_norm": 0.004804633557796478,
+      "learning_rate": 0.001,
+      "loss": 0.3721,
+      "step": 9096
+    },
+    {
+      "epoch": 0.2510062530825662,
+      "grad_norm": 0.002310951007530093,
+      "learning_rate": 0.001,
+      "loss": 0.3838,
+      "step": 9097
+    },
+    {
+      "epoch": 0.2510338452836306,
+      "grad_norm": 0.0024983736220747232,
+      "learning_rate": 0.001,
+      "loss": 0.4178,
+      "step": 9098
+    },
+    {
+      "epoch": 0.25106143748469495,
+      "grad_norm": 0.0018971741665154696,
+      "learning_rate": 0.001,
+      "loss": 0.4081,
+      "step": 9099
+    },
+    {
+      "epoch": 0.2510890296857593,
+      "grad_norm": 0.002223706105723977,
+      "learning_rate": 0.001,
+      "loss": 0.4055,
+      "step": 9100
+    },
+    {
+      "epoch": 0.2511166218868237,
+      "grad_norm": 0.0028201346285641193,
+      "learning_rate": 0.001,
+      "loss": 0.4068,
+      "step": 9101
+    },
+    {
+      "epoch": 0.25114421408788806,
+      "grad_norm": 0.0036441651172935963,
+      "learning_rate": 0.001,
+      "loss": 0.3565,
+      "step": 9102
+    },
+    {
+      "epoch": 0.2511718062889524,
+      "grad_norm": 0.002532258862629533,
+      "learning_rate": 0.001,
+      "loss": 0.4385,
+      "step": 9103
+    },
+    {
+      "epoch": 0.2511993984900168,
+      "grad_norm": 0.0029783290810883045,
+      "learning_rate": 0.001,
+      "loss": 0.3741,
+      "step": 9104
+    },
+    {
+      "epoch": 0.2512269906910812,
+      "grad_norm": 0.003864760510623455,
+      "learning_rate": 0.001,
+      "loss": 0.401,
+      "step": 9105
+    },
+    {
+      "epoch": 0.25125458289214553,
+      "grad_norm": 0.004010128788650036,
+      "learning_rate": 0.001,
+      "loss": 0.3796,
+      "step": 9106
+    },
+    {
+      "epoch": 0.2512821750932099,
+      "grad_norm": 0.004032640252262354,
+      "learning_rate": 0.001,
+      "loss": 0.373,
+      "step": 9107
+    },
+    {
+      "epoch": 0.2513097672942743,
+      "grad_norm": 0.003456498496234417,
+      "learning_rate": 0.001,
+      "loss": 0.3834,
+      "step": 9108
+    },
+    {
+      "epoch": 0.25133735949533864,
+      "grad_norm": 0.0041527096182107925,
+      "learning_rate": 0.001,
+      "loss": 0.4172,
+      "step": 9109
+    },
+    {
+      "epoch": 0.251364951696403,
+      "grad_norm": 0.0033256958704441786,
+      "learning_rate": 0.001,
+      "loss": 0.4157,
+      "step": 9110
+    },
+    {
+      "epoch": 0.2513925438974674,
+      "grad_norm": 0.009706409648060799,
+      "learning_rate": 0.001,
+      "loss": 0.3826,
+      "step": 9111
+    },
+    {
+      "epoch": 0.25142013609853175,
+      "grad_norm": 0.005892944522202015,
+      "learning_rate": 0.001,
+      "loss": 0.4361,
+      "step": 9112
+    },
+    {
+      "epoch": 0.2514477282995961,
+      "grad_norm": 0.0027680047787725925,
+      "learning_rate": 0.001,
+      "loss": 0.4273,
+      "step": 9113
+    },
+    {
+      "epoch": 0.2514753205006605,
+      "grad_norm": 0.0046938760206103325,
+      "learning_rate": 0.001,
+      "loss": 0.3873,
+      "step": 9114
+    },
+    {
+      "epoch": 0.25150291270172487,
+      "grad_norm": 0.00436344463378191,
+      "learning_rate": 0.001,
+      "loss": 0.4194,
+      "step": 9115
+    },
+    {
+      "epoch": 0.2515305049027892,
+      "grad_norm": 0.002917677629739046,
+      "learning_rate": 0.001,
+      "loss": 0.4181,
+      "step": 9116
+    },
+    {
+      "epoch": 0.2515580971038536,
+      "grad_norm": 0.0041982559487223625,
+      "learning_rate": 0.001,
+      "loss": 0.3528,
+      "step": 9117
+    },
+    {
+      "epoch": 0.251585689304918,
+      "grad_norm": 0.003178415121510625,
+      "learning_rate": 0.001,
+      "loss": 0.3877,
+      "step": 9118
+    },
+    {
+      "epoch": 0.25161328150598233,
+      "grad_norm": 0.00351191614754498,
+      "learning_rate": 0.001,
+      "loss": 0.3981,
+      "step": 9119
+    },
+    {
+      "epoch": 0.2516408737070467,
+      "grad_norm": 0.002716810442507267,
+      "learning_rate": 0.001,
+      "loss": 0.4092,
+      "step": 9120
+    },
+    {
+      "epoch": 0.2516684659081111,
+      "grad_norm": 0.00262506608851254,
+      "learning_rate": 0.001,
+      "loss": 0.4212,
+      "step": 9121
+    },
+    {
+      "epoch": 0.25169605810917545,
+      "grad_norm": 0.005152091383934021,
+      "learning_rate": 0.001,
+      "loss": 0.3949,
+      "step": 9122
+    },
+    {
+      "epoch": 0.2517236503102398,
+      "grad_norm": 0.003531603142619133,
+      "learning_rate": 0.001,
+      "loss": 0.379,
+      "step": 9123
+    },
+    {
+      "epoch": 0.2517512425113042,
+      "grad_norm": 0.0037356463726609945,
+      "learning_rate": 0.001,
+      "loss": 0.4145,
+      "step": 9124
+    },
+    {
+      "epoch": 0.25177883471236856,
+      "grad_norm": 0.02328849956393242,
+      "learning_rate": 0.001,
+      "loss": 0.4216,
+      "step": 9125
+    },
+    {
+      "epoch": 0.2518064269134329,
+      "grad_norm": 0.0030895329546183348,
+      "learning_rate": 0.001,
+      "loss": 0.3952,
+      "step": 9126
+    },
+    {
+      "epoch": 0.25183401911449727,
+      "grad_norm": 0.002636708552017808,
+      "learning_rate": 0.001,
+      "loss": 0.3837,
+      "step": 9127
+    },
+    {
+      "epoch": 0.2518616113155617,
+      "grad_norm": 0.0028558552730828524,
+      "learning_rate": 0.001,
+      "loss": 0.3888,
+      "step": 9128
+    },
+    {
+      "epoch": 0.251889203516626,
+      "grad_norm": 0.003687650430947542,
+      "learning_rate": 0.001,
+      "loss": 0.4049,
+      "step": 9129
+    },
+    {
+      "epoch": 0.2519167957176904,
+      "grad_norm": 0.002363695530220866,
+      "learning_rate": 0.001,
+      "loss": 0.4072,
+      "step": 9130
+    },
+    {
+      "epoch": 0.2519443879187548,
+      "grad_norm": 0.0024190808180719614,
+      "learning_rate": 0.001,
+      "loss": 0.4124,
+      "step": 9131
+    },
+    {
+      "epoch": 0.25197198011981914,
+      "grad_norm": 0.002729938132688403,
+      "learning_rate": 0.001,
+      "loss": 0.3879,
+      "step": 9132
+    },
+    {
+      "epoch": 0.2519995723208835,
+      "grad_norm": 0.003316995920613408,
+      "learning_rate": 0.001,
+      "loss": 0.3628,
+      "step": 9133
+    },
+    {
+      "epoch": 0.25202716452194784,
+      "grad_norm": 0.002829955890774727,
+      "learning_rate": 0.001,
+      "loss": 0.3945,
+      "step": 9134
+    },
+    {
+      "epoch": 0.25205475672301225,
+      "grad_norm": 0.0023533031344413757,
+      "learning_rate": 0.001,
+      "loss": 0.4032,
+      "step": 9135
+    },
+    {
+      "epoch": 0.2520823489240766,
+      "grad_norm": 0.002993488684296608,
+      "learning_rate": 0.001,
+      "loss": 0.4005,
+      "step": 9136
+    },
+    {
+      "epoch": 0.25210994112514096,
+      "grad_norm": 0.002113811671733856,
+      "learning_rate": 0.001,
+      "loss": 0.4032,
+      "step": 9137
+    },
+    {
+      "epoch": 0.25213753332620537,
+      "grad_norm": 0.00470153009518981,
+      "learning_rate": 0.001,
+      "loss": 0.3624,
+      "step": 9138
+    },
+    {
+      "epoch": 0.2521651255272697,
+      "grad_norm": 0.0033217284362763166,
+      "learning_rate": 0.001,
+      "loss": 0.4062,
+      "step": 9139
+    },
+    {
+      "epoch": 0.25219271772833407,
+      "grad_norm": 0.0025939196348190308,
+      "learning_rate": 0.001,
+      "loss": 0.4082,
+      "step": 9140
+    },
+    {
+      "epoch": 0.2522203099293985,
+      "grad_norm": 0.0032115280628204346,
+      "learning_rate": 0.001,
+      "loss": 0.3932,
+      "step": 9141
+    },
+    {
+      "epoch": 0.25224790213046283,
+      "grad_norm": 0.0023531445767730474,
+      "learning_rate": 0.001,
+      "loss": 0.4231,
+      "step": 9142
+    },
+    {
+      "epoch": 0.2522754943315272,
+      "grad_norm": 0.0035788915120065212,
+      "learning_rate": 0.001,
+      "loss": 0.386,
+      "step": 9143
+    },
+    {
+      "epoch": 0.25230308653259154,
+      "grad_norm": 0.002639509504660964,
+      "learning_rate": 0.001,
+      "loss": 0.4204,
+      "step": 9144
+    },
+    {
+      "epoch": 0.25233067873365594,
+      "grad_norm": 0.0037151076830923557,
+      "learning_rate": 0.001,
+      "loss": 0.4048,
+      "step": 9145
+    },
+    {
+      "epoch": 0.2523582709347203,
+      "grad_norm": 0.002693182323127985,
+      "learning_rate": 0.001,
+      "loss": 0.4174,
+      "step": 9146
+    },
+    {
+      "epoch": 0.25238586313578465,
+      "grad_norm": 0.00298108346760273,
+      "learning_rate": 0.001,
+      "loss": 0.4452,
+      "step": 9147
+    },
+    {
+      "epoch": 0.25241345533684906,
+      "grad_norm": 0.005775284022092819,
+      "learning_rate": 0.001,
+      "loss": 0.381,
+      "step": 9148
+    },
+    {
+      "epoch": 0.2524410475379134,
+      "grad_norm": 0.004025799687951803,
+      "learning_rate": 0.001,
+      "loss": 0.3968,
+      "step": 9149
+    },
+    {
+      "epoch": 0.25246863973897776,
+      "grad_norm": 0.0029659196734428406,
+      "learning_rate": 0.001,
+      "loss": 0.4274,
+      "step": 9150
+    },
+    {
+      "epoch": 0.25249623194004217,
+      "grad_norm": 0.006109724286943674,
+      "learning_rate": 0.001,
+      "loss": 0.4322,
+      "step": 9151
+    },
+    {
+      "epoch": 0.2525238241411065,
+      "grad_norm": 0.004189697094261646,
+      "learning_rate": 0.001,
+      "loss": 0.3635,
+      "step": 9152
+    },
+    {
+      "epoch": 0.2525514163421709,
+      "grad_norm": 0.004084098618477583,
+      "learning_rate": 0.001,
+      "loss": 0.4044,
+      "step": 9153
+    },
+    {
+      "epoch": 0.25257900854323523,
+      "grad_norm": 0.0035397496540099382,
+      "learning_rate": 0.001,
+      "loss": 0.4127,
+      "step": 9154
+    },
+    {
+      "epoch": 0.25260660074429964,
+      "grad_norm": 0.002793220803141594,
+      "learning_rate": 0.001,
+      "loss": 0.4159,
+      "step": 9155
+    },
+    {
+      "epoch": 0.252634192945364,
+      "grad_norm": 0.0030049553606659174,
+      "learning_rate": 0.001,
+      "loss": 0.4367,
+      "step": 9156
+    },
+    {
+      "epoch": 0.25266178514642834,
+      "grad_norm": 0.003896713722497225,
+      "learning_rate": 0.001,
+      "loss": 0.3657,
+      "step": 9157
+    },
+    {
+      "epoch": 0.25268937734749275,
+      "grad_norm": 0.0028666965663433075,
+      "learning_rate": 0.001,
+      "loss": 0.4524,
+      "step": 9158
+    },
+    {
+      "epoch": 0.2527169695485571,
+      "grad_norm": 0.0026138313114643097,
+      "learning_rate": 0.001,
+      "loss": 0.4174,
+      "step": 9159
+    },
+    {
+      "epoch": 0.25274456174962145,
+      "grad_norm": 0.0023572929203510284,
+      "learning_rate": 0.001,
+      "loss": 0.4086,
+      "step": 9160
+    },
+    {
+      "epoch": 0.25277215395068586,
+      "grad_norm": 0.003126518102362752,
+      "learning_rate": 0.001,
+      "loss": 0.3762,
+      "step": 9161
+    },
+    {
+      "epoch": 0.2527997461517502,
+      "grad_norm": 0.0024071959778666496,
+      "learning_rate": 0.001,
+      "loss": 0.4011,
+      "step": 9162
+    },
+    {
+      "epoch": 0.25282733835281457,
+      "grad_norm": 0.0023764509242028,
+      "learning_rate": 0.001,
+      "loss": 0.4642,
+      "step": 9163
+    },
+    {
+      "epoch": 0.2528549305538789,
+      "grad_norm": 0.002558658830821514,
+      "learning_rate": 0.001,
+      "loss": 0.387,
+      "step": 9164
+    },
+    {
+      "epoch": 0.25288252275494333,
+      "grad_norm": 0.004064169712364674,
+      "learning_rate": 0.001,
+      "loss": 0.3882,
+      "step": 9165
+    },
+    {
+      "epoch": 0.2529101149560077,
+      "grad_norm": 0.0038082520477473736,
+      "learning_rate": 0.001,
+      "loss": 0.3963,
+      "step": 9166
+    },
+    {
+      "epoch": 0.25293770715707203,
+      "grad_norm": 0.002633225405588746,
+      "learning_rate": 0.001,
+      "loss": 0.3788,
+      "step": 9167
+    },
+    {
+      "epoch": 0.25296529935813644,
+      "grad_norm": 0.002508282894268632,
+      "learning_rate": 0.001,
+      "loss": 0.3958,
+      "step": 9168
+    },
+    {
+      "epoch": 0.2529928915592008,
+      "grad_norm": 0.0038721607998013496,
+      "learning_rate": 0.001,
+      "loss": 0.3782,
+      "step": 9169
+    },
+    {
+      "epoch": 0.25302048376026515,
+      "grad_norm": 0.0033644256182014942,
+      "learning_rate": 0.001,
+      "loss": 0.3851,
+      "step": 9170
+    },
+    {
+      "epoch": 0.25304807596132955,
+      "grad_norm": 0.002658225130289793,
+      "learning_rate": 0.001,
+      "loss": 0.3945,
+      "step": 9171
+    },
+    {
+      "epoch": 0.2530756681623939,
+      "grad_norm": 0.0030783566180616617,
+      "learning_rate": 0.001,
+      "loss": 0.3867,
+      "step": 9172
+    },
+    {
+      "epoch": 0.25310326036345826,
+      "grad_norm": 0.0024686295073479414,
+      "learning_rate": 0.001,
+      "loss": 0.4195,
+      "step": 9173
+    },
+    {
+      "epoch": 0.2531308525645226,
+      "grad_norm": 0.0030403723940253258,
+      "learning_rate": 0.001,
+      "loss": 0.3803,
+      "step": 9174
+    },
+    {
+      "epoch": 0.253158444765587,
+      "grad_norm": 0.0033355674240738153,
+      "learning_rate": 0.001,
+      "loss": 0.4257,
+      "step": 9175
+    },
+    {
+      "epoch": 0.2531860369666514,
+      "grad_norm": 0.0023434923496097326,
+      "learning_rate": 0.001,
+      "loss": 0.4126,
+      "step": 9176
+    },
+    {
+      "epoch": 0.2532136291677157,
+      "grad_norm": 0.0037734145298600197,
+      "learning_rate": 0.001,
+      "loss": 0.3675,
+      "step": 9177
+    },
+    {
+      "epoch": 0.25324122136878013,
+      "grad_norm": 0.0024306117556989193,
+      "learning_rate": 0.001,
+      "loss": 0.3971,
+      "step": 9178
+    },
+    {
+      "epoch": 0.2532688135698445,
+      "grad_norm": 0.002472655149176717,
+      "learning_rate": 0.001,
+      "loss": 0.4281,
+      "step": 9179
+    },
+    {
+      "epoch": 0.25329640577090884,
+      "grad_norm": 0.0033675595186650753,
+      "learning_rate": 0.001,
+      "loss": 0.3934,
+      "step": 9180
+    },
+    {
+      "epoch": 0.25332399797197325,
+      "grad_norm": 0.00425712438300252,
+      "learning_rate": 0.001,
+      "loss": 0.396,
+      "step": 9181
+    },
+    {
+      "epoch": 0.2533515901730376,
+      "grad_norm": 0.002620971528813243,
+      "learning_rate": 0.001,
+      "loss": 0.4149,
+      "step": 9182
+    },
+    {
+      "epoch": 0.25337918237410195,
+      "grad_norm": 0.002496513072401285,
+      "learning_rate": 0.001,
+      "loss": 0.4068,
+      "step": 9183
+    },
+    {
+      "epoch": 0.2534067745751663,
+      "grad_norm": 0.0037882968317717314,
+      "learning_rate": 0.001,
+      "loss": 0.3728,
+      "step": 9184
+    },
+    {
+      "epoch": 0.2534343667762307,
+      "grad_norm": 0.0028284622821956873,
+      "learning_rate": 0.001,
+      "loss": 0.3959,
+      "step": 9185
+    },
+    {
+      "epoch": 0.25346195897729507,
+      "grad_norm": 0.0031743019353598356,
+      "learning_rate": 0.001,
+      "loss": 0.4024,
+      "step": 9186
+    },
+    {
+      "epoch": 0.2534895511783594,
+      "grad_norm": 0.003066749544814229,
+      "learning_rate": 0.001,
+      "loss": 0.371,
+      "step": 9187
+    },
+    {
+      "epoch": 0.2535171433794238,
+      "grad_norm": 0.002977808238938451,
+      "learning_rate": 0.001,
+      "loss": 0.4166,
+      "step": 9188
+    },
+    {
+      "epoch": 0.2535447355804882,
+      "grad_norm": 0.0039002352859824896,
+      "learning_rate": 0.001,
+      "loss": 0.3788,
+      "step": 9189
+    },
+    {
+      "epoch": 0.25357232778155253,
+      "grad_norm": 0.003922617062926292,
+      "learning_rate": 0.001,
+      "loss": 0.4162,
+      "step": 9190
+    },
+    {
+      "epoch": 0.25359991998261694,
+      "grad_norm": 0.004719828721135855,
+      "learning_rate": 0.001,
+      "loss": 0.3872,
+      "step": 9191
+    },
+    {
+      "epoch": 0.2536275121836813,
+      "grad_norm": 0.0065970043651759624,
+      "learning_rate": 0.001,
+      "loss": 0.4595,
+      "step": 9192
+    },
+    {
+      "epoch": 0.25365510438474564,
+      "grad_norm": 0.002771706786006689,
+      "learning_rate": 0.001,
+      "loss": 0.4079,
+      "step": 9193
+    },
+    {
+      "epoch": 0.25368269658581,
+      "grad_norm": 0.002968127839267254,
+      "learning_rate": 0.001,
+      "loss": 0.4369,
+      "step": 9194
+    },
+    {
+      "epoch": 0.2537102887868744,
+      "grad_norm": 0.002558299573138356,
+      "learning_rate": 0.001,
+      "loss": 0.4117,
+      "step": 9195
+    },
+    {
+      "epoch": 0.25373788098793876,
+      "grad_norm": 0.003306181402876973,
+      "learning_rate": 0.001,
+      "loss": 0.3655,
+      "step": 9196
+    },
+    {
+      "epoch": 0.2537654731890031,
+      "grad_norm": 0.005295965354889631,
+      "learning_rate": 0.001,
+      "loss": 0.4163,
+      "step": 9197
+    },
+    {
+      "epoch": 0.2537930653900675,
+      "grad_norm": 0.00258263130672276,
+      "learning_rate": 0.001,
+      "loss": 0.4246,
+      "step": 9198
+    },
+    {
+      "epoch": 0.25382065759113187,
+      "grad_norm": 0.003539256053045392,
+      "learning_rate": 0.001,
+      "loss": 0.427,
+      "step": 9199
+    },
+    {
+      "epoch": 0.2538482497921962,
+      "grad_norm": 0.004078635014593601,
+      "learning_rate": 0.001,
+      "loss": 0.417,
+      "step": 9200
+    },
+    {
+      "epoch": 0.25387584199326063,
+      "grad_norm": 0.003579481039196253,
+      "learning_rate": 0.001,
+      "loss": 0.3628,
+      "step": 9201
+    },
+    {
+      "epoch": 0.253903434194325,
+      "grad_norm": 0.011261108331382275,
+      "learning_rate": 0.001,
+      "loss": 0.3793,
+      "step": 9202
+    },
+    {
+      "epoch": 0.25393102639538934,
+      "grad_norm": 0.005823037121444941,
+      "learning_rate": 0.001,
+      "loss": 0.3973,
+      "step": 9203
+    },
+    {
+      "epoch": 0.2539586185964537,
+      "grad_norm": 0.0032197630498558283,
+      "learning_rate": 0.001,
+      "loss": 0.4038,
+      "step": 9204
+    },
+    {
+      "epoch": 0.2539862107975181,
+      "grad_norm": 0.007648573722690344,
+      "learning_rate": 0.001,
+      "loss": 0.4349,
+      "step": 9205
+    },
+    {
+      "epoch": 0.25401380299858245,
+      "grad_norm": 0.0039948225021362305,
+      "learning_rate": 0.001,
+      "loss": 0.3903,
+      "step": 9206
+    },
+    {
+      "epoch": 0.2540413951996468,
+      "grad_norm": 0.008907606825232506,
+      "learning_rate": 0.001,
+      "loss": 0.3874,
+      "step": 9207
+    },
+    {
+      "epoch": 0.2540689874007112,
+      "grad_norm": 0.0032174009829759598,
+      "learning_rate": 0.001,
+      "loss": 0.3872,
+      "step": 9208
+    },
+    {
+      "epoch": 0.25409657960177556,
+      "grad_norm": 0.0036075576208531857,
+      "learning_rate": 0.001,
+      "loss": 0.4109,
+      "step": 9209
+    },
+    {
+      "epoch": 0.2541241718028399,
+      "grad_norm": 0.002537550637498498,
+      "learning_rate": 0.001,
+      "loss": 0.3815,
+      "step": 9210
+    },
+    {
+      "epoch": 0.2541517640039043,
+      "grad_norm": 0.004352409392595291,
+      "learning_rate": 0.001,
+      "loss": 0.394,
+      "step": 9211
+    },
+    {
+      "epoch": 0.2541793562049687,
+      "grad_norm": 0.004635980818420649,
+      "learning_rate": 0.001,
+      "loss": 0.3785,
+      "step": 9212
+    },
+    {
+      "epoch": 0.25420694840603303,
+      "grad_norm": 0.0028103957884013653,
+      "learning_rate": 0.001,
+      "loss": 0.3936,
+      "step": 9213
+    },
+    {
+      "epoch": 0.2542345406070974,
+      "grad_norm": 0.0023787037935107946,
+      "learning_rate": 0.001,
+      "loss": 0.4018,
+      "step": 9214
+    },
+    {
+      "epoch": 0.2542621328081618,
+      "grad_norm": 0.002404349623247981,
+      "learning_rate": 0.001,
+      "loss": 0.4314,
+      "step": 9215
+    },
+    {
+      "epoch": 0.25428972500922614,
+      "grad_norm": 0.0025460943579673767,
+      "learning_rate": 0.001,
+      "loss": 0.4137,
+      "step": 9216
+    },
+    {
+      "epoch": 0.2543173172102905,
+      "grad_norm": 0.003394089639186859,
+      "learning_rate": 0.001,
+      "loss": 0.3982,
+      "step": 9217
+    },
+    {
+      "epoch": 0.2543449094113549,
+      "grad_norm": 0.003400506917387247,
+      "learning_rate": 0.001,
+      "loss": 0.386,
+      "step": 9218
+    },
+    {
+      "epoch": 0.25437250161241926,
+      "grad_norm": 0.007282086182385683,
+      "learning_rate": 0.001,
+      "loss": 0.394,
+      "step": 9219
+    },
+    {
+      "epoch": 0.2544000938134836,
+      "grad_norm": 0.0025983904488384724,
+      "learning_rate": 0.001,
+      "loss": 0.3971,
+      "step": 9220
+    },
+    {
+      "epoch": 0.254427686014548,
+      "grad_norm": 0.0024682951625436544,
+      "learning_rate": 0.001,
+      "loss": 0.4116,
+      "step": 9221
+    },
+    {
+      "epoch": 0.25445527821561237,
+      "grad_norm": 0.002498073736205697,
+      "learning_rate": 0.001,
+      "loss": 0.381,
+      "step": 9222
+    },
+    {
+      "epoch": 0.2544828704166767,
+      "grad_norm": 0.004814106039702892,
+      "learning_rate": 0.001,
+      "loss": 0.4072,
+      "step": 9223
+    },
+    {
+      "epoch": 0.2545104626177411,
+      "grad_norm": 0.004485031124204397,
+      "learning_rate": 0.001,
+      "loss": 0.4057,
+      "step": 9224
+    },
+    {
+      "epoch": 0.2545380548188055,
+      "grad_norm": 0.002533281221985817,
+      "learning_rate": 0.001,
+      "loss": 0.4129,
+      "step": 9225
+    },
+    {
+      "epoch": 0.25456564701986983,
+      "grad_norm": 0.0037844269536435604,
+      "learning_rate": 0.001,
+      "loss": 0.3937,
+      "step": 9226
+    },
+    {
+      "epoch": 0.2545932392209342,
+      "grad_norm": 0.0033858432434499264,
+      "learning_rate": 0.001,
+      "loss": 0.3821,
+      "step": 9227
+    },
+    {
+      "epoch": 0.2546208314219986,
+      "grad_norm": 0.006454948335886002,
+      "learning_rate": 0.001,
+      "loss": 0.3779,
+      "step": 9228
+    },
+    {
+      "epoch": 0.25464842362306295,
+      "grad_norm": 0.0032489451114088297,
+      "learning_rate": 0.001,
+      "loss": 0.4399,
+      "step": 9229
+    },
+    {
+      "epoch": 0.2546760158241273,
+      "grad_norm": 0.0027783990371972322,
+      "learning_rate": 0.001,
+      "loss": 0.4115,
+      "step": 9230
+    },
+    {
+      "epoch": 0.25470360802519165,
+      "grad_norm": 0.0032559032551944256,
+      "learning_rate": 0.001,
+      "loss": 0.4134,
+      "step": 9231
+    },
+    {
+      "epoch": 0.25473120022625606,
+      "grad_norm": 0.006878309417515993,
+      "learning_rate": 0.001,
+      "loss": 0.4187,
+      "step": 9232
+    },
+    {
+      "epoch": 0.2547587924273204,
+      "grad_norm": 0.003954887855798006,
+      "learning_rate": 0.001,
+      "loss": 0.4045,
+      "step": 9233
+    },
+    {
+      "epoch": 0.25478638462838477,
+      "grad_norm": 0.0032106926664710045,
+      "learning_rate": 0.001,
+      "loss": 0.4152,
+      "step": 9234
+    },
+    {
+      "epoch": 0.2548139768294492,
+      "grad_norm": 0.002547104610130191,
+      "learning_rate": 0.001,
+      "loss": 0.4171,
+      "step": 9235
+    },
+    {
+      "epoch": 0.2548415690305135,
+      "grad_norm": 0.00274961581453681,
+      "learning_rate": 0.001,
+      "loss": 0.3988,
+      "step": 9236
+    },
+    {
+      "epoch": 0.2548691612315779,
+      "grad_norm": 0.002504730597138405,
+      "learning_rate": 0.001,
+      "loss": 0.3737,
+      "step": 9237
+    },
+    {
+      "epoch": 0.2548967534326423,
+      "grad_norm": 0.00348614901304245,
+      "learning_rate": 0.001,
+      "loss": 0.3953,
+      "step": 9238
+    },
+    {
+      "epoch": 0.25492434563370664,
+      "grad_norm": 0.0030160502064973116,
+      "learning_rate": 0.001,
+      "loss": 0.4211,
+      "step": 9239
+    },
+    {
+      "epoch": 0.254951937834771,
+      "grad_norm": 0.0032628930639475584,
+      "learning_rate": 0.001,
+      "loss": 0.4242,
+      "step": 9240
+    },
+    {
+      "epoch": 0.25497953003583534,
+      "grad_norm": 0.0022108100820332766,
+      "learning_rate": 0.001,
+      "loss": 0.4129,
+      "step": 9241
+    },
+    {
+      "epoch": 0.25500712223689975,
+      "grad_norm": 0.003723006695508957,
+      "learning_rate": 0.001,
+      "loss": 0.4371,
+      "step": 9242
+    },
+    {
+      "epoch": 0.2550347144379641,
+      "grad_norm": 0.00470739183947444,
+      "learning_rate": 0.001,
+      "loss": 0.3936,
+      "step": 9243
+    },
+    {
+      "epoch": 0.25506230663902846,
+      "grad_norm": 0.00285819242708385,
+      "learning_rate": 0.001,
+      "loss": 0.4128,
+      "step": 9244
+    },
+    {
+      "epoch": 0.25508989884009287,
+      "grad_norm": 0.0027255092281848192,
+      "learning_rate": 0.001,
+      "loss": 0.4063,
+      "step": 9245
+    },
+    {
+      "epoch": 0.2551174910411572,
+      "grad_norm": 0.0023286372888833284,
+      "learning_rate": 0.001,
+      "loss": 0.3999,
+      "step": 9246
+    },
+    {
+      "epoch": 0.25514508324222157,
+      "grad_norm": 0.006774048320949078,
+      "learning_rate": 0.001,
+      "loss": 0.3874,
+      "step": 9247
+    },
+    {
+      "epoch": 0.255172675443286,
+      "grad_norm": 0.003444313770160079,
+      "learning_rate": 0.001,
+      "loss": 0.3923,
+      "step": 9248
+    },
+    {
+      "epoch": 0.25520026764435033,
+      "grad_norm": 0.003187762573361397,
+      "learning_rate": 0.001,
+      "loss": 0.4253,
+      "step": 9249
+    },
+    {
+      "epoch": 0.2552278598454147,
+      "grad_norm": 0.00499644735828042,
+      "learning_rate": 0.001,
+      "loss": 0.4378,
+      "step": 9250
+    },
+    {
+      "epoch": 0.25525545204647904,
+      "grad_norm": 0.00488440552726388,
+      "learning_rate": 0.001,
+      "loss": 0.4275,
+      "step": 9251
+    },
+    {
+      "epoch": 0.25528304424754344,
+      "grad_norm": 0.0026155610103160143,
+      "learning_rate": 0.001,
+      "loss": 0.3975,
+      "step": 9252
+    },
+    {
+      "epoch": 0.2553106364486078,
+      "grad_norm": 0.0035269884392619133,
+      "learning_rate": 0.001,
+      "loss": 0.406,
+      "step": 9253
+    },
+    {
+      "epoch": 0.25533822864967215,
+      "grad_norm": 0.005496086087077856,
+      "learning_rate": 0.001,
+      "loss": 0.3742,
+      "step": 9254
+    },
+    {
+      "epoch": 0.25536582085073656,
+      "grad_norm": 0.003939601127058268,
+      "learning_rate": 0.001,
+      "loss": 0.4224,
+      "step": 9255
+    },
+    {
+      "epoch": 0.2553934130518009,
+      "grad_norm": 0.0052981991320848465,
+      "learning_rate": 0.001,
+      "loss": 0.3673,
+      "step": 9256
+    },
+    {
+      "epoch": 0.25542100525286526,
+      "grad_norm": 0.0025523051153868437,
+      "learning_rate": 0.001,
+      "loss": 0.4095,
+      "step": 9257
+    },
+    {
+      "epoch": 0.25544859745392967,
+      "grad_norm": 0.004392458591610193,
+      "learning_rate": 0.001,
+      "loss": 0.4057,
+      "step": 9258
+    },
+    {
+      "epoch": 0.255476189654994,
+      "grad_norm": 0.005645066034048796,
+      "learning_rate": 0.001,
+      "loss": 0.3755,
+      "step": 9259
+    },
+    {
+      "epoch": 0.2555037818560584,
+      "grad_norm": 0.00333485659211874,
+      "learning_rate": 0.001,
+      "loss": 0.3992,
+      "step": 9260
+    },
+    {
+      "epoch": 0.25553137405712273,
+      "grad_norm": 0.003476017387583852,
+      "learning_rate": 0.001,
+      "loss": 0.3854,
+      "step": 9261
+    },
+    {
+      "epoch": 0.25555896625818714,
+      "grad_norm": 0.007261955179274082,
+      "learning_rate": 0.001,
+      "loss": 0.4203,
+      "step": 9262
+    },
+    {
+      "epoch": 0.2555865584592515,
+      "grad_norm": 0.004596728831529617,
+      "learning_rate": 0.001,
+      "loss": 0.3866,
+      "step": 9263
+    },
+    {
+      "epoch": 0.25561415066031584,
+      "grad_norm": 0.0033560562878847122,
+      "learning_rate": 0.001,
+      "loss": 0.3764,
+      "step": 9264
+    },
+    {
+      "epoch": 0.25564174286138025,
+      "grad_norm": 0.003836827352643013,
+      "learning_rate": 0.001,
+      "loss": 0.3753,
+      "step": 9265
+    },
+    {
+      "epoch": 0.2556693350624446,
+      "grad_norm": 0.0031401992309838533,
+      "learning_rate": 0.001,
+      "loss": 0.4131,
+      "step": 9266
+    },
+    {
+      "epoch": 0.25569692726350896,
+      "grad_norm": 0.003163114422932267,
+      "learning_rate": 0.001,
+      "loss": 0.3708,
+      "step": 9267
+    },
+    {
+      "epoch": 0.25572451946457336,
+      "grad_norm": 0.002614476252347231,
+      "learning_rate": 0.001,
+      "loss": 0.4264,
+      "step": 9268
+    },
+    {
+      "epoch": 0.2557521116656377,
+      "grad_norm": 0.0029321182519197464,
+      "learning_rate": 0.001,
+      "loss": 0.4019,
+      "step": 9269
+    },
+    {
+      "epoch": 0.25577970386670207,
+      "grad_norm": 0.0028678993694484234,
+      "learning_rate": 0.001,
+      "loss": 0.3902,
+      "step": 9270
+    },
+    {
+      "epoch": 0.2558072960677664,
+      "grad_norm": 0.0034806549083441496,
+      "learning_rate": 0.001,
+      "loss": 0.4039,
+      "step": 9271
+    },
+    {
+      "epoch": 0.25583488826883083,
+      "grad_norm": 0.0030649881809949875,
+      "learning_rate": 0.001,
+      "loss": 0.3957,
+      "step": 9272
+    },
+    {
+      "epoch": 0.2558624804698952,
+      "grad_norm": 0.004192110616713762,
+      "learning_rate": 0.001,
+      "loss": 0.3838,
+      "step": 9273
+    },
+    {
+      "epoch": 0.25589007267095953,
+      "grad_norm": 0.0024996623396873474,
+      "learning_rate": 0.001,
+      "loss": 0.3587,
+      "step": 9274
+    },
+    {
+      "epoch": 0.25591766487202394,
+      "grad_norm": 0.002356313867494464,
+      "learning_rate": 0.001,
+      "loss": 0.4042,
+      "step": 9275
+    },
+    {
+      "epoch": 0.2559452570730883,
+      "grad_norm": 0.0032441243529319763,
+      "learning_rate": 0.001,
+      "loss": 0.4102,
+      "step": 9276
+    },
+    {
+      "epoch": 0.25597284927415265,
+      "grad_norm": 0.003050130559131503,
+      "learning_rate": 0.001,
+      "loss": 0.3998,
+      "step": 9277
+    },
+    {
+      "epoch": 0.25600044147521706,
+      "grad_norm": 0.0028984618838876486,
+      "learning_rate": 0.001,
+      "loss": 0.4252,
+      "step": 9278
+    },
+    {
+      "epoch": 0.2560280336762814,
+      "grad_norm": 0.0024825683794915676,
+      "learning_rate": 0.001,
+      "loss": 0.4067,
+      "step": 9279
+    },
+    {
+      "epoch": 0.25605562587734576,
+      "grad_norm": 0.007617509458214045,
+      "learning_rate": 0.001,
+      "loss": 0.3855,
+      "step": 9280
+    },
+    {
+      "epoch": 0.2560832180784101,
+      "grad_norm": 0.002734134206548333,
+      "learning_rate": 0.001,
+      "loss": 0.411,
+      "step": 9281
+    },
+    {
+      "epoch": 0.2561108102794745,
+      "grad_norm": 0.0032497511710971594,
+      "learning_rate": 0.001,
+      "loss": 0.3858,
+      "step": 9282
+    },
+    {
+      "epoch": 0.2561384024805389,
+      "grad_norm": 0.0038412590511143208,
+      "learning_rate": 0.001,
+      "loss": 0.4383,
+      "step": 9283
+    },
+    {
+      "epoch": 0.2561659946816032,
+      "grad_norm": 0.002717244438827038,
+      "learning_rate": 0.001,
+      "loss": 0.3815,
+      "step": 9284
+    },
+    {
+      "epoch": 0.25619358688266763,
+      "grad_norm": 0.002895118435844779,
+      "learning_rate": 0.001,
+      "loss": 0.4379,
+      "step": 9285
+    },
+    {
+      "epoch": 0.256221179083732,
+      "grad_norm": 0.0027081493753939867,
+      "learning_rate": 0.001,
+      "loss": 0.3759,
+      "step": 9286
+    },
+    {
+      "epoch": 0.25624877128479634,
+      "grad_norm": 0.002953689079731703,
+      "learning_rate": 0.001,
+      "loss": 0.4152,
+      "step": 9287
+    },
+    {
+      "epoch": 0.25627636348586075,
+      "grad_norm": 0.0018930652877315879,
+      "learning_rate": 0.001,
+      "loss": 0.4461,
+      "step": 9288
+    },
+    {
+      "epoch": 0.2563039556869251,
+      "grad_norm": 0.0027619446627795696,
+      "learning_rate": 0.001,
+      "loss": 0.393,
+      "step": 9289
+    },
+    {
+      "epoch": 0.25633154788798945,
+      "grad_norm": 0.0028445671778172255,
+      "learning_rate": 0.001,
+      "loss": 0.3958,
+      "step": 9290
+    },
+    {
+      "epoch": 0.2563591400890538,
+      "grad_norm": 0.004813406616449356,
+      "learning_rate": 0.001,
+      "loss": 0.3977,
+      "step": 9291
+    },
+    {
+      "epoch": 0.2563867322901182,
+      "grad_norm": 0.0027178998570889235,
+      "learning_rate": 0.001,
+      "loss": 0.3842,
+      "step": 9292
+    },
+    {
+      "epoch": 0.25641432449118257,
+      "grad_norm": 0.0027016105595976114,
+      "learning_rate": 0.001,
+      "loss": 0.409,
+      "step": 9293
+    },
+    {
+      "epoch": 0.2564419166922469,
+      "grad_norm": 0.0038917434867471457,
+      "learning_rate": 0.001,
+      "loss": 0.391,
+      "step": 9294
+    },
+    {
+      "epoch": 0.2564695088933113,
+      "grad_norm": 0.004160382319241762,
+      "learning_rate": 0.001,
+      "loss": 0.3978,
+      "step": 9295
+    },
+    {
+      "epoch": 0.2564971010943757,
+      "grad_norm": 0.004111155401915312,
+      "learning_rate": 0.001,
+      "loss": 0.3929,
+      "step": 9296
+    },
+    {
+      "epoch": 0.25652469329544003,
+      "grad_norm": 0.00411924347281456,
+      "learning_rate": 0.001,
+      "loss": 0.3913,
+      "step": 9297
+    },
+    {
+      "epoch": 0.25655228549650444,
+      "grad_norm": 0.0026549468748271465,
+      "learning_rate": 0.001,
+      "loss": 0.4281,
+      "step": 9298
+    },
+    {
+      "epoch": 0.2565798776975688,
+      "grad_norm": 0.004998568445444107,
+      "learning_rate": 0.001,
+      "loss": 0.3905,
+      "step": 9299
+    },
+    {
+      "epoch": 0.25660746989863314,
+      "grad_norm": 0.002522986149415374,
+      "learning_rate": 0.001,
+      "loss": 0.4141,
+      "step": 9300
+    },
+    {
+      "epoch": 0.2566350620996975,
+      "grad_norm": 0.0028745641466230154,
+      "learning_rate": 0.001,
+      "loss": 0.3847,
+      "step": 9301
+    },
+    {
+      "epoch": 0.2566626543007619,
+      "grad_norm": 0.0056059998460114,
+      "learning_rate": 0.001,
+      "loss": 0.4198,
+      "step": 9302
+    },
+    {
+      "epoch": 0.25669024650182626,
+      "grad_norm": 0.002385946689173579,
+      "learning_rate": 0.001,
+      "loss": 0.4059,
+      "step": 9303
+    },
+    {
+      "epoch": 0.2567178387028906,
+      "grad_norm": 0.0057910652831196785,
+      "learning_rate": 0.001,
+      "loss": 0.4043,
+      "step": 9304
+    },
+    {
+      "epoch": 0.256745430903955,
+      "grad_norm": 0.003377356566488743,
+      "learning_rate": 0.001,
+      "loss": 0.4097,
+      "step": 9305
+    },
+    {
+      "epoch": 0.25677302310501937,
+      "grad_norm": 0.002434053458273411,
+      "learning_rate": 0.001,
+      "loss": 0.4205,
+      "step": 9306
+    },
+    {
+      "epoch": 0.2568006153060837,
+      "grad_norm": 0.0029004281386733055,
+      "learning_rate": 0.001,
+      "loss": 0.3941,
+      "step": 9307
+    },
+    {
+      "epoch": 0.25682820750714813,
+      "grad_norm": 0.0032230631913989782,
+      "learning_rate": 0.001,
+      "loss": 0.3965,
+      "step": 9308
+    },
+    {
+      "epoch": 0.2568557997082125,
+      "grad_norm": 0.00245184195227921,
+      "learning_rate": 0.001,
+      "loss": 0.4297,
+      "step": 9309
+    },
+    {
+      "epoch": 0.25688339190927684,
+      "grad_norm": 0.0030043041333556175,
+      "learning_rate": 0.001,
+      "loss": 0.3851,
+      "step": 9310
+    },
+    {
+      "epoch": 0.2569109841103412,
+      "grad_norm": 0.007739334367215633,
+      "learning_rate": 0.001,
+      "loss": 0.3723,
+      "step": 9311
+    },
+    {
+      "epoch": 0.2569385763114056,
+      "grad_norm": 0.002734127687290311,
+      "learning_rate": 0.001,
+      "loss": 0.4155,
+      "step": 9312
+    },
+    {
+      "epoch": 0.25696616851246995,
+      "grad_norm": 0.004879895132035017,
+      "learning_rate": 0.001,
+      "loss": 0.3962,
+      "step": 9313
+    },
+    {
+      "epoch": 0.2569937607135343,
+      "grad_norm": 0.002906453562900424,
+      "learning_rate": 0.001,
+      "loss": 0.4332,
+      "step": 9314
+    },
+    {
+      "epoch": 0.2570213529145987,
+      "grad_norm": 0.004463196266442537,
+      "learning_rate": 0.001,
+      "loss": 0.3455,
+      "step": 9315
+    },
+    {
+      "epoch": 0.25704894511566306,
+      "grad_norm": 0.004677020013332367,
+      "learning_rate": 0.001,
+      "loss": 0.3984,
+      "step": 9316
+    },
+    {
+      "epoch": 0.2570765373167274,
+      "grad_norm": 0.006812858395278454,
+      "learning_rate": 0.001,
+      "loss": 0.415,
+      "step": 9317
+    },
+    {
+      "epoch": 0.25710412951779177,
+      "grad_norm": 0.0026047630235552788,
+      "learning_rate": 0.001,
+      "loss": 0.4258,
+      "step": 9318
+    },
+    {
+      "epoch": 0.2571317217188562,
+      "grad_norm": 0.004368063993752003,
+      "learning_rate": 0.001,
+      "loss": 0.3617,
+      "step": 9319
+    },
+    {
+      "epoch": 0.25715931391992053,
+      "grad_norm": 0.0024550901725888252,
+      "learning_rate": 0.001,
+      "loss": 0.4326,
+      "step": 9320
+    },
+    {
+      "epoch": 0.2571869061209849,
+      "grad_norm": 0.003168846946209669,
+      "learning_rate": 0.001,
+      "loss": 0.4061,
+      "step": 9321
+    },
+    {
+      "epoch": 0.2572144983220493,
+      "grad_norm": 0.004993709735572338,
+      "learning_rate": 0.001,
+      "loss": 0.3897,
+      "step": 9322
+    },
+    {
+      "epoch": 0.25724209052311364,
+      "grad_norm": 0.0027966087218374014,
+      "learning_rate": 0.001,
+      "loss": 0.3959,
+      "step": 9323
+    },
+    {
+      "epoch": 0.257269682724178,
+      "grad_norm": 0.0036490256898105145,
+      "learning_rate": 0.001,
+      "loss": 0.3662,
+      "step": 9324
+    },
+    {
+      "epoch": 0.2572972749252424,
+      "grad_norm": 0.006444889586418867,
+      "learning_rate": 0.001,
+      "loss": 0.3677,
+      "step": 9325
+    },
+    {
+      "epoch": 0.25732486712630676,
+      "grad_norm": 0.005108532030135393,
+      "learning_rate": 0.001,
+      "loss": 0.4314,
+      "step": 9326
+    },
+    {
+      "epoch": 0.2573524593273711,
+      "grad_norm": 0.004126682877540588,
+      "learning_rate": 0.001,
+      "loss": 0.3793,
+      "step": 9327
+    },
+    {
+      "epoch": 0.25738005152843546,
+      "grad_norm": 0.002818217733874917,
+      "learning_rate": 0.001,
+      "loss": 0.4286,
+      "step": 9328
+    },
+    {
+      "epoch": 0.25740764372949987,
+      "grad_norm": 0.0042424676939845085,
+      "learning_rate": 0.001,
+      "loss": 0.4339,
+      "step": 9329
+    },
+    {
+      "epoch": 0.2574352359305642,
+      "grad_norm": 0.003638759721070528,
+      "learning_rate": 0.001,
+      "loss": 0.4323,
+      "step": 9330
+    },
+    {
+      "epoch": 0.2574628281316286,
+      "grad_norm": 0.003619089489802718,
+      "learning_rate": 0.001,
+      "loss": 0.366,
+      "step": 9331
+    },
+    {
+      "epoch": 0.257490420332693,
+      "grad_norm": 0.0029553293716162443,
+      "learning_rate": 0.001,
+      "loss": 0.3955,
+      "step": 9332
+    },
+    {
+      "epoch": 0.25751801253375733,
+      "grad_norm": 0.003728683805093169,
+      "learning_rate": 0.001,
+      "loss": 0.4057,
+      "step": 9333
+    },
+    {
+      "epoch": 0.2575456047348217,
+      "grad_norm": 0.006002207286655903,
+      "learning_rate": 0.001,
+      "loss": 0.3933,
+      "step": 9334
+    },
+    {
+      "epoch": 0.2575731969358861,
+      "grad_norm": 0.005744462367147207,
+      "learning_rate": 0.001,
+      "loss": 0.3832,
+      "step": 9335
+    },
+    {
+      "epoch": 0.25760078913695045,
+      "grad_norm": 0.006803566124290228,
+      "learning_rate": 0.001,
+      "loss": 0.4151,
+      "step": 9336
+    },
+    {
+      "epoch": 0.2576283813380148,
+      "grad_norm": 0.0028065780643373728,
+      "learning_rate": 0.001,
+      "loss": 0.4086,
+      "step": 9337
+    },
+    {
+      "epoch": 0.25765597353907915,
+      "grad_norm": 0.003158635227009654,
+      "learning_rate": 0.001,
+      "loss": 0.4321,
+      "step": 9338
+    },
+    {
+      "epoch": 0.25768356574014356,
+      "grad_norm": 0.0043748971074819565,
+      "learning_rate": 0.001,
+      "loss": 0.3857,
+      "step": 9339
+    },
+    {
+      "epoch": 0.2577111579412079,
+      "grad_norm": 0.0038016666658222675,
+      "learning_rate": 0.001,
+      "loss": 0.4322,
+      "step": 9340
+    },
+    {
+      "epoch": 0.25773875014227227,
+      "grad_norm": 0.005203484557569027,
+      "learning_rate": 0.001,
+      "loss": 0.3926,
+      "step": 9341
+    },
+    {
+      "epoch": 0.2577663423433367,
+      "grad_norm": 0.0031318182591348886,
+      "learning_rate": 0.001,
+      "loss": 0.3828,
+      "step": 9342
+    },
+    {
+      "epoch": 0.257793934544401,
+      "grad_norm": 0.0021927165798842907,
+      "learning_rate": 0.001,
+      "loss": 0.3997,
+      "step": 9343
+    },
+    {
+      "epoch": 0.2578215267454654,
+      "grad_norm": 0.008451412431895733,
+      "learning_rate": 0.001,
+      "loss": 0.3882,
+      "step": 9344
+    },
+    {
+      "epoch": 0.2578491189465298,
+      "grad_norm": 0.012602372094988823,
+      "learning_rate": 0.001,
+      "loss": 0.3981,
+      "step": 9345
+    },
+    {
+      "epoch": 0.25787671114759414,
+      "grad_norm": 0.004658313933759928,
+      "learning_rate": 0.001,
+      "loss": 0.3969,
+      "step": 9346
+    },
+    {
+      "epoch": 0.2579043033486585,
+      "grad_norm": 0.029415573924779892,
+      "learning_rate": 0.001,
+      "loss": 0.3801,
+      "step": 9347
+    },
+    {
+      "epoch": 0.25793189554972284,
+      "grad_norm": 0.005347141530364752,
+      "learning_rate": 0.001,
+      "loss": 0.395,
+      "step": 9348
+    },
+    {
+      "epoch": 0.25795948775078725,
+      "grad_norm": 0.008220274932682514,
+      "learning_rate": 0.001,
+      "loss": 0.3721,
+      "step": 9349
+    },
+    {
+      "epoch": 0.2579870799518516,
+      "grad_norm": 0.0034864528570324183,
+      "learning_rate": 0.001,
+      "loss": 0.396,
+      "step": 9350
+    },
+    {
+      "epoch": 0.25801467215291596,
+      "grad_norm": 0.0023793810978531837,
+      "learning_rate": 0.001,
+      "loss": 0.4201,
+      "step": 9351
+    },
+    {
+      "epoch": 0.25804226435398037,
+      "grad_norm": 0.00437847338616848,
+      "learning_rate": 0.001,
+      "loss": 0.4082,
+      "step": 9352
+    },
+    {
+      "epoch": 0.2580698565550447,
+      "grad_norm": 0.011434728279709816,
+      "learning_rate": 0.001,
+      "loss": 0.4019,
+      "step": 9353
+    },
+    {
+      "epoch": 0.25809744875610907,
+      "grad_norm": 0.002974115777760744,
+      "learning_rate": 0.001,
+      "loss": 0.3866,
+      "step": 9354
+    },
+    {
+      "epoch": 0.2581250409571735,
+      "grad_norm": 0.0029822078067809343,
+      "learning_rate": 0.001,
+      "loss": 0.3945,
+      "step": 9355
+    },
+    {
+      "epoch": 0.25815263315823783,
+      "grad_norm": 0.004817003384232521,
+      "learning_rate": 0.001,
+      "loss": 0.413,
+      "step": 9356
+    },
+    {
+      "epoch": 0.2581802253593022,
+      "grad_norm": 0.005547203589230776,
+      "learning_rate": 0.001,
+      "loss": 0.3835,
+      "step": 9357
+    },
+    {
+      "epoch": 0.25820781756036654,
+      "grad_norm": 0.002377490047365427,
+      "learning_rate": 0.001,
+      "loss": 0.3958,
+      "step": 9358
+    },
+    {
+      "epoch": 0.25823540976143095,
+      "grad_norm": 0.0028770265635102987,
+      "learning_rate": 0.001,
+      "loss": 0.376,
+      "step": 9359
+    },
+    {
+      "epoch": 0.2582630019624953,
+      "grad_norm": 0.003406745847314596,
+      "learning_rate": 0.001,
+      "loss": 0.3971,
+      "step": 9360
+    },
+    {
+      "epoch": 0.25829059416355965,
+      "grad_norm": 0.002956562442705035,
+      "learning_rate": 0.001,
+      "loss": 0.3923,
+      "step": 9361
+    },
+    {
+      "epoch": 0.25831818636462406,
+      "grad_norm": 0.0038173554930835962,
+      "learning_rate": 0.001,
+      "loss": 0.4468,
+      "step": 9362
+    },
+    {
+      "epoch": 0.2583457785656884,
+      "grad_norm": 0.003059924114495516,
+      "learning_rate": 0.001,
+      "loss": 0.3992,
+      "step": 9363
+    },
+    {
+      "epoch": 0.25837337076675276,
+      "grad_norm": 0.002305653179064393,
+      "learning_rate": 0.001,
+      "loss": 0.4215,
+      "step": 9364
+    },
+    {
+      "epoch": 0.25840096296781717,
+      "grad_norm": 0.00254283519461751,
+      "learning_rate": 0.001,
+      "loss": 0.4171,
+      "step": 9365
+    },
+    {
+      "epoch": 0.2584285551688815,
+      "grad_norm": 0.00530956732109189,
+      "learning_rate": 0.001,
+      "loss": 0.3919,
+      "step": 9366
+    },
+    {
+      "epoch": 0.2584561473699459,
+      "grad_norm": 0.0036723476368933916,
+      "learning_rate": 0.001,
+      "loss": 0.3638,
+      "step": 9367
+    },
+    {
+      "epoch": 0.25848373957101023,
+      "grad_norm": 0.008182425051927567,
+      "learning_rate": 0.001,
+      "loss": 0.4096,
+      "step": 9368
+    },
+    {
+      "epoch": 0.25851133177207464,
+      "grad_norm": 0.01312658004462719,
+      "learning_rate": 0.001,
+      "loss": 0.3924,
+      "step": 9369
+    },
+    {
+      "epoch": 0.258538923973139,
+      "grad_norm": 0.0029170040506869555,
+      "learning_rate": 0.001,
+      "loss": 0.3817,
+      "step": 9370
+    },
+    {
+      "epoch": 0.25856651617420334,
+      "grad_norm": 0.00417022779583931,
+      "learning_rate": 0.001,
+      "loss": 0.3712,
+      "step": 9371
+    },
+    {
+      "epoch": 0.25859410837526775,
+      "grad_norm": 0.007455759681761265,
+      "learning_rate": 0.001,
+      "loss": 0.377,
+      "step": 9372
+    },
+    {
+      "epoch": 0.2586217005763321,
+      "grad_norm": 0.0027683484368026257,
+      "learning_rate": 0.001,
+      "loss": 0.4119,
+      "step": 9373
+    },
+    {
+      "epoch": 0.25864929277739646,
+      "grad_norm": 0.005918456707149744,
+      "learning_rate": 0.001,
+      "loss": 0.3668,
+      "step": 9374
+    },
+    {
+      "epoch": 0.25867688497846086,
+      "grad_norm": 0.002821930916979909,
+      "learning_rate": 0.001,
+      "loss": 0.4186,
+      "step": 9375
+    },
+    {
+      "epoch": 0.2587044771795252,
+      "grad_norm": 0.003820334793999791,
+      "learning_rate": 0.001,
+      "loss": 0.4133,
+      "step": 9376
+    },
+    {
+      "epoch": 0.25873206938058957,
+      "grad_norm": 0.003256513038650155,
+      "learning_rate": 0.001,
+      "loss": 0.3759,
+      "step": 9377
+    },
+    {
+      "epoch": 0.2587596615816539,
+      "grad_norm": 0.010328397154808044,
+      "learning_rate": 0.001,
+      "loss": 0.3832,
+      "step": 9378
+    },
+    {
+      "epoch": 0.25878725378271833,
+      "grad_norm": 0.002478259615600109,
+      "learning_rate": 0.001,
+      "loss": 0.4065,
+      "step": 9379
+    },
+    {
+      "epoch": 0.2588148459837827,
+      "grad_norm": 0.0034857571590691805,
+      "learning_rate": 0.001,
+      "loss": 0.3946,
+      "step": 9380
+    },
+    {
+      "epoch": 0.25884243818484703,
+      "grad_norm": 0.00216446490958333,
+      "learning_rate": 0.001,
+      "loss": 0.4009,
+      "step": 9381
+    },
+    {
+      "epoch": 0.25887003038591144,
+      "grad_norm": 0.0028666919097304344,
+      "learning_rate": 0.001,
+      "loss": 0.3897,
+      "step": 9382
+    },
+    {
+      "epoch": 0.2588976225869758,
+      "grad_norm": 0.002316189929842949,
+      "learning_rate": 0.001,
+      "loss": 0.3896,
+      "step": 9383
+    },
+    {
+      "epoch": 0.25892521478804015,
+      "grad_norm": 0.0033454406075179577,
+      "learning_rate": 0.001,
+      "loss": 0.3701,
+      "step": 9384
+    },
+    {
+      "epoch": 0.25895280698910456,
+      "grad_norm": 0.0029785428196191788,
+      "learning_rate": 0.001,
+      "loss": 0.379,
+      "step": 9385
+    },
+    {
+      "epoch": 0.2589803991901689,
+      "grad_norm": 0.0043611531145870686,
+      "learning_rate": 0.001,
+      "loss": 0.4064,
+      "step": 9386
+    },
+    {
+      "epoch": 0.25900799139123326,
+      "grad_norm": 0.004450157284736633,
+      "learning_rate": 0.001,
+      "loss": 0.4026,
+      "step": 9387
+    },
+    {
+      "epoch": 0.2590355835922976,
+      "grad_norm": 0.003939083311706781,
+      "learning_rate": 0.001,
+      "loss": 0.3898,
+      "step": 9388
+    },
+    {
+      "epoch": 0.259063175793362,
+      "grad_norm": 0.0035476302728056908,
+      "learning_rate": 0.001,
+      "loss": 0.3751,
+      "step": 9389
+    },
+    {
+      "epoch": 0.2590907679944264,
+      "grad_norm": 0.005521733313798904,
+      "learning_rate": 0.001,
+      "loss": 0.4249,
+      "step": 9390
+    },
+    {
+      "epoch": 0.2591183601954907,
+      "grad_norm": 0.0055731479078531265,
+      "learning_rate": 0.001,
+      "loss": 0.3996,
+      "step": 9391
+    },
+    {
+      "epoch": 0.25914595239655513,
+      "grad_norm": 0.003587324172258377,
+      "learning_rate": 0.001,
+      "loss": 0.374,
+      "step": 9392
+    },
+    {
+      "epoch": 0.2591735445976195,
+      "grad_norm": 0.018497616052627563,
+      "learning_rate": 0.001,
+      "loss": 0.3912,
+      "step": 9393
+    },
+    {
+      "epoch": 0.25920113679868384,
+      "grad_norm": 0.005223001353442669,
+      "learning_rate": 0.001,
+      "loss": 0.4013,
+      "step": 9394
+    },
+    {
+      "epoch": 0.25922872899974825,
+      "grad_norm": 0.0024736267514526844,
+      "learning_rate": 0.001,
+      "loss": 0.3715,
+      "step": 9395
+    },
+    {
+      "epoch": 0.2592563212008126,
+      "grad_norm": 0.0023903397377580404,
+      "learning_rate": 0.001,
+      "loss": 0.4097,
+      "step": 9396
+    },
+    {
+      "epoch": 0.25928391340187695,
+      "grad_norm": 0.0038790139369666576,
+      "learning_rate": 0.001,
+      "loss": 0.3862,
+      "step": 9397
+    },
+    {
+      "epoch": 0.2593115056029413,
+      "grad_norm": 0.0027185981161892414,
+      "learning_rate": 0.001,
+      "loss": 0.3727,
+      "step": 9398
+    },
+    {
+      "epoch": 0.2593390978040057,
+      "grad_norm": 0.007733003702014685,
+      "learning_rate": 0.001,
+      "loss": 0.368,
+      "step": 9399
+    },
+    {
+      "epoch": 0.25936669000507007,
+      "grad_norm": 0.003809612710028887,
+      "learning_rate": 0.001,
+      "loss": 0.3936,
+      "step": 9400
+    },
+    {
+      "epoch": 0.2593942822061344,
+      "grad_norm": 0.004529821220785379,
+      "learning_rate": 0.001,
+      "loss": 0.4428,
+      "step": 9401
+    },
+    {
+      "epoch": 0.2594218744071988,
+      "grad_norm": 0.0025704330764710903,
+      "learning_rate": 0.001,
+      "loss": 0.4154,
+      "step": 9402
+    },
+    {
+      "epoch": 0.2594494666082632,
+      "grad_norm": 0.0027423694264143705,
+      "learning_rate": 0.001,
+      "loss": 0.4307,
+      "step": 9403
+    },
+    {
+      "epoch": 0.25947705880932753,
+      "grad_norm": 0.0036277722101658583,
+      "learning_rate": 0.001,
+      "loss": 0.3904,
+      "step": 9404
+    },
+    {
+      "epoch": 0.25950465101039194,
+      "grad_norm": 0.003103514201939106,
+      "learning_rate": 0.001,
+      "loss": 0.4137,
+      "step": 9405
+    },
+    {
+      "epoch": 0.2595322432114563,
+      "grad_norm": 0.005228511989116669,
+      "learning_rate": 0.001,
+      "loss": 0.3989,
+      "step": 9406
+    },
+    {
+      "epoch": 0.25955983541252065,
+      "grad_norm": 0.0027565350756049156,
+      "learning_rate": 0.001,
+      "loss": 0.4142,
+      "step": 9407
+    },
+    {
+      "epoch": 0.259587427613585,
+      "grad_norm": 0.005467361304908991,
+      "learning_rate": 0.001,
+      "loss": 0.38,
+      "step": 9408
+    },
+    {
+      "epoch": 0.2596150198146494,
+      "grad_norm": 0.002642360283061862,
+      "learning_rate": 0.001,
+      "loss": 0.3789,
+      "step": 9409
+    },
+    {
+      "epoch": 0.25964261201571376,
+      "grad_norm": 0.00389938335865736,
+      "learning_rate": 0.001,
+      "loss": 0.416,
+      "step": 9410
+    },
+    {
+      "epoch": 0.2596702042167781,
+      "grad_norm": 0.0047807167284190655,
+      "learning_rate": 0.001,
+      "loss": 0.41,
+      "step": 9411
+    },
+    {
+      "epoch": 0.2596977964178425,
+      "grad_norm": 0.017033278942108154,
+      "learning_rate": 0.001,
+      "loss": 0.4064,
+      "step": 9412
+    },
+    {
+      "epoch": 0.25972538861890687,
+      "grad_norm": 0.014154274947941303,
+      "learning_rate": 0.001,
+      "loss": 0.4051,
+      "step": 9413
+    },
+    {
+      "epoch": 0.2597529808199712,
+      "grad_norm": 0.009238173253834248,
+      "learning_rate": 0.001,
+      "loss": 0.3483,
+      "step": 9414
+    },
+    {
+      "epoch": 0.2597805730210356,
+      "grad_norm": 0.008399531245231628,
+      "learning_rate": 0.001,
+      "loss": 0.4039,
+      "step": 9415
+    },
+    {
+      "epoch": 0.2598081652221,
+      "grad_norm": 0.002651994815096259,
+      "learning_rate": 0.001,
+      "loss": 0.389,
+      "step": 9416
+    },
+    {
+      "epoch": 0.25983575742316434,
+      "grad_norm": 0.003266576211899519,
+      "learning_rate": 0.001,
+      "loss": 0.3792,
+      "step": 9417
+    },
+    {
+      "epoch": 0.2598633496242287,
+      "grad_norm": 0.0043223886750638485,
+      "learning_rate": 0.001,
+      "loss": 0.3893,
+      "step": 9418
+    },
+    {
+      "epoch": 0.2598909418252931,
+      "grad_norm": 0.004282352980226278,
+      "learning_rate": 0.001,
+      "loss": 0.382,
+      "step": 9419
+    },
+    {
+      "epoch": 0.25991853402635745,
+      "grad_norm": 0.006613946054130793,
+      "learning_rate": 0.001,
+      "loss": 0.3995,
+      "step": 9420
+    },
+    {
+      "epoch": 0.2599461262274218,
+      "grad_norm": 0.0021815176587551832,
+      "learning_rate": 0.001,
+      "loss": 0.415,
+      "step": 9421
+    },
+    {
+      "epoch": 0.2599737184284862,
+      "grad_norm": 0.0028681547846645117,
+      "learning_rate": 0.001,
+      "loss": 0.4084,
+      "step": 9422
+    },
+    {
+      "epoch": 0.26000131062955056,
+      "grad_norm": 0.0035515769850462675,
+      "learning_rate": 0.001,
+      "loss": 0.3948,
+      "step": 9423
+    },
+    {
+      "epoch": 0.2600289028306149,
+      "grad_norm": 0.004106239881366491,
+      "learning_rate": 0.001,
+      "loss": 0.3822,
+      "step": 9424
+    },
+    {
+      "epoch": 0.26005649503167927,
+      "grad_norm": 0.005105991847813129,
+      "learning_rate": 0.001,
+      "loss": 0.3999,
+      "step": 9425
+    },
+    {
+      "epoch": 0.2600840872327437,
+      "grad_norm": 0.004727422725409269,
+      "learning_rate": 0.001,
+      "loss": 0.391,
+      "step": 9426
+    },
+    {
+      "epoch": 0.26011167943380803,
+      "grad_norm": 0.005350995343178511,
+      "learning_rate": 0.001,
+      "loss": 0.4007,
+      "step": 9427
+    },
+    {
+      "epoch": 0.2601392716348724,
+      "grad_norm": 0.005172072909772396,
+      "learning_rate": 0.001,
+      "loss": 0.3803,
+      "step": 9428
+    },
+    {
+      "epoch": 0.2601668638359368,
+      "grad_norm": 0.004639607388526201,
+      "learning_rate": 0.001,
+      "loss": 0.4244,
+      "step": 9429
+    },
+    {
+      "epoch": 0.26019445603700114,
+      "grad_norm": 0.004586922936141491,
+      "learning_rate": 0.001,
+      "loss": 0.3792,
+      "step": 9430
+    },
+    {
+      "epoch": 0.2602220482380655,
+      "grad_norm": 0.004610531497746706,
+      "learning_rate": 0.001,
+      "loss": 0.3964,
+      "step": 9431
+    },
+    {
+      "epoch": 0.2602496404391299,
+      "grad_norm": 0.0026218774728477,
+      "learning_rate": 0.001,
+      "loss": 0.4178,
+      "step": 9432
+    },
+    {
+      "epoch": 0.26027723264019426,
+      "grad_norm": 0.006149280350655317,
+      "learning_rate": 0.001,
+      "loss": 0.3892,
+      "step": 9433
+    },
+    {
+      "epoch": 0.2603048248412586,
+      "grad_norm": 0.0048711239360272884,
+      "learning_rate": 0.001,
+      "loss": 0.4503,
+      "step": 9434
+    },
+    {
+      "epoch": 0.26033241704232296,
+      "grad_norm": 0.03840470314025879,
+      "learning_rate": 0.001,
+      "loss": 0.4005,
+      "step": 9435
+    },
+    {
+      "epoch": 0.26036000924338737,
+      "grad_norm": 0.0037160839419811964,
+      "learning_rate": 0.001,
+      "loss": 0.3907,
+      "step": 9436
+    },
+    {
+      "epoch": 0.2603876014444517,
+      "grad_norm": 0.0032652821391820908,
+      "learning_rate": 0.001,
+      "loss": 0.4121,
+      "step": 9437
+    },
+    {
+      "epoch": 0.2604151936455161,
+      "grad_norm": 0.007966517470777035,
+      "learning_rate": 0.001,
+      "loss": 0.375,
+      "step": 9438
+    },
+    {
+      "epoch": 0.2604427858465805,
+      "grad_norm": 0.006473064888268709,
+      "learning_rate": 0.001,
+      "loss": 0.4054,
+      "step": 9439
+    },
+    {
+      "epoch": 0.26047037804764483,
+      "grad_norm": 0.005094799678772688,
+      "learning_rate": 0.001,
+      "loss": 0.3895,
+      "step": 9440
+    },
+    {
+      "epoch": 0.2604979702487092,
+      "grad_norm": 0.049805864691734314,
+      "learning_rate": 0.001,
+      "loss": 0.3923,
+      "step": 9441
+    },
+    {
+      "epoch": 0.2605255624497736,
+      "grad_norm": 0.0036678947508335114,
+      "learning_rate": 0.001,
+      "loss": 0.4396,
+      "step": 9442
+    },
+    {
+      "epoch": 0.26055315465083795,
+      "grad_norm": 0.004897846840322018,
+      "learning_rate": 0.001,
+      "loss": 0.366,
+      "step": 9443
+    },
+    {
+      "epoch": 0.2605807468519023,
+      "grad_norm": 0.00426288740709424,
+      "learning_rate": 0.001,
+      "loss": 0.3929,
+      "step": 9444
+    },
+    {
+      "epoch": 0.26060833905296665,
+      "grad_norm": 0.009089482948184013,
+      "learning_rate": 0.001,
+      "loss": 0.3799,
+      "step": 9445
+    },
+    {
+      "epoch": 0.26063593125403106,
+      "grad_norm": 0.00743220467120409,
+      "learning_rate": 0.001,
+      "loss": 0.3701,
+      "step": 9446
+    },
+    {
+      "epoch": 0.2606635234550954,
+      "grad_norm": 0.004399383440613747,
+      "learning_rate": 0.001,
+      "loss": 0.4017,
+      "step": 9447
+    },
+    {
+      "epoch": 0.26069111565615977,
+      "grad_norm": 0.0023268854711204767,
+      "learning_rate": 0.001,
+      "loss": 0.3957,
+      "step": 9448
+    },
+    {
+      "epoch": 0.2607187078572242,
+      "grad_norm": 0.003956654574722052,
+      "learning_rate": 0.001,
+      "loss": 0.3912,
+      "step": 9449
+    },
+    {
+      "epoch": 0.2607463000582885,
+      "grad_norm": 0.00389308063313365,
+      "learning_rate": 0.001,
+      "loss": 0.3898,
+      "step": 9450
+    },
+    {
+      "epoch": 0.2607738922593529,
+      "grad_norm": 0.0036460221745073795,
+      "learning_rate": 0.001,
+      "loss": 0.3999,
+      "step": 9451
+    },
+    {
+      "epoch": 0.2608014844604173,
+      "grad_norm": 0.003133069258183241,
+      "learning_rate": 0.001,
+      "loss": 0.3932,
+      "step": 9452
+    },
+    {
+      "epoch": 0.26082907666148164,
+      "grad_norm": 0.0027508933562785387,
+      "learning_rate": 0.001,
+      "loss": 0.3912,
+      "step": 9453
+    },
+    {
+      "epoch": 0.260856668862546,
+      "grad_norm": 0.003370395628735423,
+      "learning_rate": 0.001,
+      "loss": 0.4176,
+      "step": 9454
+    },
+    {
+      "epoch": 0.26088426106361035,
+      "grad_norm": 0.0036432910710573196,
+      "learning_rate": 0.001,
+      "loss": 0.4114,
+      "step": 9455
+    },
+    {
+      "epoch": 0.26091185326467475,
+      "grad_norm": 0.004142069257795811,
+      "learning_rate": 0.001,
+      "loss": 0.3825,
+      "step": 9456
+    },
+    {
+      "epoch": 0.2609394454657391,
+      "grad_norm": 0.003075930755585432,
+      "learning_rate": 0.001,
+      "loss": 0.3992,
+      "step": 9457
+    },
+    {
+      "epoch": 0.26096703766680346,
+      "grad_norm": 0.002679844619706273,
+      "learning_rate": 0.001,
+      "loss": 0.371,
+      "step": 9458
+    },
+    {
+      "epoch": 0.26099462986786787,
+      "grad_norm": 0.0034837801940739155,
+      "learning_rate": 0.001,
+      "loss": 0.3892,
+      "step": 9459
+    },
+    {
+      "epoch": 0.2610222220689322,
+      "grad_norm": 0.002534442814067006,
+      "learning_rate": 0.001,
+      "loss": 0.408,
+      "step": 9460
+    },
+    {
+      "epoch": 0.26104981426999657,
+      "grad_norm": 0.005891683977097273,
+      "learning_rate": 0.001,
+      "loss": 0.3751,
+      "step": 9461
+    },
+    {
+      "epoch": 0.261077406471061,
+      "grad_norm": 0.005468081682920456,
+      "learning_rate": 0.001,
+      "loss": 0.353,
+      "step": 9462
+    },
+    {
+      "epoch": 0.26110499867212533,
+      "grad_norm": 0.002956255106255412,
+      "learning_rate": 0.001,
+      "loss": 0.4211,
+      "step": 9463
+    },
+    {
+      "epoch": 0.2611325908731897,
+      "grad_norm": 0.003849986707791686,
+      "learning_rate": 0.001,
+      "loss": 0.3962,
+      "step": 9464
+    },
+    {
+      "epoch": 0.26116018307425404,
+      "grad_norm": 0.002705160528421402,
+      "learning_rate": 0.001,
+      "loss": 0.4156,
+      "step": 9465
+    },
+    {
+      "epoch": 0.26118777527531845,
+      "grad_norm": 0.002549894852563739,
+      "learning_rate": 0.001,
+      "loss": 0.4244,
+      "step": 9466
+    },
+    {
+      "epoch": 0.2612153674763828,
+      "grad_norm": 0.003938885871320963,
+      "learning_rate": 0.001,
+      "loss": 0.3799,
+      "step": 9467
+    },
+    {
+      "epoch": 0.26124295967744715,
+      "grad_norm": 0.002657047938555479,
+      "learning_rate": 0.001,
+      "loss": 0.3941,
+      "step": 9468
+    },
+    {
+      "epoch": 0.26127055187851156,
+      "grad_norm": 0.004167834762483835,
+      "learning_rate": 0.001,
+      "loss": 0.3993,
+      "step": 9469
+    },
+    {
+      "epoch": 0.2612981440795759,
+      "grad_norm": 0.0034376648254692554,
+      "learning_rate": 0.001,
+      "loss": 0.3597,
+      "step": 9470
+    },
+    {
+      "epoch": 0.26132573628064026,
+      "grad_norm": 0.0034596873447299004,
+      "learning_rate": 0.001,
+      "loss": 0.4028,
+      "step": 9471
+    },
+    {
+      "epoch": 0.26135332848170467,
+      "grad_norm": 0.002702021738514304,
+      "learning_rate": 0.001,
+      "loss": 0.3876,
+      "step": 9472
+    },
+    {
+      "epoch": 0.261380920682769,
+      "grad_norm": 0.0027853166684508324,
+      "learning_rate": 0.001,
+      "loss": 0.4026,
+      "step": 9473
+    },
+    {
+      "epoch": 0.2614085128838334,
+      "grad_norm": 0.0032341419719159603,
+      "learning_rate": 0.001,
+      "loss": 0.4335,
+      "step": 9474
+    },
+    {
+      "epoch": 0.26143610508489773,
+      "grad_norm": 0.0034284372813999653,
+      "learning_rate": 0.001,
+      "loss": 0.3815,
+      "step": 9475
+    },
+    {
+      "epoch": 0.26146369728596214,
+      "grad_norm": 0.003640861948952079,
+      "learning_rate": 0.001,
+      "loss": 0.4027,
+      "step": 9476
+    },
+    {
+      "epoch": 0.2614912894870265,
+      "grad_norm": 0.002866639755666256,
+      "learning_rate": 0.001,
+      "loss": 0.3967,
+      "step": 9477
+    },
+    {
+      "epoch": 0.26151888168809084,
+      "grad_norm": 0.0023725773207843304,
+      "learning_rate": 0.001,
+      "loss": 0.3974,
+      "step": 9478
+    },
+    {
+      "epoch": 0.26154647388915525,
+      "grad_norm": 0.0032590448390692472,
+      "learning_rate": 0.001,
+      "loss": 0.3543,
+      "step": 9479
+    },
+    {
+      "epoch": 0.2615740660902196,
+      "grad_norm": 0.004378579091280699,
+      "learning_rate": 0.001,
+      "loss": 0.3743,
+      "step": 9480
+    },
+    {
+      "epoch": 0.26160165829128396,
+      "grad_norm": 0.0037949122488498688,
+      "learning_rate": 0.001,
+      "loss": 0.4333,
+      "step": 9481
+    },
+    {
+      "epoch": 0.26162925049234836,
+      "grad_norm": 0.002791197504848242,
+      "learning_rate": 0.001,
+      "loss": 0.3499,
+      "step": 9482
+    },
+    {
+      "epoch": 0.2616568426934127,
+      "grad_norm": 0.0038585460279136896,
+      "learning_rate": 0.001,
+      "loss": 0.3701,
+      "step": 9483
+    },
+    {
+      "epoch": 0.26168443489447707,
+      "grad_norm": 0.002629459137097001,
+      "learning_rate": 0.001,
+      "loss": 0.423,
+      "step": 9484
+    },
+    {
+      "epoch": 0.2617120270955414,
+      "grad_norm": 0.0032473283354192972,
+      "learning_rate": 0.001,
+      "loss": 0.3866,
+      "step": 9485
+    },
+    {
+      "epoch": 0.26173961929660583,
+      "grad_norm": 0.0029964474961161613,
+      "learning_rate": 0.001,
+      "loss": 0.3792,
+      "step": 9486
+    },
+    {
+      "epoch": 0.2617672114976702,
+      "grad_norm": 0.0031551928259432316,
+      "learning_rate": 0.001,
+      "loss": 0.4245,
+      "step": 9487
+    },
+    {
+      "epoch": 0.26179480369873454,
+      "grad_norm": 0.00244794599711895,
+      "learning_rate": 0.001,
+      "loss": 0.3962,
+      "step": 9488
+    },
+    {
+      "epoch": 0.26182239589979894,
+      "grad_norm": 0.003302481258288026,
+      "learning_rate": 0.001,
+      "loss": 0.4174,
+      "step": 9489
+    },
+    {
+      "epoch": 0.2618499881008633,
+      "grad_norm": 0.0027688101399689913,
+      "learning_rate": 0.001,
+      "loss": 0.4026,
+      "step": 9490
+    },
+    {
+      "epoch": 0.26187758030192765,
+      "grad_norm": 0.0031254065688699484,
+      "learning_rate": 0.001,
+      "loss": 0.3972,
+      "step": 9491
+    },
+    {
+      "epoch": 0.26190517250299206,
+      "grad_norm": 0.00236527225933969,
+      "learning_rate": 0.001,
+      "loss": 0.4213,
+      "step": 9492
+    },
+    {
+      "epoch": 0.2619327647040564,
+      "grad_norm": 0.0028193267062306404,
+      "learning_rate": 0.001,
+      "loss": 0.3627,
+      "step": 9493
+    },
+    {
+      "epoch": 0.26196035690512076,
+      "grad_norm": 0.002419235184788704,
+      "learning_rate": 0.001,
+      "loss": 0.4039,
+      "step": 9494
+    },
+    {
+      "epoch": 0.2619879491061851,
+      "grad_norm": 0.006053110118955374,
+      "learning_rate": 0.001,
+      "loss": 0.4159,
+      "step": 9495
+    },
+    {
+      "epoch": 0.2620155413072495,
+      "grad_norm": 0.004557557869702578,
+      "learning_rate": 0.001,
+      "loss": 0.3878,
+      "step": 9496
+    },
+    {
+      "epoch": 0.2620431335083139,
+      "grad_norm": 0.002896302379667759,
+      "learning_rate": 0.001,
+      "loss": 0.3909,
+      "step": 9497
+    },
+    {
+      "epoch": 0.2620707257093782,
+      "grad_norm": 0.002858472056686878,
+      "learning_rate": 0.001,
+      "loss": 0.3965,
+      "step": 9498
+    },
+    {
+      "epoch": 0.26209831791044264,
+      "grad_norm": 0.004048534668982029,
+      "learning_rate": 0.001,
+      "loss": 0.3825,
+      "step": 9499
+    },
+    {
+      "epoch": 0.262125910111507,
+      "grad_norm": 0.003960879985243082,
+      "learning_rate": 0.001,
+      "loss": 0.4,
+      "step": 9500
+    },
+    {
+      "epoch": 0.262125910111507,
+      "eval_runtime": 25.194,
+      "eval_samples_per_second": 1.27,
+      "eval_steps_per_second": 0.159,
+      "step": 9500
+    },
+    {
+      "epoch": 0.26215350231257134,
+      "grad_norm": 0.0026859226636588573,
+      "learning_rate": 0.001,
+      "loss": 0.3963,
+      "step": 9501
+    },
+    {
+      "epoch": 0.26218109451363575,
+      "grad_norm": 0.004340642131865025,
+      "learning_rate": 0.001,
+      "loss": 0.3599,
+      "step": 9502
+    },
+    {
+      "epoch": 0.2622086867147001,
+      "grad_norm": 0.003308952320367098,
+      "learning_rate": 0.001,
+      "loss": 0.3751,
+      "step": 9503
+    },
+    {
+      "epoch": 0.26223627891576445,
+      "grad_norm": 0.003607766004279256,
+      "learning_rate": 0.001,
+      "loss": 0.4285,
+      "step": 9504
+    },
+    {
+      "epoch": 0.2622638711168288,
+      "grad_norm": 0.004522815812379122,
+      "learning_rate": 0.001,
+      "loss": 0.3602,
+      "step": 9505
+    },
+    {
+      "epoch": 0.2622914633178932,
+      "grad_norm": 0.004235134460031986,
+      "learning_rate": 0.001,
+      "loss": 0.3815,
+      "step": 9506
+    },
+    {
+      "epoch": 0.26231905551895757,
+      "grad_norm": 0.010735023766756058,
+      "learning_rate": 0.001,
+      "loss": 0.4159,
+      "step": 9507
+    },
+    {
+      "epoch": 0.2623466477200219,
+      "grad_norm": 0.004076390992850065,
+      "learning_rate": 0.001,
+      "loss": 0.4218,
+      "step": 9508
+    },
+    {
+      "epoch": 0.2623742399210863,
+      "grad_norm": 0.007312237750738859,
+      "learning_rate": 0.001,
+      "loss": 0.4081,
+      "step": 9509
+    },
+    {
+      "epoch": 0.2624018321221507,
+      "grad_norm": 0.007191124372184277,
+      "learning_rate": 0.001,
+      "loss": 0.38,
+      "step": 9510
+    },
+    {
+      "epoch": 0.26242942432321503,
+      "grad_norm": 0.0035255979746580124,
+      "learning_rate": 0.001,
+      "loss": 0.4116,
+      "step": 9511
+    },
+    {
+      "epoch": 0.2624570165242794,
+      "grad_norm": 0.004981547594070435,
+      "learning_rate": 0.001,
+      "loss": 0.4575,
+      "step": 9512
+    },
+    {
+      "epoch": 0.2624846087253438,
+      "grad_norm": 0.006985159125179052,
+      "learning_rate": 0.001,
+      "loss": 0.3878,
+      "step": 9513
+    },
+    {
+      "epoch": 0.26251220092640815,
+      "grad_norm": 0.005956006236374378,
+      "learning_rate": 0.001,
+      "loss": 0.3624,
+      "step": 9514
+    },
+    {
+      "epoch": 0.2625397931274725,
+      "grad_norm": 0.005447957664728165,
+      "learning_rate": 0.001,
+      "loss": 0.4185,
+      "step": 9515
+    },
+    {
+      "epoch": 0.2625673853285369,
+      "grad_norm": 0.007735871244221926,
+      "learning_rate": 0.001,
+      "loss": 0.4083,
+      "step": 9516
+    },
+    {
+      "epoch": 0.26259497752960126,
+      "grad_norm": 0.004465447273105383,
+      "learning_rate": 0.001,
+      "loss": 0.4081,
+      "step": 9517
+    },
+    {
+      "epoch": 0.2626225697306656,
+      "grad_norm": 0.0033752690069377422,
+      "learning_rate": 0.001,
+      "loss": 0.3773,
+      "step": 9518
+    },
+    {
+      "epoch": 0.26265016193173,
+      "grad_norm": 0.006316347047686577,
+      "learning_rate": 0.001,
+      "loss": 0.4389,
+      "step": 9519
+    },
+    {
+      "epoch": 0.26267775413279437,
+      "grad_norm": 0.019447481259703636,
+      "learning_rate": 0.001,
+      "loss": 0.3625,
+      "step": 9520
+    },
+    {
+      "epoch": 0.2627053463338587,
+      "grad_norm": 0.0039499313570559025,
+      "learning_rate": 0.001,
+      "loss": 0.42,
+      "step": 9521
+    },
+    {
+      "epoch": 0.2627329385349231,
+      "grad_norm": 0.005296787712723017,
+      "learning_rate": 0.001,
+      "loss": 0.397,
+      "step": 9522
+    },
+    {
+      "epoch": 0.2627605307359875,
+      "grad_norm": 0.0028091988060623407,
+      "learning_rate": 0.001,
+      "loss": 0.3897,
+      "step": 9523
+    },
+    {
+      "epoch": 0.26278812293705184,
+      "grad_norm": 0.004335075616836548,
+      "learning_rate": 0.001,
+      "loss": 0.3796,
+      "step": 9524
+    },
+    {
+      "epoch": 0.2628157151381162,
+      "grad_norm": 0.0030884994193911552,
+      "learning_rate": 0.001,
+      "loss": 0.415,
+      "step": 9525
+    },
+    {
+      "epoch": 0.2628433073391806,
+      "grad_norm": 0.004353479482233524,
+      "learning_rate": 0.001,
+      "loss": 0.4152,
+      "step": 9526
+    },
+    {
+      "epoch": 0.26287089954024495,
+      "grad_norm": 0.004048333968967199,
+      "learning_rate": 0.001,
+      "loss": 0.4178,
+      "step": 9527
+    },
+    {
+      "epoch": 0.2628984917413093,
+      "grad_norm": 0.0034877778962254524,
+      "learning_rate": 0.001,
+      "loss": 0.3853,
+      "step": 9528
+    },
+    {
+      "epoch": 0.2629260839423737,
+      "grad_norm": 0.0037483058404177427,
+      "learning_rate": 0.001,
+      "loss": 0.399,
+      "step": 9529
+    },
+    {
+      "epoch": 0.26295367614343806,
+      "grad_norm": 0.005867067724466324,
+      "learning_rate": 0.001,
+      "loss": 0.3985,
+      "step": 9530
+    },
+    {
+      "epoch": 0.2629812683445024,
+      "grad_norm": 0.004038537386804819,
+      "learning_rate": 0.001,
+      "loss": 0.3983,
+      "step": 9531
+    },
+    {
+      "epoch": 0.26300886054556677,
+      "grad_norm": 0.005649103317409754,
+      "learning_rate": 0.001,
+      "loss": 0.4183,
+      "step": 9532
+    },
+    {
+      "epoch": 0.2630364527466312,
+      "grad_norm": 0.0033814937341958284,
+      "learning_rate": 0.001,
+      "loss": 0.3688,
+      "step": 9533
+    },
+    {
+      "epoch": 0.26306404494769553,
+      "grad_norm": 0.0035686511546373367,
+      "learning_rate": 0.001,
+      "loss": 0.3774,
+      "step": 9534
+    },
+    {
+      "epoch": 0.2630916371487599,
+      "grad_norm": 0.004245868884027004,
+      "learning_rate": 0.001,
+      "loss": 0.3896,
+      "step": 9535
+    },
+    {
+      "epoch": 0.2631192293498243,
+      "grad_norm": 0.006375094410032034,
+      "learning_rate": 0.001,
+      "loss": 0.3639,
+      "step": 9536
+    },
+    {
+      "epoch": 0.26314682155088864,
+      "grad_norm": 0.008856172673404217,
+      "learning_rate": 0.001,
+      "loss": 0.3961,
+      "step": 9537
+    },
+    {
+      "epoch": 0.263174413751953,
+      "grad_norm": 0.005304283462464809,
+      "learning_rate": 0.001,
+      "loss": 0.3915,
+      "step": 9538
+    },
+    {
+      "epoch": 0.2632020059530174,
+      "grad_norm": 0.003037388902157545,
+      "learning_rate": 0.001,
+      "loss": 0.4013,
+      "step": 9539
+    },
+    {
+      "epoch": 0.26322959815408176,
+      "grad_norm": 0.00462722172960639,
+      "learning_rate": 0.001,
+      "loss": 0.3675,
+      "step": 9540
+    },
+    {
+      "epoch": 0.2632571903551461,
+      "grad_norm": 0.009490979835391045,
+      "learning_rate": 0.001,
+      "loss": 0.3635,
+      "step": 9541
+    },
+    {
+      "epoch": 0.26328478255621046,
+      "grad_norm": 0.0032088488806039095,
+      "learning_rate": 0.001,
+      "loss": 0.3793,
+      "step": 9542
+    },
+    {
+      "epoch": 0.26331237475727487,
+      "grad_norm": 0.006467896047979593,
+      "learning_rate": 0.001,
+      "loss": 0.3777,
+      "step": 9543
+    },
+    {
+      "epoch": 0.2633399669583392,
+      "grad_norm": 0.005221541505306959,
+      "learning_rate": 0.001,
+      "loss": 0.395,
+      "step": 9544
+    },
+    {
+      "epoch": 0.2633675591594036,
+      "grad_norm": 0.0027723468374460936,
+      "learning_rate": 0.001,
+      "loss": 0.3899,
+      "step": 9545
+    },
+    {
+      "epoch": 0.263395151360468,
+      "grad_norm": 0.0038048315327614546,
+      "learning_rate": 0.001,
+      "loss": 0.4034,
+      "step": 9546
+    },
+    {
+      "epoch": 0.26342274356153234,
+      "grad_norm": 0.004077599383890629,
+      "learning_rate": 0.001,
+      "loss": 0.438,
+      "step": 9547
+    },
+    {
+      "epoch": 0.2634503357625967,
+      "grad_norm": 0.0045372615568339825,
+      "learning_rate": 0.001,
+      "loss": 0.3785,
+      "step": 9548
+    },
+    {
+      "epoch": 0.2634779279636611,
+      "grad_norm": 0.0027856752276420593,
+      "learning_rate": 0.001,
+      "loss": 0.4039,
+      "step": 9549
+    },
+    {
+      "epoch": 0.26350552016472545,
+      "grad_norm": 0.0026442657690495253,
+      "learning_rate": 0.001,
+      "loss": 0.4157,
+      "step": 9550
+    },
+    {
+      "epoch": 0.2635331123657898,
+      "grad_norm": 0.003073024796321988,
+      "learning_rate": 0.001,
+      "loss": 0.4273,
+      "step": 9551
+    },
+    {
+      "epoch": 0.26356070456685415,
+      "grad_norm": 0.0030135842971503735,
+      "learning_rate": 0.001,
+      "loss": 0.4209,
+      "step": 9552
+    },
+    {
+      "epoch": 0.26358829676791856,
+      "grad_norm": 0.004977943375706673,
+      "learning_rate": 0.001,
+      "loss": 0.4246,
+      "step": 9553
+    },
+    {
+      "epoch": 0.2636158889689829,
+      "grad_norm": 0.0030254484154284,
+      "learning_rate": 0.001,
+      "loss": 0.3803,
+      "step": 9554
+    },
+    {
+      "epoch": 0.26364348117004727,
+      "grad_norm": 0.003810320282354951,
+      "learning_rate": 0.001,
+      "loss": 0.3984,
+      "step": 9555
+    },
+    {
+      "epoch": 0.2636710733711117,
+      "grad_norm": 0.002874776953831315,
+      "learning_rate": 0.001,
+      "loss": 0.4319,
+      "step": 9556
+    },
+    {
+      "epoch": 0.263698665572176,
+      "grad_norm": 0.00934076588600874,
+      "learning_rate": 0.001,
+      "loss": 0.4137,
+      "step": 9557
+    },
+    {
+      "epoch": 0.2637262577732404,
+      "grad_norm": 0.0022208630107343197,
+      "learning_rate": 0.001,
+      "loss": 0.4178,
+      "step": 9558
+    },
+    {
+      "epoch": 0.2637538499743048,
+      "grad_norm": 0.002492060884833336,
+      "learning_rate": 0.001,
+      "loss": 0.4334,
+      "step": 9559
+    },
+    {
+      "epoch": 0.26378144217536914,
+      "grad_norm": 0.005849645473062992,
+      "learning_rate": 0.001,
+      "loss": 0.3854,
+      "step": 9560
+    },
+    {
+      "epoch": 0.2638090343764335,
+      "grad_norm": 0.004015061073005199,
+      "learning_rate": 0.001,
+      "loss": 0.3674,
+      "step": 9561
+    },
+    {
+      "epoch": 0.26383662657749785,
+      "grad_norm": 0.0033112233504652977,
+      "learning_rate": 0.001,
+      "loss": 0.3804,
+      "step": 9562
+    },
+    {
+      "epoch": 0.26386421877856225,
+      "grad_norm": 0.002749896375462413,
+      "learning_rate": 0.001,
+      "loss": 0.4021,
+      "step": 9563
+    },
+    {
+      "epoch": 0.2638918109796266,
+      "grad_norm": 0.0027315288316458464,
+      "learning_rate": 0.001,
+      "loss": 0.3964,
+      "step": 9564
+    },
+    {
+      "epoch": 0.26391940318069096,
+      "grad_norm": 0.008834882639348507,
+      "learning_rate": 0.001,
+      "loss": 0.3689,
+      "step": 9565
+    },
+    {
+      "epoch": 0.26394699538175537,
+      "grad_norm": 0.01456514373421669,
+      "learning_rate": 0.001,
+      "loss": 0.3782,
+      "step": 9566
+    },
+    {
+      "epoch": 0.2639745875828197,
+      "grad_norm": 0.0028064576908946037,
+      "learning_rate": 0.001,
+      "loss": 0.4158,
+      "step": 9567
+    },
+    {
+      "epoch": 0.2640021797838841,
+      "grad_norm": 0.00802356656640768,
+      "learning_rate": 0.001,
+      "loss": 0.415,
+      "step": 9568
+    },
+    {
+      "epoch": 0.2640297719849485,
+      "grad_norm": 0.0031302161514759064,
+      "learning_rate": 0.001,
+      "loss": 0.4031,
+      "step": 9569
+    },
+    {
+      "epoch": 0.26405736418601283,
+      "grad_norm": 0.0035431873984634876,
+      "learning_rate": 0.001,
+      "loss": 0.3863,
+      "step": 9570
+    },
+    {
+      "epoch": 0.2640849563870772,
+      "grad_norm": 0.002664680825546384,
+      "learning_rate": 0.001,
+      "loss": 0.4043,
+      "step": 9571
+    },
+    {
+      "epoch": 0.26411254858814154,
+      "grad_norm": 0.006673896219581366,
+      "learning_rate": 0.001,
+      "loss": 0.3966,
+      "step": 9572
+    },
+    {
+      "epoch": 0.26414014078920595,
+      "grad_norm": 0.003176551777869463,
+      "learning_rate": 0.001,
+      "loss": 0.3871,
+      "step": 9573
+    },
+    {
+      "epoch": 0.2641677329902703,
+      "grad_norm": 0.01657606102526188,
+      "learning_rate": 0.001,
+      "loss": 0.4126,
+      "step": 9574
+    },
+    {
+      "epoch": 0.26419532519133465,
+      "grad_norm": 0.0069385310634970665,
+      "learning_rate": 0.001,
+      "loss": 0.4318,
+      "step": 9575
+    },
+    {
+      "epoch": 0.26422291739239906,
+      "grad_norm": 0.009939515963196754,
+      "learning_rate": 0.001,
+      "loss": 0.4081,
+      "step": 9576
+    },
+    {
+      "epoch": 0.2642505095934634,
+      "grad_norm": 0.002801012946292758,
+      "learning_rate": 0.001,
+      "loss": 0.4001,
+      "step": 9577
+    },
+    {
+      "epoch": 0.26427810179452776,
+      "grad_norm": 0.003218452911823988,
+      "learning_rate": 0.001,
+      "loss": 0.3829,
+      "step": 9578
+    },
+    {
+      "epoch": 0.2643056939955922,
+      "grad_norm": 0.0029445989057421684,
+      "learning_rate": 0.001,
+      "loss": 0.4012,
+      "step": 9579
+    },
+    {
+      "epoch": 0.2643332861966565,
+      "grad_norm": 0.003089828649535775,
+      "learning_rate": 0.001,
+      "loss": 0.4091,
+      "step": 9580
+    },
+    {
+      "epoch": 0.2643608783977209,
+      "grad_norm": 0.0025015585124492645,
+      "learning_rate": 0.001,
+      "loss": 0.4079,
+      "step": 9581
+    },
+    {
+      "epoch": 0.26438847059878523,
+      "grad_norm": 0.0021664395462721586,
+      "learning_rate": 0.001,
+      "loss": 0.4003,
+      "step": 9582
+    },
+    {
+      "epoch": 0.26441606279984964,
+      "grad_norm": 0.003756561316549778,
+      "learning_rate": 0.001,
+      "loss": 0.388,
+      "step": 9583
+    },
+    {
+      "epoch": 0.264443655000914,
+      "grad_norm": 0.003459861734881997,
+      "learning_rate": 0.001,
+      "loss": 0.41,
+      "step": 9584
+    },
+    {
+      "epoch": 0.26447124720197834,
+      "grad_norm": 0.0028741243295371532,
+      "learning_rate": 0.001,
+      "loss": 0.4028,
+      "step": 9585
+    },
+    {
+      "epoch": 0.26449883940304275,
+      "grad_norm": 0.00595883559435606,
+      "learning_rate": 0.001,
+      "loss": 0.4392,
+      "step": 9586
+    },
+    {
+      "epoch": 0.2645264316041071,
+      "grad_norm": 0.011750889010727406,
+      "learning_rate": 0.001,
+      "loss": 0.402,
+      "step": 9587
+    },
+    {
+      "epoch": 0.26455402380517146,
+      "grad_norm": 0.005314968526363373,
+      "learning_rate": 0.001,
+      "loss": 0.354,
+      "step": 9588
+    },
+    {
+      "epoch": 0.26458161600623586,
+      "grad_norm": 0.007636393420398235,
+      "learning_rate": 0.001,
+      "loss": 0.4063,
+      "step": 9589
+    },
+    {
+      "epoch": 0.2646092082073002,
+      "grad_norm": 0.003898282302543521,
+      "learning_rate": 0.001,
+      "loss": 0.3966,
+      "step": 9590
+    },
+    {
+      "epoch": 0.26463680040836457,
+      "grad_norm": 0.0038661775179207325,
+      "learning_rate": 0.001,
+      "loss": 0.4047,
+      "step": 9591
+    },
+    {
+      "epoch": 0.2646643926094289,
+      "grad_norm": 0.006659380160272121,
+      "learning_rate": 0.001,
+      "loss": 0.3853,
+      "step": 9592
+    },
+    {
+      "epoch": 0.26469198481049333,
+      "grad_norm": 0.0044357809238135815,
+      "learning_rate": 0.001,
+      "loss": 0.3755,
+      "step": 9593
+    },
+    {
+      "epoch": 0.2647195770115577,
+      "grad_norm": 0.0028593826573342085,
+      "learning_rate": 0.001,
+      "loss": 0.3733,
+      "step": 9594
+    },
+    {
+      "epoch": 0.26474716921262204,
+      "grad_norm": 0.003539180150255561,
+      "learning_rate": 0.001,
+      "loss": 0.3903,
+      "step": 9595
+    },
+    {
+      "epoch": 0.26477476141368644,
+      "grad_norm": 0.002942041726782918,
+      "learning_rate": 0.001,
+      "loss": 0.3914,
+      "step": 9596
+    },
+    {
+      "epoch": 0.2648023536147508,
+      "grad_norm": 0.0028285589069128036,
+      "learning_rate": 0.001,
+      "loss": 0.3797,
+      "step": 9597
+    },
+    {
+      "epoch": 0.26482994581581515,
+      "grad_norm": 0.0025839281734079123,
+      "learning_rate": 0.001,
+      "loss": 0.4079,
+      "step": 9598
+    },
+    {
+      "epoch": 0.2648575380168795,
+      "grad_norm": 0.00478973425924778,
+      "learning_rate": 0.001,
+      "loss": 0.3935,
+      "step": 9599
+    },
+    {
+      "epoch": 0.2648851302179439,
+      "grad_norm": 0.00439714128151536,
+      "learning_rate": 0.001,
+      "loss": 0.3596,
+      "step": 9600
+    },
+    {
+      "epoch": 0.26491272241900826,
+      "grad_norm": 0.0023904817644506693,
+      "learning_rate": 0.001,
+      "loss": 0.3985,
+      "step": 9601
+    },
+    {
+      "epoch": 0.2649403146200726,
+      "grad_norm": 0.0025250171311199665,
+      "learning_rate": 0.001,
+      "loss": 0.3991,
+      "step": 9602
+    },
+    {
+      "epoch": 0.264967906821137,
+      "grad_norm": 0.0065780081786215305,
+      "learning_rate": 0.001,
+      "loss": 0.3759,
+      "step": 9603
+    },
+    {
+      "epoch": 0.2649954990222014,
+      "grad_norm": 0.0026748040691018105,
+      "learning_rate": 0.001,
+      "loss": 0.4106,
+      "step": 9604
+    },
+    {
+      "epoch": 0.2650230912232657,
+      "grad_norm": 0.0031709959730505943,
+      "learning_rate": 0.001,
+      "loss": 0.4583,
+      "step": 9605
+    },
+    {
+      "epoch": 0.26505068342433014,
+      "grad_norm": 0.0026485987473279238,
+      "learning_rate": 0.001,
+      "loss": 0.4135,
+      "step": 9606
+    },
+    {
+      "epoch": 0.2650782756253945,
+      "grad_norm": 0.0029246548656374216,
+      "learning_rate": 0.001,
+      "loss": 0.4141,
+      "step": 9607
+    },
+    {
+      "epoch": 0.26510586782645884,
+      "grad_norm": 0.0031324047595262527,
+      "learning_rate": 0.001,
+      "loss": 0.3857,
+      "step": 9608
+    },
+    {
+      "epoch": 0.2651334600275232,
+      "grad_norm": 0.0024871290661394596,
+      "learning_rate": 0.001,
+      "loss": 0.4121,
+      "step": 9609
+    },
+    {
+      "epoch": 0.2651610522285876,
+      "grad_norm": 0.00248337653465569,
+      "learning_rate": 0.001,
+      "loss": 0.3977,
+      "step": 9610
+    },
+    {
+      "epoch": 0.26518864442965195,
+      "grad_norm": 0.002592930570244789,
+      "learning_rate": 0.001,
+      "loss": 0.4398,
+      "step": 9611
+    },
+    {
+      "epoch": 0.2652162366307163,
+      "grad_norm": 0.0031219925731420517,
+      "learning_rate": 0.001,
+      "loss": 0.4022,
+      "step": 9612
+    },
+    {
+      "epoch": 0.2652438288317807,
+      "grad_norm": 0.002096372190862894,
+      "learning_rate": 0.001,
+      "loss": 0.4162,
+      "step": 9613
+    },
+    {
+      "epoch": 0.26527142103284507,
+      "grad_norm": 0.0021825828589498997,
+      "learning_rate": 0.001,
+      "loss": 0.4235,
+      "step": 9614
+    },
+    {
+      "epoch": 0.2652990132339094,
+      "grad_norm": 0.0035961021203547716,
+      "learning_rate": 0.001,
+      "loss": 0.3995,
+      "step": 9615
+    },
+    {
+      "epoch": 0.26532660543497383,
+      "grad_norm": 0.003122879657894373,
+      "learning_rate": 0.001,
+      "loss": 0.4624,
+      "step": 9616
+    },
+    {
+      "epoch": 0.2653541976360382,
+      "grad_norm": 0.004705473314970732,
+      "learning_rate": 0.001,
+      "loss": 0.4148,
+      "step": 9617
+    },
+    {
+      "epoch": 0.26538178983710253,
+      "grad_norm": 0.0024439608678221703,
+      "learning_rate": 0.001,
+      "loss": 0.4063,
+      "step": 9618
+    },
+    {
+      "epoch": 0.2654093820381669,
+      "grad_norm": 0.0027413556817919016,
+      "learning_rate": 0.001,
+      "loss": 0.3999,
+      "step": 9619
+    },
+    {
+      "epoch": 0.2654369742392313,
+      "grad_norm": 0.004719111602753401,
+      "learning_rate": 0.001,
+      "loss": 0.4197,
+      "step": 9620
+    },
+    {
+      "epoch": 0.26546456644029565,
+      "grad_norm": 0.003787653986364603,
+      "learning_rate": 0.001,
+      "loss": 0.4061,
+      "step": 9621
+    },
+    {
+      "epoch": 0.26549215864136,
+      "grad_norm": 0.007587883621454239,
+      "learning_rate": 0.001,
+      "loss": 0.3865,
+      "step": 9622
+    },
+    {
+      "epoch": 0.2655197508424244,
+      "grad_norm": 0.0023858651984483004,
+      "learning_rate": 0.001,
+      "loss": 0.3952,
+      "step": 9623
+    },
+    {
+      "epoch": 0.26554734304348876,
+      "grad_norm": 0.005965366493910551,
+      "learning_rate": 0.001,
+      "loss": 0.3829,
+      "step": 9624
+    },
+    {
+      "epoch": 0.2655749352445531,
+      "grad_norm": 0.003585060592740774,
+      "learning_rate": 0.001,
+      "loss": 0.4503,
+      "step": 9625
+    },
+    {
+      "epoch": 0.2656025274456175,
+      "grad_norm": 0.006620787549763918,
+      "learning_rate": 0.001,
+      "loss": 0.3972,
+      "step": 9626
+    },
+    {
+      "epoch": 0.2656301196466819,
+      "grad_norm": 0.002199475420638919,
+      "learning_rate": 0.001,
+      "loss": 0.4479,
+      "step": 9627
+    },
+    {
+      "epoch": 0.2656577118477462,
+      "grad_norm": 0.003352593397721648,
+      "learning_rate": 0.001,
+      "loss": 0.4099,
+      "step": 9628
+    },
+    {
+      "epoch": 0.2656853040488106,
+      "grad_norm": 0.00883631780743599,
+      "learning_rate": 0.001,
+      "loss": 0.4001,
+      "step": 9629
+    },
+    {
+      "epoch": 0.265712896249875,
+      "grad_norm": 0.0033785421401262283,
+      "learning_rate": 0.001,
+      "loss": 0.4285,
+      "step": 9630
+    },
+    {
+      "epoch": 0.26574048845093934,
+      "grad_norm": 0.00507304398342967,
+      "learning_rate": 0.001,
+      "loss": 0.3795,
+      "step": 9631
+    },
+    {
+      "epoch": 0.2657680806520037,
+      "grad_norm": 0.010923560708761215,
+      "learning_rate": 0.001,
+      "loss": 0.3992,
+      "step": 9632
+    },
+    {
+      "epoch": 0.2657956728530681,
+      "grad_norm": 0.003360697068274021,
+      "learning_rate": 0.001,
+      "loss": 0.3868,
+      "step": 9633
+    },
+    {
+      "epoch": 0.26582326505413245,
+      "grad_norm": 0.003309650346636772,
+      "learning_rate": 0.001,
+      "loss": 0.4029,
+      "step": 9634
+    },
+    {
+      "epoch": 0.2658508572551968,
+      "grad_norm": 0.003346043173223734,
+      "learning_rate": 0.001,
+      "loss": 0.395,
+      "step": 9635
+    },
+    {
+      "epoch": 0.2658784494562612,
+      "grad_norm": 0.002929107751697302,
+      "learning_rate": 0.001,
+      "loss": 0.4192,
+      "step": 9636
+    },
+    {
+      "epoch": 0.26590604165732556,
+      "grad_norm": 0.0035214154049754143,
+      "learning_rate": 0.001,
+      "loss": 0.4382,
+      "step": 9637
+    },
+    {
+      "epoch": 0.2659336338583899,
+      "grad_norm": 0.004796061664819717,
+      "learning_rate": 0.001,
+      "loss": 0.3684,
+      "step": 9638
+    },
+    {
+      "epoch": 0.26596122605945427,
+      "grad_norm": 0.002335017779842019,
+      "learning_rate": 0.001,
+      "loss": 0.4417,
+      "step": 9639
+    },
+    {
+      "epoch": 0.2659888182605187,
+      "grad_norm": 0.002536866581067443,
+      "learning_rate": 0.001,
+      "loss": 0.4012,
+      "step": 9640
+    },
+    {
+      "epoch": 0.26601641046158303,
+      "grad_norm": 0.0021245151292532682,
+      "learning_rate": 0.001,
+      "loss": 0.4175,
+      "step": 9641
+    },
+    {
+      "epoch": 0.2660440026626474,
+      "grad_norm": 0.002907783491536975,
+      "learning_rate": 0.001,
+      "loss": 0.3827,
+      "step": 9642
+    },
+    {
+      "epoch": 0.2660715948637118,
+      "grad_norm": 0.00233438890427351,
+      "learning_rate": 0.001,
+      "loss": 0.4061,
+      "step": 9643
+    },
+    {
+      "epoch": 0.26609918706477614,
+      "grad_norm": 0.002872066106647253,
+      "learning_rate": 0.001,
+      "loss": 0.4071,
+      "step": 9644
+    },
+    {
+      "epoch": 0.2661267792658405,
+      "grad_norm": 0.002991445129737258,
+      "learning_rate": 0.001,
+      "loss": 0.3818,
+      "step": 9645
+    },
+    {
+      "epoch": 0.2661543714669049,
+      "grad_norm": 0.004284476395696402,
+      "learning_rate": 0.001,
+      "loss": 0.3798,
+      "step": 9646
+    },
+    {
+      "epoch": 0.26618196366796926,
+      "grad_norm": 0.0026576148811727762,
+      "learning_rate": 0.001,
+      "loss": 0.3893,
+      "step": 9647
+    },
+    {
+      "epoch": 0.2662095558690336,
+      "grad_norm": 0.0023093062918633223,
+      "learning_rate": 0.001,
+      "loss": 0.4186,
+      "step": 9648
+    },
+    {
+      "epoch": 0.26623714807009796,
+      "grad_norm": 0.0027488505002111197,
+      "learning_rate": 0.001,
+      "loss": 0.3849,
+      "step": 9649
+    },
+    {
+      "epoch": 0.26626474027116237,
+      "grad_norm": 0.002484068041667342,
+      "learning_rate": 0.001,
+      "loss": 0.4064,
+      "step": 9650
+    },
+    {
+      "epoch": 0.2662923324722267,
+      "grad_norm": 0.003056896850466728,
+      "learning_rate": 0.001,
+      "loss": 0.4155,
+      "step": 9651
+    },
+    {
+      "epoch": 0.2663199246732911,
+      "grad_norm": 0.0031155794858932495,
+      "learning_rate": 0.001,
+      "loss": 0.4079,
+      "step": 9652
+    },
+    {
+      "epoch": 0.2663475168743555,
+      "grad_norm": 0.002967652166262269,
+      "learning_rate": 0.001,
+      "loss": 0.3771,
+      "step": 9653
+    },
+    {
+      "epoch": 0.26637510907541984,
+      "grad_norm": 0.004259769804775715,
+      "learning_rate": 0.001,
+      "loss": 0.3793,
+      "step": 9654
+    },
+    {
+      "epoch": 0.2664027012764842,
+      "grad_norm": 0.0025000995956361294,
+      "learning_rate": 0.001,
+      "loss": 0.4392,
+      "step": 9655
+    },
+    {
+      "epoch": 0.2664302934775486,
+      "grad_norm": 0.0023649544455111027,
+      "learning_rate": 0.001,
+      "loss": 0.4357,
+      "step": 9656
+    },
+    {
+      "epoch": 0.26645788567861295,
+      "grad_norm": 0.005546201951801777,
+      "learning_rate": 0.001,
+      "loss": 0.3846,
+      "step": 9657
+    },
+    {
+      "epoch": 0.2664854778796773,
+      "grad_norm": 0.005744127556681633,
+      "learning_rate": 0.001,
+      "loss": 0.3935,
+      "step": 9658
+    },
+    {
+      "epoch": 0.26651307008074165,
+      "grad_norm": 0.004911984317004681,
+      "learning_rate": 0.001,
+      "loss": 0.4594,
+      "step": 9659
+    },
+    {
+      "epoch": 0.26654066228180606,
+      "grad_norm": 0.016794539988040924,
+      "learning_rate": 0.001,
+      "loss": 0.4044,
+      "step": 9660
+    },
+    {
+      "epoch": 0.2665682544828704,
+      "grad_norm": 0.004329861607402563,
+      "learning_rate": 0.001,
+      "loss": 0.4239,
+      "step": 9661
+    },
+    {
+      "epoch": 0.26659584668393477,
+      "grad_norm": 0.002347084926441312,
+      "learning_rate": 0.001,
+      "loss": 0.4268,
+      "step": 9662
+    },
+    {
+      "epoch": 0.2666234388849992,
+      "grad_norm": 0.003959767986088991,
+      "learning_rate": 0.001,
+      "loss": 0.4072,
+      "step": 9663
+    },
+    {
+      "epoch": 0.26665103108606353,
+      "grad_norm": 0.0030292284209281206,
+      "learning_rate": 0.001,
+      "loss": 0.3974,
+      "step": 9664
+    },
+    {
+      "epoch": 0.2666786232871279,
+      "grad_norm": 0.0031279707327485085,
+      "learning_rate": 0.001,
+      "loss": 0.4021,
+      "step": 9665
+    },
+    {
+      "epoch": 0.2667062154881923,
+      "grad_norm": 0.0038146632723510265,
+      "learning_rate": 0.001,
+      "loss": 0.3889,
+      "step": 9666
+    },
+    {
+      "epoch": 0.26673380768925664,
+      "grad_norm": 0.002462350530549884,
+      "learning_rate": 0.001,
+      "loss": 0.4424,
+      "step": 9667
+    },
+    {
+      "epoch": 0.266761399890321,
+      "grad_norm": 0.0025544725358486176,
+      "learning_rate": 0.001,
+      "loss": 0.4037,
+      "step": 9668
+    },
+    {
+      "epoch": 0.26678899209138535,
+      "grad_norm": 0.008749466389417648,
+      "learning_rate": 0.001,
+      "loss": 0.388,
+      "step": 9669
+    },
+    {
+      "epoch": 0.26681658429244975,
+      "grad_norm": 0.004481594543904066,
+      "learning_rate": 0.001,
+      "loss": 0.387,
+      "step": 9670
+    },
+    {
+      "epoch": 0.2668441764935141,
+      "grad_norm": 0.0027323602698743343,
+      "learning_rate": 0.001,
+      "loss": 0.4081,
+      "step": 9671
+    },
+    {
+      "epoch": 0.26687176869457846,
+      "grad_norm": 0.005246414337307215,
+      "learning_rate": 0.001,
+      "loss": 0.4027,
+      "step": 9672
+    },
+    {
+      "epoch": 0.26689936089564287,
+      "grad_norm": 0.0037931909319013357,
+      "learning_rate": 0.001,
+      "loss": 0.361,
+      "step": 9673
+    },
+    {
+      "epoch": 0.2669269530967072,
+      "grad_norm": 0.0028204875998198986,
+      "learning_rate": 0.001,
+      "loss": 0.3814,
+      "step": 9674
+    },
+    {
+      "epoch": 0.2669545452977716,
+      "grad_norm": 0.002369094407185912,
+      "learning_rate": 0.001,
+      "loss": 0.4217,
+      "step": 9675
+    },
+    {
+      "epoch": 0.266982137498836,
+      "grad_norm": 0.0028346215840429068,
+      "learning_rate": 0.001,
+      "loss": 0.4193,
+      "step": 9676
+    },
+    {
+      "epoch": 0.26700972969990033,
+      "grad_norm": 0.002556850900873542,
+      "learning_rate": 0.001,
+      "loss": 0.4062,
+      "step": 9677
+    },
+    {
+      "epoch": 0.2670373219009647,
+      "grad_norm": 0.0030708981212228537,
+      "learning_rate": 0.001,
+      "loss": 0.4116,
+      "step": 9678
+    },
+    {
+      "epoch": 0.26706491410202904,
+      "grad_norm": 0.003195315832272172,
+      "learning_rate": 0.001,
+      "loss": 0.397,
+      "step": 9679
+    },
+    {
+      "epoch": 0.26709250630309345,
+      "grad_norm": 0.0025848529767245054,
+      "learning_rate": 0.001,
+      "loss": 0.3751,
+      "step": 9680
+    },
+    {
+      "epoch": 0.2671200985041578,
+      "grad_norm": 0.0033659334294497967,
+      "learning_rate": 0.001,
+      "loss": 0.3981,
+      "step": 9681
+    },
+    {
+      "epoch": 0.26714769070522215,
+      "grad_norm": 0.023203924298286438,
+      "learning_rate": 0.001,
+      "loss": 0.4258,
+      "step": 9682
+    },
+    {
+      "epoch": 0.26717528290628656,
+      "grad_norm": 0.016148481518030167,
+      "learning_rate": 0.001,
+      "loss": 0.3905,
+      "step": 9683
+    },
+    {
+      "epoch": 0.2672028751073509,
+      "grad_norm": 0.003742114407941699,
+      "learning_rate": 0.001,
+      "loss": 0.4043,
+      "step": 9684
+    },
+    {
+      "epoch": 0.26723046730841526,
+      "grad_norm": 0.003448138013482094,
+      "learning_rate": 0.001,
+      "loss": 0.379,
+      "step": 9685
+    },
+    {
+      "epoch": 0.2672580595094797,
+      "grad_norm": 0.004335745703428984,
+      "learning_rate": 0.001,
+      "loss": 0.406,
+      "step": 9686
+    },
+    {
+      "epoch": 0.267285651710544,
+      "grad_norm": 0.0025457171723246574,
+      "learning_rate": 0.001,
+      "loss": 0.4174,
+      "step": 9687
+    },
+    {
+      "epoch": 0.2673132439116084,
+      "grad_norm": 0.0024251220747828484,
+      "learning_rate": 0.001,
+      "loss": 0.4321,
+      "step": 9688
+    },
+    {
+      "epoch": 0.26734083611267273,
+      "grad_norm": 0.00228850357234478,
+      "learning_rate": 0.001,
+      "loss": 0.3903,
+      "step": 9689
+    },
+    {
+      "epoch": 0.26736842831373714,
+      "grad_norm": 0.0042780437506735325,
+      "learning_rate": 0.001,
+      "loss": 0.387,
+      "step": 9690
+    },
+    {
+      "epoch": 0.2673960205148015,
+      "grad_norm": 0.003844790393486619,
+      "learning_rate": 0.001,
+      "loss": 0.3944,
+      "step": 9691
+    },
+    {
+      "epoch": 0.26742361271586584,
+      "grad_norm": 0.005739064887166023,
+      "learning_rate": 0.001,
+      "loss": 0.4403,
+      "step": 9692
+    },
+    {
+      "epoch": 0.26745120491693025,
+      "grad_norm": 0.0037738836836069822,
+      "learning_rate": 0.001,
+      "loss": 0.3757,
+      "step": 9693
+    },
+    {
+      "epoch": 0.2674787971179946,
+      "grad_norm": 0.006218787748366594,
+      "learning_rate": 0.001,
+      "loss": 0.4044,
+      "step": 9694
+    },
+    {
+      "epoch": 0.26750638931905896,
+      "grad_norm": 0.002998180454596877,
+      "learning_rate": 0.001,
+      "loss": 0.4152,
+      "step": 9695
+    },
+    {
+      "epoch": 0.2675339815201233,
+      "grad_norm": 0.008532087318599224,
+      "learning_rate": 0.001,
+      "loss": 0.3896,
+      "step": 9696
+    },
+    {
+      "epoch": 0.2675615737211877,
+      "grad_norm": 0.002803490497171879,
+      "learning_rate": 0.001,
+      "loss": 0.3901,
+      "step": 9697
+    },
+    {
+      "epoch": 0.26758916592225207,
+      "grad_norm": 0.003659422043710947,
+      "learning_rate": 0.001,
+      "loss": 0.3915,
+      "step": 9698
+    },
+    {
+      "epoch": 0.2676167581233164,
+      "grad_norm": 0.004317516926676035,
+      "learning_rate": 0.001,
+      "loss": 0.4044,
+      "step": 9699
+    },
+    {
+      "epoch": 0.26764435032438083,
+      "grad_norm": 0.004090850241482258,
+      "learning_rate": 0.001,
+      "loss": 0.3826,
+      "step": 9700
+    },
+    {
+      "epoch": 0.2676719425254452,
+      "grad_norm": 0.005415142513811588,
+      "learning_rate": 0.001,
+      "loss": 0.4334,
+      "step": 9701
+    },
+    {
+      "epoch": 0.26769953472650954,
+      "grad_norm": 0.0036655161529779434,
+      "learning_rate": 0.001,
+      "loss": 0.3982,
+      "step": 9702
+    },
+    {
+      "epoch": 0.26772712692757394,
+      "grad_norm": 0.008789146319031715,
+      "learning_rate": 0.001,
+      "loss": 0.4141,
+      "step": 9703
+    },
+    {
+      "epoch": 0.2677547191286383,
+      "grad_norm": 0.004173342138528824,
+      "learning_rate": 0.001,
+      "loss": 0.3865,
+      "step": 9704
+    },
+    {
+      "epoch": 0.26778231132970265,
+      "grad_norm": 0.0031271290499716997,
+      "learning_rate": 0.001,
+      "loss": 0.424,
+      "step": 9705
+    },
+    {
+      "epoch": 0.267809903530767,
+      "grad_norm": 0.0030106983613222837,
+      "learning_rate": 0.001,
+      "loss": 0.3698,
+      "step": 9706
+    },
+    {
+      "epoch": 0.2678374957318314,
+      "grad_norm": 0.004071452189236879,
+      "learning_rate": 0.001,
+      "loss": 0.3773,
+      "step": 9707
+    },
+    {
+      "epoch": 0.26786508793289576,
+      "grad_norm": 0.002735607326030731,
+      "learning_rate": 0.001,
+      "loss": 0.4445,
+      "step": 9708
+    },
+    {
+      "epoch": 0.2678926801339601,
+      "grad_norm": 0.0036248862743377686,
+      "learning_rate": 0.001,
+      "loss": 0.3786,
+      "step": 9709
+    },
+    {
+      "epoch": 0.2679202723350245,
+      "grad_norm": 0.004495333880186081,
+      "learning_rate": 0.001,
+      "loss": 0.4288,
+      "step": 9710
+    },
+    {
+      "epoch": 0.2679478645360889,
+      "grad_norm": 0.003465922549366951,
+      "learning_rate": 0.001,
+      "loss": 0.4178,
+      "step": 9711
+    },
+    {
+      "epoch": 0.26797545673715323,
+      "grad_norm": 0.0050760298036038876,
+      "learning_rate": 0.001,
+      "loss": 0.3898,
+      "step": 9712
+    },
+    {
+      "epoch": 0.26800304893821764,
+      "grad_norm": 0.0025128766428679228,
+      "learning_rate": 0.001,
+      "loss": 0.4293,
+      "step": 9713
+    },
+    {
+      "epoch": 0.268030641139282,
+      "grad_norm": 0.0030928144697099924,
+      "learning_rate": 0.001,
+      "loss": 0.4024,
+      "step": 9714
+    },
+    {
+      "epoch": 0.26805823334034634,
+      "grad_norm": 0.0031707047019153833,
+      "learning_rate": 0.001,
+      "loss": 0.4027,
+      "step": 9715
+    },
+    {
+      "epoch": 0.2680858255414107,
+      "grad_norm": 0.0074263643473386765,
+      "learning_rate": 0.001,
+      "loss": 0.392,
+      "step": 9716
+    },
+    {
+      "epoch": 0.2681134177424751,
+      "grad_norm": 0.004428863059729338,
+      "learning_rate": 0.001,
+      "loss": 0.4157,
+      "step": 9717
+    },
+    {
+      "epoch": 0.26814100994353945,
+      "grad_norm": 0.005300541874021292,
+      "learning_rate": 0.001,
+      "loss": 0.421,
+      "step": 9718
+    },
+    {
+      "epoch": 0.2681686021446038,
+      "grad_norm": 0.013318234123289585,
+      "learning_rate": 0.001,
+      "loss": 0.3608,
+      "step": 9719
+    },
+    {
+      "epoch": 0.2681961943456682,
+      "grad_norm": 0.0030859310645610094,
+      "learning_rate": 0.001,
+      "loss": 0.4177,
+      "step": 9720
+    },
+    {
+      "epoch": 0.26822378654673257,
+      "grad_norm": 0.00345474760979414,
+      "learning_rate": 0.001,
+      "loss": 0.3964,
+      "step": 9721
+    },
+    {
+      "epoch": 0.2682513787477969,
+      "grad_norm": 0.006549532990902662,
+      "learning_rate": 0.001,
+      "loss": 0.3835,
+      "step": 9722
+    },
+    {
+      "epoch": 0.26827897094886133,
+      "grad_norm": 0.013787861913442612,
+      "learning_rate": 0.001,
+      "loss": 0.4332,
+      "step": 9723
+    },
+    {
+      "epoch": 0.2683065631499257,
+      "grad_norm": 0.00343018164858222,
+      "learning_rate": 0.001,
+      "loss": 0.4035,
+      "step": 9724
+    },
+    {
+      "epoch": 0.26833415535099003,
+      "grad_norm": 0.002628603018820286,
+      "learning_rate": 0.001,
+      "loss": 0.4012,
+      "step": 9725
+    },
+    {
+      "epoch": 0.2683617475520544,
+      "grad_norm": 0.004094823729246855,
+      "learning_rate": 0.001,
+      "loss": 0.3817,
+      "step": 9726
+    },
+    {
+      "epoch": 0.2683893397531188,
+      "grad_norm": 0.002913734642788768,
+      "learning_rate": 0.001,
+      "loss": 0.3751,
+      "step": 9727
+    },
+    {
+      "epoch": 0.26841693195418315,
+      "grad_norm": 0.002792428247630596,
+      "learning_rate": 0.001,
+      "loss": 0.3806,
+      "step": 9728
+    },
+    {
+      "epoch": 0.2684445241552475,
+      "grad_norm": 0.0024083973839879036,
+      "learning_rate": 0.001,
+      "loss": 0.4336,
+      "step": 9729
+    },
+    {
+      "epoch": 0.2684721163563119,
+      "grad_norm": 0.002965459832921624,
+      "learning_rate": 0.001,
+      "loss": 0.4155,
+      "step": 9730
+    },
+    {
+      "epoch": 0.26849970855737626,
+      "grad_norm": 0.01316264271736145,
+      "learning_rate": 0.001,
+      "loss": 0.4539,
+      "step": 9731
+    },
+    {
+      "epoch": 0.2685273007584406,
+      "grad_norm": 0.002591692376881838,
+      "learning_rate": 0.001,
+      "loss": 0.3989,
+      "step": 9732
+    },
+    {
+      "epoch": 0.268554892959505,
+      "grad_norm": 0.002965483581647277,
+      "learning_rate": 0.001,
+      "loss": 0.3652,
+      "step": 9733
+    },
+    {
+      "epoch": 0.2685824851605694,
+      "grad_norm": 0.004762920085340738,
+      "learning_rate": 0.001,
+      "loss": 0.4115,
+      "step": 9734
+    },
+    {
+      "epoch": 0.2686100773616337,
+      "grad_norm": 0.0021797195076942444,
+      "learning_rate": 0.001,
+      "loss": 0.3907,
+      "step": 9735
+    },
+    {
+      "epoch": 0.2686376695626981,
+      "grad_norm": 0.0029116403311491013,
+      "learning_rate": 0.001,
+      "loss": 0.3816,
+      "step": 9736
+    },
+    {
+      "epoch": 0.2686652617637625,
+      "grad_norm": 0.003725858870893717,
+      "learning_rate": 0.001,
+      "loss": 0.3588,
+      "step": 9737
+    },
+    {
+      "epoch": 0.26869285396482684,
+      "grad_norm": 0.004337244667112827,
+      "learning_rate": 0.001,
+      "loss": 0.4034,
+      "step": 9738
+    },
+    {
+      "epoch": 0.2687204461658912,
+      "grad_norm": 0.0027076283004134893,
+      "learning_rate": 0.001,
+      "loss": 0.391,
+      "step": 9739
+    },
+    {
+      "epoch": 0.2687480383669556,
+      "grad_norm": 0.0021667422261089087,
+      "learning_rate": 0.001,
+      "loss": 0.4089,
+      "step": 9740
+    },
+    {
+      "epoch": 0.26877563056801995,
+      "grad_norm": 0.0021627771202474833,
+      "learning_rate": 0.001,
+      "loss": 0.4135,
+      "step": 9741
+    },
+    {
+      "epoch": 0.2688032227690843,
+      "grad_norm": 0.0031696807127445936,
+      "learning_rate": 0.001,
+      "loss": 0.3744,
+      "step": 9742
+    },
+    {
+      "epoch": 0.2688308149701487,
+      "grad_norm": 0.0033821675460785627,
+      "learning_rate": 0.001,
+      "loss": 0.4016,
+      "step": 9743
+    },
+    {
+      "epoch": 0.26885840717121307,
+      "grad_norm": 0.0039416286163032055,
+      "learning_rate": 0.001,
+      "loss": 0.3728,
+      "step": 9744
+    },
+    {
+      "epoch": 0.2688859993722774,
+      "grad_norm": 0.003314357250928879,
+      "learning_rate": 0.001,
+      "loss": 0.3387,
+      "step": 9745
+    },
+    {
+      "epoch": 0.26891359157334177,
+      "grad_norm": 0.0031994907185435295,
+      "learning_rate": 0.001,
+      "loss": 0.4292,
+      "step": 9746
+    },
+    {
+      "epoch": 0.2689411837744062,
+      "grad_norm": 0.006605618633329868,
+      "learning_rate": 0.001,
+      "loss": 0.4039,
+      "step": 9747
+    },
+    {
+      "epoch": 0.26896877597547053,
+      "grad_norm": 0.0040326593443751335,
+      "learning_rate": 0.001,
+      "loss": 0.3823,
+      "step": 9748
+    },
+    {
+      "epoch": 0.2689963681765349,
+      "grad_norm": 0.004000399261713028,
+      "learning_rate": 0.001,
+      "loss": 0.4014,
+      "step": 9749
+    },
+    {
+      "epoch": 0.2690239603775993,
+      "grad_norm": 0.005057289730757475,
+      "learning_rate": 0.001,
+      "loss": 0.4383,
+      "step": 9750
+    },
+    {
+      "epoch": 0.26905155257866364,
+      "grad_norm": 0.006951628718525171,
+      "learning_rate": 0.001,
+      "loss": 0.4397,
+      "step": 9751
+    },
+    {
+      "epoch": 0.269079144779728,
+      "grad_norm": 0.012159796431660652,
+      "learning_rate": 0.001,
+      "loss": 0.3827,
+      "step": 9752
+    },
+    {
+      "epoch": 0.2691067369807924,
+      "grad_norm": 0.012929181568324566,
+      "learning_rate": 0.001,
+      "loss": 0.3891,
+      "step": 9753
+    },
+    {
+      "epoch": 0.26913432918185676,
+      "grad_norm": 0.028701601549983025,
+      "learning_rate": 0.001,
+      "loss": 0.4264,
+      "step": 9754
+    },
+    {
+      "epoch": 0.2691619213829211,
+      "grad_norm": 0.00337967392988503,
+      "learning_rate": 0.001,
+      "loss": 0.3958,
+      "step": 9755
+    },
+    {
+      "epoch": 0.26918951358398546,
+      "grad_norm": 0.004440324380993843,
+      "learning_rate": 0.001,
+      "loss": 0.4315,
+      "step": 9756
+    },
+    {
+      "epoch": 0.26921710578504987,
+      "grad_norm": 0.009946012869477272,
+      "learning_rate": 0.001,
+      "loss": 0.4012,
+      "step": 9757
+    },
+    {
+      "epoch": 0.2692446979861142,
+      "grad_norm": 0.0047539942897856236,
+      "learning_rate": 0.001,
+      "loss": 0.3814,
+      "step": 9758
+    },
+    {
+      "epoch": 0.2692722901871786,
+      "grad_norm": 0.002247569151222706,
+      "learning_rate": 0.001,
+      "loss": 0.4414,
+      "step": 9759
+    },
+    {
+      "epoch": 0.269299882388243,
+      "grad_norm": 0.004046887159347534,
+      "learning_rate": 0.001,
+      "loss": 0.3617,
+      "step": 9760
+    },
+    {
+      "epoch": 0.26932747458930734,
+      "grad_norm": 0.0023645355831831694,
+      "learning_rate": 0.001,
+      "loss": 0.4105,
+      "step": 9761
+    },
+    {
+      "epoch": 0.2693550667903717,
+      "grad_norm": 0.006252918858081102,
+      "learning_rate": 0.001,
+      "loss": 0.3906,
+      "step": 9762
+    },
+    {
+      "epoch": 0.2693826589914361,
+      "grad_norm": 0.004668123554438353,
+      "learning_rate": 0.001,
+      "loss": 0.4173,
+      "step": 9763
+    },
+    {
+      "epoch": 0.26941025119250045,
+      "grad_norm": 0.005385317839682102,
+      "learning_rate": 0.001,
+      "loss": 0.3852,
+      "step": 9764
+    },
+    {
+      "epoch": 0.2694378433935648,
+      "grad_norm": 0.004070324823260307,
+      "learning_rate": 0.001,
+      "loss": 0.409,
+      "step": 9765
+    },
+    {
+      "epoch": 0.26946543559462915,
+      "grad_norm": 0.002345689572393894,
+      "learning_rate": 0.001,
+      "loss": 0.4281,
+      "step": 9766
+    },
+    {
+      "epoch": 0.26949302779569356,
+      "grad_norm": 0.004202275536954403,
+      "learning_rate": 0.001,
+      "loss": 0.4266,
+      "step": 9767
+    },
+    {
+      "epoch": 0.2695206199967579,
+      "grad_norm": 0.003104708855971694,
+      "learning_rate": 0.001,
+      "loss": 0.3805,
+      "step": 9768
+    },
+    {
+      "epoch": 0.26954821219782227,
+      "grad_norm": 0.003290161257609725,
+      "learning_rate": 0.001,
+      "loss": 0.4162,
+      "step": 9769
+    },
+    {
+      "epoch": 0.2695758043988867,
+      "grad_norm": 0.0024769490119069815,
+      "learning_rate": 0.001,
+      "loss": 0.3946,
+      "step": 9770
+    },
+    {
+      "epoch": 0.26960339659995103,
+      "grad_norm": 0.002376476302742958,
+      "learning_rate": 0.001,
+      "loss": 0.3567,
+      "step": 9771
+    },
+    {
+      "epoch": 0.2696309888010154,
+      "grad_norm": 0.00581009779125452,
+      "learning_rate": 0.001,
+      "loss": 0.3977,
+      "step": 9772
+    },
+    {
+      "epoch": 0.2696585810020798,
+      "grad_norm": 0.005047387443482876,
+      "learning_rate": 0.001,
+      "loss": 0.4187,
+      "step": 9773
+    },
+    {
+      "epoch": 0.26968617320314414,
+      "grad_norm": 0.002559883752837777,
+      "learning_rate": 0.001,
+      "loss": 0.3871,
+      "step": 9774
+    },
+    {
+      "epoch": 0.2697137654042085,
+      "grad_norm": 0.002292527351528406,
+      "learning_rate": 0.001,
+      "loss": 0.402,
+      "step": 9775
+    },
+    {
+      "epoch": 0.26974135760527285,
+      "grad_norm": 0.002428713021799922,
+      "learning_rate": 0.001,
+      "loss": 0.393,
+      "step": 9776
+    },
+    {
+      "epoch": 0.26976894980633725,
+      "grad_norm": 0.0029613785445690155,
+      "learning_rate": 0.001,
+      "loss": 0.387,
+      "step": 9777
+    },
+    {
+      "epoch": 0.2697965420074016,
+      "grad_norm": 0.00751551054418087,
+      "learning_rate": 0.001,
+      "loss": 0.383,
+      "step": 9778
+    },
+    {
+      "epoch": 0.26982413420846596,
+      "grad_norm": 0.0034091894049197435,
+      "learning_rate": 0.001,
+      "loss": 0.4048,
+      "step": 9779
+    },
+    {
+      "epoch": 0.26985172640953037,
+      "grad_norm": 0.003550524590536952,
+      "learning_rate": 0.001,
+      "loss": 0.3828,
+      "step": 9780
+    },
+    {
+      "epoch": 0.2698793186105947,
+      "grad_norm": 0.019099680706858635,
+      "learning_rate": 0.001,
+      "loss": 0.3838,
+      "step": 9781
+    },
+    {
+      "epoch": 0.2699069108116591,
+      "grad_norm": 0.013346359133720398,
+      "learning_rate": 0.001,
+      "loss": 0.4195,
+      "step": 9782
+    },
+    {
+      "epoch": 0.2699345030127235,
+      "grad_norm": 0.0031708034221082926,
+      "learning_rate": 0.001,
+      "loss": 0.4188,
+      "step": 9783
+    },
+    {
+      "epoch": 0.26996209521378783,
+      "grad_norm": 0.009070201776921749,
+      "learning_rate": 0.001,
+      "loss": 0.3902,
+      "step": 9784
+    },
+    {
+      "epoch": 0.2699896874148522,
+      "grad_norm": 0.003416346851736307,
+      "learning_rate": 0.001,
+      "loss": 0.3885,
+      "step": 9785
+    },
+    {
+      "epoch": 0.27001727961591654,
+      "grad_norm": 0.0025843738112598658,
+      "learning_rate": 0.001,
+      "loss": 0.4058,
+      "step": 9786
+    },
+    {
+      "epoch": 0.27004487181698095,
+      "grad_norm": 0.024039220064878464,
+      "learning_rate": 0.001,
+      "loss": 0.3799,
+      "step": 9787
+    },
+    {
+      "epoch": 0.2700724640180453,
+      "grad_norm": 0.00454790610820055,
+      "learning_rate": 0.001,
+      "loss": 0.3843,
+      "step": 9788
+    },
+    {
+      "epoch": 0.27010005621910965,
+      "grad_norm": 0.0040943981148302555,
+      "learning_rate": 0.001,
+      "loss": 0.4019,
+      "step": 9789
+    },
+    {
+      "epoch": 0.27012764842017406,
+      "grad_norm": 0.01507536880671978,
+      "learning_rate": 0.001,
+      "loss": 0.401,
+      "step": 9790
+    },
+    {
+      "epoch": 0.2701552406212384,
+      "grad_norm": 0.007708291057497263,
+      "learning_rate": 0.001,
+      "loss": 0.4034,
+      "step": 9791
+    },
+    {
+      "epoch": 0.27018283282230277,
+      "grad_norm": 0.002290240256115794,
+      "learning_rate": 0.001,
+      "loss": 0.4356,
+      "step": 9792
+    },
+    {
+      "epoch": 0.2702104250233671,
+      "grad_norm": 0.006000965368002653,
+      "learning_rate": 0.001,
+      "loss": 0.3904,
+      "step": 9793
+    },
+    {
+      "epoch": 0.2702380172244315,
+      "grad_norm": 0.0025645066052675247,
+      "learning_rate": 0.001,
+      "loss": 0.3787,
+      "step": 9794
+    },
+    {
+      "epoch": 0.2702656094254959,
+      "grad_norm": 0.004800410475581884,
+      "learning_rate": 0.001,
+      "loss": 0.3921,
+      "step": 9795
+    },
+    {
+      "epoch": 0.27029320162656023,
+      "grad_norm": 0.002543856855481863,
+      "learning_rate": 0.001,
+      "loss": 0.4212,
+      "step": 9796
+    },
+    {
+      "epoch": 0.27032079382762464,
+      "grad_norm": 0.0024924466852098703,
+      "learning_rate": 0.001,
+      "loss": 0.4508,
+      "step": 9797
+    },
+    {
+      "epoch": 0.270348386028689,
+      "grad_norm": 0.017285561189055443,
+      "learning_rate": 0.001,
+      "loss": 0.4157,
+      "step": 9798
+    },
+    {
+      "epoch": 0.27037597822975334,
+      "grad_norm": 0.0034071647096425295,
+      "learning_rate": 0.001,
+      "loss": 0.4031,
+      "step": 9799
+    },
+    {
+      "epoch": 0.27040357043081775,
+      "grad_norm": 0.0026173749938607216,
+      "learning_rate": 0.001,
+      "loss": 0.4107,
+      "step": 9800
+    },
+    {
+      "epoch": 0.2704311626318821,
+      "grad_norm": 0.0027858337853103876,
+      "learning_rate": 0.001,
+      "loss": 0.3942,
+      "step": 9801
+    },
+    {
+      "epoch": 0.27045875483294646,
+      "grad_norm": 0.003459551138803363,
+      "learning_rate": 0.001,
+      "loss": 0.3888,
+      "step": 9802
+    },
+    {
+      "epoch": 0.2704863470340108,
+      "grad_norm": 0.0028464416973292828,
+      "learning_rate": 0.001,
+      "loss": 0.3334,
+      "step": 9803
+    },
+    {
+      "epoch": 0.2705139392350752,
+      "grad_norm": 0.0025196147616952658,
+      "learning_rate": 0.001,
+      "loss": 0.4198,
+      "step": 9804
+    },
+    {
+      "epoch": 0.27054153143613957,
+      "grad_norm": 0.002594658173620701,
+      "learning_rate": 0.001,
+      "loss": 0.3921,
+      "step": 9805
+    },
+    {
+      "epoch": 0.2705691236372039,
+      "grad_norm": 0.0055243829265236855,
+      "learning_rate": 0.001,
+      "loss": 0.4163,
+      "step": 9806
+    },
+    {
+      "epoch": 0.27059671583826833,
+      "grad_norm": 0.003879512194544077,
+      "learning_rate": 0.001,
+      "loss": 0.3891,
+      "step": 9807
+    },
+    {
+      "epoch": 0.2706243080393327,
+      "grad_norm": 0.0029453851748257875,
+      "learning_rate": 0.001,
+      "loss": 0.3562,
+      "step": 9808
+    },
+    {
+      "epoch": 0.27065190024039704,
+      "grad_norm": 0.004673081450164318,
+      "learning_rate": 0.001,
+      "loss": 0.394,
+      "step": 9809
+    },
+    {
+      "epoch": 0.27067949244146144,
+      "grad_norm": 0.002838741522282362,
+      "learning_rate": 0.001,
+      "loss": 0.4154,
+      "step": 9810
+    },
+    {
+      "epoch": 0.2707070846425258,
+      "grad_norm": 0.002747466554865241,
+      "learning_rate": 0.001,
+      "loss": 0.3861,
+      "step": 9811
+    },
+    {
+      "epoch": 0.27073467684359015,
+      "grad_norm": 0.003146926872432232,
+      "learning_rate": 0.001,
+      "loss": 0.4179,
+      "step": 9812
+    },
+    {
+      "epoch": 0.2707622690446545,
+      "grad_norm": 0.007232631091028452,
+      "learning_rate": 0.001,
+      "loss": 0.3991,
+      "step": 9813
+    },
+    {
+      "epoch": 0.2707898612457189,
+      "grad_norm": 0.0033664864022284746,
+      "learning_rate": 0.001,
+      "loss": 0.4017,
+      "step": 9814
+    },
+    {
+      "epoch": 0.27081745344678326,
+      "grad_norm": 0.002574306447058916,
+      "learning_rate": 0.001,
+      "loss": 0.4027,
+      "step": 9815
+    },
+    {
+      "epoch": 0.2708450456478476,
+      "grad_norm": 0.0030514101963490248,
+      "learning_rate": 0.001,
+      "loss": 0.3745,
+      "step": 9816
+    },
+    {
+      "epoch": 0.270872637848912,
+      "grad_norm": 0.0026982761919498444,
+      "learning_rate": 0.001,
+      "loss": 0.4389,
+      "step": 9817
+    },
+    {
+      "epoch": 0.2709002300499764,
+      "grad_norm": 0.0034780215937644243,
+      "learning_rate": 0.001,
+      "loss": 0.4446,
+      "step": 9818
+    },
+    {
+      "epoch": 0.27092782225104073,
+      "grad_norm": 0.0022655725479125977,
+      "learning_rate": 0.001,
+      "loss": 0.391,
+      "step": 9819
+    },
+    {
+      "epoch": 0.27095541445210514,
+      "grad_norm": 0.0031723659485578537,
+      "learning_rate": 0.001,
+      "loss": 0.3482,
+      "step": 9820
+    },
+    {
+      "epoch": 0.2709830066531695,
+      "grad_norm": 0.004059515427798033,
+      "learning_rate": 0.001,
+      "loss": 0.4105,
+      "step": 9821
+    },
+    {
+      "epoch": 0.27101059885423384,
+      "grad_norm": 0.003187219612300396,
+      "learning_rate": 0.001,
+      "loss": 0.3748,
+      "step": 9822
+    },
+    {
+      "epoch": 0.2710381910552982,
+      "grad_norm": 0.002115658251568675,
+      "learning_rate": 0.001,
+      "loss": 0.4208,
+      "step": 9823
+    },
+    {
+      "epoch": 0.2710657832563626,
+      "grad_norm": 0.004135797265917063,
+      "learning_rate": 0.001,
+      "loss": 0.4439,
+      "step": 9824
+    },
+    {
+      "epoch": 0.27109337545742695,
+      "grad_norm": 0.0046222819946706295,
+      "learning_rate": 0.001,
+      "loss": 0.4101,
+      "step": 9825
+    },
+    {
+      "epoch": 0.2711209676584913,
+      "grad_norm": 0.00371656590141356,
+      "learning_rate": 0.001,
+      "loss": 0.3868,
+      "step": 9826
+    },
+    {
+      "epoch": 0.2711485598595557,
+      "grad_norm": 0.0033854972571134567,
+      "learning_rate": 0.001,
+      "loss": 0.3818,
+      "step": 9827
+    },
+    {
+      "epoch": 0.27117615206062007,
+      "grad_norm": 0.0034048438537865877,
+      "learning_rate": 0.001,
+      "loss": 0.3669,
+      "step": 9828
+    },
+    {
+      "epoch": 0.2712037442616844,
+      "grad_norm": 0.005359988659620285,
+      "learning_rate": 0.001,
+      "loss": 0.3695,
+      "step": 9829
+    },
+    {
+      "epoch": 0.27123133646274883,
+      "grad_norm": 0.0037438932340592146,
+      "learning_rate": 0.001,
+      "loss": 0.3743,
+      "step": 9830
+    },
+    {
+      "epoch": 0.2712589286638132,
+      "grad_norm": 0.002518385648727417,
+      "learning_rate": 0.001,
+      "loss": 0.4065,
+      "step": 9831
+    },
+    {
+      "epoch": 0.27128652086487753,
+      "grad_norm": 0.0029225589241832495,
+      "learning_rate": 0.001,
+      "loss": 0.414,
+      "step": 9832
+    },
+    {
+      "epoch": 0.2713141130659419,
+      "grad_norm": 0.003086130367591977,
+      "learning_rate": 0.001,
+      "loss": 0.3932,
+      "step": 9833
+    },
+    {
+      "epoch": 0.2713417052670063,
+      "grad_norm": 0.005153126548975706,
+      "learning_rate": 0.001,
+      "loss": 0.37,
+      "step": 9834
+    },
+    {
+      "epoch": 0.27136929746807065,
+      "grad_norm": 0.0035248089116066694,
+      "learning_rate": 0.001,
+      "loss": 0.4109,
+      "step": 9835
+    },
+    {
+      "epoch": 0.271396889669135,
+      "grad_norm": 0.006128870882093906,
+      "learning_rate": 0.001,
+      "loss": 0.4036,
+      "step": 9836
+    },
+    {
+      "epoch": 0.2714244818701994,
+      "grad_norm": 0.003302632598206401,
+      "learning_rate": 0.001,
+      "loss": 0.3738,
+      "step": 9837
+    },
+    {
+      "epoch": 0.27145207407126376,
+      "grad_norm": 0.004240743815898895,
+      "learning_rate": 0.001,
+      "loss": 0.4279,
+      "step": 9838
+    },
+    {
+      "epoch": 0.2714796662723281,
+      "grad_norm": 0.014292960986495018,
+      "learning_rate": 0.001,
+      "loss": 0.4009,
+      "step": 9839
+    },
+    {
+      "epoch": 0.2715072584733925,
+      "grad_norm": 0.002007813658565283,
+      "learning_rate": 0.001,
+      "loss": 0.4097,
+      "step": 9840
+    },
+    {
+      "epoch": 0.2715348506744569,
+      "grad_norm": 0.004010828211903572,
+      "learning_rate": 0.001,
+      "loss": 0.3683,
+      "step": 9841
+    },
+    {
+      "epoch": 0.2715624428755212,
+      "grad_norm": 0.00426107831299305,
+      "learning_rate": 0.001,
+      "loss": 0.3931,
+      "step": 9842
+    },
+    {
+      "epoch": 0.2715900350765856,
+      "grad_norm": 0.0019332681549713016,
+      "learning_rate": 0.001,
+      "loss": 0.4438,
+      "step": 9843
+    },
+    {
+      "epoch": 0.27161762727765,
+      "grad_norm": 0.0026610195636749268,
+      "learning_rate": 0.001,
+      "loss": 0.406,
+      "step": 9844
+    },
+    {
+      "epoch": 0.27164521947871434,
+      "grad_norm": 0.0026648433413356543,
+      "learning_rate": 0.001,
+      "loss": 0.3922,
+      "step": 9845
+    },
+    {
+      "epoch": 0.2716728116797787,
+      "grad_norm": 0.0038692099042236805,
+      "learning_rate": 0.001,
+      "loss": 0.423,
+      "step": 9846
+    },
+    {
+      "epoch": 0.2717004038808431,
+      "grad_norm": 0.0028008136432617903,
+      "learning_rate": 0.001,
+      "loss": 0.3998,
+      "step": 9847
+    },
+    {
+      "epoch": 0.27172799608190745,
+      "grad_norm": 0.0038147938903421164,
+      "learning_rate": 0.001,
+      "loss": 0.394,
+      "step": 9848
+    },
+    {
+      "epoch": 0.2717555882829718,
+      "grad_norm": 0.0787762999534607,
+      "learning_rate": 0.001,
+      "loss": 0.4016,
+      "step": 9849
+    },
+    {
+      "epoch": 0.2717831804840362,
+      "grad_norm": 0.004193917848169804,
+      "learning_rate": 0.001,
+      "loss": 0.4197,
+      "step": 9850
+    },
+    {
+      "epoch": 0.27181077268510057,
+      "grad_norm": 0.0029792841523885727,
+      "learning_rate": 0.001,
+      "loss": 0.3763,
+      "step": 9851
+    },
+    {
+      "epoch": 0.2718383648861649,
+      "grad_norm": 0.002425075275823474,
+      "learning_rate": 0.001,
+      "loss": 0.4264,
+      "step": 9852
+    },
+    {
+      "epoch": 0.27186595708722927,
+      "grad_norm": 0.0051060812547802925,
+      "learning_rate": 0.001,
+      "loss": 0.3555,
+      "step": 9853
+    },
+    {
+      "epoch": 0.2718935492882937,
+      "grad_norm": 0.024291977286338806,
+      "learning_rate": 0.001,
+      "loss": 0.3906,
+      "step": 9854
+    },
+    {
+      "epoch": 0.27192114148935803,
+      "grad_norm": 0.005702044349163771,
+      "learning_rate": 0.001,
+      "loss": 0.4261,
+      "step": 9855
+    },
+    {
+      "epoch": 0.2719487336904224,
+      "grad_norm": 0.0037687926087528467,
+      "learning_rate": 0.001,
+      "loss": 0.3845,
+      "step": 9856
+    },
+    {
+      "epoch": 0.2719763258914868,
+      "grad_norm": 0.007429871242493391,
+      "learning_rate": 0.001,
+      "loss": 0.4084,
+      "step": 9857
+    },
+    {
+      "epoch": 0.27200391809255114,
+      "grad_norm": 0.00363955763168633,
+      "learning_rate": 0.001,
+      "loss": 0.3385,
+      "step": 9858
+    },
+    {
+      "epoch": 0.2720315102936155,
+      "grad_norm": 0.00260857748799026,
+      "learning_rate": 0.001,
+      "loss": 0.4518,
+      "step": 9859
+    },
+    {
+      "epoch": 0.2720591024946799,
+      "grad_norm": 0.002581104403361678,
+      "learning_rate": 0.001,
+      "loss": 0.3907,
+      "step": 9860
+    },
+    {
+      "epoch": 0.27208669469574426,
+      "grad_norm": 0.0025525682140141726,
+      "learning_rate": 0.001,
+      "loss": 0.4144,
+      "step": 9861
+    },
+    {
+      "epoch": 0.2721142868968086,
+      "grad_norm": 0.00219151028431952,
+      "learning_rate": 0.001,
+      "loss": 0.381,
+      "step": 9862
+    },
+    {
+      "epoch": 0.27214187909787296,
+      "grad_norm": 0.0028392940293997526,
+      "learning_rate": 0.001,
+      "loss": 0.4049,
+      "step": 9863
+    },
+    {
+      "epoch": 0.27216947129893737,
+      "grad_norm": 0.005825422238558531,
+      "learning_rate": 0.001,
+      "loss": 0.3668,
+      "step": 9864
+    },
+    {
+      "epoch": 0.2721970635000017,
+      "grad_norm": 0.00334284920245409,
+      "learning_rate": 0.001,
+      "loss": 0.3837,
+      "step": 9865
+    },
+    {
+      "epoch": 0.2722246557010661,
+      "grad_norm": 0.0023583846632391214,
+      "learning_rate": 0.001,
+      "loss": 0.4264,
+      "step": 9866
+    },
+    {
+      "epoch": 0.2722522479021305,
+      "grad_norm": 0.004898847080767155,
+      "learning_rate": 0.001,
+      "loss": 0.4439,
+      "step": 9867
+    },
+    {
+      "epoch": 0.27227984010319484,
+      "grad_norm": 0.004352132324129343,
+      "learning_rate": 0.001,
+      "loss": 0.3966,
+      "step": 9868
+    },
+    {
+      "epoch": 0.2723074323042592,
+      "grad_norm": 0.003136218059808016,
+      "learning_rate": 0.001,
+      "loss": 0.4208,
+      "step": 9869
+    },
+    {
+      "epoch": 0.2723350245053236,
+      "grad_norm": 0.003824204672127962,
+      "learning_rate": 0.001,
+      "loss": 0.3764,
+      "step": 9870
+    },
+    {
+      "epoch": 0.27236261670638795,
+      "grad_norm": 0.0033238916657865047,
+      "learning_rate": 0.001,
+      "loss": 0.4085,
+      "step": 9871
+    },
+    {
+      "epoch": 0.2723902089074523,
+      "grad_norm": 0.005290450528264046,
+      "learning_rate": 0.001,
+      "loss": 0.3666,
+      "step": 9872
+    },
+    {
+      "epoch": 0.27241780110851666,
+      "grad_norm": 0.0039006490260362625,
+      "learning_rate": 0.001,
+      "loss": 0.3767,
+      "step": 9873
+    },
+    {
+      "epoch": 0.27244539330958106,
+      "grad_norm": 0.006280507426708937,
+      "learning_rate": 0.001,
+      "loss": 0.4119,
+      "step": 9874
+    },
+    {
+      "epoch": 0.2724729855106454,
+      "grad_norm": 0.010091300122439861,
+      "learning_rate": 0.001,
+      "loss": 0.3789,
+      "step": 9875
+    },
+    {
+      "epoch": 0.27250057771170977,
+      "grad_norm": 0.012804842554032803,
+      "learning_rate": 0.001,
+      "loss": 0.3659,
+      "step": 9876
+    },
+    {
+      "epoch": 0.2725281699127742,
+      "grad_norm": 0.005947540979832411,
+      "learning_rate": 0.001,
+      "loss": 0.3872,
+      "step": 9877
+    },
+    {
+      "epoch": 0.27255576211383853,
+      "grad_norm": 0.007248189765959978,
+      "learning_rate": 0.001,
+      "loss": 0.4269,
+      "step": 9878
+    },
+    {
+      "epoch": 0.2725833543149029,
+      "grad_norm": 0.00791088119149208,
+      "learning_rate": 0.001,
+      "loss": 0.3778,
+      "step": 9879
+    },
+    {
+      "epoch": 0.2726109465159673,
+      "grad_norm": 0.011914456263184547,
+      "learning_rate": 0.001,
+      "loss": 0.415,
+      "step": 9880
+    },
+    {
+      "epoch": 0.27263853871703164,
+      "grad_norm": 0.005593923386186361,
+      "learning_rate": 0.001,
+      "loss": 0.3928,
+      "step": 9881
+    },
+    {
+      "epoch": 0.272666130918096,
+      "grad_norm": 0.0036203390918672085,
+      "learning_rate": 0.001,
+      "loss": 0.4013,
+      "step": 9882
+    },
+    {
+      "epoch": 0.27269372311916035,
+      "grad_norm": 0.0030619618482887745,
+      "learning_rate": 0.001,
+      "loss": 0.4023,
+      "step": 9883
+    },
+    {
+      "epoch": 0.27272131532022476,
+      "grad_norm": 0.002730242908000946,
+      "learning_rate": 0.001,
+      "loss": 0.3723,
+      "step": 9884
+    },
+    {
+      "epoch": 0.2727489075212891,
+      "grad_norm": 0.004023353569209576,
+      "learning_rate": 0.001,
+      "loss": 0.4212,
+      "step": 9885
+    },
+    {
+      "epoch": 0.27277649972235346,
+      "grad_norm": 0.0054342905059456825,
+      "learning_rate": 0.001,
+      "loss": 0.3853,
+      "step": 9886
+    },
+    {
+      "epoch": 0.27280409192341787,
+      "grad_norm": 0.004413217771798372,
+      "learning_rate": 0.001,
+      "loss": 0.3759,
+      "step": 9887
+    },
+    {
+      "epoch": 0.2728316841244822,
+      "grad_norm": 0.0039059999398887157,
+      "learning_rate": 0.001,
+      "loss": 0.4253,
+      "step": 9888
+    },
+    {
+      "epoch": 0.2728592763255466,
+      "grad_norm": 0.003089727135375142,
+      "learning_rate": 0.001,
+      "loss": 0.4385,
+      "step": 9889
+    },
+    {
+      "epoch": 0.2728868685266109,
+      "grad_norm": 0.002594609744846821,
+      "learning_rate": 0.001,
+      "loss": 0.3901,
+      "step": 9890
+    },
+    {
+      "epoch": 0.27291446072767533,
+      "grad_norm": 0.004234743770211935,
+      "learning_rate": 0.001,
+      "loss": 0.3582,
+      "step": 9891
+    },
+    {
+      "epoch": 0.2729420529287397,
+      "grad_norm": 0.002928930101916194,
+      "learning_rate": 0.001,
+      "loss": 0.3743,
+      "step": 9892
+    },
+    {
+      "epoch": 0.27296964512980404,
+      "grad_norm": 0.002950299996882677,
+      "learning_rate": 0.001,
+      "loss": 0.4439,
+      "step": 9893
+    },
+    {
+      "epoch": 0.27299723733086845,
+      "grad_norm": 0.002796403132379055,
+      "learning_rate": 0.001,
+      "loss": 0.4327,
+      "step": 9894
+    },
+    {
+      "epoch": 0.2730248295319328,
+      "grad_norm": 0.0047624544240534306,
+      "learning_rate": 0.001,
+      "loss": 0.416,
+      "step": 9895
+    },
+    {
+      "epoch": 0.27305242173299715,
+      "grad_norm": 0.002439835574477911,
+      "learning_rate": 0.001,
+      "loss": 0.4364,
+      "step": 9896
+    },
+    {
+      "epoch": 0.27308001393406156,
+      "grad_norm": 0.002326715039089322,
+      "learning_rate": 0.001,
+      "loss": 0.4155,
+      "step": 9897
+    },
+    {
+      "epoch": 0.2731076061351259,
+      "grad_norm": 0.00256901397369802,
+      "learning_rate": 0.001,
+      "loss": 0.4314,
+      "step": 9898
+    },
+    {
+      "epoch": 0.27313519833619027,
+      "grad_norm": 0.002579151652753353,
+      "learning_rate": 0.001,
+      "loss": 0.408,
+      "step": 9899
+    },
+    {
+      "epoch": 0.2731627905372546,
+      "grad_norm": 0.00309234787710011,
+      "learning_rate": 0.001,
+      "loss": 0.3471,
+      "step": 9900
+    },
+    {
+      "epoch": 0.273190382738319,
+      "grad_norm": 0.0036706968676298857,
+      "learning_rate": 0.001,
+      "loss": 0.3779,
+      "step": 9901
+    },
+    {
+      "epoch": 0.2732179749393834,
+      "grad_norm": 0.0024913379456847906,
+      "learning_rate": 0.001,
+      "loss": 0.4071,
+      "step": 9902
+    },
+    {
+      "epoch": 0.27324556714044773,
+      "grad_norm": 0.004764418583363295,
+      "learning_rate": 0.001,
+      "loss": 0.432,
+      "step": 9903
+    },
+    {
+      "epoch": 0.27327315934151214,
+      "grad_norm": 0.0027252251747995615,
+      "learning_rate": 0.001,
+      "loss": 0.3977,
+      "step": 9904
+    },
+    {
+      "epoch": 0.2733007515425765,
+      "grad_norm": 0.0033297077752649784,
+      "learning_rate": 0.001,
+      "loss": 0.4315,
+      "step": 9905
+    },
+    {
+      "epoch": 0.27332834374364084,
+      "grad_norm": 0.0033139032311737537,
+      "learning_rate": 0.001,
+      "loss": 0.3994,
+      "step": 9906
+    },
+    {
+      "epoch": 0.27335593594470525,
+      "grad_norm": 0.004935343284159899,
+      "learning_rate": 0.001,
+      "loss": 0.3974,
+      "step": 9907
+    },
+    {
+      "epoch": 0.2733835281457696,
+      "grad_norm": 0.0046791452914476395,
+      "learning_rate": 0.001,
+      "loss": 0.4204,
+      "step": 9908
+    },
+    {
+      "epoch": 0.27341112034683396,
+      "grad_norm": 0.004267543088644743,
+      "learning_rate": 0.001,
+      "loss": 0.4563,
+      "step": 9909
+    },
+    {
+      "epoch": 0.2734387125478983,
+      "grad_norm": 0.002623759675770998,
+      "learning_rate": 0.001,
+      "loss": 0.3976,
+      "step": 9910
+    },
+    {
+      "epoch": 0.2734663047489627,
+      "grad_norm": 0.002590791555121541,
+      "learning_rate": 0.001,
+      "loss": 0.3993,
+      "step": 9911
+    },
+    {
+      "epoch": 0.27349389695002707,
+      "grad_norm": 0.002841070294380188,
+      "learning_rate": 0.001,
+      "loss": 0.3935,
+      "step": 9912
+    },
+    {
+      "epoch": 0.2735214891510914,
+      "grad_norm": 0.009606373496353626,
+      "learning_rate": 0.001,
+      "loss": 0.4181,
+      "step": 9913
+    },
+    {
+      "epoch": 0.27354908135215583,
+      "grad_norm": 0.0030742899980396032,
+      "learning_rate": 0.001,
+      "loss": 0.3942,
+      "step": 9914
+    },
+    {
+      "epoch": 0.2735766735532202,
+      "grad_norm": 0.00670670298859477,
+      "learning_rate": 0.001,
+      "loss": 0.4058,
+      "step": 9915
+    },
+    {
+      "epoch": 0.27360426575428454,
+      "grad_norm": 0.002301518339663744,
+      "learning_rate": 0.001,
+      "loss": 0.3965,
+      "step": 9916
+    },
+    {
+      "epoch": 0.27363185795534894,
+      "grad_norm": 0.00346144987270236,
+      "learning_rate": 0.001,
+      "loss": 0.3968,
+      "step": 9917
+    },
+    {
+      "epoch": 0.2736594501564133,
+      "grad_norm": 0.0030207394156605005,
+      "learning_rate": 0.001,
+      "loss": 0.3899,
+      "step": 9918
+    },
+    {
+      "epoch": 0.27368704235747765,
+      "grad_norm": 0.005349605809897184,
+      "learning_rate": 0.001,
+      "loss": 0.3826,
+      "step": 9919
+    },
+    {
+      "epoch": 0.273714634558542,
+      "grad_norm": 0.004348041955381632,
+      "learning_rate": 0.001,
+      "loss": 0.3966,
+      "step": 9920
+    },
+    {
+      "epoch": 0.2737422267596064,
+      "grad_norm": 0.0030986126512289047,
+      "learning_rate": 0.001,
+      "loss": 0.4256,
+      "step": 9921
+    },
+    {
+      "epoch": 0.27376981896067076,
+      "grad_norm": 0.0026239063590765,
+      "learning_rate": 0.001,
+      "loss": 0.3946,
+      "step": 9922
+    },
+    {
+      "epoch": 0.2737974111617351,
+      "grad_norm": 0.0023042093962430954,
+      "learning_rate": 0.001,
+      "loss": 0.3869,
+      "step": 9923
+    },
+    {
+      "epoch": 0.2738250033627995,
+      "grad_norm": 0.007671200204640627,
+      "learning_rate": 0.001,
+      "loss": 0.3721,
+      "step": 9924
+    },
+    {
+      "epoch": 0.2738525955638639,
+      "grad_norm": 0.00476741511374712,
+      "learning_rate": 0.001,
+      "loss": 0.367,
+      "step": 9925
+    },
+    {
+      "epoch": 0.27388018776492823,
+      "grad_norm": 0.004138735588639975,
+      "learning_rate": 0.001,
+      "loss": 0.4208,
+      "step": 9926
+    },
+    {
+      "epoch": 0.27390777996599264,
+      "grad_norm": 0.007656489033252001,
+      "learning_rate": 0.001,
+      "loss": 0.3586,
+      "step": 9927
+    },
+    {
+      "epoch": 0.273935372167057,
+      "grad_norm": 0.003876802045851946,
+      "learning_rate": 0.001,
+      "loss": 0.3793,
+      "step": 9928
+    },
+    {
+      "epoch": 0.27396296436812134,
+      "grad_norm": 0.010240191593766212,
+      "learning_rate": 0.001,
+      "loss": 0.4083,
+      "step": 9929
+    },
+    {
+      "epoch": 0.2739905565691857,
+      "grad_norm": 0.005146909970790148,
+      "learning_rate": 0.001,
+      "loss": 0.3728,
+      "step": 9930
+    },
+    {
+      "epoch": 0.2740181487702501,
+      "grad_norm": 0.00523938424885273,
+      "learning_rate": 0.001,
+      "loss": 0.3844,
+      "step": 9931
+    },
+    {
+      "epoch": 0.27404574097131446,
+      "grad_norm": 0.00330050359480083,
+      "learning_rate": 0.001,
+      "loss": 0.3892,
+      "step": 9932
+    },
+    {
+      "epoch": 0.2740733331723788,
+      "grad_norm": 0.00420812563970685,
+      "learning_rate": 0.001,
+      "loss": 0.3911,
+      "step": 9933
+    },
+    {
+      "epoch": 0.2741009253734432,
+      "grad_norm": 0.00954358745366335,
+      "learning_rate": 0.001,
+      "loss": 0.3833,
+      "step": 9934
+    },
+    {
+      "epoch": 0.27412851757450757,
+      "grad_norm": 0.005676385015249252,
+      "learning_rate": 0.001,
+      "loss": 0.401,
+      "step": 9935
+    },
+    {
+      "epoch": 0.2741561097755719,
+      "grad_norm": 0.00360414432361722,
+      "learning_rate": 0.001,
+      "loss": 0.3899,
+      "step": 9936
+    },
+    {
+      "epoch": 0.27418370197663633,
+      "grad_norm": 0.002916174242272973,
+      "learning_rate": 0.001,
+      "loss": 0.4373,
+      "step": 9937
+    },
+    {
+      "epoch": 0.2742112941777007,
+      "grad_norm": 0.0041490960866212845,
+      "learning_rate": 0.001,
+      "loss": 0.3927,
+      "step": 9938
+    },
+    {
+      "epoch": 0.27423888637876503,
+      "grad_norm": 0.03103417344391346,
+      "learning_rate": 0.001,
+      "loss": 0.3987,
+      "step": 9939
+    },
+    {
+      "epoch": 0.2742664785798294,
+      "grad_norm": 0.00923039298504591,
+      "learning_rate": 0.001,
+      "loss": 0.4004,
+      "step": 9940
+    },
+    {
+      "epoch": 0.2742940707808938,
+      "grad_norm": 0.0029965273570269346,
+      "learning_rate": 0.001,
+      "loss": 0.439,
+      "step": 9941
+    },
+    {
+      "epoch": 0.27432166298195815,
+      "grad_norm": 0.003088477300480008,
+      "learning_rate": 0.001,
+      "loss": 0.3813,
+      "step": 9942
+    },
+    {
+      "epoch": 0.2743492551830225,
+      "grad_norm": 0.0032554971985518932,
+      "learning_rate": 0.001,
+      "loss": 0.4156,
+      "step": 9943
+    },
+    {
+      "epoch": 0.2743768473840869,
+      "grad_norm": 0.004142871592193842,
+      "learning_rate": 0.001,
+      "loss": 0.4189,
+      "step": 9944
+    },
+    {
+      "epoch": 0.27440443958515126,
+      "grad_norm": 0.003104181494563818,
+      "learning_rate": 0.001,
+      "loss": 0.4042,
+      "step": 9945
+    },
+    {
+      "epoch": 0.2744320317862156,
+      "grad_norm": 0.0025867957156151533,
+      "learning_rate": 0.001,
+      "loss": 0.4087,
+      "step": 9946
+    },
+    {
+      "epoch": 0.27445962398728,
+      "grad_norm": 0.002821350237354636,
+      "learning_rate": 0.001,
+      "loss": 0.4176,
+      "step": 9947
+    },
+    {
+      "epoch": 0.2744872161883444,
+      "grad_norm": 0.003803239669650793,
+      "learning_rate": 0.001,
+      "loss": 0.3958,
+      "step": 9948
+    },
+    {
+      "epoch": 0.2745148083894087,
+      "grad_norm": 0.002869410440325737,
+      "learning_rate": 0.001,
+      "loss": 0.3895,
+      "step": 9949
+    },
+    {
+      "epoch": 0.2745424005904731,
+      "grad_norm": 0.004910988733172417,
+      "learning_rate": 0.001,
+      "loss": 0.4023,
+      "step": 9950
+    },
+    {
+      "epoch": 0.2745699927915375,
+      "grad_norm": 0.00325010740198195,
+      "learning_rate": 0.001,
+      "loss": 0.4318,
+      "step": 9951
+    },
+    {
+      "epoch": 0.27459758499260184,
+      "grad_norm": 0.0021431315690279007,
+      "learning_rate": 0.001,
+      "loss": 0.4124,
+      "step": 9952
+    },
+    {
+      "epoch": 0.2746251771936662,
+      "grad_norm": 0.003677117172628641,
+      "learning_rate": 0.001,
+      "loss": 0.4193,
+      "step": 9953
+    },
+    {
+      "epoch": 0.2746527693947306,
+      "grad_norm": 0.002277594292536378,
+      "learning_rate": 0.001,
+      "loss": 0.4206,
+      "step": 9954
+    },
+    {
+      "epoch": 0.27468036159579495,
+      "grad_norm": 0.004701568745076656,
+      "learning_rate": 0.001,
+      "loss": 0.3822,
+      "step": 9955
+    },
+    {
+      "epoch": 0.2747079537968593,
+      "grad_norm": 0.002484973520040512,
+      "learning_rate": 0.001,
+      "loss": 0.3838,
+      "step": 9956
+    },
+    {
+      "epoch": 0.2747355459979237,
+      "grad_norm": 0.0033851531334221363,
+      "learning_rate": 0.001,
+      "loss": 0.3915,
+      "step": 9957
+    },
+    {
+      "epoch": 0.27476313819898807,
+      "grad_norm": 0.0025149055290967226,
+      "learning_rate": 0.001,
+      "loss": 0.4137,
+      "step": 9958
+    },
+    {
+      "epoch": 0.2747907304000524,
+      "grad_norm": 0.0030154273845255375,
+      "learning_rate": 0.001,
+      "loss": 0.3872,
+      "step": 9959
+    },
+    {
+      "epoch": 0.27481832260111677,
+      "grad_norm": 0.010692945681512356,
+      "learning_rate": 0.001,
+      "loss": 0.3861,
+      "step": 9960
+    },
+    {
+      "epoch": 0.2748459148021812,
+      "grad_norm": 0.0041275848634541035,
+      "learning_rate": 0.001,
+      "loss": 0.3891,
+      "step": 9961
+    },
+    {
+      "epoch": 0.27487350700324553,
+      "grad_norm": 0.0029004793614149094,
+      "learning_rate": 0.001,
+      "loss": 0.4236,
+      "step": 9962
+    },
+    {
+      "epoch": 0.2749010992043099,
+      "grad_norm": 0.009717054665088654,
+      "learning_rate": 0.001,
+      "loss": 0.38,
+      "step": 9963
+    },
+    {
+      "epoch": 0.2749286914053743,
+      "grad_norm": 0.00394978653639555,
+      "learning_rate": 0.001,
+      "loss": 0.3602,
+      "step": 9964
+    },
+    {
+      "epoch": 0.27495628360643865,
+      "grad_norm": 0.0029732300899922848,
+      "learning_rate": 0.001,
+      "loss": 0.4,
+      "step": 9965
+    },
+    {
+      "epoch": 0.274983875807503,
+      "grad_norm": 0.002955190371721983,
+      "learning_rate": 0.001,
+      "loss": 0.3871,
+      "step": 9966
+    },
+    {
+      "epoch": 0.2750114680085674,
+      "grad_norm": 0.0038855448365211487,
+      "learning_rate": 0.001,
+      "loss": 0.3838,
+      "step": 9967
+    },
+    {
+      "epoch": 0.27503906020963176,
+      "grad_norm": 0.002460125368088484,
+      "learning_rate": 0.001,
+      "loss": 0.4013,
+      "step": 9968
+    },
+    {
+      "epoch": 0.2750666524106961,
+      "grad_norm": 0.002936316654086113,
+      "learning_rate": 0.001,
+      "loss": 0.3468,
+      "step": 9969
+    },
+    {
+      "epoch": 0.27509424461176046,
+      "grad_norm": 0.003299242816865444,
+      "learning_rate": 0.001,
+      "loss": 0.4191,
+      "step": 9970
+    },
+    {
+      "epoch": 0.27512183681282487,
+      "grad_norm": 0.0037559715565294027,
+      "learning_rate": 0.001,
+      "loss": 0.362,
+      "step": 9971
+    },
+    {
+      "epoch": 0.2751494290138892,
+      "grad_norm": 0.0035624271258711815,
+      "learning_rate": 0.001,
+      "loss": 0.3944,
+      "step": 9972
+    },
+    {
+      "epoch": 0.2751770212149536,
+      "grad_norm": 0.00262457481585443,
+      "learning_rate": 0.001,
+      "loss": 0.3985,
+      "step": 9973
+    },
+    {
+      "epoch": 0.275204613416018,
+      "grad_norm": 0.0050546471029520035,
+      "learning_rate": 0.001,
+      "loss": 0.3629,
+      "step": 9974
+    },
+    {
+      "epoch": 0.27523220561708234,
+      "grad_norm": 0.004298574756830931,
+      "learning_rate": 0.001,
+      "loss": 0.4059,
+      "step": 9975
+    },
+    {
+      "epoch": 0.2752597978181467,
+      "grad_norm": 0.0042538209818303585,
+      "learning_rate": 0.001,
+      "loss": 0.402,
+      "step": 9976
+    },
+    {
+      "epoch": 0.27528739001921104,
+      "grad_norm": 0.003947499208152294,
+      "learning_rate": 0.001,
+      "loss": 0.3789,
+      "step": 9977
+    },
+    {
+      "epoch": 0.27531498222027545,
+      "grad_norm": 0.0038300196174532175,
+      "learning_rate": 0.001,
+      "loss": 0.3953,
+      "step": 9978
+    },
+    {
+      "epoch": 0.2753425744213398,
+      "grad_norm": 0.005689695011824369,
+      "learning_rate": 0.001,
+      "loss": 0.3856,
+      "step": 9979
+    },
+    {
+      "epoch": 0.27537016662240416,
+      "grad_norm": 0.0022208907175809145,
+      "learning_rate": 0.001,
+      "loss": 0.3766,
+      "step": 9980
+    },
+    {
+      "epoch": 0.27539775882346856,
+      "grad_norm": 0.0022134091705083847,
+      "learning_rate": 0.001,
+      "loss": 0.3945,
+      "step": 9981
+    },
+    {
+      "epoch": 0.2754253510245329,
+      "grad_norm": 0.0045527201145887375,
+      "learning_rate": 0.001,
+      "loss": 0.3707,
+      "step": 9982
+    },
+    {
+      "epoch": 0.27545294322559727,
+      "grad_norm": 0.004349110182374716,
+      "learning_rate": 0.001,
+      "loss": 0.4218,
+      "step": 9983
+    },
+    {
+      "epoch": 0.2754805354266617,
+      "grad_norm": 0.0028283181600272655,
+      "learning_rate": 0.001,
+      "loss": 0.4415,
+      "step": 9984
+    },
+    {
+      "epoch": 0.27550812762772603,
+      "grad_norm": 0.0024474281817674637,
+      "learning_rate": 0.001,
+      "loss": 0.368,
+      "step": 9985
+    },
+    {
+      "epoch": 0.2755357198287904,
+      "grad_norm": 0.0033072589430958033,
+      "learning_rate": 0.001,
+      "loss": 0.4011,
+      "step": 9986
+    },
+    {
+      "epoch": 0.27556331202985473,
+      "grad_norm": 0.002837040927261114,
+      "learning_rate": 0.001,
+      "loss": 0.369,
+      "step": 9987
+    },
+    {
+      "epoch": 0.27559090423091914,
+      "grad_norm": 0.014132903888821602,
+      "learning_rate": 0.001,
+      "loss": 0.3712,
+      "step": 9988
+    },
+    {
+      "epoch": 0.2756184964319835,
+      "grad_norm": 0.002668624045327306,
+      "learning_rate": 0.001,
+      "loss": 0.3964,
+      "step": 9989
+    },
+    {
+      "epoch": 0.27564608863304785,
+      "grad_norm": 0.003035599598661065,
+      "learning_rate": 0.001,
+      "loss": 0.3895,
+      "step": 9990
+    },
+    {
+      "epoch": 0.27567368083411226,
+      "grad_norm": 0.0033727851696312428,
+      "learning_rate": 0.001,
+      "loss": 0.402,
+      "step": 9991
+    },
+    {
+      "epoch": 0.2757012730351766,
+      "grad_norm": 0.0022625040728598833,
+      "learning_rate": 0.001,
+      "loss": 0.4287,
+      "step": 9992
+    },
+    {
+      "epoch": 0.27572886523624096,
+      "grad_norm": 0.004006414674222469,
+      "learning_rate": 0.001,
+      "loss": 0.3875,
+      "step": 9993
+    },
+    {
+      "epoch": 0.27575645743730537,
+      "grad_norm": 0.002454617992043495,
+      "learning_rate": 0.001,
+      "loss": 0.424,
+      "step": 9994
+    },
+    {
+      "epoch": 0.2757840496383697,
+      "grad_norm": 0.003350407350808382,
+      "learning_rate": 0.001,
+      "loss": 0.4022,
+      "step": 9995
+    },
+    {
+      "epoch": 0.2758116418394341,
+      "grad_norm": 0.004660541657358408,
+      "learning_rate": 0.001,
+      "loss": 0.3739,
+      "step": 9996
+    },
+    {
+      "epoch": 0.2758392340404984,
+      "grad_norm": 0.002644259948283434,
+      "learning_rate": 0.001,
+      "loss": 0.4017,
+      "step": 9997
+    },
+    {
+      "epoch": 0.27586682624156283,
+      "grad_norm": 0.004398273769766092,
+      "learning_rate": 0.001,
+      "loss": 0.3655,
+      "step": 9998
+    },
+    {
+      "epoch": 0.2758944184426272,
+      "grad_norm": 0.013148190453648567,
+      "learning_rate": 0.001,
+      "loss": 0.3823,
+      "step": 9999
+    },
+    {
+      "epoch": 0.27592201064369154,
+      "grad_norm": 0.002878269413486123,
+      "learning_rate": 0.001,
+      "loss": 0.3966,
+      "step": 10000
+    },
+    {
+      "epoch": 0.27592201064369154,
+      "eval_runtime": 24.4086,
+      "eval_samples_per_second": 1.311,
+      "eval_steps_per_second": 0.164,
+      "step": 10000
+    },
+    {
+      "epoch": 0.27594960284475595,
+      "grad_norm": 0.0022266616579145193,
+      "learning_rate": 0.001,
+      "loss": 0.4605,
+      "step": 10001
+    },
+    {
+      "epoch": 0.2759771950458203,
+      "grad_norm": 0.0019374669063836336,
+      "learning_rate": 0.001,
+      "loss": 0.4417,
+      "step": 10002
+    },
+    {
+      "epoch": 0.27600478724688465,
+      "grad_norm": 0.0026133570354431868,
+      "learning_rate": 0.001,
+      "loss": 0.408,
+      "step": 10003
+    },
+    {
+      "epoch": 0.27603237944794906,
+      "grad_norm": 0.003653626423329115,
+      "learning_rate": 0.001,
+      "loss": 0.3964,
+      "step": 10004
+    },
+    {
+      "epoch": 0.2760599716490134,
+      "grad_norm": 0.002634751843288541,
+      "learning_rate": 0.001,
+      "loss": 0.3812,
+      "step": 10005
+    },
+    {
+      "epoch": 0.27608756385007777,
+      "grad_norm": 0.012561113573610783,
+      "learning_rate": 0.001,
+      "loss": 0.4003,
+      "step": 10006
+    },
+    {
+      "epoch": 0.2761151560511421,
+      "grad_norm": 0.01040496677160263,
+      "learning_rate": 0.001,
+      "loss": 0.3894,
+      "step": 10007
+    },
+    {
+      "epoch": 0.2761427482522065,
+      "grad_norm": 0.005907583516091108,
+      "learning_rate": 0.001,
+      "loss": 0.4064,
+      "step": 10008
+    },
+    {
+      "epoch": 0.2761703404532709,
+      "grad_norm": 0.0033373169135302305,
+      "learning_rate": 0.001,
+      "loss": 0.3865,
+      "step": 10009
+    },
+    {
+      "epoch": 0.27619793265433523,
+      "grad_norm": 0.003724615555256605,
+      "learning_rate": 0.001,
+      "loss": 0.3988,
+      "step": 10010
+    },
+    {
+      "epoch": 0.27622552485539964,
+      "grad_norm": 0.0035398202016949654,
+      "learning_rate": 0.001,
+      "loss": 0.403,
+      "step": 10011
+    },
+    {
+      "epoch": 0.276253117056464,
+      "grad_norm": 0.003467197297140956,
+      "learning_rate": 0.001,
+      "loss": 0.4124,
+      "step": 10012
+    },
+    {
+      "epoch": 0.27628070925752835,
+      "grad_norm": 0.004464291967451572,
+      "learning_rate": 0.001,
+      "loss": 0.3839,
+      "step": 10013
+    },
+    {
+      "epoch": 0.27630830145859275,
+      "grad_norm": 0.0036106444895267487,
+      "learning_rate": 0.001,
+      "loss": 0.3951,
+      "step": 10014
+    },
+    {
+      "epoch": 0.2763358936596571,
+      "grad_norm": 0.004417051561176777,
+      "learning_rate": 0.001,
+      "loss": 0.3908,
+      "step": 10015
+    },
+    {
+      "epoch": 0.27636348586072146,
+      "grad_norm": 0.0054782601073384285,
+      "learning_rate": 0.001,
+      "loss": 0.4039,
+      "step": 10016
+    },
+    {
+      "epoch": 0.2763910780617858,
+      "grad_norm": 0.004640686791390181,
+      "learning_rate": 0.001,
+      "loss": 0.384,
+      "step": 10017
+    },
+    {
+      "epoch": 0.2764186702628502,
+      "grad_norm": 0.005945026874542236,
+      "learning_rate": 0.001,
+      "loss": 0.3939,
+      "step": 10018
+    },
+    {
+      "epoch": 0.27644626246391457,
+      "grad_norm": 0.002607600996270776,
+      "learning_rate": 0.001,
+      "loss": 0.3993,
+      "step": 10019
+    },
+    {
+      "epoch": 0.2764738546649789,
+      "grad_norm": 0.0032611433416604996,
+      "learning_rate": 0.001,
+      "loss": 0.4207,
+      "step": 10020
+    },
+    {
+      "epoch": 0.27650144686604333,
+      "grad_norm": 0.0037529822438955307,
+      "learning_rate": 0.001,
+      "loss": 0.372,
+      "step": 10021
+    },
+    {
+      "epoch": 0.2765290390671077,
+      "grad_norm": 0.002805948257446289,
+      "learning_rate": 0.001,
+      "loss": 0.4327,
+      "step": 10022
+    },
+    {
+      "epoch": 0.27655663126817204,
+      "grad_norm": 0.003274905029684305,
+      "learning_rate": 0.001,
+      "loss": 0.4127,
+      "step": 10023
+    },
+    {
+      "epoch": 0.27658422346923645,
+      "grad_norm": 0.00208392646163702,
+      "learning_rate": 0.001,
+      "loss": 0.4326,
+      "step": 10024
+    },
+    {
+      "epoch": 0.2766118156703008,
+      "grad_norm": 0.007121518719941378,
+      "learning_rate": 0.001,
+      "loss": 0.3495,
+      "step": 10025
+    },
+    {
+      "epoch": 0.27663940787136515,
+      "grad_norm": 0.0040301973931491375,
+      "learning_rate": 0.001,
+      "loss": 0.4095,
+      "step": 10026
+    },
+    {
+      "epoch": 0.2766670000724295,
+      "grad_norm": 0.0036494045052677393,
+      "learning_rate": 0.001,
+      "loss": 0.3918,
+      "step": 10027
+    },
+    {
+      "epoch": 0.2766945922734939,
+      "grad_norm": 0.0036077816039323807,
+      "learning_rate": 0.001,
+      "loss": 0.3922,
+      "step": 10028
+    },
+    {
+      "epoch": 0.27672218447455826,
+      "grad_norm": 0.010453345254063606,
+      "learning_rate": 0.001,
+      "loss": 0.3961,
+      "step": 10029
+    },
+    {
+      "epoch": 0.2767497766756226,
+      "grad_norm": 0.004115840420126915,
+      "learning_rate": 0.001,
+      "loss": 0.4289,
+      "step": 10030
+    },
+    {
+      "epoch": 0.276777368876687,
+      "grad_norm": 0.0033005299046635628,
+      "learning_rate": 0.001,
+      "loss": 0.4526,
+      "step": 10031
+    },
+    {
+      "epoch": 0.2768049610777514,
+      "grad_norm": 0.010283264331519604,
+      "learning_rate": 0.001,
+      "loss": 0.3695,
+      "step": 10032
+    },
+    {
+      "epoch": 0.27683255327881573,
+      "grad_norm": 0.0024423354770988226,
+      "learning_rate": 0.001,
+      "loss": 0.4325,
+      "step": 10033
+    },
+    {
+      "epoch": 0.27686014547988014,
+      "grad_norm": 0.002194769913330674,
+      "learning_rate": 0.001,
+      "loss": 0.3853,
+      "step": 10034
+    },
+    {
+      "epoch": 0.2768877376809445,
+      "grad_norm": 0.00277558621019125,
+      "learning_rate": 0.001,
+      "loss": 0.407,
+      "step": 10035
+    },
+    {
+      "epoch": 0.27691532988200884,
+      "grad_norm": 0.0023043700493872166,
+      "learning_rate": 0.001,
+      "loss": 0.4007,
+      "step": 10036
+    },
+    {
+      "epoch": 0.2769429220830732,
+      "grad_norm": 0.0019729414489120245,
+      "learning_rate": 0.001,
+      "loss": 0.3824,
+      "step": 10037
+    },
+    {
+      "epoch": 0.2769705142841376,
+      "grad_norm": 0.0028671245090663433,
+      "learning_rate": 0.001,
+      "loss": 0.407,
+      "step": 10038
+    },
+    {
+      "epoch": 0.27699810648520196,
+      "grad_norm": 0.0039373342879116535,
+      "learning_rate": 0.001,
+      "loss": 0.4104,
+      "step": 10039
+    },
+    {
+      "epoch": 0.2770256986862663,
+      "grad_norm": 0.009182474575936794,
+      "learning_rate": 0.001,
+      "loss": 0.3806,
+      "step": 10040
+    },
+    {
+      "epoch": 0.2770532908873307,
+      "grad_norm": 0.0029172920621931553,
+      "learning_rate": 0.001,
+      "loss": 0.3708,
+      "step": 10041
+    },
+    {
+      "epoch": 0.27708088308839507,
+      "grad_norm": 0.0023737072478979826,
+      "learning_rate": 0.001,
+      "loss": 0.4056,
+      "step": 10042
+    },
+    {
+      "epoch": 0.2771084752894594,
+      "grad_norm": 0.002812950173392892,
+      "learning_rate": 0.001,
+      "loss": 0.3643,
+      "step": 10043
+    },
+    {
+      "epoch": 0.27713606749052383,
+      "grad_norm": 0.002222485141828656,
+      "learning_rate": 0.001,
+      "loss": 0.4092,
+      "step": 10044
+    },
+    {
+      "epoch": 0.2771636596915882,
+      "grad_norm": 0.0037747088354080915,
+      "learning_rate": 0.001,
+      "loss": 0.4252,
+      "step": 10045
+    },
+    {
+      "epoch": 0.27719125189265253,
+      "grad_norm": 0.00997960101813078,
+      "learning_rate": 0.001,
+      "loss": 0.3705,
+      "step": 10046
+    },
+    {
+      "epoch": 0.2772188440937169,
+      "grad_norm": 0.0026916342321783304,
+      "learning_rate": 0.001,
+      "loss": 0.4323,
+      "step": 10047
+    },
+    {
+      "epoch": 0.2772464362947813,
+      "grad_norm": 0.002582460641860962,
+      "learning_rate": 0.001,
+      "loss": 0.3911,
+      "step": 10048
+    },
+    {
+      "epoch": 0.27727402849584565,
+      "grad_norm": 0.0027102259919047356,
+      "learning_rate": 0.001,
+      "loss": 0.4339,
+      "step": 10049
+    },
+    {
+      "epoch": 0.27730162069691,
+      "grad_norm": 0.0034718411043286324,
+      "learning_rate": 0.001,
+      "loss": 0.4187,
+      "step": 10050
+    },
+    {
+      "epoch": 0.2773292128979744,
+      "grad_norm": 0.0027413431089371443,
+      "learning_rate": 0.001,
+      "loss": 0.4083,
+      "step": 10051
+    },
+    {
+      "epoch": 0.27735680509903876,
+      "grad_norm": 0.003133729798719287,
+      "learning_rate": 0.001,
+      "loss": 0.4117,
+      "step": 10052
+    },
+    {
+      "epoch": 0.2773843973001031,
+      "grad_norm": 0.002180175157263875,
+      "learning_rate": 0.001,
+      "loss": 0.4286,
+      "step": 10053
+    },
+    {
+      "epoch": 0.2774119895011675,
+      "grad_norm": 0.0031354373786598444,
+      "learning_rate": 0.001,
+      "loss": 0.412,
+      "step": 10054
+    },
+    {
+      "epoch": 0.2774395817022319,
+      "grad_norm": 0.0018822277197614312,
+      "learning_rate": 0.001,
+      "loss": 0.3981,
+      "step": 10055
+    },
+    {
+      "epoch": 0.2774671739032962,
+      "grad_norm": 0.013822532258927822,
+      "learning_rate": 0.001,
+      "loss": 0.4179,
+      "step": 10056
+    },
+    {
+      "epoch": 0.2774947661043606,
+      "grad_norm": 0.0037403854075819254,
+      "learning_rate": 0.001,
+      "loss": 0.3872,
+      "step": 10057
+    },
+    {
+      "epoch": 0.277522358305425,
+      "grad_norm": 0.005285004619508982,
+      "learning_rate": 0.001,
+      "loss": 0.3919,
+      "step": 10058
+    },
+    {
+      "epoch": 0.27754995050648934,
+      "grad_norm": 0.0025517537724226713,
+      "learning_rate": 0.001,
+      "loss": 0.4253,
+      "step": 10059
+    },
+    {
+      "epoch": 0.2775775427075537,
+      "grad_norm": 0.005149201024323702,
+      "learning_rate": 0.001,
+      "loss": 0.3967,
+      "step": 10060
+    },
+    {
+      "epoch": 0.2776051349086181,
+      "grad_norm": 0.012182818725705147,
+      "learning_rate": 0.001,
+      "loss": 0.3998,
+      "step": 10061
+    },
+    {
+      "epoch": 0.27763272710968245,
+      "grad_norm": 0.003710835939273238,
+      "learning_rate": 0.001,
+      "loss": 0.3726,
+      "step": 10062
+    },
+    {
+      "epoch": 0.2776603193107468,
+      "grad_norm": 0.005408014170825481,
+      "learning_rate": 0.001,
+      "loss": 0.4152,
+      "step": 10063
+    },
+    {
+      "epoch": 0.2776879115118112,
+      "grad_norm": 0.0025339778512716293,
+      "learning_rate": 0.001,
+      "loss": 0.3499,
+      "step": 10064
+    },
+    {
+      "epoch": 0.27771550371287557,
+      "grad_norm": 0.0028167536947876215,
+      "learning_rate": 0.001,
+      "loss": 0.379,
+      "step": 10065
+    },
+    {
+      "epoch": 0.2777430959139399,
+      "grad_norm": 0.0029440028592944145,
+      "learning_rate": 0.001,
+      "loss": 0.4124,
+      "step": 10066
+    },
+    {
+      "epoch": 0.27777068811500427,
+      "grad_norm": 0.0024950318038463593,
+      "learning_rate": 0.001,
+      "loss": 0.3861,
+      "step": 10067
+    },
+    {
+      "epoch": 0.2777982803160687,
+      "grad_norm": 0.002304122317582369,
+      "learning_rate": 0.001,
+      "loss": 0.4027,
+      "step": 10068
+    },
+    {
+      "epoch": 0.27782587251713303,
+      "grad_norm": 0.003218509955331683,
+      "learning_rate": 0.001,
+      "loss": 0.407,
+      "step": 10069
+    },
+    {
+      "epoch": 0.2778534647181974,
+      "grad_norm": 0.003081993665546179,
+      "learning_rate": 0.001,
+      "loss": 0.4053,
+      "step": 10070
+    },
+    {
+      "epoch": 0.2778810569192618,
+      "grad_norm": 0.003274043556302786,
+      "learning_rate": 0.001,
+      "loss": 0.3837,
+      "step": 10071
+    },
+    {
+      "epoch": 0.27790864912032615,
+      "grad_norm": 0.011977877467870712,
+      "learning_rate": 0.001,
+      "loss": 0.4057,
+      "step": 10072
+    },
+    {
+      "epoch": 0.2779362413213905,
+      "grad_norm": 0.004248856566846371,
+      "learning_rate": 0.001,
+      "loss": 0.4282,
+      "step": 10073
+    },
+    {
+      "epoch": 0.27796383352245485,
+      "grad_norm": 0.0030563895124942064,
+      "learning_rate": 0.001,
+      "loss": 0.3861,
+      "step": 10074
+    },
+    {
+      "epoch": 0.27799142572351926,
+      "grad_norm": 0.0027819809038192034,
+      "learning_rate": 0.001,
+      "loss": 0.3906,
+      "step": 10075
+    },
+    {
+      "epoch": 0.2780190179245836,
+      "grad_norm": 0.003203164553269744,
+      "learning_rate": 0.001,
+      "loss": 0.4168,
+      "step": 10076
+    },
+    {
+      "epoch": 0.27804661012564796,
+      "grad_norm": 0.003680006368085742,
+      "learning_rate": 0.001,
+      "loss": 0.3929,
+      "step": 10077
+    },
+    {
+      "epoch": 0.27807420232671237,
+      "grad_norm": 0.0054307919926941395,
+      "learning_rate": 0.001,
+      "loss": 0.4098,
+      "step": 10078
+    },
+    {
+      "epoch": 0.2781017945277767,
+      "grad_norm": 0.00977605115622282,
+      "learning_rate": 0.001,
+      "loss": 0.4088,
+      "step": 10079
+    },
+    {
+      "epoch": 0.2781293867288411,
+      "grad_norm": 0.003044104902073741,
+      "learning_rate": 0.001,
+      "loss": 0.4212,
+      "step": 10080
+    },
+    {
+      "epoch": 0.2781569789299055,
+      "grad_norm": 0.009494941681623459,
+      "learning_rate": 0.001,
+      "loss": 0.3936,
+      "step": 10081
+    },
+    {
+      "epoch": 0.27818457113096984,
+      "grad_norm": 0.04275614395737648,
+      "learning_rate": 0.001,
+      "loss": 0.3941,
+      "step": 10082
+    },
+    {
+      "epoch": 0.2782121633320342,
+      "grad_norm": 0.0026185614988207817,
+      "learning_rate": 0.001,
+      "loss": 0.4145,
+      "step": 10083
+    },
+    {
+      "epoch": 0.27823975553309854,
+      "grad_norm": 0.003999069333076477,
+      "learning_rate": 0.001,
+      "loss": 0.4262,
+      "step": 10084
+    },
+    {
+      "epoch": 0.27826734773416295,
+      "grad_norm": 0.00245444243773818,
+      "learning_rate": 0.001,
+      "loss": 0.386,
+      "step": 10085
+    },
+    {
+      "epoch": 0.2782949399352273,
+      "grad_norm": 0.00522113312035799,
+      "learning_rate": 0.001,
+      "loss": 0.4011,
+      "step": 10086
+    },
+    {
+      "epoch": 0.27832253213629166,
+      "grad_norm": 0.0037140315398573875,
+      "learning_rate": 0.001,
+      "loss": 0.3612,
+      "step": 10087
+    },
+    {
+      "epoch": 0.27835012433735606,
+      "grad_norm": 0.0034693137276917696,
+      "learning_rate": 0.001,
+      "loss": 0.4073,
+      "step": 10088
+    },
+    {
+      "epoch": 0.2783777165384204,
+      "grad_norm": 0.00494216475635767,
+      "learning_rate": 0.001,
+      "loss": 0.3911,
+      "step": 10089
+    },
+    {
+      "epoch": 0.27840530873948477,
+      "grad_norm": 0.0024820046965032816,
+      "learning_rate": 0.001,
+      "loss": 0.4066,
+      "step": 10090
+    },
+    {
+      "epoch": 0.2784329009405492,
+      "grad_norm": 0.0024154249113053083,
+      "learning_rate": 0.001,
+      "loss": 0.3744,
+      "step": 10091
+    },
+    {
+      "epoch": 0.27846049314161353,
+      "grad_norm": 0.0025213605258613825,
+      "learning_rate": 0.001,
+      "loss": 0.3724,
+      "step": 10092
+    },
+    {
+      "epoch": 0.2784880853426779,
+      "grad_norm": 0.003560342825949192,
+      "learning_rate": 0.001,
+      "loss": 0.4064,
+      "step": 10093
+    },
+    {
+      "epoch": 0.27851567754374223,
+      "grad_norm": 0.0023579909466207027,
+      "learning_rate": 0.001,
+      "loss": 0.3771,
+      "step": 10094
+    },
+    {
+      "epoch": 0.27854326974480664,
+      "grad_norm": 0.0032963070552796125,
+      "learning_rate": 0.001,
+      "loss": 0.3797,
+      "step": 10095
+    },
+    {
+      "epoch": 0.278570861945871,
+      "grad_norm": 0.0023323865607380867,
+      "learning_rate": 0.001,
+      "loss": 0.4111,
+      "step": 10096
+    },
+    {
+      "epoch": 0.27859845414693535,
+      "grad_norm": 0.0035246131010353565,
+      "learning_rate": 0.001,
+      "loss": 0.3901,
+      "step": 10097
+    },
+    {
+      "epoch": 0.27862604634799976,
+      "grad_norm": 0.004129378125071526,
+      "learning_rate": 0.001,
+      "loss": 0.4039,
+      "step": 10098
+    },
+    {
+      "epoch": 0.2786536385490641,
+      "grad_norm": 0.0030813373159617186,
+      "learning_rate": 0.001,
+      "loss": 0.3724,
+      "step": 10099
+    },
+    {
+      "epoch": 0.27868123075012846,
+      "grad_norm": 0.002692109439522028,
+      "learning_rate": 0.001,
+      "loss": 0.4205,
+      "step": 10100
+    },
+    {
+      "epoch": 0.27870882295119287,
+      "grad_norm": 0.0024737734347581863,
+      "learning_rate": 0.001,
+      "loss": 0.4175,
+      "step": 10101
+    },
+    {
+      "epoch": 0.2787364151522572,
+      "grad_norm": 0.0027061791624873877,
+      "learning_rate": 0.001,
+      "loss": 0.3762,
+      "step": 10102
+    },
+    {
+      "epoch": 0.2787640073533216,
+      "grad_norm": 0.002315426943823695,
+      "learning_rate": 0.001,
+      "loss": 0.3813,
+      "step": 10103
+    },
+    {
+      "epoch": 0.2787915995543859,
+      "grad_norm": 0.005130481906235218,
+      "learning_rate": 0.001,
+      "loss": 0.3783,
+      "step": 10104
+    },
+    {
+      "epoch": 0.27881919175545034,
+      "grad_norm": 0.006558484397828579,
+      "learning_rate": 0.001,
+      "loss": 0.4091,
+      "step": 10105
+    },
+    {
+      "epoch": 0.2788467839565147,
+      "grad_norm": 0.004167409613728523,
+      "learning_rate": 0.001,
+      "loss": 0.3981,
+      "step": 10106
+    },
+    {
+      "epoch": 0.27887437615757904,
+      "grad_norm": 0.0057708099484443665,
+      "learning_rate": 0.001,
+      "loss": 0.3973,
+      "step": 10107
+    },
+    {
+      "epoch": 0.27890196835864345,
+      "grad_norm": 0.0028495537117123604,
+      "learning_rate": 0.001,
+      "loss": 0.3616,
+      "step": 10108
+    },
+    {
+      "epoch": 0.2789295605597078,
+      "grad_norm": 0.002703807782381773,
+      "learning_rate": 0.001,
+      "loss": 0.4447,
+      "step": 10109
+    },
+    {
+      "epoch": 0.27895715276077215,
+      "grad_norm": 0.0030382112599909306,
+      "learning_rate": 0.001,
+      "loss": 0.4137,
+      "step": 10110
+    },
+    {
+      "epoch": 0.27898474496183656,
+      "grad_norm": 0.0030904843006283045,
+      "learning_rate": 0.001,
+      "loss": 0.4234,
+      "step": 10111
+    },
+    {
+      "epoch": 0.2790123371629009,
+      "grad_norm": 0.003971985075622797,
+      "learning_rate": 0.001,
+      "loss": 0.4184,
+      "step": 10112
+    },
+    {
+      "epoch": 0.27903992936396527,
+      "grad_norm": 0.002467118203639984,
+      "learning_rate": 0.001,
+      "loss": 0.4372,
+      "step": 10113
+    },
+    {
+      "epoch": 0.2790675215650296,
+      "grad_norm": 0.006953855976462364,
+      "learning_rate": 0.001,
+      "loss": 0.4285,
+      "step": 10114
+    },
+    {
+      "epoch": 0.279095113766094,
+      "grad_norm": 0.0055779386311769485,
+      "learning_rate": 0.001,
+      "loss": 0.3893,
+      "step": 10115
+    },
+    {
+      "epoch": 0.2791227059671584,
+      "grad_norm": 0.004655706230551004,
+      "learning_rate": 0.001,
+      "loss": 0.4083,
+      "step": 10116
+    },
+    {
+      "epoch": 0.27915029816822273,
+      "grad_norm": 0.004876443184912205,
+      "learning_rate": 0.001,
+      "loss": 0.3755,
+      "step": 10117
+    },
+    {
+      "epoch": 0.27917789036928714,
+      "grad_norm": 0.004932073410600424,
+      "learning_rate": 0.001,
+      "loss": 0.3852,
+      "step": 10118
+    },
+    {
+      "epoch": 0.2792054825703515,
+      "grad_norm": 0.007771969772875309,
+      "learning_rate": 0.001,
+      "loss": 0.4056,
+      "step": 10119
+    },
+    {
+      "epoch": 0.27923307477141585,
+      "grad_norm": 0.005660747177898884,
+      "learning_rate": 0.001,
+      "loss": 0.3796,
+      "step": 10120
+    },
+    {
+      "epoch": 0.27926066697248025,
+      "grad_norm": 0.0034438660368323326,
+      "learning_rate": 0.001,
+      "loss": 0.4038,
+      "step": 10121
+    },
+    {
+      "epoch": 0.2792882591735446,
+      "grad_norm": 0.003013996873050928,
+      "learning_rate": 0.001,
+      "loss": 0.444,
+      "step": 10122
+    },
+    {
+      "epoch": 0.27931585137460896,
+      "grad_norm": 0.0033822916448116302,
+      "learning_rate": 0.001,
+      "loss": 0.3583,
+      "step": 10123
+    },
+    {
+      "epoch": 0.2793434435756733,
+      "grad_norm": 0.0029740110039711,
+      "learning_rate": 0.001,
+      "loss": 0.3905,
+      "step": 10124
+    },
+    {
+      "epoch": 0.2793710357767377,
+      "grad_norm": 0.005846134852617979,
+      "learning_rate": 0.001,
+      "loss": 0.3874,
+      "step": 10125
+    },
+    {
+      "epoch": 0.27939862797780207,
+      "grad_norm": 0.0038693509995937347,
+      "learning_rate": 0.001,
+      "loss": 0.4188,
+      "step": 10126
+    },
+    {
+      "epoch": 0.2794262201788664,
+      "grad_norm": 0.023018015548586845,
+      "learning_rate": 0.001,
+      "loss": 0.4024,
+      "step": 10127
+    },
+    {
+      "epoch": 0.27945381237993083,
+      "grad_norm": 0.004444723483175039,
+      "learning_rate": 0.001,
+      "loss": 0.3736,
+      "step": 10128
+    },
+    {
+      "epoch": 0.2794814045809952,
+      "grad_norm": 0.00677674962207675,
+      "learning_rate": 0.001,
+      "loss": 0.3877,
+      "step": 10129
+    },
+    {
+      "epoch": 0.27950899678205954,
+      "grad_norm": 0.004375552758574486,
+      "learning_rate": 0.001,
+      "loss": 0.4132,
+      "step": 10130
+    },
+    {
+      "epoch": 0.27953658898312395,
+      "grad_norm": 0.003335924819111824,
+      "learning_rate": 0.001,
+      "loss": 0.3811,
+      "step": 10131
+    },
+    {
+      "epoch": 0.2795641811841883,
+      "grad_norm": 0.0037330735940486193,
+      "learning_rate": 0.001,
+      "loss": 0.4207,
+      "step": 10132
+    },
+    {
+      "epoch": 0.27959177338525265,
+      "grad_norm": 0.004045455250889063,
+      "learning_rate": 0.001,
+      "loss": 0.3541,
+      "step": 10133
+    },
+    {
+      "epoch": 0.279619365586317,
+      "grad_norm": 0.004944518208503723,
+      "learning_rate": 0.001,
+      "loss": 0.3924,
+      "step": 10134
+    },
+    {
+      "epoch": 0.2796469577873814,
+      "grad_norm": 0.003303113393485546,
+      "learning_rate": 0.001,
+      "loss": 0.4458,
+      "step": 10135
+    },
+    {
+      "epoch": 0.27967454998844576,
+      "grad_norm": 0.002501634182408452,
+      "learning_rate": 0.001,
+      "loss": 0.4063,
+      "step": 10136
+    },
+    {
+      "epoch": 0.2797021421895101,
+      "grad_norm": 0.008220963180065155,
+      "learning_rate": 0.001,
+      "loss": 0.3977,
+      "step": 10137
+    },
+    {
+      "epoch": 0.2797297343905745,
+      "grad_norm": 0.015138577669858932,
+      "learning_rate": 0.001,
+      "loss": 0.4192,
+      "step": 10138
+    },
+    {
+      "epoch": 0.2797573265916389,
+      "grad_norm": 0.00455261766910553,
+      "learning_rate": 0.001,
+      "loss": 0.377,
+      "step": 10139
+    },
+    {
+      "epoch": 0.27978491879270323,
+      "grad_norm": 0.003458748571574688,
+      "learning_rate": 0.001,
+      "loss": 0.4064,
+      "step": 10140
+    },
+    {
+      "epoch": 0.27981251099376764,
+      "grad_norm": 0.0061928508803248405,
+      "learning_rate": 0.001,
+      "loss": 0.4298,
+      "step": 10141
+    },
+    {
+      "epoch": 0.279840103194832,
+      "grad_norm": 0.004049024078994989,
+      "learning_rate": 0.001,
+      "loss": 0.3936,
+      "step": 10142
+    },
+    {
+      "epoch": 0.27986769539589634,
+      "grad_norm": 0.0035144255962222815,
+      "learning_rate": 0.001,
+      "loss": 0.404,
+      "step": 10143
+    },
+    {
+      "epoch": 0.2798952875969607,
+      "grad_norm": 0.004885702393949032,
+      "learning_rate": 0.001,
+      "loss": 0.3764,
+      "step": 10144
+    },
+    {
+      "epoch": 0.2799228797980251,
+      "grad_norm": 0.004949494265019894,
+      "learning_rate": 0.001,
+      "loss": 0.3898,
+      "step": 10145
+    },
+    {
+      "epoch": 0.27995047199908946,
+      "grad_norm": 0.003847777610644698,
+      "learning_rate": 0.001,
+      "loss": 0.4003,
+      "step": 10146
+    },
+    {
+      "epoch": 0.2799780642001538,
+      "grad_norm": 0.0029670281801372766,
+      "learning_rate": 0.001,
+      "loss": 0.388,
+      "step": 10147
+    },
+    {
+      "epoch": 0.2800056564012182,
+      "grad_norm": 0.0038127326406538486,
+      "learning_rate": 0.001,
+      "loss": 0.3912,
+      "step": 10148
+    },
+    {
+      "epoch": 0.28003324860228257,
+      "grad_norm": 0.002967024687677622,
+      "learning_rate": 0.001,
+      "loss": 0.3976,
+      "step": 10149
+    },
+    {
+      "epoch": 0.2800608408033469,
+      "grad_norm": 0.004500363487750292,
+      "learning_rate": 0.001,
+      "loss": 0.4126,
+      "step": 10150
+    },
+    {
+      "epoch": 0.28008843300441133,
+      "grad_norm": 0.0036577654536813498,
+      "learning_rate": 0.001,
+      "loss": 0.3864,
+      "step": 10151
+    },
+    {
+      "epoch": 0.2801160252054757,
+      "grad_norm": 0.00338082667440176,
+      "learning_rate": 0.001,
+      "loss": 0.4216,
+      "step": 10152
+    },
+    {
+      "epoch": 0.28014361740654004,
+      "grad_norm": 0.004477594047784805,
+      "learning_rate": 0.001,
+      "loss": 0.3779,
+      "step": 10153
+    },
+    {
+      "epoch": 0.2801712096076044,
+      "grad_norm": 0.005331623367965221,
+      "learning_rate": 0.001,
+      "loss": 0.3674,
+      "step": 10154
+    },
+    {
+      "epoch": 0.2801988018086688,
+      "grad_norm": 0.0034174472093582153,
+      "learning_rate": 0.001,
+      "loss": 0.3847,
+      "step": 10155
+    },
+    {
+      "epoch": 0.28022639400973315,
+      "grad_norm": 0.004699235316365957,
+      "learning_rate": 0.001,
+      "loss": 0.3842,
+      "step": 10156
+    },
+    {
+      "epoch": 0.2802539862107975,
+      "grad_norm": 0.0027989267837256193,
+      "learning_rate": 0.001,
+      "loss": 0.3757,
+      "step": 10157
+    },
+    {
+      "epoch": 0.2802815784118619,
+      "grad_norm": 0.003362043295055628,
+      "learning_rate": 0.001,
+      "loss": 0.3973,
+      "step": 10158
+    },
+    {
+      "epoch": 0.28030917061292626,
+      "grad_norm": 0.0024922748561948538,
+      "learning_rate": 0.001,
+      "loss": 0.4422,
+      "step": 10159
+    },
+    {
+      "epoch": 0.2803367628139906,
+      "grad_norm": 0.0043379804119467735,
+      "learning_rate": 0.001,
+      "loss": 0.391,
+      "step": 10160
+    },
+    {
+      "epoch": 0.280364355015055,
+      "grad_norm": 0.0037757758982479572,
+      "learning_rate": 0.001,
+      "loss": 0.4115,
+      "step": 10161
+    },
+    {
+      "epoch": 0.2803919472161194,
+      "grad_norm": 0.003047212492674589,
+      "learning_rate": 0.001,
+      "loss": 0.4139,
+      "step": 10162
+    },
+    {
+      "epoch": 0.2804195394171837,
+      "grad_norm": 0.0027652375865727663,
+      "learning_rate": 0.001,
+      "loss": 0.3968,
+      "step": 10163
+    },
+    {
+      "epoch": 0.2804471316182481,
+      "grad_norm": 0.0034420695155858994,
+      "learning_rate": 0.001,
+      "loss": 0.3819,
+      "step": 10164
+    },
+    {
+      "epoch": 0.2804747238193125,
+      "grad_norm": 0.0024248689878731966,
+      "learning_rate": 0.001,
+      "loss": 0.4071,
+      "step": 10165
+    },
+    {
+      "epoch": 0.28050231602037684,
+      "grad_norm": 0.0045005762949585915,
+      "learning_rate": 0.001,
+      "loss": 0.3774,
+      "step": 10166
+    },
+    {
+      "epoch": 0.2805299082214412,
+      "grad_norm": 0.002247242256999016,
+      "learning_rate": 0.001,
+      "loss": 0.409,
+      "step": 10167
+    },
+    {
+      "epoch": 0.2805575004225056,
+      "grad_norm": 0.004416101146489382,
+      "learning_rate": 0.001,
+      "loss": 0.4311,
+      "step": 10168
+    },
+    {
+      "epoch": 0.28058509262356995,
+      "grad_norm": 0.003067211015149951,
+      "learning_rate": 0.001,
+      "loss": 0.3713,
+      "step": 10169
+    },
+    {
+      "epoch": 0.2806126848246343,
+      "grad_norm": 0.0030722361989319324,
+      "learning_rate": 0.001,
+      "loss": 0.446,
+      "step": 10170
+    },
+    {
+      "epoch": 0.28064027702569866,
+      "grad_norm": 0.003344991710036993,
+      "learning_rate": 0.001,
+      "loss": 0.3825,
+      "step": 10171
+    },
+    {
+      "epoch": 0.28066786922676307,
+      "grad_norm": 0.0028220931999385357,
+      "learning_rate": 0.001,
+      "loss": 0.4381,
+      "step": 10172
+    },
+    {
+      "epoch": 0.2806954614278274,
+      "grad_norm": 0.0032366979867219925,
+      "learning_rate": 0.001,
+      "loss": 0.3769,
+      "step": 10173
+    },
+    {
+      "epoch": 0.28072305362889177,
+      "grad_norm": 0.004435013514012098,
+      "learning_rate": 0.001,
+      "loss": 0.41,
+      "step": 10174
+    },
+    {
+      "epoch": 0.2807506458299562,
+      "grad_norm": 0.002561743138357997,
+      "learning_rate": 0.001,
+      "loss": 0.4298,
+      "step": 10175
+    },
+    {
+      "epoch": 0.28077823803102053,
+      "grad_norm": 0.0032213281374424696,
+      "learning_rate": 0.001,
+      "loss": 0.4292,
+      "step": 10176
+    },
+    {
+      "epoch": 0.2808058302320849,
+      "grad_norm": 0.0033837456721812487,
+      "learning_rate": 0.001,
+      "loss": 0.4082,
+      "step": 10177
+    },
+    {
+      "epoch": 0.2808334224331493,
+      "grad_norm": 0.0026573874056339264,
+      "learning_rate": 0.001,
+      "loss": 0.4523,
+      "step": 10178
+    },
+    {
+      "epoch": 0.28086101463421365,
+      "grad_norm": 0.004164142068475485,
+      "learning_rate": 0.001,
+      "loss": 0.3699,
+      "step": 10179
+    },
+    {
+      "epoch": 0.280888606835278,
+      "grad_norm": 0.005075597669929266,
+      "learning_rate": 0.001,
+      "loss": 0.3859,
+      "step": 10180
+    },
+    {
+      "epoch": 0.28091619903634235,
+      "grad_norm": 0.007850431837141514,
+      "learning_rate": 0.001,
+      "loss": 0.3754,
+      "step": 10181
+    },
+    {
+      "epoch": 0.28094379123740676,
+      "grad_norm": 0.0037918195594102144,
+      "learning_rate": 0.001,
+      "loss": 0.4183,
+      "step": 10182
+    },
+    {
+      "epoch": 0.2809713834384711,
+      "grad_norm": 0.004340451210737228,
+      "learning_rate": 0.001,
+      "loss": 0.399,
+      "step": 10183
+    },
+    {
+      "epoch": 0.28099897563953546,
+      "grad_norm": 0.003013703739270568,
+      "learning_rate": 0.001,
+      "loss": 0.4117,
+      "step": 10184
+    },
+    {
+      "epoch": 0.2810265678405999,
+      "grad_norm": 0.004570307210087776,
+      "learning_rate": 0.001,
+      "loss": 0.3913,
+      "step": 10185
+    },
+    {
+      "epoch": 0.2810541600416642,
+      "grad_norm": 0.0028304762672632933,
+      "learning_rate": 0.001,
+      "loss": 0.3712,
+      "step": 10186
+    },
+    {
+      "epoch": 0.2810817522427286,
+      "grad_norm": 0.003872218308970332,
+      "learning_rate": 0.001,
+      "loss": 0.3755,
+      "step": 10187
+    },
+    {
+      "epoch": 0.281109344443793,
+      "grad_norm": 0.003236171556636691,
+      "learning_rate": 0.001,
+      "loss": 0.4066,
+      "step": 10188
+    },
+    {
+      "epoch": 0.28113693664485734,
+      "grad_norm": 0.00254503614269197,
+      "learning_rate": 0.001,
+      "loss": 0.3824,
+      "step": 10189
+    },
+    {
+      "epoch": 0.2811645288459217,
+      "grad_norm": 0.0019429237581789494,
+      "learning_rate": 0.001,
+      "loss": 0.3883,
+      "step": 10190
+    },
+    {
+      "epoch": 0.28119212104698604,
+      "grad_norm": 0.003968310542404652,
+      "learning_rate": 0.001,
+      "loss": 0.4173,
+      "step": 10191
+    },
+    {
+      "epoch": 0.28121971324805045,
+      "grad_norm": 0.0034456211142241955,
+      "learning_rate": 0.001,
+      "loss": 0.4078,
+      "step": 10192
+    },
+    {
+      "epoch": 0.2812473054491148,
+      "grad_norm": 0.00856733787804842,
+      "learning_rate": 0.001,
+      "loss": 0.3771,
+      "step": 10193
+    },
+    {
+      "epoch": 0.28127489765017916,
+      "grad_norm": 0.00205041142180562,
+      "learning_rate": 0.001,
+      "loss": 0.3974,
+      "step": 10194
+    },
+    {
+      "epoch": 0.28130248985124356,
+      "grad_norm": 0.0054842522367835045,
+      "learning_rate": 0.001,
+      "loss": 0.3786,
+      "step": 10195
+    },
+    {
+      "epoch": 0.2813300820523079,
+      "grad_norm": 0.0026425907853990793,
+      "learning_rate": 0.001,
+      "loss": 0.415,
+      "step": 10196
+    },
+    {
+      "epoch": 0.28135767425337227,
+      "grad_norm": 0.0030602633487433195,
+      "learning_rate": 0.001,
+      "loss": 0.4006,
+      "step": 10197
+    },
+    {
+      "epoch": 0.2813852664544367,
+      "grad_norm": 0.00611920328810811,
+      "learning_rate": 0.001,
+      "loss": 0.3823,
+      "step": 10198
+    },
+    {
+      "epoch": 0.28141285865550103,
+      "grad_norm": 0.0028095582965761423,
+      "learning_rate": 0.001,
+      "loss": 0.4303,
+      "step": 10199
+    },
+    {
+      "epoch": 0.2814404508565654,
+      "grad_norm": 0.002753883833065629,
+      "learning_rate": 0.001,
+      "loss": 0.408,
+      "step": 10200
+    },
+    {
+      "epoch": 0.28146804305762974,
+      "grad_norm": 0.0056452443823218346,
+      "learning_rate": 0.001,
+      "loss": 0.3858,
+      "step": 10201
+    },
+    {
+      "epoch": 0.28149563525869414,
+      "grad_norm": 0.004540273919701576,
+      "learning_rate": 0.001,
+      "loss": 0.4322,
+      "step": 10202
+    },
+    {
+      "epoch": 0.2815232274597585,
+      "grad_norm": 0.0029490333981812,
+      "learning_rate": 0.001,
+      "loss": 0.4101,
+      "step": 10203
+    },
+    {
+      "epoch": 0.28155081966082285,
+      "grad_norm": 0.0022447453811764717,
+      "learning_rate": 0.001,
+      "loss": 0.4199,
+      "step": 10204
+    },
+    {
+      "epoch": 0.28157841186188726,
+      "grad_norm": 0.004265769850462675,
+      "learning_rate": 0.001,
+      "loss": 0.4092,
+      "step": 10205
+    },
+    {
+      "epoch": 0.2816060040629516,
+      "grad_norm": 0.007797228638082743,
+      "learning_rate": 0.001,
+      "loss": 0.3928,
+      "step": 10206
+    },
+    {
+      "epoch": 0.28163359626401596,
+      "grad_norm": 0.0029594229999929667,
+      "learning_rate": 0.001,
+      "loss": 0.3981,
+      "step": 10207
+    },
+    {
+      "epoch": 0.28166118846508037,
+      "grad_norm": 0.006314094644039869,
+      "learning_rate": 0.001,
+      "loss": 0.3699,
+      "step": 10208
+    },
+    {
+      "epoch": 0.2816887806661447,
+      "grad_norm": 0.002639149548485875,
+      "learning_rate": 0.001,
+      "loss": 0.4034,
+      "step": 10209
+    },
+    {
+      "epoch": 0.2817163728672091,
+      "grad_norm": 0.0027635907754302025,
+      "learning_rate": 0.001,
+      "loss": 0.4056,
+      "step": 10210
+    },
+    {
+      "epoch": 0.2817439650682734,
+      "grad_norm": 0.006702120881527662,
+      "learning_rate": 0.001,
+      "loss": 0.3668,
+      "step": 10211
+    },
+    {
+      "epoch": 0.28177155726933784,
+      "grad_norm": 0.008459938690066338,
+      "learning_rate": 0.001,
+      "loss": 0.4373,
+      "step": 10212
+    },
+    {
+      "epoch": 0.2817991494704022,
+      "grad_norm": 0.005659305490553379,
+      "learning_rate": 0.001,
+      "loss": 0.3908,
+      "step": 10213
+    },
+    {
+      "epoch": 0.28182674167146654,
+      "grad_norm": 0.002557772444561124,
+      "learning_rate": 0.001,
+      "loss": 0.4227,
+      "step": 10214
+    },
+    {
+      "epoch": 0.28185433387253095,
+      "grad_norm": 0.0043907626532018185,
+      "learning_rate": 0.001,
+      "loss": 0.3726,
+      "step": 10215
+    },
+    {
+      "epoch": 0.2818819260735953,
+      "grad_norm": 0.004590220283716917,
+      "learning_rate": 0.001,
+      "loss": 0.4098,
+      "step": 10216
+    },
+    {
+      "epoch": 0.28190951827465965,
+      "grad_norm": 0.05343657732009888,
+      "learning_rate": 0.001,
+      "loss": 0.4197,
+      "step": 10217
+    },
+    {
+      "epoch": 0.28193711047572406,
+      "grad_norm": 0.008130939677357674,
+      "learning_rate": 0.001,
+      "loss": 0.3902,
+      "step": 10218
+    },
+    {
+      "epoch": 0.2819647026767884,
+      "grad_norm": 0.002707968931645155,
+      "learning_rate": 0.001,
+      "loss": 0.3788,
+      "step": 10219
+    },
+    {
+      "epoch": 0.28199229487785277,
+      "grad_norm": 0.002904132939875126,
+      "learning_rate": 0.001,
+      "loss": 0.3735,
+      "step": 10220
+    },
+    {
+      "epoch": 0.2820198870789171,
+      "grad_norm": 0.002991893794387579,
+      "learning_rate": 0.001,
+      "loss": 0.419,
+      "step": 10221
+    },
+    {
+      "epoch": 0.28204747927998153,
+      "grad_norm": 0.0030220167245715857,
+      "learning_rate": 0.001,
+      "loss": 0.3878,
+      "step": 10222
+    },
+    {
+      "epoch": 0.2820750714810459,
+      "grad_norm": 0.002117662690579891,
+      "learning_rate": 0.001,
+      "loss": 0.4007,
+      "step": 10223
+    },
+    {
+      "epoch": 0.28210266368211023,
+      "grad_norm": 0.003019734052941203,
+      "learning_rate": 0.001,
+      "loss": 0.3892,
+      "step": 10224
+    },
+    {
+      "epoch": 0.28213025588317464,
+      "grad_norm": 0.003011964727193117,
+      "learning_rate": 0.001,
+      "loss": 0.3953,
+      "step": 10225
+    },
+    {
+      "epoch": 0.282157848084239,
+      "grad_norm": 0.0034174530301243067,
+      "learning_rate": 0.001,
+      "loss": 0.4179,
+      "step": 10226
+    },
+    {
+      "epoch": 0.28218544028530335,
+      "grad_norm": 0.0024363279808312654,
+      "learning_rate": 0.001,
+      "loss": 0.3984,
+      "step": 10227
+    },
+    {
+      "epoch": 0.28221303248636775,
+      "grad_norm": 0.002497728681191802,
+      "learning_rate": 0.001,
+      "loss": 0.4095,
+      "step": 10228
+    },
+    {
+      "epoch": 0.2822406246874321,
+      "grad_norm": 0.0034284652210772038,
+      "learning_rate": 0.001,
+      "loss": 0.3882,
+      "step": 10229
+    },
+    {
+      "epoch": 0.28226821688849646,
+      "grad_norm": 0.002253099577501416,
+      "learning_rate": 0.001,
+      "loss": 0.3991,
+      "step": 10230
+    },
+    {
+      "epoch": 0.2822958090895608,
+      "grad_norm": 0.003588828956708312,
+      "learning_rate": 0.001,
+      "loss": 0.3769,
+      "step": 10231
+    },
+    {
+      "epoch": 0.2823234012906252,
+      "grad_norm": 0.0027390895411372185,
+      "learning_rate": 0.001,
+      "loss": 0.3883,
+      "step": 10232
+    },
+    {
+      "epoch": 0.2823509934916896,
+      "grad_norm": 0.002828077645972371,
+      "learning_rate": 0.001,
+      "loss": 0.3915,
+      "step": 10233
+    },
+    {
+      "epoch": 0.2823785856927539,
+      "grad_norm": 0.006078117527067661,
+      "learning_rate": 0.001,
+      "loss": 0.3746,
+      "step": 10234
+    },
+    {
+      "epoch": 0.28240617789381833,
+      "grad_norm": 0.004316994454711676,
+      "learning_rate": 0.001,
+      "loss": 0.3866,
+      "step": 10235
+    },
+    {
+      "epoch": 0.2824337700948827,
+      "grad_norm": 0.0036562427412718534,
+      "learning_rate": 0.001,
+      "loss": 0.4069,
+      "step": 10236
+    },
+    {
+      "epoch": 0.28246136229594704,
+      "grad_norm": 0.004819520283490419,
+      "learning_rate": 0.001,
+      "loss": 0.3939,
+      "step": 10237
+    },
+    {
+      "epoch": 0.28248895449701145,
+      "grad_norm": 0.0036955669056624174,
+      "learning_rate": 0.001,
+      "loss": 0.376,
+      "step": 10238
+    },
+    {
+      "epoch": 0.2825165466980758,
+      "grad_norm": 0.004412720445543528,
+      "learning_rate": 0.001,
+      "loss": 0.4036,
+      "step": 10239
+    },
+    {
+      "epoch": 0.28254413889914015,
+      "grad_norm": 0.0021497290581464767,
+      "learning_rate": 0.001,
+      "loss": 0.4165,
+      "step": 10240
+    },
+    {
+      "epoch": 0.2825717311002045,
+      "grad_norm": 0.003762908047065139,
+      "learning_rate": 0.001,
+      "loss": 0.3871,
+      "step": 10241
+    },
+    {
+      "epoch": 0.2825993233012689,
+      "grad_norm": 0.0028142263181507587,
+      "learning_rate": 0.001,
+      "loss": 0.3492,
+      "step": 10242
+    },
+    {
+      "epoch": 0.28262691550233326,
+      "grad_norm": 0.002374440897256136,
+      "learning_rate": 0.001,
+      "loss": 0.4136,
+      "step": 10243
+    },
+    {
+      "epoch": 0.2826545077033976,
+      "grad_norm": 0.002582703484222293,
+      "learning_rate": 0.001,
+      "loss": 0.3865,
+      "step": 10244
+    },
+    {
+      "epoch": 0.282682099904462,
+      "grad_norm": 0.003294025780633092,
+      "learning_rate": 0.001,
+      "loss": 0.4019,
+      "step": 10245
+    },
+    {
+      "epoch": 0.2827096921055264,
+      "grad_norm": 0.0025567803531885147,
+      "learning_rate": 0.001,
+      "loss": 0.4282,
+      "step": 10246
+    },
+    {
+      "epoch": 0.28273728430659073,
+      "grad_norm": 0.002444926183670759,
+      "learning_rate": 0.001,
+      "loss": 0.4432,
+      "step": 10247
+    },
+    {
+      "epoch": 0.28276487650765514,
+      "grad_norm": 0.002646154025569558,
+      "learning_rate": 0.001,
+      "loss": 0.404,
+      "step": 10248
+    },
+    {
+      "epoch": 0.2827924687087195,
+      "grad_norm": 0.003058774396777153,
+      "learning_rate": 0.001,
+      "loss": 0.4082,
+      "step": 10249
+    },
+    {
+      "epoch": 0.28282006090978384,
+      "grad_norm": 0.004560539964586496,
+      "learning_rate": 0.001,
+      "loss": 0.3797,
+      "step": 10250
+    },
+    {
+      "epoch": 0.2828476531108482,
+      "grad_norm": 0.005327416118234396,
+      "learning_rate": 0.001,
+      "loss": 0.3994,
+      "step": 10251
+    },
+    {
+      "epoch": 0.2828752453119126,
+      "grad_norm": 0.002578763058409095,
+      "learning_rate": 0.001,
+      "loss": 0.3966,
+      "step": 10252
+    },
+    {
+      "epoch": 0.28290283751297696,
+      "grad_norm": 0.0030044415034353733,
+      "learning_rate": 0.001,
+      "loss": 0.4102,
+      "step": 10253
+    },
+    {
+      "epoch": 0.2829304297140413,
+      "grad_norm": 0.002438440453261137,
+      "learning_rate": 0.001,
+      "loss": 0.4276,
+      "step": 10254
+    },
+    {
+      "epoch": 0.2829580219151057,
+      "grad_norm": 0.002474286826327443,
+      "learning_rate": 0.001,
+      "loss": 0.4097,
+      "step": 10255
+    },
+    {
+      "epoch": 0.28298561411617007,
+      "grad_norm": 0.0036326062399894,
+      "learning_rate": 0.001,
+      "loss": 0.3871,
+      "step": 10256
+    },
+    {
+      "epoch": 0.2830132063172344,
+      "grad_norm": 0.006661332678049803,
+      "learning_rate": 0.001,
+      "loss": 0.3838,
+      "step": 10257
+    },
+    {
+      "epoch": 0.2830407985182988,
+      "grad_norm": 0.005072845611721277,
+      "learning_rate": 0.001,
+      "loss": 0.3903,
+      "step": 10258
+    },
+    {
+      "epoch": 0.2830683907193632,
+      "grad_norm": 0.004820408299565315,
+      "learning_rate": 0.001,
+      "loss": 0.373,
+      "step": 10259
+    },
+    {
+      "epoch": 0.28309598292042754,
+      "grad_norm": 0.00312256021425128,
+      "learning_rate": 0.001,
+      "loss": 0.4164,
+      "step": 10260
+    },
+    {
+      "epoch": 0.2831235751214919,
+      "grad_norm": 0.0029510208405554295,
+      "learning_rate": 0.001,
+      "loss": 0.4023,
+      "step": 10261
+    },
+    {
+      "epoch": 0.2831511673225563,
+      "grad_norm": 0.0040268669836223125,
+      "learning_rate": 0.001,
+      "loss": 0.3614,
+      "step": 10262
+    },
+    {
+      "epoch": 0.28317875952362065,
+      "grad_norm": 0.006543302442878485,
+      "learning_rate": 0.001,
+      "loss": 0.384,
+      "step": 10263
+    },
+    {
+      "epoch": 0.283206351724685,
+      "grad_norm": 0.003965241368860006,
+      "learning_rate": 0.001,
+      "loss": 0.3958,
+      "step": 10264
+    },
+    {
+      "epoch": 0.2832339439257494,
+      "grad_norm": 0.0032414360903203487,
+      "learning_rate": 0.001,
+      "loss": 0.3433,
+      "step": 10265
+    },
+    {
+      "epoch": 0.28326153612681376,
+      "grad_norm": 0.002474940847605467,
+      "learning_rate": 0.001,
+      "loss": 0.393,
+      "step": 10266
+    },
+    {
+      "epoch": 0.2832891283278781,
+      "grad_norm": 0.0031417065765708685,
+      "learning_rate": 0.001,
+      "loss": 0.393,
+      "step": 10267
+    },
+    {
+      "epoch": 0.28331672052894247,
+      "grad_norm": 0.0028232133481651545,
+      "learning_rate": 0.001,
+      "loss": 0.4599,
+      "step": 10268
+    },
+    {
+      "epoch": 0.2833443127300069,
+      "grad_norm": 0.002691833535209298,
+      "learning_rate": 0.001,
+      "loss": 0.3892,
+      "step": 10269
+    },
+    {
+      "epoch": 0.28337190493107123,
+      "grad_norm": 0.005066230893135071,
+      "learning_rate": 0.001,
+      "loss": 0.3908,
+      "step": 10270
+    },
+    {
+      "epoch": 0.2833994971321356,
+      "grad_norm": 0.004286530893296003,
+      "learning_rate": 0.001,
+      "loss": 0.3913,
+      "step": 10271
+    },
+    {
+      "epoch": 0.2834270893332,
+      "grad_norm": 0.004262128844857216,
+      "learning_rate": 0.001,
+      "loss": 0.4195,
+      "step": 10272
+    },
+    {
+      "epoch": 0.28345468153426434,
+      "grad_norm": 0.0027960515581071377,
+      "learning_rate": 0.001,
+      "loss": 0.4211,
+      "step": 10273
+    },
+    {
+      "epoch": 0.2834822737353287,
+      "grad_norm": 0.004192339722067118,
+      "learning_rate": 0.001,
+      "loss": 0.377,
+      "step": 10274
+    },
+    {
+      "epoch": 0.2835098659363931,
+      "grad_norm": 0.0032848825212568045,
+      "learning_rate": 0.001,
+      "loss": 0.362,
+      "step": 10275
+    },
+    {
+      "epoch": 0.28353745813745745,
+      "grad_norm": 0.0027345793787389994,
+      "learning_rate": 0.001,
+      "loss": 0.4034,
+      "step": 10276
+    },
+    {
+      "epoch": 0.2835650503385218,
+      "grad_norm": 0.0030612831469625235,
+      "learning_rate": 0.001,
+      "loss": 0.3779,
+      "step": 10277
+    },
+    {
+      "epoch": 0.28359264253958616,
+      "grad_norm": 0.002645805710926652,
+      "learning_rate": 0.001,
+      "loss": 0.3881,
+      "step": 10278
+    },
+    {
+      "epoch": 0.28362023474065057,
+      "grad_norm": 0.0025237167719751596,
+      "learning_rate": 0.001,
+      "loss": 0.3954,
+      "step": 10279
+    },
+    {
+      "epoch": 0.2836478269417149,
+      "grad_norm": 0.0036604590713977814,
+      "learning_rate": 0.001,
+      "loss": 0.4253,
+      "step": 10280
+    },
+    {
+      "epoch": 0.2836754191427793,
+      "grad_norm": 0.007117830216884613,
+      "learning_rate": 0.001,
+      "loss": 0.3755,
+      "step": 10281
+    },
+    {
+      "epoch": 0.2837030113438437,
+      "grad_norm": 0.05505690723657608,
+      "learning_rate": 0.001,
+      "loss": 0.3955,
+      "step": 10282
+    },
+    {
+      "epoch": 0.28373060354490803,
+      "grad_norm": 0.006176114547997713,
+      "learning_rate": 0.001,
+      "loss": 0.352,
+      "step": 10283
+    },
+    {
+      "epoch": 0.2837581957459724,
+      "grad_norm": 0.003028157399967313,
+      "learning_rate": 0.001,
+      "loss": 0.3814,
+      "step": 10284
+    },
+    {
+      "epoch": 0.2837857879470368,
+      "grad_norm": 0.0032485162373632193,
+      "learning_rate": 0.001,
+      "loss": 0.4049,
+      "step": 10285
+    },
+    {
+      "epoch": 0.28381338014810115,
+      "grad_norm": 0.002797128167003393,
+      "learning_rate": 0.001,
+      "loss": 0.4038,
+      "step": 10286
+    },
+    {
+      "epoch": 0.2838409723491655,
+      "grad_norm": 0.003049092134460807,
+      "learning_rate": 0.001,
+      "loss": 0.4092,
+      "step": 10287
+    },
+    {
+      "epoch": 0.28386856455022985,
+      "grad_norm": 0.0031948923133313656,
+      "learning_rate": 0.001,
+      "loss": 0.3943,
+      "step": 10288
+    },
+    {
+      "epoch": 0.28389615675129426,
+      "grad_norm": 0.003836139803752303,
+      "learning_rate": 0.001,
+      "loss": 0.3844,
+      "step": 10289
+    },
+    {
+      "epoch": 0.2839237489523586,
+      "grad_norm": 0.002338884864002466,
+      "learning_rate": 0.001,
+      "loss": 0.4126,
+      "step": 10290
+    },
+    {
+      "epoch": 0.28395134115342296,
+      "grad_norm": 0.009832640178501606,
+      "learning_rate": 0.001,
+      "loss": 0.4102,
+      "step": 10291
+    },
+    {
+      "epoch": 0.2839789333544874,
+      "grad_norm": 0.0026350358966737986,
+      "learning_rate": 0.001,
+      "loss": 0.43,
+      "step": 10292
+    },
+    {
+      "epoch": 0.2840065255555517,
+      "grad_norm": 0.003660572227090597,
+      "learning_rate": 0.001,
+      "loss": 0.4099,
+      "step": 10293
+    },
+    {
+      "epoch": 0.2840341177566161,
+      "grad_norm": 0.00275686988607049,
+      "learning_rate": 0.001,
+      "loss": 0.3934,
+      "step": 10294
+    },
+    {
+      "epoch": 0.2840617099576805,
+      "grad_norm": 0.0033077567350119352,
+      "learning_rate": 0.001,
+      "loss": 0.3926,
+      "step": 10295
+    },
+    {
+      "epoch": 0.28408930215874484,
+      "grad_norm": 0.006942362524569035,
+      "learning_rate": 0.001,
+      "loss": 0.354,
+      "step": 10296
+    },
+    {
+      "epoch": 0.2841168943598092,
+      "grad_norm": 0.0036332812160253525,
+      "learning_rate": 0.001,
+      "loss": 0.3707,
+      "step": 10297
+    },
+    {
+      "epoch": 0.28414448656087354,
+      "grad_norm": 0.012020766735076904,
+      "learning_rate": 0.001,
+      "loss": 0.3754,
+      "step": 10298
+    },
+    {
+      "epoch": 0.28417207876193795,
+      "grad_norm": 0.0028270173352211714,
+      "learning_rate": 0.001,
+      "loss": 0.3852,
+      "step": 10299
+    },
+    {
+      "epoch": 0.2841996709630023,
+      "grad_norm": 0.011626332998275757,
+      "learning_rate": 0.001,
+      "loss": 0.3823,
+      "step": 10300
+    },
+    {
+      "epoch": 0.28422726316406666,
+      "grad_norm": 0.004878376144915819,
+      "learning_rate": 0.001,
+      "loss": 0.4427,
+      "step": 10301
+    },
+    {
+      "epoch": 0.28425485536513106,
+      "grad_norm": 0.004530386067926884,
+      "learning_rate": 0.001,
+      "loss": 0.3858,
+      "step": 10302
+    },
+    {
+      "epoch": 0.2842824475661954,
+      "grad_norm": 0.004930171649903059,
+      "learning_rate": 0.001,
+      "loss": 0.3893,
+      "step": 10303
+    },
+    {
+      "epoch": 0.28431003976725977,
+      "grad_norm": 0.009053234942257404,
+      "learning_rate": 0.001,
+      "loss": 0.3703,
+      "step": 10304
+    },
+    {
+      "epoch": 0.2843376319683242,
+      "grad_norm": 0.004205284174531698,
+      "learning_rate": 0.001,
+      "loss": 0.3538,
+      "step": 10305
+    },
+    {
+      "epoch": 0.28436522416938853,
+      "grad_norm": 0.003883173456415534,
+      "learning_rate": 0.001,
+      "loss": 0.4267,
+      "step": 10306
+    },
+    {
+      "epoch": 0.2843928163704529,
+      "grad_norm": 0.0031415647827088833,
+      "learning_rate": 0.001,
+      "loss": 0.4154,
+      "step": 10307
+    },
+    {
+      "epoch": 0.28442040857151724,
+      "grad_norm": 0.003464039647951722,
+      "learning_rate": 0.001,
+      "loss": 0.39,
+      "step": 10308
+    },
+    {
+      "epoch": 0.28444800077258164,
+      "grad_norm": 0.004332481883466244,
+      "learning_rate": 0.001,
+      "loss": 0.3642,
+      "step": 10309
+    },
+    {
+      "epoch": 0.284475592973646,
+      "grad_norm": 0.00255342829041183,
+      "learning_rate": 0.001,
+      "loss": 0.4375,
+      "step": 10310
+    },
+    {
+      "epoch": 0.28450318517471035,
+      "grad_norm": 0.0028760903514921665,
+      "learning_rate": 0.001,
+      "loss": 0.4054,
+      "step": 10311
+    },
+    {
+      "epoch": 0.28453077737577476,
+      "grad_norm": 0.005956695880740881,
+      "learning_rate": 0.001,
+      "loss": 0.4019,
+      "step": 10312
+    },
+    {
+      "epoch": 0.2845583695768391,
+      "grad_norm": 0.004237044602632523,
+      "learning_rate": 0.001,
+      "loss": 0.3803,
+      "step": 10313
+    },
+    {
+      "epoch": 0.28458596177790346,
+      "grad_norm": 0.005159912630915642,
+      "learning_rate": 0.001,
+      "loss": 0.3598,
+      "step": 10314
+    },
+    {
+      "epoch": 0.28461355397896787,
+      "grad_norm": 0.0030028277542442083,
+      "learning_rate": 0.001,
+      "loss": 0.4357,
+      "step": 10315
+    },
+    {
+      "epoch": 0.2846411461800322,
+      "grad_norm": 0.002339400118216872,
+      "learning_rate": 0.001,
+      "loss": 0.4295,
+      "step": 10316
+    },
+    {
+      "epoch": 0.2846687383810966,
+      "grad_norm": 0.0021588336676359177,
+      "learning_rate": 0.001,
+      "loss": 0.4477,
+      "step": 10317
+    },
+    {
+      "epoch": 0.28469633058216093,
+      "grad_norm": 0.002696117153391242,
+      "learning_rate": 0.001,
+      "loss": 0.39,
+      "step": 10318
+    },
+    {
+      "epoch": 0.28472392278322534,
+      "grad_norm": 0.0029791551642119884,
+      "learning_rate": 0.001,
+      "loss": 0.4123,
+      "step": 10319
+    },
+    {
+      "epoch": 0.2847515149842897,
+      "grad_norm": 0.00416885782033205,
+      "learning_rate": 0.001,
+      "loss": 0.419,
+      "step": 10320
+    },
+    {
+      "epoch": 0.28477910718535404,
+      "grad_norm": 0.005709108896553516,
+      "learning_rate": 0.001,
+      "loss": 0.4096,
+      "step": 10321
+    },
+    {
+      "epoch": 0.28480669938641845,
+      "grad_norm": 0.002943917643278837,
+      "learning_rate": 0.001,
+      "loss": 0.3675,
+      "step": 10322
+    },
+    {
+      "epoch": 0.2848342915874828,
+      "grad_norm": 0.0025159113574773073,
+      "learning_rate": 0.001,
+      "loss": 0.3841,
+      "step": 10323
+    },
+    {
+      "epoch": 0.28486188378854715,
+      "grad_norm": 0.004362201318144798,
+      "learning_rate": 0.001,
+      "loss": 0.3772,
+      "step": 10324
+    },
+    {
+      "epoch": 0.28488947598961156,
+      "grad_norm": 0.0049796863459050655,
+      "learning_rate": 0.001,
+      "loss": 0.4253,
+      "step": 10325
+    },
+    {
+      "epoch": 0.2849170681906759,
+      "grad_norm": 0.004486286547034979,
+      "learning_rate": 0.001,
+      "loss": 0.4077,
+      "step": 10326
+    },
+    {
+      "epoch": 0.28494466039174027,
+      "grad_norm": 0.006912431679666042,
+      "learning_rate": 0.001,
+      "loss": 0.4178,
+      "step": 10327
+    },
+    {
+      "epoch": 0.2849722525928046,
+      "grad_norm": 0.0031216132920235395,
+      "learning_rate": 0.001,
+      "loss": 0.4226,
+      "step": 10328
+    },
+    {
+      "epoch": 0.28499984479386903,
+      "grad_norm": 0.004195013549178839,
+      "learning_rate": 0.001,
+      "loss": 0.3776,
+      "step": 10329
+    },
+    {
+      "epoch": 0.2850274369949334,
+      "grad_norm": 0.0029478815849870443,
+      "learning_rate": 0.001,
+      "loss": 0.3897,
+      "step": 10330
+    },
+    {
+      "epoch": 0.28505502919599773,
+      "grad_norm": 0.004067423287779093,
+      "learning_rate": 0.001,
+      "loss": 0.3935,
+      "step": 10331
+    },
+    {
+      "epoch": 0.28508262139706214,
+      "grad_norm": 0.004478363785892725,
+      "learning_rate": 0.001,
+      "loss": 0.4146,
+      "step": 10332
+    },
+    {
+      "epoch": 0.2851102135981265,
+      "grad_norm": 0.009275195188820362,
+      "learning_rate": 0.001,
+      "loss": 0.3937,
+      "step": 10333
+    },
+    {
+      "epoch": 0.28513780579919085,
+      "grad_norm": 0.0037428569048643112,
+      "learning_rate": 0.001,
+      "loss": 0.3955,
+      "step": 10334
+    },
+    {
+      "epoch": 0.28516539800025525,
+      "grad_norm": 0.004474049899727106,
+      "learning_rate": 0.001,
+      "loss": 0.4112,
+      "step": 10335
+    },
+    {
+      "epoch": 0.2851929902013196,
+      "grad_norm": 0.004371373914182186,
+      "learning_rate": 0.001,
+      "loss": 0.3571,
+      "step": 10336
+    },
+    {
+      "epoch": 0.28522058240238396,
+      "grad_norm": 0.0037502245977520943,
+      "learning_rate": 0.001,
+      "loss": 0.3624,
+      "step": 10337
+    },
+    {
+      "epoch": 0.2852481746034483,
+      "grad_norm": 0.002311047865077853,
+      "learning_rate": 0.001,
+      "loss": 0.3721,
+      "step": 10338
+    },
+    {
+      "epoch": 0.2852757668045127,
+      "grad_norm": 0.006753645371645689,
+      "learning_rate": 0.001,
+      "loss": 0.3764,
+      "step": 10339
+    },
+    {
+      "epoch": 0.2853033590055771,
+      "grad_norm": 0.0031517634633928537,
+      "learning_rate": 0.001,
+      "loss": 0.4182,
+      "step": 10340
+    },
+    {
+      "epoch": 0.2853309512066414,
+      "grad_norm": 0.0038493776228278875,
+      "learning_rate": 0.001,
+      "loss": 0.3738,
+      "step": 10341
+    },
+    {
+      "epoch": 0.28535854340770583,
+      "grad_norm": 0.0019644973799586296,
+      "learning_rate": 0.001,
+      "loss": 0.4105,
+      "step": 10342
+    },
+    {
+      "epoch": 0.2853861356087702,
+      "grad_norm": 0.0025845207273960114,
+      "learning_rate": 0.001,
+      "loss": 0.394,
+      "step": 10343
+    },
+    {
+      "epoch": 0.28541372780983454,
+      "grad_norm": 0.0030156485736370087,
+      "learning_rate": 0.001,
+      "loss": 0.3931,
+      "step": 10344
+    },
+    {
+      "epoch": 0.28544132001089895,
+      "grad_norm": 0.0033527929335832596,
+      "learning_rate": 0.001,
+      "loss": 0.4045,
+      "step": 10345
+    },
+    {
+      "epoch": 0.2854689122119633,
+      "grad_norm": 0.0047877393662929535,
+      "learning_rate": 0.001,
+      "loss": 0.4196,
+      "step": 10346
+    },
+    {
+      "epoch": 0.28549650441302765,
+      "grad_norm": 0.002556293271481991,
+      "learning_rate": 0.001,
+      "loss": 0.4108,
+      "step": 10347
+    },
+    {
+      "epoch": 0.285524096614092,
+      "grad_norm": 0.0028672493062913418,
+      "learning_rate": 0.001,
+      "loss": 0.3888,
+      "step": 10348
+    },
+    {
+      "epoch": 0.2855516888151564,
+      "grad_norm": 0.0026332384441047907,
+      "learning_rate": 0.001,
+      "loss": 0.4436,
+      "step": 10349
+    },
+    {
+      "epoch": 0.28557928101622077,
+      "grad_norm": 0.0033898288384079933,
+      "learning_rate": 0.001,
+      "loss": 0.3911,
+      "step": 10350
+    },
+    {
+      "epoch": 0.2856068732172851,
+      "grad_norm": 0.003938847221434116,
+      "learning_rate": 0.001,
+      "loss": 0.4113,
+      "step": 10351
+    },
+    {
+      "epoch": 0.2856344654183495,
+      "grad_norm": 0.004278605338186026,
+      "learning_rate": 0.001,
+      "loss": 0.3997,
+      "step": 10352
+    },
+    {
+      "epoch": 0.2856620576194139,
+      "grad_norm": 0.009001370519399643,
+      "learning_rate": 0.001,
+      "loss": 0.4223,
+      "step": 10353
+    },
+    {
+      "epoch": 0.28568964982047823,
+      "grad_norm": 0.0058325412683188915,
+      "learning_rate": 0.001,
+      "loss": 0.386,
+      "step": 10354
+    },
+    {
+      "epoch": 0.2857172420215426,
+      "grad_norm": 0.0033330926671624184,
+      "learning_rate": 0.001,
+      "loss": 0.3984,
+      "step": 10355
+    },
+    {
+      "epoch": 0.285744834222607,
+      "grad_norm": 0.0033065094612538815,
+      "learning_rate": 0.001,
+      "loss": 0.4256,
+      "step": 10356
+    },
+    {
+      "epoch": 0.28577242642367134,
+      "grad_norm": 0.002417915500700474,
+      "learning_rate": 0.001,
+      "loss": 0.4,
+      "step": 10357
+    },
+    {
+      "epoch": 0.2858000186247357,
+      "grad_norm": 0.0036074428353458643,
+      "learning_rate": 0.001,
+      "loss": 0.3477,
+      "step": 10358
+    },
+    {
+      "epoch": 0.2858276108258001,
+      "grad_norm": 0.0028410926461219788,
+      "learning_rate": 0.001,
+      "loss": 0.3941,
+      "step": 10359
+    },
+    {
+      "epoch": 0.28585520302686446,
+      "grad_norm": 0.0024444633163511753,
+      "learning_rate": 0.001,
+      "loss": 0.4072,
+      "step": 10360
+    },
+    {
+      "epoch": 0.2858827952279288,
+      "grad_norm": 0.0035574915818870068,
+      "learning_rate": 0.001,
+      "loss": 0.3702,
+      "step": 10361
+    },
+    {
+      "epoch": 0.2859103874289932,
+      "grad_norm": 0.003090563230216503,
+      "learning_rate": 0.001,
+      "loss": 0.3829,
+      "step": 10362
+    },
+    {
+      "epoch": 0.28593797963005757,
+      "grad_norm": 0.003158135572448373,
+      "learning_rate": 0.001,
+      "loss": 0.3842,
+      "step": 10363
+    },
+    {
+      "epoch": 0.2859655718311219,
+      "grad_norm": 0.004847416654229164,
+      "learning_rate": 0.001,
+      "loss": 0.3727,
+      "step": 10364
+    },
+    {
+      "epoch": 0.2859931640321863,
+      "grad_norm": 0.0035511176101863384,
+      "learning_rate": 0.001,
+      "loss": 0.4061,
+      "step": 10365
+    },
+    {
+      "epoch": 0.2860207562332507,
+      "grad_norm": 0.0049200220964848995,
+      "learning_rate": 0.001,
+      "loss": 0.399,
+      "step": 10366
+    },
+    {
+      "epoch": 0.28604834843431504,
+      "grad_norm": 0.0024868594482541084,
+      "learning_rate": 0.001,
+      "loss": 0.374,
+      "step": 10367
+    },
+    {
+      "epoch": 0.2860759406353794,
+      "grad_norm": 0.004043731838464737,
+      "learning_rate": 0.001,
+      "loss": 0.4011,
+      "step": 10368
+    },
+    {
+      "epoch": 0.2861035328364438,
+      "grad_norm": 0.0034267944283783436,
+      "learning_rate": 0.001,
+      "loss": 0.4056,
+      "step": 10369
+    },
+    {
+      "epoch": 0.28613112503750815,
+      "grad_norm": 0.003376040840521455,
+      "learning_rate": 0.001,
+      "loss": 0.4189,
+      "step": 10370
+    },
+    {
+      "epoch": 0.2861587172385725,
+      "grad_norm": 0.0024833050556480885,
+      "learning_rate": 0.001,
+      "loss": 0.4246,
+      "step": 10371
+    },
+    {
+      "epoch": 0.2861863094396369,
+      "grad_norm": 0.0030002393759787083,
+      "learning_rate": 0.001,
+      "loss": 0.4271,
+      "step": 10372
+    },
+    {
+      "epoch": 0.28621390164070126,
+      "grad_norm": 0.0029513488989323378,
+      "learning_rate": 0.001,
+      "loss": 0.3705,
+      "step": 10373
+    },
+    {
+      "epoch": 0.2862414938417656,
+      "grad_norm": 0.0033245980739593506,
+      "learning_rate": 0.001,
+      "loss": 0.3759,
+      "step": 10374
+    },
+    {
+      "epoch": 0.28626908604282997,
+      "grad_norm": 0.002725611673668027,
+      "learning_rate": 0.001,
+      "loss": 0.3741,
+      "step": 10375
+    },
+    {
+      "epoch": 0.2862966782438944,
+      "grad_norm": 0.0023878002539277077,
+      "learning_rate": 0.001,
+      "loss": 0.4046,
+      "step": 10376
+    },
+    {
+      "epoch": 0.28632427044495873,
+      "grad_norm": 0.0076403240673244,
+      "learning_rate": 0.001,
+      "loss": 0.4184,
+      "step": 10377
+    },
+    {
+      "epoch": 0.2863518626460231,
+      "grad_norm": 0.00309283216483891,
+      "learning_rate": 0.001,
+      "loss": 0.4104,
+      "step": 10378
+    },
+    {
+      "epoch": 0.2863794548470875,
+      "grad_norm": 0.002792463870719075,
+      "learning_rate": 0.001,
+      "loss": 0.3792,
+      "step": 10379
+    },
+    {
+      "epoch": 0.28640704704815184,
+      "grad_norm": 0.002797079971060157,
+      "learning_rate": 0.001,
+      "loss": 0.4015,
+      "step": 10380
+    },
+    {
+      "epoch": 0.2864346392492162,
+      "grad_norm": 0.0023108189925551414,
+      "learning_rate": 0.001,
+      "loss": 0.4203,
+      "step": 10381
+    },
+    {
+      "epoch": 0.2864622314502806,
+      "grad_norm": 0.0021790589671581984,
+      "learning_rate": 0.001,
+      "loss": 0.4065,
+      "step": 10382
+    },
+    {
+      "epoch": 0.28648982365134495,
+      "grad_norm": 0.003151221200823784,
+      "learning_rate": 0.001,
+      "loss": 0.4023,
+      "step": 10383
+    },
+    {
+      "epoch": 0.2865174158524093,
+      "grad_norm": 0.0043502976186573505,
+      "learning_rate": 0.001,
+      "loss": 0.4219,
+      "step": 10384
+    },
+    {
+      "epoch": 0.28654500805347366,
+      "grad_norm": 0.0038338969461619854,
+      "learning_rate": 0.001,
+      "loss": 0.427,
+      "step": 10385
+    },
+    {
+      "epoch": 0.28657260025453807,
+      "grad_norm": 0.004629583563655615,
+      "learning_rate": 0.001,
+      "loss": 0.4075,
+      "step": 10386
+    },
+    {
+      "epoch": 0.2866001924556024,
+      "grad_norm": 0.008832892403006554,
+      "learning_rate": 0.001,
+      "loss": 0.409,
+      "step": 10387
+    },
+    {
+      "epoch": 0.2866277846566668,
+      "grad_norm": 0.005081677809357643,
+      "learning_rate": 0.001,
+      "loss": 0.3842,
+      "step": 10388
+    },
+    {
+      "epoch": 0.2866553768577312,
+      "grad_norm": 0.005641726311296225,
+      "learning_rate": 0.001,
+      "loss": 0.3621,
+      "step": 10389
+    },
+    {
+      "epoch": 0.28668296905879553,
+      "grad_norm": 0.005556876305490732,
+      "learning_rate": 0.001,
+      "loss": 0.4187,
+      "step": 10390
+    },
+    {
+      "epoch": 0.2867105612598599,
+      "grad_norm": 0.004483111668378115,
+      "learning_rate": 0.001,
+      "loss": 0.3967,
+      "step": 10391
+    },
+    {
+      "epoch": 0.2867381534609243,
+      "grad_norm": 0.004832763224840164,
+      "learning_rate": 0.001,
+      "loss": 0.3957,
+      "step": 10392
+    },
+    {
+      "epoch": 0.28676574566198865,
+      "grad_norm": 0.0030382846016436815,
+      "learning_rate": 0.001,
+      "loss": 0.4291,
+      "step": 10393
+    },
+    {
+      "epoch": 0.286793337863053,
+      "grad_norm": 0.003834576578810811,
+      "learning_rate": 0.001,
+      "loss": 0.3803,
+      "step": 10394
+    },
+    {
+      "epoch": 0.28682093006411735,
+      "grad_norm": 0.003595273941755295,
+      "learning_rate": 0.001,
+      "loss": 0.4179,
+      "step": 10395
+    },
+    {
+      "epoch": 0.28684852226518176,
+      "grad_norm": 0.006260840687900782,
+      "learning_rate": 0.001,
+      "loss": 0.4191,
+      "step": 10396
+    },
+    {
+      "epoch": 0.2868761144662461,
+      "grad_norm": 0.00411889236420393,
+      "learning_rate": 0.001,
+      "loss": 0.4024,
+      "step": 10397
+    },
+    {
+      "epoch": 0.28690370666731047,
+      "grad_norm": 0.0030989821534603834,
+      "learning_rate": 0.001,
+      "loss": 0.4152,
+      "step": 10398
+    },
+    {
+      "epoch": 0.2869312988683749,
+      "grad_norm": 0.003285297192633152,
+      "learning_rate": 0.001,
+      "loss": 0.4113,
+      "step": 10399
+    },
+    {
+      "epoch": 0.2869588910694392,
+      "grad_norm": 0.0032312078401446342,
+      "learning_rate": 0.001,
+      "loss": 0.4149,
+      "step": 10400
+    },
+    {
+      "epoch": 0.2869864832705036,
+      "grad_norm": 0.004018446430563927,
+      "learning_rate": 0.001,
+      "loss": 0.3976,
+      "step": 10401
+    },
+    {
+      "epoch": 0.287014075471568,
+      "grad_norm": 0.0027517497073858976,
+      "learning_rate": 0.001,
+      "loss": 0.4206,
+      "step": 10402
+    },
+    {
+      "epoch": 0.28704166767263234,
+      "grad_norm": 0.0023782148491591215,
+      "learning_rate": 0.001,
+      "loss": 0.4018,
+      "step": 10403
+    },
+    {
+      "epoch": 0.2870692598736967,
+      "grad_norm": 0.002795133274048567,
+      "learning_rate": 0.001,
+      "loss": 0.416,
+      "step": 10404
+    },
+    {
+      "epoch": 0.28709685207476104,
+      "grad_norm": 0.0036653687711805105,
+      "learning_rate": 0.001,
+      "loss": 0.3434,
+      "step": 10405
+    },
+    {
+      "epoch": 0.28712444427582545,
+      "grad_norm": 0.003916740883141756,
+      "learning_rate": 0.001,
+      "loss": 0.3793,
+      "step": 10406
+    },
+    {
+      "epoch": 0.2871520364768898,
+      "grad_norm": 0.004016435705125332,
+      "learning_rate": 0.001,
+      "loss": 0.4219,
+      "step": 10407
+    },
+    {
+      "epoch": 0.28717962867795416,
+      "grad_norm": 0.0035471522714942694,
+      "learning_rate": 0.001,
+      "loss": 0.4374,
+      "step": 10408
+    },
+    {
+      "epoch": 0.28720722087901857,
+      "grad_norm": 0.010290498845279217,
+      "learning_rate": 0.001,
+      "loss": 0.4291,
+      "step": 10409
+    },
+    {
+      "epoch": 0.2872348130800829,
+      "grad_norm": 0.004986819811165333,
+      "learning_rate": 0.001,
+      "loss": 0.4183,
+      "step": 10410
+    },
+    {
+      "epoch": 0.28726240528114727,
+      "grad_norm": 0.003227797569707036,
+      "learning_rate": 0.001,
+      "loss": 0.3876,
+      "step": 10411
+    },
+    {
+      "epoch": 0.2872899974822117,
+      "grad_norm": 0.0036358062643557787,
+      "learning_rate": 0.001,
+      "loss": 0.4043,
+      "step": 10412
+    },
+    {
+      "epoch": 0.28731758968327603,
+      "grad_norm": 0.002349398098886013,
+      "learning_rate": 0.001,
+      "loss": 0.4514,
+      "step": 10413
+    },
+    {
+      "epoch": 0.2873451818843404,
+      "grad_norm": 0.004284038674086332,
+      "learning_rate": 0.001,
+      "loss": 0.3808,
+      "step": 10414
+    },
+    {
+      "epoch": 0.28737277408540474,
+      "grad_norm": 0.0026736604049801826,
+      "learning_rate": 0.001,
+      "loss": 0.4064,
+      "step": 10415
+    },
+    {
+      "epoch": 0.28740036628646914,
+      "grad_norm": 0.002364154439419508,
+      "learning_rate": 0.001,
+      "loss": 0.4356,
+      "step": 10416
+    },
+    {
+      "epoch": 0.2874279584875335,
+      "grad_norm": 0.0029544299468398094,
+      "learning_rate": 0.001,
+      "loss": 0.4061,
+      "step": 10417
+    },
+    {
+      "epoch": 0.28745555068859785,
+      "grad_norm": 0.0052476017735898495,
+      "learning_rate": 0.001,
+      "loss": 0.3952,
+      "step": 10418
+    },
+    {
+      "epoch": 0.28748314288966226,
+      "grad_norm": 0.0021597561426460743,
+      "learning_rate": 0.001,
+      "loss": 0.4266,
+      "step": 10419
+    },
+    {
+      "epoch": 0.2875107350907266,
+      "grad_norm": 0.006407409440726042,
+      "learning_rate": 0.001,
+      "loss": 0.4276,
+      "step": 10420
+    },
+    {
+      "epoch": 0.28753832729179096,
+      "grad_norm": 0.004578462336212397,
+      "learning_rate": 0.001,
+      "loss": 0.3882,
+      "step": 10421
+    },
+    {
+      "epoch": 0.28756591949285537,
+      "grad_norm": 0.0024405980948358774,
+      "learning_rate": 0.001,
+      "loss": 0.4355,
+      "step": 10422
+    },
+    {
+      "epoch": 0.2875935116939197,
+      "grad_norm": 0.0061929537914693356,
+      "learning_rate": 0.001,
+      "loss": 0.3499,
+      "step": 10423
+    },
+    {
+      "epoch": 0.2876211038949841,
+      "grad_norm": 0.002528023673221469,
+      "learning_rate": 0.001,
+      "loss": 0.4066,
+      "step": 10424
+    },
+    {
+      "epoch": 0.28764869609604843,
+      "grad_norm": 0.005064224358648062,
+      "learning_rate": 0.001,
+      "loss": 0.4323,
+      "step": 10425
+    },
+    {
+      "epoch": 0.28767628829711284,
+      "grad_norm": 0.0034656894858926535,
+      "learning_rate": 0.001,
+      "loss": 0.436,
+      "step": 10426
+    },
+    {
+      "epoch": 0.2877038804981772,
+      "grad_norm": 0.013309495523571968,
+      "learning_rate": 0.001,
+      "loss": 0.3463,
+      "step": 10427
+    },
+    {
+      "epoch": 0.28773147269924154,
+      "grad_norm": 0.0033926607575267553,
+      "learning_rate": 0.001,
+      "loss": 0.3824,
+      "step": 10428
+    },
+    {
+      "epoch": 0.28775906490030595,
+      "grad_norm": 0.003299700329080224,
+      "learning_rate": 0.001,
+      "loss": 0.3592,
+      "step": 10429
+    },
+    {
+      "epoch": 0.2877866571013703,
+      "grad_norm": 0.00263997376896441,
+      "learning_rate": 0.001,
+      "loss": 0.383,
+      "step": 10430
+    },
+    {
+      "epoch": 0.28781424930243465,
+      "grad_norm": 0.002664315514266491,
+      "learning_rate": 0.001,
+      "loss": 0.4104,
+      "step": 10431
+    },
+    {
+      "epoch": 0.28784184150349906,
+      "grad_norm": 0.003565206192433834,
+      "learning_rate": 0.001,
+      "loss": 0.395,
+      "step": 10432
+    },
+    {
+      "epoch": 0.2878694337045634,
+      "grad_norm": 0.0028061075136065483,
+      "learning_rate": 0.001,
+      "loss": 0.4317,
+      "step": 10433
+    },
+    {
+      "epoch": 0.28789702590562777,
+      "grad_norm": 0.005139282438904047,
+      "learning_rate": 0.001,
+      "loss": 0.3856,
+      "step": 10434
+    },
+    {
+      "epoch": 0.2879246181066921,
+      "grad_norm": 0.003619102295488119,
+      "learning_rate": 0.001,
+      "loss": 0.4045,
+      "step": 10435
+    },
+    {
+      "epoch": 0.28795221030775653,
+      "grad_norm": 0.0031147655099630356,
+      "learning_rate": 0.001,
+      "loss": 0.4198,
+      "step": 10436
+    },
+    {
+      "epoch": 0.2879798025088209,
+      "grad_norm": 0.0035453352611511946,
+      "learning_rate": 0.001,
+      "loss": 0.414,
+      "step": 10437
+    },
+    {
+      "epoch": 0.28800739470988523,
+      "grad_norm": 0.013630262576043606,
+      "learning_rate": 0.001,
+      "loss": 0.394,
+      "step": 10438
+    },
+    {
+      "epoch": 0.28803498691094964,
+      "grad_norm": 0.006643341388553381,
+      "learning_rate": 0.001,
+      "loss": 0.3848,
+      "step": 10439
+    },
+    {
+      "epoch": 0.288062579112014,
+      "grad_norm": 0.0038181054405868053,
+      "learning_rate": 0.001,
+      "loss": 0.3982,
+      "step": 10440
+    },
+    {
+      "epoch": 0.28809017131307835,
+      "grad_norm": 0.00438200868666172,
+      "learning_rate": 0.001,
+      "loss": 0.4341,
+      "step": 10441
+    },
+    {
+      "epoch": 0.28811776351414276,
+      "grad_norm": 0.009693934582173824,
+      "learning_rate": 0.001,
+      "loss": 0.4069,
+      "step": 10442
+    },
+    {
+      "epoch": 0.2881453557152071,
+      "grad_norm": 0.0037193638272583485,
+      "learning_rate": 0.001,
+      "loss": 0.3944,
+      "step": 10443
+    },
+    {
+      "epoch": 0.28817294791627146,
+      "grad_norm": 0.005814881529659033,
+      "learning_rate": 0.001,
+      "loss": 0.4172,
+      "step": 10444
+    },
+    {
+      "epoch": 0.2882005401173358,
+      "grad_norm": 0.003509842325001955,
+      "learning_rate": 0.001,
+      "loss": 0.404,
+      "step": 10445
+    },
+    {
+      "epoch": 0.2882281323184002,
+      "grad_norm": 0.048714328557252884,
+      "learning_rate": 0.001,
+      "loss": 0.4366,
+      "step": 10446
+    },
+    {
+      "epoch": 0.2882557245194646,
+      "grad_norm": 0.003179010935127735,
+      "learning_rate": 0.001,
+      "loss": 0.3806,
+      "step": 10447
+    },
+    {
+      "epoch": 0.2882833167205289,
+      "grad_norm": 0.004650244489312172,
+      "learning_rate": 0.001,
+      "loss": 0.3891,
+      "step": 10448
+    },
+    {
+      "epoch": 0.28831090892159333,
+      "grad_norm": 0.010246813297271729,
+      "learning_rate": 0.001,
+      "loss": 0.3738,
+      "step": 10449
+    },
+    {
+      "epoch": 0.2883385011226577,
+      "grad_norm": 0.0024331400636583567,
+      "learning_rate": 0.001,
+      "loss": 0.437,
+      "step": 10450
+    },
+    {
+      "epoch": 0.28836609332372204,
+      "grad_norm": 0.0032423040829598904,
+      "learning_rate": 0.001,
+      "loss": 0.3977,
+      "step": 10451
+    },
+    {
+      "epoch": 0.2883936855247864,
+      "grad_norm": 0.0032161688432097435,
+      "learning_rate": 0.001,
+      "loss": 0.3756,
+      "step": 10452
+    },
+    {
+      "epoch": 0.2884212777258508,
+      "grad_norm": 0.009974083863198757,
+      "learning_rate": 0.001,
+      "loss": 0.4151,
+      "step": 10453
+    },
+    {
+      "epoch": 0.28844886992691515,
+      "grad_norm": 0.0024438994005322456,
+      "learning_rate": 0.001,
+      "loss": 0.3818,
+      "step": 10454
+    },
+    {
+      "epoch": 0.2884764621279795,
+      "grad_norm": 0.0024701599031686783,
+      "learning_rate": 0.001,
+      "loss": 0.4116,
+      "step": 10455
+    },
+    {
+      "epoch": 0.2885040543290439,
+      "grad_norm": 0.0037678529042750597,
+      "learning_rate": 0.001,
+      "loss": 0.4239,
+      "step": 10456
+    },
+    {
+      "epoch": 0.28853164653010827,
+      "grad_norm": 0.0021011261269450188,
+      "learning_rate": 0.001,
+      "loss": 0.44,
+      "step": 10457
+    },
+    {
+      "epoch": 0.2885592387311726,
+      "grad_norm": 0.0035511725582182407,
+      "learning_rate": 0.001,
+      "loss": 0.3732,
+      "step": 10458
+    },
+    {
+      "epoch": 0.288586830932237,
+      "grad_norm": 0.002597698476165533,
+      "learning_rate": 0.001,
+      "loss": 0.4159,
+      "step": 10459
+    },
+    {
+      "epoch": 0.2886144231333014,
+      "grad_norm": 0.006349804345518351,
+      "learning_rate": 0.001,
+      "loss": 0.3979,
+      "step": 10460
+    },
+    {
+      "epoch": 0.28864201533436573,
+      "grad_norm": 0.003128377255052328,
+      "learning_rate": 0.001,
+      "loss": 0.4154,
+      "step": 10461
+    },
+    {
+      "epoch": 0.2886696075354301,
+      "grad_norm": 0.005628944840282202,
+      "learning_rate": 0.001,
+      "loss": 0.3865,
+      "step": 10462
+    },
+    {
+      "epoch": 0.2886971997364945,
+      "grad_norm": 0.009223297238349915,
+      "learning_rate": 0.001,
+      "loss": 0.4119,
+      "step": 10463
+    },
+    {
+      "epoch": 0.28872479193755884,
+      "grad_norm": 0.002501624170690775,
+      "learning_rate": 0.001,
+      "loss": 0.4228,
+      "step": 10464
+    },
+    {
+      "epoch": 0.2887523841386232,
+      "grad_norm": 0.0031182433012872934,
+      "learning_rate": 0.001,
+      "loss": 0.4293,
+      "step": 10465
+    },
+    {
+      "epoch": 0.2887799763396876,
+      "grad_norm": 0.003218629164621234,
+      "learning_rate": 0.001,
+      "loss": 0.396,
+      "step": 10466
+    },
+    {
+      "epoch": 0.28880756854075196,
+      "grad_norm": 0.0033797677606344223,
+      "learning_rate": 0.001,
+      "loss": 0.4258,
+      "step": 10467
+    },
+    {
+      "epoch": 0.2888351607418163,
+      "grad_norm": 0.005143940914422274,
+      "learning_rate": 0.001,
+      "loss": 0.4083,
+      "step": 10468
+    },
+    {
+      "epoch": 0.2888627529428807,
+      "grad_norm": 0.02191130630671978,
+      "learning_rate": 0.001,
+      "loss": 0.3642,
+      "step": 10469
+    },
+    {
+      "epoch": 0.28889034514394507,
+      "grad_norm": 0.013004349544644356,
+      "learning_rate": 0.001,
+      "loss": 0.3792,
+      "step": 10470
+    },
+    {
+      "epoch": 0.2889179373450094,
+      "grad_norm": 0.0044490983709692955,
+      "learning_rate": 0.001,
+      "loss": 0.411,
+      "step": 10471
+    },
+    {
+      "epoch": 0.2889455295460738,
+      "grad_norm": 0.004338787868618965,
+      "learning_rate": 0.001,
+      "loss": 0.3989,
+      "step": 10472
+    },
+    {
+      "epoch": 0.2889731217471382,
+      "grad_norm": 0.0022168918512761593,
+      "learning_rate": 0.001,
+      "loss": 0.4162,
+      "step": 10473
+    },
+    {
+      "epoch": 0.28900071394820254,
+      "grad_norm": 0.0039255740121006966,
+      "learning_rate": 0.001,
+      "loss": 0.4124,
+      "step": 10474
+    },
+    {
+      "epoch": 0.2890283061492669,
+      "grad_norm": 0.0024870086926966906,
+      "learning_rate": 0.001,
+      "loss": 0.4359,
+      "step": 10475
+    },
+    {
+      "epoch": 0.2890558983503313,
+      "grad_norm": 0.009762310422956944,
+      "learning_rate": 0.001,
+      "loss": 0.4262,
+      "step": 10476
+    },
+    {
+      "epoch": 0.28908349055139565,
+      "grad_norm": 0.0031546815298497677,
+      "learning_rate": 0.001,
+      "loss": 0.3929,
+      "step": 10477
+    },
+    {
+      "epoch": 0.28911108275246,
+      "grad_norm": 0.0028812165837734938,
+      "learning_rate": 0.001,
+      "loss": 0.3788,
+      "step": 10478
+    },
+    {
+      "epoch": 0.2891386749535244,
+      "grad_norm": 0.003111601574346423,
+      "learning_rate": 0.001,
+      "loss": 0.3852,
+      "step": 10479
+    },
+    {
+      "epoch": 0.28916626715458876,
+      "grad_norm": 0.0035458174534142017,
+      "learning_rate": 0.001,
+      "loss": 0.4125,
+      "step": 10480
+    },
+    {
+      "epoch": 0.2891938593556531,
+      "grad_norm": 0.0034652771428227425,
+      "learning_rate": 0.001,
+      "loss": 0.3833,
+      "step": 10481
+    },
+    {
+      "epoch": 0.28922145155671747,
+      "grad_norm": 0.004158776253461838,
+      "learning_rate": 0.001,
+      "loss": 0.3449,
+      "step": 10482
+    },
+    {
+      "epoch": 0.2892490437577819,
+      "grad_norm": 0.0031947793904691935,
+      "learning_rate": 0.001,
+      "loss": 0.4176,
+      "step": 10483
+    },
+    {
+      "epoch": 0.28927663595884623,
+      "grad_norm": 0.0023631020449101925,
+      "learning_rate": 0.001,
+      "loss": 0.4212,
+      "step": 10484
+    },
+    {
+      "epoch": 0.2893042281599106,
+      "grad_norm": 0.0029194431845098734,
+      "learning_rate": 0.001,
+      "loss": 0.3617,
+      "step": 10485
+    },
+    {
+      "epoch": 0.289331820360975,
+      "grad_norm": 0.003637043060734868,
+      "learning_rate": 0.001,
+      "loss": 0.4131,
+      "step": 10486
+    },
+    {
+      "epoch": 0.28935941256203934,
+      "grad_norm": 0.002662140876054764,
+      "learning_rate": 0.001,
+      "loss": 0.3454,
+      "step": 10487
+    },
+    {
+      "epoch": 0.2893870047631037,
+      "grad_norm": 0.002793428720906377,
+      "learning_rate": 0.001,
+      "loss": 0.398,
+      "step": 10488
+    },
+    {
+      "epoch": 0.2894145969641681,
+      "grad_norm": 0.0031368432100862265,
+      "learning_rate": 0.001,
+      "loss": 0.4084,
+      "step": 10489
+    },
+    {
+      "epoch": 0.28944218916523246,
+      "grad_norm": 0.003831058507785201,
+      "learning_rate": 0.001,
+      "loss": 0.3796,
+      "step": 10490
+    },
+    {
+      "epoch": 0.2894697813662968,
+      "grad_norm": 0.0026088710874319077,
+      "learning_rate": 0.001,
+      "loss": 0.3848,
+      "step": 10491
+    },
+    {
+      "epoch": 0.28949737356736116,
+      "grad_norm": 0.004104997497051954,
+      "learning_rate": 0.001,
+      "loss": 0.431,
+      "step": 10492
+    },
+    {
+      "epoch": 0.28952496576842557,
+      "grad_norm": 0.004247668199241161,
+      "learning_rate": 0.001,
+      "loss": 0.3998,
+      "step": 10493
+    },
+    {
+      "epoch": 0.2895525579694899,
+      "grad_norm": 0.0025502184871584177,
+      "learning_rate": 0.001,
+      "loss": 0.391,
+      "step": 10494
+    },
+    {
+      "epoch": 0.2895801501705543,
+      "grad_norm": 0.0031428837683051825,
+      "learning_rate": 0.001,
+      "loss": 0.3968,
+      "step": 10495
+    },
+    {
+      "epoch": 0.2896077423716187,
+      "grad_norm": 0.00442470470443368,
+      "learning_rate": 0.001,
+      "loss": 0.3862,
+      "step": 10496
+    },
+    {
+      "epoch": 0.28963533457268303,
+      "grad_norm": 0.0065785483457148075,
+      "learning_rate": 0.001,
+      "loss": 0.4503,
+      "step": 10497
+    },
+    {
+      "epoch": 0.2896629267737474,
+      "grad_norm": 0.0034503082279115915,
+      "learning_rate": 0.001,
+      "loss": 0.4078,
+      "step": 10498
+    },
+    {
+      "epoch": 0.2896905189748118,
+      "grad_norm": 0.0028073948342353106,
+      "learning_rate": 0.001,
+      "loss": 0.3872,
+      "step": 10499
+    },
+    {
+      "epoch": 0.28971811117587615,
+      "grad_norm": 0.006891318131238222,
+      "learning_rate": 0.001,
+      "loss": 0.4123,
+      "step": 10500
+    },
+    {
+      "epoch": 0.28971811117587615,
+      "eval_runtime": 23.9893,
+      "eval_samples_per_second": 1.334,
+      "eval_steps_per_second": 0.167,
+      "step": 10500
+    },
+    {
+      "epoch": 0.2897457033769405,
+      "grad_norm": 0.0038485617842525244,
+      "learning_rate": 0.001,
+      "loss": 0.4164,
+      "step": 10501
+    },
+    {
+      "epoch": 0.28977329557800485,
+      "grad_norm": 0.004269871395081282,
+      "learning_rate": 0.001,
+      "loss": 0.4047,
+      "step": 10502
+    },
+    {
+      "epoch": 0.28980088777906926,
+      "grad_norm": 0.00325970770791173,
+      "learning_rate": 0.001,
+      "loss": 0.4373,
+      "step": 10503
+    },
+    {
+      "epoch": 0.2898284799801336,
+      "grad_norm": 0.0031080457847565413,
+      "learning_rate": 0.001,
+      "loss": 0.4237,
+      "step": 10504
+    },
+    {
+      "epoch": 0.28985607218119797,
+      "grad_norm": 0.003122104099020362,
+      "learning_rate": 0.001,
+      "loss": 0.3966,
+      "step": 10505
+    },
+    {
+      "epoch": 0.2898836643822624,
+      "grad_norm": 0.003600063733756542,
+      "learning_rate": 0.001,
+      "loss": 0.3981,
+      "step": 10506
+    },
+    {
+      "epoch": 0.2899112565833267,
+      "grad_norm": 0.004156127572059631,
+      "learning_rate": 0.001,
+      "loss": 0.38,
+      "step": 10507
+    },
+    {
+      "epoch": 0.2899388487843911,
+      "grad_norm": 0.0035865693353116512,
+      "learning_rate": 0.001,
+      "loss": 0.4205,
+      "step": 10508
+    },
+    {
+      "epoch": 0.2899664409854555,
+      "grad_norm": 0.004097979050129652,
+      "learning_rate": 0.001,
+      "loss": 0.4112,
+      "step": 10509
+    },
+    {
+      "epoch": 0.28999403318651984,
+      "grad_norm": 0.006173746194690466,
+      "learning_rate": 0.001,
+      "loss": 0.4409,
+      "step": 10510
+    },
+    {
+      "epoch": 0.2900216253875842,
+      "grad_norm": 0.0032481453381478786,
+      "learning_rate": 0.001,
+      "loss": 0.4437,
+      "step": 10511
+    },
+    {
+      "epoch": 0.29004921758864854,
+      "grad_norm": 0.009451840072870255,
+      "learning_rate": 0.001,
+      "loss": 0.3547,
+      "step": 10512
+    },
+    {
+      "epoch": 0.29007680978971295,
+      "grad_norm": 0.0031670001335442066,
+      "learning_rate": 0.001,
+      "loss": 0.3936,
+      "step": 10513
+    },
+    {
+      "epoch": 0.2901044019907773,
+      "grad_norm": 0.0040013547986745834,
+      "learning_rate": 0.001,
+      "loss": 0.4047,
+      "step": 10514
+    },
+    {
+      "epoch": 0.29013199419184166,
+      "grad_norm": 0.01435109507292509,
+      "learning_rate": 0.001,
+      "loss": 0.3811,
+      "step": 10515
+    },
+    {
+      "epoch": 0.29015958639290607,
+      "grad_norm": 0.014031428843736649,
+      "learning_rate": 0.001,
+      "loss": 0.3909,
+      "step": 10516
+    },
+    {
+      "epoch": 0.2901871785939704,
+      "grad_norm": 0.021386094391345978,
+      "learning_rate": 0.001,
+      "loss": 0.4344,
+      "step": 10517
+    },
+    {
+      "epoch": 0.29021477079503477,
+      "grad_norm": 0.01621176116168499,
+      "learning_rate": 0.001,
+      "loss": 0.4005,
+      "step": 10518
+    },
+    {
+      "epoch": 0.2902423629960992,
+      "grad_norm": 0.1645982414484024,
+      "learning_rate": 0.001,
+      "loss": 0.3977,
+      "step": 10519
+    },
+    {
+      "epoch": 0.29026995519716353,
+      "grad_norm": 0.009323320351541042,
+      "learning_rate": 0.001,
+      "loss": 0.3349,
+      "step": 10520
+    },
+    {
+      "epoch": 0.2902975473982279,
+      "grad_norm": 0.010149064473807812,
+      "learning_rate": 0.001,
+      "loss": 0.4171,
+      "step": 10521
+    },
+    {
+      "epoch": 0.29032513959929224,
+      "grad_norm": 0.004645455162972212,
+      "learning_rate": 0.001,
+      "loss": 0.4127,
+      "step": 10522
+    },
+    {
+      "epoch": 0.29035273180035664,
+      "grad_norm": 0.005608486942946911,
+      "learning_rate": 0.001,
+      "loss": 0.41,
+      "step": 10523
+    },
+    {
+      "epoch": 0.290380324001421,
+      "grad_norm": 0.020055659115314484,
+      "learning_rate": 0.001,
+      "loss": 0.4182,
+      "step": 10524
+    },
+    {
+      "epoch": 0.29040791620248535,
+      "grad_norm": 0.00331861968152225,
+      "learning_rate": 0.001,
+      "loss": 0.3792,
+      "step": 10525
+    },
+    {
+      "epoch": 0.29043550840354976,
+      "grad_norm": 0.0030799147207289934,
+      "learning_rate": 0.001,
+      "loss": 0.4441,
+      "step": 10526
+    },
+    {
+      "epoch": 0.2904631006046141,
+      "grad_norm": 0.003175821155309677,
+      "learning_rate": 0.001,
+      "loss": 0.4228,
+      "step": 10527
+    },
+    {
+      "epoch": 0.29049069280567846,
+      "grad_norm": 0.002744150348007679,
+      "learning_rate": 0.001,
+      "loss": 0.394,
+      "step": 10528
+    },
+    {
+      "epoch": 0.29051828500674287,
+      "grad_norm": 0.0017547330353409052,
+      "learning_rate": 0.001,
+      "loss": 0.4104,
+      "step": 10529
+    },
+    {
+      "epoch": 0.2905458772078072,
+      "grad_norm": 0.002442681696265936,
+      "learning_rate": 0.001,
+      "loss": 0.4078,
+      "step": 10530
+    },
+    {
+      "epoch": 0.2905734694088716,
+      "grad_norm": 0.0028025461360812187,
+      "learning_rate": 0.001,
+      "loss": 0.3758,
+      "step": 10531
+    },
+    {
+      "epoch": 0.29060106160993593,
+      "grad_norm": 0.003767822403460741,
+      "learning_rate": 0.001,
+      "loss": 0.3827,
+      "step": 10532
+    },
+    {
+      "epoch": 0.29062865381100034,
+      "grad_norm": 0.031069811433553696,
+      "learning_rate": 0.001,
+      "loss": 0.3772,
+      "step": 10533
+    },
+    {
+      "epoch": 0.2906562460120647,
+      "grad_norm": 0.005183485336601734,
+      "learning_rate": 0.001,
+      "loss": 0.3973,
+      "step": 10534
+    },
+    {
+      "epoch": 0.29068383821312904,
+      "grad_norm": 0.004126776475459337,
+      "learning_rate": 0.001,
+      "loss": 0.3994,
+      "step": 10535
+    },
+    {
+      "epoch": 0.29071143041419345,
+      "grad_norm": 0.0029067141003906727,
+      "learning_rate": 0.001,
+      "loss": 0.4026,
+      "step": 10536
+    },
+    {
+      "epoch": 0.2907390226152578,
+      "grad_norm": 0.00455186702311039,
+      "learning_rate": 0.001,
+      "loss": 0.413,
+      "step": 10537
+    },
+    {
+      "epoch": 0.29076661481632216,
+      "grad_norm": 0.0032677233684808016,
+      "learning_rate": 0.001,
+      "loss": 0.4204,
+      "step": 10538
+    },
+    {
+      "epoch": 0.29079420701738656,
+      "grad_norm": 0.0033605170901864767,
+      "learning_rate": 0.001,
+      "loss": 0.4034,
+      "step": 10539
+    },
+    {
+      "epoch": 0.2908217992184509,
+      "grad_norm": 0.0031016338616609573,
+      "learning_rate": 0.001,
+      "loss": 0.4089,
+      "step": 10540
+    },
+    {
+      "epoch": 0.29084939141951527,
+      "grad_norm": 0.0022766580805182457,
+      "learning_rate": 0.001,
+      "loss": 0.4186,
+      "step": 10541
+    },
+    {
+      "epoch": 0.2908769836205796,
+      "grad_norm": 0.0034821683075278997,
+      "learning_rate": 0.001,
+      "loss": 0.3844,
+      "step": 10542
+    },
+    {
+      "epoch": 0.29090457582164403,
+      "grad_norm": 0.0025649424642324448,
+      "learning_rate": 0.001,
+      "loss": 0.4215,
+      "step": 10543
+    },
+    {
+      "epoch": 0.2909321680227084,
+      "grad_norm": 0.004783533047884703,
+      "learning_rate": 0.001,
+      "loss": 0.4034,
+      "step": 10544
+    },
+    {
+      "epoch": 0.29095976022377273,
+      "grad_norm": 0.007414715830236673,
+      "learning_rate": 0.001,
+      "loss": 0.4077,
+      "step": 10545
+    },
+    {
+      "epoch": 0.29098735242483714,
+      "grad_norm": 0.010034749284386635,
+      "learning_rate": 0.001,
+      "loss": 0.3879,
+      "step": 10546
+    },
+    {
+      "epoch": 0.2910149446259015,
+      "grad_norm": 0.0025835982523858547,
+      "learning_rate": 0.001,
+      "loss": 0.3929,
+      "step": 10547
+    },
+    {
+      "epoch": 0.29104253682696585,
+      "grad_norm": 0.0022810695227235556,
+      "learning_rate": 0.001,
+      "loss": 0.4089,
+      "step": 10548
+    },
+    {
+      "epoch": 0.2910701290280302,
+      "grad_norm": 0.0027416511438786983,
+      "learning_rate": 0.001,
+      "loss": 0.4023,
+      "step": 10549
+    },
+    {
+      "epoch": 0.2910977212290946,
+      "grad_norm": 0.004057244397699833,
+      "learning_rate": 0.001,
+      "loss": 0.4082,
+      "step": 10550
+    },
+    {
+      "epoch": 0.29112531343015896,
+      "grad_norm": 0.00185355672147125,
+      "learning_rate": 0.001,
+      "loss": 0.4296,
+      "step": 10551
+    },
+    {
+      "epoch": 0.2911529056312233,
+      "grad_norm": 0.0033400054089725018,
+      "learning_rate": 0.001,
+      "loss": 0.4106,
+      "step": 10552
+    },
+    {
+      "epoch": 0.2911804978322877,
+      "grad_norm": 0.005987055134028196,
+      "learning_rate": 0.001,
+      "loss": 0.3716,
+      "step": 10553
+    },
+    {
+      "epoch": 0.2912080900333521,
+      "grad_norm": 0.007598403375595808,
+      "learning_rate": 0.001,
+      "loss": 0.3706,
+      "step": 10554
+    },
+    {
+      "epoch": 0.2912356822344164,
+      "grad_norm": 0.00337135954760015,
+      "learning_rate": 0.001,
+      "loss": 0.4039,
+      "step": 10555
+    },
+    {
+      "epoch": 0.29126327443548083,
+      "grad_norm": 0.003520526457577944,
+      "learning_rate": 0.001,
+      "loss": 0.4189,
+      "step": 10556
+    },
+    {
+      "epoch": 0.2912908666365452,
+      "grad_norm": 0.002943966304883361,
+      "learning_rate": 0.001,
+      "loss": 0.3459,
+      "step": 10557
+    },
+    {
+      "epoch": 0.29131845883760954,
+      "grad_norm": 0.004878449719399214,
+      "learning_rate": 0.001,
+      "loss": 0.3735,
+      "step": 10558
+    },
+    {
+      "epoch": 0.2913460510386739,
+      "grad_norm": 0.0023713402915745974,
+      "learning_rate": 0.001,
+      "loss": 0.4028,
+      "step": 10559
+    },
+    {
+      "epoch": 0.2913736432397383,
+      "grad_norm": 0.0031815902329981327,
+      "learning_rate": 0.001,
+      "loss": 0.3708,
+      "step": 10560
+    },
+    {
+      "epoch": 0.29140123544080265,
+      "grad_norm": 0.002473505213856697,
+      "learning_rate": 0.001,
+      "loss": 0.4208,
+      "step": 10561
+    },
+    {
+      "epoch": 0.291428827641867,
+      "grad_norm": 0.002570673357695341,
+      "learning_rate": 0.001,
+      "loss": 0.3712,
+      "step": 10562
+    },
+    {
+      "epoch": 0.2914564198429314,
+      "grad_norm": 0.004323242697864771,
+      "learning_rate": 0.001,
+      "loss": 0.4001,
+      "step": 10563
+    },
+    {
+      "epoch": 0.29148401204399577,
+      "grad_norm": 0.003748292336240411,
+      "learning_rate": 0.001,
+      "loss": 0.3852,
+      "step": 10564
+    },
+    {
+      "epoch": 0.2915116042450601,
+      "grad_norm": 0.0039474996738135815,
+      "learning_rate": 0.001,
+      "loss": 0.424,
+      "step": 10565
+    },
+    {
+      "epoch": 0.2915391964461245,
+      "grad_norm": 0.003185428213328123,
+      "learning_rate": 0.001,
+      "loss": 0.4258,
+      "step": 10566
+    },
+    {
+      "epoch": 0.2915667886471889,
+      "grad_norm": 0.008764101192355156,
+      "learning_rate": 0.001,
+      "loss": 0.4073,
+      "step": 10567
+    },
+    {
+      "epoch": 0.29159438084825323,
+      "grad_norm": 0.006038394290953875,
+      "learning_rate": 0.001,
+      "loss": 0.4069,
+      "step": 10568
+    },
+    {
+      "epoch": 0.2916219730493176,
+      "grad_norm": 0.00498878164216876,
+      "learning_rate": 0.001,
+      "loss": 0.4099,
+      "step": 10569
+    },
+    {
+      "epoch": 0.291649565250382,
+      "grad_norm": 0.002792202867567539,
+      "learning_rate": 0.001,
+      "loss": 0.4132,
+      "step": 10570
+    },
+    {
+      "epoch": 0.29167715745144634,
+      "grad_norm": 0.004675569478422403,
+      "learning_rate": 0.001,
+      "loss": 0.4226,
+      "step": 10571
+    },
+    {
+      "epoch": 0.2917047496525107,
+      "grad_norm": 0.00361837400123477,
+      "learning_rate": 0.001,
+      "loss": 0.3538,
+      "step": 10572
+    },
+    {
+      "epoch": 0.2917323418535751,
+      "grad_norm": 0.00293007493019104,
+      "learning_rate": 0.001,
+      "loss": 0.4157,
+      "step": 10573
+    },
+    {
+      "epoch": 0.29175993405463946,
+      "grad_norm": 0.0027430958580225706,
+      "learning_rate": 0.001,
+      "loss": 0.4124,
+      "step": 10574
+    },
+    {
+      "epoch": 0.2917875262557038,
+      "grad_norm": 0.0023098003584891558,
+      "learning_rate": 0.001,
+      "loss": 0.4127,
+      "step": 10575
+    },
+    {
+      "epoch": 0.2918151184567682,
+      "grad_norm": 0.0030539468862116337,
+      "learning_rate": 0.001,
+      "loss": 0.3997,
+      "step": 10576
+    },
+    {
+      "epoch": 0.29184271065783257,
+      "grad_norm": 0.004506285302340984,
+      "learning_rate": 0.001,
+      "loss": 0.3835,
+      "step": 10577
+    },
+    {
+      "epoch": 0.2918703028588969,
+      "grad_norm": 0.00386527506634593,
+      "learning_rate": 0.001,
+      "loss": 0.4258,
+      "step": 10578
+    },
+    {
+      "epoch": 0.2918978950599613,
+      "grad_norm": 0.0044982582330703735,
+      "learning_rate": 0.001,
+      "loss": 0.3744,
+      "step": 10579
+    },
+    {
+      "epoch": 0.2919254872610257,
+      "grad_norm": 0.005035079549998045,
+      "learning_rate": 0.001,
+      "loss": 0.3732,
+      "step": 10580
+    },
+    {
+      "epoch": 0.29195307946209004,
+      "grad_norm": 0.007419761270284653,
+      "learning_rate": 0.001,
+      "loss": 0.3572,
+      "step": 10581
+    },
+    {
+      "epoch": 0.2919806716631544,
+      "grad_norm": 0.003058833070099354,
+      "learning_rate": 0.001,
+      "loss": 0.4311,
+      "step": 10582
+    },
+    {
+      "epoch": 0.2920082638642188,
+      "grad_norm": 0.008622899651527405,
+      "learning_rate": 0.001,
+      "loss": 0.405,
+      "step": 10583
+    },
+    {
+      "epoch": 0.29203585606528315,
+      "grad_norm": 0.0038068420253694057,
+      "learning_rate": 0.001,
+      "loss": 0.4076,
+      "step": 10584
+    },
+    {
+      "epoch": 0.2920634482663475,
+      "grad_norm": 0.0027387929148972034,
+      "learning_rate": 0.001,
+      "loss": 0.3948,
+      "step": 10585
+    },
+    {
+      "epoch": 0.2920910404674119,
+      "grad_norm": 0.004177779424935579,
+      "learning_rate": 0.001,
+      "loss": 0.3726,
+      "step": 10586
+    },
+    {
+      "epoch": 0.29211863266847626,
+      "grad_norm": 0.0050719305872917175,
+      "learning_rate": 0.001,
+      "loss": 0.4282,
+      "step": 10587
+    },
+    {
+      "epoch": 0.2921462248695406,
+      "grad_norm": 0.00421450100839138,
+      "learning_rate": 0.001,
+      "loss": 0.3963,
+      "step": 10588
+    },
+    {
+      "epoch": 0.29217381707060497,
+      "grad_norm": 0.031578365713357925,
+      "learning_rate": 0.001,
+      "loss": 0.3792,
+      "step": 10589
+    },
+    {
+      "epoch": 0.2922014092716694,
+      "grad_norm": 0.008609611541032791,
+      "learning_rate": 0.001,
+      "loss": 0.3712,
+      "step": 10590
+    },
+    {
+      "epoch": 0.29222900147273373,
+      "grad_norm": 0.005752986762672663,
+      "learning_rate": 0.001,
+      "loss": 0.38,
+      "step": 10591
+    },
+    {
+      "epoch": 0.2922565936737981,
+      "grad_norm": 0.004760433454066515,
+      "learning_rate": 0.001,
+      "loss": 0.4197,
+      "step": 10592
+    },
+    {
+      "epoch": 0.2922841858748625,
+      "grad_norm": 0.0037690645549446344,
+      "learning_rate": 0.001,
+      "loss": 0.3877,
+      "step": 10593
+    },
+    {
+      "epoch": 0.29231177807592684,
+      "grad_norm": 0.0029250262305140495,
+      "learning_rate": 0.001,
+      "loss": 0.4383,
+      "step": 10594
+    },
+    {
+      "epoch": 0.2923393702769912,
+      "grad_norm": 0.002340389881283045,
+      "learning_rate": 0.001,
+      "loss": 0.4113,
+      "step": 10595
+    },
+    {
+      "epoch": 0.2923669624780556,
+      "grad_norm": 0.002268069889396429,
+      "learning_rate": 0.001,
+      "loss": 0.4107,
+      "step": 10596
+    },
+    {
+      "epoch": 0.29239455467911996,
+      "grad_norm": 0.004916078876703978,
+      "learning_rate": 0.001,
+      "loss": 0.4144,
+      "step": 10597
+    },
+    {
+      "epoch": 0.2924221468801843,
+      "grad_norm": 0.0026404778473079205,
+      "learning_rate": 0.001,
+      "loss": 0.3631,
+      "step": 10598
+    },
+    {
+      "epoch": 0.29244973908124866,
+      "grad_norm": 0.003348248079419136,
+      "learning_rate": 0.001,
+      "loss": 0.387,
+      "step": 10599
+    },
+    {
+      "epoch": 0.29247733128231307,
+      "grad_norm": 0.0029468834400177,
+      "learning_rate": 0.001,
+      "loss": 0.3734,
+      "step": 10600
+    },
+    {
+      "epoch": 0.2925049234833774,
+      "grad_norm": 0.00501684146001935,
+      "learning_rate": 0.001,
+      "loss": 0.4221,
+      "step": 10601
+    },
+    {
+      "epoch": 0.2925325156844418,
+      "grad_norm": 0.002596599282696843,
+      "learning_rate": 0.001,
+      "loss": 0.412,
+      "step": 10602
+    },
+    {
+      "epoch": 0.2925601078855062,
+      "grad_norm": 0.004242388531565666,
+      "learning_rate": 0.001,
+      "loss": 0.4014,
+      "step": 10603
+    },
+    {
+      "epoch": 0.29258770008657053,
+      "grad_norm": 0.003474390832707286,
+      "learning_rate": 0.001,
+      "loss": 0.378,
+      "step": 10604
+    },
+    {
+      "epoch": 0.2926152922876349,
+      "grad_norm": 0.0028386737685650587,
+      "learning_rate": 0.001,
+      "loss": 0.4124,
+      "step": 10605
+    },
+    {
+      "epoch": 0.2926428844886993,
+      "grad_norm": 0.005176781211048365,
+      "learning_rate": 0.001,
+      "loss": 0.3949,
+      "step": 10606
+    },
+    {
+      "epoch": 0.29267047668976365,
+      "grad_norm": 0.002886307192966342,
+      "learning_rate": 0.001,
+      "loss": 0.395,
+      "step": 10607
+    },
+    {
+      "epoch": 0.292698068890828,
+      "grad_norm": 0.0048707169480621815,
+      "learning_rate": 0.001,
+      "loss": 0.405,
+      "step": 10608
+    },
+    {
+      "epoch": 0.29272566109189235,
+      "grad_norm": 0.003168995026499033,
+      "learning_rate": 0.001,
+      "loss": 0.377,
+      "step": 10609
+    },
+    {
+      "epoch": 0.29275325329295676,
+      "grad_norm": 0.00348648545332253,
+      "learning_rate": 0.001,
+      "loss": 0.3736,
+      "step": 10610
+    },
+    {
+      "epoch": 0.2927808454940211,
+      "grad_norm": 0.002645922591909766,
+      "learning_rate": 0.001,
+      "loss": 0.4314,
+      "step": 10611
+    },
+    {
+      "epoch": 0.29280843769508547,
+      "grad_norm": 0.002931213239207864,
+      "learning_rate": 0.001,
+      "loss": 0.3761,
+      "step": 10612
+    },
+    {
+      "epoch": 0.2928360298961499,
+      "grad_norm": 0.003571485634893179,
+      "learning_rate": 0.001,
+      "loss": 0.3484,
+      "step": 10613
+    },
+    {
+      "epoch": 0.2928636220972142,
+      "grad_norm": 0.002395814750343561,
+      "learning_rate": 0.001,
+      "loss": 0.4037,
+      "step": 10614
+    },
+    {
+      "epoch": 0.2928912142982786,
+      "grad_norm": 0.007757443469017744,
+      "learning_rate": 0.001,
+      "loss": 0.3728,
+      "step": 10615
+    },
+    {
+      "epoch": 0.292918806499343,
+      "grad_norm": 0.003894525347277522,
+      "learning_rate": 0.001,
+      "loss": 0.3976,
+      "step": 10616
+    },
+    {
+      "epoch": 0.29294639870040734,
+      "grad_norm": 0.005945149809122086,
+      "learning_rate": 0.001,
+      "loss": 0.3822,
+      "step": 10617
+    },
+    {
+      "epoch": 0.2929739909014717,
+      "grad_norm": 0.005653233267366886,
+      "learning_rate": 0.001,
+      "loss": 0.3896,
+      "step": 10618
+    },
+    {
+      "epoch": 0.29300158310253605,
+      "grad_norm": 0.006182766519486904,
+      "learning_rate": 0.001,
+      "loss": 0.4181,
+      "step": 10619
+    },
+    {
+      "epoch": 0.29302917530360045,
+      "grad_norm": 0.004933160729706287,
+      "learning_rate": 0.001,
+      "loss": 0.4209,
+      "step": 10620
+    },
+    {
+      "epoch": 0.2930567675046648,
+      "grad_norm": 0.004339593928307295,
+      "learning_rate": 0.001,
+      "loss": 0.3796,
+      "step": 10621
+    },
+    {
+      "epoch": 0.29308435970572916,
+      "grad_norm": 0.0027681838255375624,
+      "learning_rate": 0.001,
+      "loss": 0.4027,
+      "step": 10622
+    },
+    {
+      "epoch": 0.29311195190679357,
+      "grad_norm": 0.002985588042065501,
+      "learning_rate": 0.001,
+      "loss": 0.4367,
+      "step": 10623
+    },
+    {
+      "epoch": 0.2931395441078579,
+      "grad_norm": 0.003226140746846795,
+      "learning_rate": 0.001,
+      "loss": 0.4223,
+      "step": 10624
+    },
+    {
+      "epoch": 0.29316713630892227,
+      "grad_norm": 0.00455946521833539,
+      "learning_rate": 0.001,
+      "loss": 0.4024,
+      "step": 10625
+    },
+    {
+      "epoch": 0.2931947285099867,
+      "grad_norm": 0.004617081955075264,
+      "learning_rate": 0.001,
+      "loss": 0.3963,
+      "step": 10626
+    },
+    {
+      "epoch": 0.29322232071105103,
+      "grad_norm": 0.006010159850120544,
+      "learning_rate": 0.001,
+      "loss": 0.4247,
+      "step": 10627
+    },
+    {
+      "epoch": 0.2932499129121154,
+      "grad_norm": 0.0038819455076009035,
+      "learning_rate": 0.001,
+      "loss": 0.4062,
+      "step": 10628
+    },
+    {
+      "epoch": 0.29327750511317974,
+      "grad_norm": 0.0035864575766026974,
+      "learning_rate": 0.001,
+      "loss": 0.3823,
+      "step": 10629
+    },
+    {
+      "epoch": 0.29330509731424415,
+      "grad_norm": 0.002191227860748768,
+      "learning_rate": 0.001,
+      "loss": 0.408,
+      "step": 10630
+    },
+    {
+      "epoch": 0.2933326895153085,
+      "grad_norm": 0.004282178822904825,
+      "learning_rate": 0.001,
+      "loss": 0.3453,
+      "step": 10631
+    },
+    {
+      "epoch": 0.29336028171637285,
+      "grad_norm": 0.0022272500209510326,
+      "learning_rate": 0.001,
+      "loss": 0.4057,
+      "step": 10632
+    },
+    {
+      "epoch": 0.29338787391743726,
+      "grad_norm": 0.002555177314206958,
+      "learning_rate": 0.001,
+      "loss": 0.3953,
+      "step": 10633
+    },
+    {
+      "epoch": 0.2934154661185016,
+      "grad_norm": 0.006331083830446005,
+      "learning_rate": 0.001,
+      "loss": 0.3625,
+      "step": 10634
+    },
+    {
+      "epoch": 0.29344305831956596,
+      "grad_norm": 0.002670933725312352,
+      "learning_rate": 0.001,
+      "loss": 0.4096,
+      "step": 10635
+    },
+    {
+      "epoch": 0.2934706505206303,
+      "grad_norm": 0.004304058384150267,
+      "learning_rate": 0.001,
+      "loss": 0.3682,
+      "step": 10636
+    },
+    {
+      "epoch": 0.2934982427216947,
+      "grad_norm": 0.0024331146851181984,
+      "learning_rate": 0.001,
+      "loss": 0.393,
+      "step": 10637
+    },
+    {
+      "epoch": 0.2935258349227591,
+      "grad_norm": 0.003931795712560415,
+      "learning_rate": 0.001,
+      "loss": 0.3928,
+      "step": 10638
+    },
+    {
+      "epoch": 0.29355342712382343,
+      "grad_norm": 0.004206740763038397,
+      "learning_rate": 0.001,
+      "loss": 0.3946,
+      "step": 10639
+    },
+    {
+      "epoch": 0.29358101932488784,
+      "grad_norm": 0.0031811101362109184,
+      "learning_rate": 0.001,
+      "loss": 0.4167,
+      "step": 10640
+    },
+    {
+      "epoch": 0.2936086115259522,
+      "grad_norm": 0.002255258383229375,
+      "learning_rate": 0.001,
+      "loss": 0.3808,
+      "step": 10641
+    },
+    {
+      "epoch": 0.29363620372701654,
+      "grad_norm": 0.002453514840453863,
+      "learning_rate": 0.001,
+      "loss": 0.4223,
+      "step": 10642
+    },
+    {
+      "epoch": 0.29366379592808095,
+      "grad_norm": 0.002259905217215419,
+      "learning_rate": 0.001,
+      "loss": 0.3815,
+      "step": 10643
+    },
+    {
+      "epoch": 0.2936913881291453,
+      "grad_norm": 0.0024826505687087774,
+      "learning_rate": 0.001,
+      "loss": 0.3844,
+      "step": 10644
+    },
+    {
+      "epoch": 0.29371898033020966,
+      "grad_norm": 0.0024130498059093952,
+      "learning_rate": 0.001,
+      "loss": 0.39,
+      "step": 10645
+    },
+    {
+      "epoch": 0.293746572531274,
+      "grad_norm": 0.002940199337899685,
+      "learning_rate": 0.001,
+      "loss": 0.4167,
+      "step": 10646
+    },
+    {
+      "epoch": 0.2937741647323384,
+      "grad_norm": 0.0037300034891813993,
+      "learning_rate": 0.001,
+      "loss": 0.3452,
+      "step": 10647
+    },
+    {
+      "epoch": 0.29380175693340277,
+      "grad_norm": 0.0023701984900981188,
+      "learning_rate": 0.001,
+      "loss": 0.4345,
+      "step": 10648
+    },
+    {
+      "epoch": 0.2938293491344671,
+      "grad_norm": 0.0023255145642906427,
+      "learning_rate": 0.001,
+      "loss": 0.3798,
+      "step": 10649
+    },
+    {
+      "epoch": 0.29385694133553153,
+      "grad_norm": 0.001929348916746676,
+      "learning_rate": 0.001,
+      "loss": 0.4208,
+      "step": 10650
+    },
+    {
+      "epoch": 0.2938845335365959,
+      "grad_norm": 0.003764538327232003,
+      "learning_rate": 0.001,
+      "loss": 0.4241,
+      "step": 10651
+    },
+    {
+      "epoch": 0.29391212573766023,
+      "grad_norm": 0.0031687645241618156,
+      "learning_rate": 0.001,
+      "loss": 0.3864,
+      "step": 10652
+    },
+    {
+      "epoch": 0.29393971793872464,
+      "grad_norm": 0.0023800362832844257,
+      "learning_rate": 0.001,
+      "loss": 0.4062,
+      "step": 10653
+    },
+    {
+      "epoch": 0.293967310139789,
+      "grad_norm": 0.003322491655126214,
+      "learning_rate": 0.001,
+      "loss": 0.3802,
+      "step": 10654
+    },
+    {
+      "epoch": 0.29399490234085335,
+      "grad_norm": 0.003406172851100564,
+      "learning_rate": 0.001,
+      "loss": 0.3784,
+      "step": 10655
+    },
+    {
+      "epoch": 0.2940224945419177,
+      "grad_norm": 0.0024194566067308187,
+      "learning_rate": 0.001,
+      "loss": 0.3699,
+      "step": 10656
+    },
+    {
+      "epoch": 0.2940500867429821,
+      "grad_norm": 0.002812894992530346,
+      "learning_rate": 0.001,
+      "loss": 0.3738,
+      "step": 10657
+    },
+    {
+      "epoch": 0.29407767894404646,
+      "grad_norm": 0.004144003614783287,
+      "learning_rate": 0.001,
+      "loss": 0.3811,
+      "step": 10658
+    },
+    {
+      "epoch": 0.2941052711451108,
+      "grad_norm": 0.0025214876513928175,
+      "learning_rate": 0.001,
+      "loss": 0.3895,
+      "step": 10659
+    },
+    {
+      "epoch": 0.2941328633461752,
+      "grad_norm": 0.0037252691108733416,
+      "learning_rate": 0.001,
+      "loss": 0.3846,
+      "step": 10660
+    },
+    {
+      "epoch": 0.2941604555472396,
+      "grad_norm": 0.0028469781391322613,
+      "learning_rate": 0.001,
+      "loss": 0.4039,
+      "step": 10661
+    },
+    {
+      "epoch": 0.2941880477483039,
+      "grad_norm": 0.004957746714353561,
+      "learning_rate": 0.001,
+      "loss": 0.3672,
+      "step": 10662
+    },
+    {
+      "epoch": 0.29421563994936833,
+      "grad_norm": 0.0030585613567382097,
+      "learning_rate": 0.001,
+      "loss": 0.383,
+      "step": 10663
+    },
+    {
+      "epoch": 0.2942432321504327,
+      "grad_norm": 0.003800381440669298,
+      "learning_rate": 0.001,
+      "loss": 0.4068,
+      "step": 10664
+    },
+    {
+      "epoch": 0.29427082435149704,
+      "grad_norm": 0.0074830930680036545,
+      "learning_rate": 0.001,
+      "loss": 0.3925,
+      "step": 10665
+    },
+    {
+      "epoch": 0.2942984165525614,
+      "grad_norm": 0.0026420445647090673,
+      "learning_rate": 0.001,
+      "loss": 0.4019,
+      "step": 10666
+    },
+    {
+      "epoch": 0.2943260087536258,
+      "grad_norm": 0.0033852518536150455,
+      "learning_rate": 0.001,
+      "loss": 0.3826,
+      "step": 10667
+    },
+    {
+      "epoch": 0.29435360095469015,
+      "grad_norm": 0.004352671559900045,
+      "learning_rate": 0.001,
+      "loss": 0.4167,
+      "step": 10668
+    },
+    {
+      "epoch": 0.2943811931557545,
+      "grad_norm": 0.002693094778805971,
+      "learning_rate": 0.001,
+      "loss": 0.4035,
+      "step": 10669
+    },
+    {
+      "epoch": 0.2944087853568189,
+      "grad_norm": 0.0030384764540940523,
+      "learning_rate": 0.001,
+      "loss": 0.3952,
+      "step": 10670
+    },
+    {
+      "epoch": 0.29443637755788327,
+      "grad_norm": 0.0021408821921795607,
+      "learning_rate": 0.001,
+      "loss": 0.4297,
+      "step": 10671
+    },
+    {
+      "epoch": 0.2944639697589476,
+      "grad_norm": 0.0028972120489925146,
+      "learning_rate": 0.001,
+      "loss": 0.3687,
+      "step": 10672
+    },
+    {
+      "epoch": 0.294491561960012,
+      "grad_norm": 0.0024992034304887056,
+      "learning_rate": 0.001,
+      "loss": 0.4118,
+      "step": 10673
+    },
+    {
+      "epoch": 0.2945191541610764,
+      "grad_norm": 0.004567326512187719,
+      "learning_rate": 0.001,
+      "loss": 0.4199,
+      "step": 10674
+    },
+    {
+      "epoch": 0.29454674636214073,
+      "grad_norm": 0.0030367637518793344,
+      "learning_rate": 0.001,
+      "loss": 0.4081,
+      "step": 10675
+    },
+    {
+      "epoch": 0.2945743385632051,
+      "grad_norm": 0.003327249316498637,
+      "learning_rate": 0.001,
+      "loss": 0.382,
+      "step": 10676
+    },
+    {
+      "epoch": 0.2946019307642695,
+      "grad_norm": 0.00256183254532516,
+      "learning_rate": 0.001,
+      "loss": 0.3798,
+      "step": 10677
+    },
+    {
+      "epoch": 0.29462952296533385,
+      "grad_norm": 0.004928114358335733,
+      "learning_rate": 0.001,
+      "loss": 0.3755,
+      "step": 10678
+    },
+    {
+      "epoch": 0.2946571151663982,
+      "grad_norm": 0.0023404727689921856,
+      "learning_rate": 0.001,
+      "loss": 0.3945,
+      "step": 10679
+    },
+    {
+      "epoch": 0.2946847073674626,
+      "grad_norm": 0.0023969600442796946,
+      "learning_rate": 0.001,
+      "loss": 0.4028,
+      "step": 10680
+    },
+    {
+      "epoch": 0.29471229956852696,
+      "grad_norm": 0.0030103707686066628,
+      "learning_rate": 0.001,
+      "loss": 0.406,
+      "step": 10681
+    },
+    {
+      "epoch": 0.2947398917695913,
+      "grad_norm": 0.003063908079639077,
+      "learning_rate": 0.001,
+      "loss": 0.4101,
+      "step": 10682
+    },
+    {
+      "epoch": 0.2947674839706557,
+      "grad_norm": 0.0045205047354102135,
+      "learning_rate": 0.001,
+      "loss": 0.3845,
+      "step": 10683
+    },
+    {
+      "epoch": 0.29479507617172007,
+      "grad_norm": 0.0176579337567091,
+      "learning_rate": 0.001,
+      "loss": 0.4263,
+      "step": 10684
+    },
+    {
+      "epoch": 0.2948226683727844,
+      "grad_norm": 0.011839848011732101,
+      "learning_rate": 0.001,
+      "loss": 0.4159,
+      "step": 10685
+    },
+    {
+      "epoch": 0.2948502605738488,
+      "grad_norm": 0.01756063662469387,
+      "learning_rate": 0.001,
+      "loss": 0.4065,
+      "step": 10686
+    },
+    {
+      "epoch": 0.2948778527749132,
+      "grad_norm": 0.009184034541249275,
+      "learning_rate": 0.001,
+      "loss": 0.3991,
+      "step": 10687
+    },
+    {
+      "epoch": 0.29490544497597754,
+      "grad_norm": 0.005395642481744289,
+      "learning_rate": 0.001,
+      "loss": 0.391,
+      "step": 10688
+    },
+    {
+      "epoch": 0.2949330371770419,
+      "grad_norm": 0.016659358516335487,
+      "learning_rate": 0.001,
+      "loss": 0.3581,
+      "step": 10689
+    },
+    {
+      "epoch": 0.2949606293781063,
+      "grad_norm": 0.01040132250636816,
+      "learning_rate": 0.001,
+      "loss": 0.3883,
+      "step": 10690
+    },
+    {
+      "epoch": 0.29498822157917065,
+      "grad_norm": 0.003244626335799694,
+      "learning_rate": 0.001,
+      "loss": 0.3769,
+      "step": 10691
+    },
+    {
+      "epoch": 0.295015813780235,
+      "grad_norm": 0.004035422578454018,
+      "learning_rate": 0.001,
+      "loss": 0.4067,
+      "step": 10692
+    },
+    {
+      "epoch": 0.2950434059812994,
+      "grad_norm": 0.00780785595998168,
+      "learning_rate": 0.001,
+      "loss": 0.4043,
+      "step": 10693
+    },
+    {
+      "epoch": 0.29507099818236376,
+      "grad_norm": 0.010562491603195667,
+      "learning_rate": 0.001,
+      "loss": 0.4504,
+      "step": 10694
+    },
+    {
+      "epoch": 0.2950985903834281,
+      "grad_norm": 0.003506020875647664,
+      "learning_rate": 0.001,
+      "loss": 0.4049,
+      "step": 10695
+    },
+    {
+      "epoch": 0.29512618258449247,
+      "grad_norm": 0.0033239612821489573,
+      "learning_rate": 0.001,
+      "loss": 0.4266,
+      "step": 10696
+    },
+    {
+      "epoch": 0.2951537747855569,
+      "grad_norm": 0.0031016524881124496,
+      "learning_rate": 0.001,
+      "loss": 0.445,
+      "step": 10697
+    },
+    {
+      "epoch": 0.29518136698662123,
+      "grad_norm": 0.0037100305780768394,
+      "learning_rate": 0.001,
+      "loss": 0.4156,
+      "step": 10698
+    },
+    {
+      "epoch": 0.2952089591876856,
+      "grad_norm": 0.003935493528842926,
+      "learning_rate": 0.001,
+      "loss": 0.3695,
+      "step": 10699
+    },
+    {
+      "epoch": 0.29523655138875,
+      "grad_norm": 0.0022481977939605713,
+      "learning_rate": 0.001,
+      "loss": 0.4624,
+      "step": 10700
+    },
+    {
+      "epoch": 0.29526414358981434,
+      "grad_norm": 0.003701514797285199,
+      "learning_rate": 0.001,
+      "loss": 0.4009,
+      "step": 10701
+    },
+    {
+      "epoch": 0.2952917357908787,
+      "grad_norm": 0.004181206226348877,
+      "learning_rate": 0.001,
+      "loss": 0.3765,
+      "step": 10702
+    },
+    {
+      "epoch": 0.2953193279919431,
+      "grad_norm": 0.0034305029548704624,
+      "learning_rate": 0.001,
+      "loss": 0.391,
+      "step": 10703
+    },
+    {
+      "epoch": 0.29534692019300746,
+      "grad_norm": 0.013461762107908726,
+      "learning_rate": 0.001,
+      "loss": 0.4012,
+      "step": 10704
+    },
+    {
+      "epoch": 0.2953745123940718,
+      "grad_norm": 0.006567132659256458,
+      "learning_rate": 0.001,
+      "loss": 0.4033,
+      "step": 10705
+    },
+    {
+      "epoch": 0.29540210459513616,
+      "grad_norm": 0.006370114628225565,
+      "learning_rate": 0.001,
+      "loss": 0.3926,
+      "step": 10706
+    },
+    {
+      "epoch": 0.29542969679620057,
+      "grad_norm": 0.0061858720146119595,
+      "learning_rate": 0.001,
+      "loss": 0.4074,
+      "step": 10707
+    },
+    {
+      "epoch": 0.2954572889972649,
+      "grad_norm": 0.0025320991408079863,
+      "learning_rate": 0.001,
+      "loss": 0.4031,
+      "step": 10708
+    },
+    {
+      "epoch": 0.2954848811983293,
+      "grad_norm": 0.0028619118966162205,
+      "learning_rate": 0.001,
+      "loss": 0.4078,
+      "step": 10709
+    },
+    {
+      "epoch": 0.2955124733993937,
+      "grad_norm": 0.0026225673500448465,
+      "learning_rate": 0.001,
+      "loss": 0.4244,
+      "step": 10710
+    },
+    {
+      "epoch": 0.29554006560045804,
+      "grad_norm": 0.0035380006302148104,
+      "learning_rate": 0.001,
+      "loss": 0.3842,
+      "step": 10711
+    },
+    {
+      "epoch": 0.2955676578015224,
+      "grad_norm": 0.0031377810519188643,
+      "learning_rate": 0.001,
+      "loss": 0.4318,
+      "step": 10712
+    },
+    {
+      "epoch": 0.2955952500025868,
+      "grad_norm": 0.0046783569268882275,
+      "learning_rate": 0.001,
+      "loss": 0.3563,
+      "step": 10713
+    },
+    {
+      "epoch": 0.29562284220365115,
+      "grad_norm": 0.0029727064538747072,
+      "learning_rate": 0.001,
+      "loss": 0.4166,
+      "step": 10714
+    },
+    {
+      "epoch": 0.2956504344047155,
+      "grad_norm": 0.0036277880426496267,
+      "learning_rate": 0.001,
+      "loss": 0.4064,
+      "step": 10715
+    },
+    {
+      "epoch": 0.29567802660577985,
+      "grad_norm": 0.002516635926440358,
+      "learning_rate": 0.001,
+      "loss": 0.4071,
+      "step": 10716
+    },
+    {
+      "epoch": 0.29570561880684426,
+      "grad_norm": 0.0029823719523847103,
+      "learning_rate": 0.001,
+      "loss": 0.4081,
+      "step": 10717
+    },
+    {
+      "epoch": 0.2957332110079086,
+      "grad_norm": 0.008971653878688812,
+      "learning_rate": 0.001,
+      "loss": 0.4054,
+      "step": 10718
+    },
+    {
+      "epoch": 0.29576080320897297,
+      "grad_norm": 0.0029265042394399643,
+      "learning_rate": 0.001,
+      "loss": 0.4077,
+      "step": 10719
+    },
+    {
+      "epoch": 0.2957883954100374,
+      "grad_norm": 0.003302923170849681,
+      "learning_rate": 0.001,
+      "loss": 0.4356,
+      "step": 10720
+    },
+    {
+      "epoch": 0.2958159876111017,
+      "grad_norm": 0.0031180151272565126,
+      "learning_rate": 0.001,
+      "loss": 0.4354,
+      "step": 10721
+    },
+    {
+      "epoch": 0.2958435798121661,
+      "grad_norm": 0.009599697776138783,
+      "learning_rate": 0.001,
+      "loss": 0.3762,
+      "step": 10722
+    },
+    {
+      "epoch": 0.2958711720132305,
+      "grad_norm": 0.0027971549425274134,
+      "learning_rate": 0.001,
+      "loss": 0.3832,
+      "step": 10723
+    },
+    {
+      "epoch": 0.29589876421429484,
+      "grad_norm": 0.0056414068676531315,
+      "learning_rate": 0.001,
+      "loss": 0.3877,
+      "step": 10724
+    },
+    {
+      "epoch": 0.2959263564153592,
+      "grad_norm": 0.004426025785505772,
+      "learning_rate": 0.001,
+      "loss": 0.4151,
+      "step": 10725
+    },
+    {
+      "epoch": 0.29595394861642355,
+      "grad_norm": 0.0021196724846959114,
+      "learning_rate": 0.001,
+      "loss": 0.4232,
+      "step": 10726
+    },
+    {
+      "epoch": 0.29598154081748795,
+      "grad_norm": 0.002627891954034567,
+      "learning_rate": 0.001,
+      "loss": 0.4296,
+      "step": 10727
+    },
+    {
+      "epoch": 0.2960091330185523,
+      "grad_norm": 0.006108472123742104,
+      "learning_rate": 0.001,
+      "loss": 0.3671,
+      "step": 10728
+    },
+    {
+      "epoch": 0.29603672521961666,
+      "grad_norm": 0.0034290605690330267,
+      "learning_rate": 0.001,
+      "loss": 0.3751,
+      "step": 10729
+    },
+    {
+      "epoch": 0.29606431742068107,
+      "grad_norm": 0.0039821164682507515,
+      "learning_rate": 0.001,
+      "loss": 0.3657,
+      "step": 10730
+    },
+    {
+      "epoch": 0.2960919096217454,
+      "grad_norm": 0.0026045297272503376,
+      "learning_rate": 0.001,
+      "loss": 0.4055,
+      "step": 10731
+    },
+    {
+      "epoch": 0.29611950182280977,
+      "grad_norm": 0.0025215772911906242,
+      "learning_rate": 0.001,
+      "loss": 0.3925,
+      "step": 10732
+    },
+    {
+      "epoch": 0.2961470940238741,
+      "grad_norm": 0.0033087488263845444,
+      "learning_rate": 0.001,
+      "loss": 0.3933,
+      "step": 10733
+    },
+    {
+      "epoch": 0.29617468622493853,
+      "grad_norm": 0.002935874741524458,
+      "learning_rate": 0.001,
+      "loss": 0.3783,
+      "step": 10734
+    },
+    {
+      "epoch": 0.2962022784260029,
+      "grad_norm": 0.00557227386161685,
+      "learning_rate": 0.001,
+      "loss": 0.3924,
+      "step": 10735
+    },
+    {
+      "epoch": 0.29622987062706724,
+      "grad_norm": 0.0036772515159100294,
+      "learning_rate": 0.001,
+      "loss": 0.403,
+      "step": 10736
+    },
+    {
+      "epoch": 0.29625746282813165,
+      "grad_norm": 0.004311760421842337,
+      "learning_rate": 0.001,
+      "loss": 0.3504,
+      "step": 10737
+    },
+    {
+      "epoch": 0.296285055029196,
+      "grad_norm": 0.0028161678928881884,
+      "learning_rate": 0.001,
+      "loss": 0.4023,
+      "step": 10738
+    },
+    {
+      "epoch": 0.29631264723026035,
+      "grad_norm": 0.00484466552734375,
+      "learning_rate": 0.001,
+      "loss": 0.3902,
+      "step": 10739
+    },
+    {
+      "epoch": 0.29634023943132476,
+      "grad_norm": 0.0023396711330860853,
+      "learning_rate": 0.001,
+      "loss": 0.4433,
+      "step": 10740
+    },
+    {
+      "epoch": 0.2963678316323891,
+      "grad_norm": 0.0026779670733958483,
+      "learning_rate": 0.001,
+      "loss": 0.3784,
+      "step": 10741
+    },
+    {
+      "epoch": 0.29639542383345346,
+      "grad_norm": 0.002949584275484085,
+      "learning_rate": 0.001,
+      "loss": 0.34,
+      "step": 10742
+    },
+    {
+      "epoch": 0.2964230160345178,
+      "grad_norm": 0.00291722291149199,
+      "learning_rate": 0.001,
+      "loss": 0.3756,
+      "step": 10743
+    },
+    {
+      "epoch": 0.2964506082355822,
+      "grad_norm": 0.00298800109885633,
+      "learning_rate": 0.001,
+      "loss": 0.3669,
+      "step": 10744
+    },
+    {
+      "epoch": 0.2964782004366466,
+      "grad_norm": 0.003106021322309971,
+      "learning_rate": 0.001,
+      "loss": 0.4179,
+      "step": 10745
+    },
+    {
+      "epoch": 0.29650579263771093,
+      "grad_norm": 0.002655046060681343,
+      "learning_rate": 0.001,
+      "loss": 0.3925,
+      "step": 10746
+    },
+    {
+      "epoch": 0.29653338483877534,
+      "grad_norm": 0.003087179269641638,
+      "learning_rate": 0.001,
+      "loss": 0.418,
+      "step": 10747
+    },
+    {
+      "epoch": 0.2965609770398397,
+      "grad_norm": 0.004109977278858423,
+      "learning_rate": 0.001,
+      "loss": 0.4055,
+      "step": 10748
+    },
+    {
+      "epoch": 0.29658856924090404,
+      "grad_norm": 0.0026674012187868357,
+      "learning_rate": 0.001,
+      "loss": 0.423,
+      "step": 10749
+    },
+    {
+      "epoch": 0.29661616144196845,
+      "grad_norm": 0.004147304221987724,
+      "learning_rate": 0.001,
+      "loss": 0.4138,
+      "step": 10750
+    },
+    {
+      "epoch": 0.2966437536430328,
+      "grad_norm": 0.0035869008861482143,
+      "learning_rate": 0.001,
+      "loss": 0.3942,
+      "step": 10751
+    },
+    {
+      "epoch": 0.29667134584409716,
+      "grad_norm": 0.0047487919218838215,
+      "learning_rate": 0.001,
+      "loss": 0.4298,
+      "step": 10752
+    },
+    {
+      "epoch": 0.2966989380451615,
+      "grad_norm": 0.004151053261011839,
+      "learning_rate": 0.001,
+      "loss": 0.4299,
+      "step": 10753
+    },
+    {
+      "epoch": 0.2967265302462259,
+      "grad_norm": 0.007814058102667332,
+      "learning_rate": 0.001,
+      "loss": 0.3843,
+      "step": 10754
+    },
+    {
+      "epoch": 0.29675412244729027,
+      "grad_norm": 0.011301838792860508,
+      "learning_rate": 0.001,
+      "loss": 0.3942,
+      "step": 10755
+    },
+    {
+      "epoch": 0.2967817146483546,
+      "grad_norm": 0.00466828653588891,
+      "learning_rate": 0.001,
+      "loss": 0.411,
+      "step": 10756
+    },
+    {
+      "epoch": 0.29680930684941903,
+      "grad_norm": 0.002339472994208336,
+      "learning_rate": 0.001,
+      "loss": 0.4028,
+      "step": 10757
+    },
+    {
+      "epoch": 0.2968368990504834,
+      "grad_norm": 0.0024191855918616056,
+      "learning_rate": 0.001,
+      "loss": 0.411,
+      "step": 10758
+    },
+    {
+      "epoch": 0.29686449125154774,
+      "grad_norm": 0.003175315447151661,
+      "learning_rate": 0.001,
+      "loss": 0.4209,
+      "step": 10759
+    },
+    {
+      "epoch": 0.29689208345261214,
+      "grad_norm": 0.010462275706231594,
+      "learning_rate": 0.001,
+      "loss": 0.3814,
+      "step": 10760
+    },
+    {
+      "epoch": 0.2969196756536765,
+      "grad_norm": 0.002783535746857524,
+      "learning_rate": 0.001,
+      "loss": 0.4265,
+      "step": 10761
+    },
+    {
+      "epoch": 0.29694726785474085,
+      "grad_norm": 0.0035620226990431547,
+      "learning_rate": 0.001,
+      "loss": 0.4312,
+      "step": 10762
+    },
+    {
+      "epoch": 0.2969748600558052,
+      "grad_norm": 0.003987728152424097,
+      "learning_rate": 0.001,
+      "loss": 0.385,
+      "step": 10763
+    },
+    {
+      "epoch": 0.2970024522568696,
+      "grad_norm": 0.0033255494199693203,
+      "learning_rate": 0.001,
+      "loss": 0.448,
+      "step": 10764
+    },
+    {
+      "epoch": 0.29703004445793396,
+      "grad_norm": 0.0032936478964984417,
+      "learning_rate": 0.001,
+      "loss": 0.3832,
+      "step": 10765
+    },
+    {
+      "epoch": 0.2970576366589983,
+      "grad_norm": 0.0029676929116249084,
+      "learning_rate": 0.001,
+      "loss": 0.4159,
+      "step": 10766
+    },
+    {
+      "epoch": 0.2970852288600627,
+      "grad_norm": 0.0031973712611943483,
+      "learning_rate": 0.001,
+      "loss": 0.3938,
+      "step": 10767
+    },
+    {
+      "epoch": 0.2971128210611271,
+      "grad_norm": 0.00365530326962471,
+      "learning_rate": 0.001,
+      "loss": 0.3773,
+      "step": 10768
+    },
+    {
+      "epoch": 0.2971404132621914,
+      "grad_norm": 0.0027721880469471216,
+      "learning_rate": 0.001,
+      "loss": 0.4058,
+      "step": 10769
+    },
+    {
+      "epoch": 0.29716800546325584,
+      "grad_norm": 0.003800489939749241,
+      "learning_rate": 0.001,
+      "loss": 0.4081,
+      "step": 10770
+    },
+    {
+      "epoch": 0.2971955976643202,
+      "grad_norm": 0.0025981413200497627,
+      "learning_rate": 0.001,
+      "loss": 0.3701,
+      "step": 10771
+    },
+    {
+      "epoch": 0.29722318986538454,
+      "grad_norm": 0.002971008885651827,
+      "learning_rate": 0.001,
+      "loss": 0.3769,
+      "step": 10772
+    },
+    {
+      "epoch": 0.2972507820664489,
+      "grad_norm": 0.0038061172235757113,
+      "learning_rate": 0.001,
+      "loss": 0.3942,
+      "step": 10773
+    },
+    {
+      "epoch": 0.2972783742675133,
+      "grad_norm": 0.006774569861590862,
+      "learning_rate": 0.001,
+      "loss": 0.4156,
+      "step": 10774
+    },
+    {
+      "epoch": 0.29730596646857765,
+      "grad_norm": 0.002431475091725588,
+      "learning_rate": 0.001,
+      "loss": 0.3845,
+      "step": 10775
+    },
+    {
+      "epoch": 0.297333558669642,
+      "grad_norm": 0.0034196816850453615,
+      "learning_rate": 0.001,
+      "loss": 0.4186,
+      "step": 10776
+    },
+    {
+      "epoch": 0.2973611508707064,
+      "grad_norm": 0.0041458397172391415,
+      "learning_rate": 0.001,
+      "loss": 0.3907,
+      "step": 10777
+    },
+    {
+      "epoch": 0.29738874307177077,
+      "grad_norm": 0.0031936741434037685,
+      "learning_rate": 0.001,
+      "loss": 0.3969,
+      "step": 10778
+    },
+    {
+      "epoch": 0.2974163352728351,
+      "grad_norm": 0.0030546991620212793,
+      "learning_rate": 0.001,
+      "loss": 0.3683,
+      "step": 10779
+    },
+    {
+      "epoch": 0.2974439274738995,
+      "grad_norm": 0.0025455260183662176,
+      "learning_rate": 0.001,
+      "loss": 0.4408,
+      "step": 10780
+    },
+    {
+      "epoch": 0.2974715196749639,
+      "grad_norm": 0.002307919319719076,
+      "learning_rate": 0.001,
+      "loss": 0.4057,
+      "step": 10781
+    },
+    {
+      "epoch": 0.29749911187602823,
+      "grad_norm": 0.0028700283728539944,
+      "learning_rate": 0.001,
+      "loss": 0.3809,
+      "step": 10782
+    },
+    {
+      "epoch": 0.2975267040770926,
+      "grad_norm": 0.0026034286711364985,
+      "learning_rate": 0.001,
+      "loss": 0.4032,
+      "step": 10783
+    },
+    {
+      "epoch": 0.297554296278157,
+      "grad_norm": 0.005439944099634886,
+      "learning_rate": 0.001,
+      "loss": 0.3849,
+      "step": 10784
+    },
+    {
+      "epoch": 0.29758188847922135,
+      "grad_norm": 0.004825190175324678,
+      "learning_rate": 0.001,
+      "loss": 0.3851,
+      "step": 10785
+    },
+    {
+      "epoch": 0.2976094806802857,
+      "grad_norm": 0.0020885509438812733,
+      "learning_rate": 0.001,
+      "loss": 0.3967,
+      "step": 10786
+    },
+    {
+      "epoch": 0.2976370728813501,
+      "grad_norm": 0.0027620510663837194,
+      "learning_rate": 0.001,
+      "loss": 0.436,
+      "step": 10787
+    },
+    {
+      "epoch": 0.29766466508241446,
+      "grad_norm": 0.002639887621626258,
+      "learning_rate": 0.001,
+      "loss": 0.3654,
+      "step": 10788
+    },
+    {
+      "epoch": 0.2976922572834788,
+      "grad_norm": 0.0022529284469783306,
+      "learning_rate": 0.001,
+      "loss": 0.4011,
+      "step": 10789
+    },
+    {
+      "epoch": 0.2977198494845432,
+      "grad_norm": 0.0026258870493620634,
+      "learning_rate": 0.001,
+      "loss": 0.4102,
+      "step": 10790
+    },
+    {
+      "epoch": 0.2977474416856076,
+      "grad_norm": 0.0029656942933797836,
+      "learning_rate": 0.001,
+      "loss": 0.4083,
+      "step": 10791
+    },
+    {
+      "epoch": 0.2977750338866719,
+      "grad_norm": 0.0020525925792753696,
+      "learning_rate": 0.001,
+      "loss": 0.4201,
+      "step": 10792
+    },
+    {
+      "epoch": 0.2978026260877363,
+      "grad_norm": 0.0030268540140241385,
+      "learning_rate": 0.001,
+      "loss": 0.4068,
+      "step": 10793
+    },
+    {
+      "epoch": 0.2978302182888007,
+      "grad_norm": 0.003296051872894168,
+      "learning_rate": 0.001,
+      "loss": 0.3897,
+      "step": 10794
+    },
+    {
+      "epoch": 0.29785781048986504,
+      "grad_norm": 0.002267766511067748,
+      "learning_rate": 0.001,
+      "loss": 0.4141,
+      "step": 10795
+    },
+    {
+      "epoch": 0.2978854026909294,
+      "grad_norm": 0.0039958711713552475,
+      "learning_rate": 0.001,
+      "loss": 0.3643,
+      "step": 10796
+    },
+    {
+      "epoch": 0.2979129948919938,
+      "grad_norm": 0.003096002619713545,
+      "learning_rate": 0.001,
+      "loss": 0.356,
+      "step": 10797
+    },
+    {
+      "epoch": 0.29794058709305815,
+      "grad_norm": 0.004919158294796944,
+      "learning_rate": 0.001,
+      "loss": 0.3677,
+      "step": 10798
+    },
+    {
+      "epoch": 0.2979681792941225,
+      "grad_norm": 0.002651217393577099,
+      "learning_rate": 0.001,
+      "loss": 0.397,
+      "step": 10799
+    },
+    {
+      "epoch": 0.2979957714951869,
+      "grad_norm": 0.0031349128112196922,
+      "learning_rate": 0.001,
+      "loss": 0.3819,
+      "step": 10800
+    },
+    {
+      "epoch": 0.29802336369625126,
+      "grad_norm": 0.0057730907574296,
+      "learning_rate": 0.001,
+      "loss": 0.3844,
+      "step": 10801
+    },
+    {
+      "epoch": 0.2980509558973156,
+      "grad_norm": 0.004934222903102636,
+      "learning_rate": 0.001,
+      "loss": 0.3653,
+      "step": 10802
+    },
+    {
+      "epoch": 0.29807854809837997,
+      "grad_norm": 0.003473837161436677,
+      "learning_rate": 0.001,
+      "loss": 0.3967,
+      "step": 10803
+    },
+    {
+      "epoch": 0.2981061402994444,
+      "grad_norm": 0.0036314206663519144,
+      "learning_rate": 0.001,
+      "loss": 0.3985,
+      "step": 10804
+    },
+    {
+      "epoch": 0.29813373250050873,
+      "grad_norm": 0.004540051333606243,
+      "learning_rate": 0.001,
+      "loss": 0.3885,
+      "step": 10805
+    },
+    {
+      "epoch": 0.2981613247015731,
+      "grad_norm": 0.002627008128911257,
+      "learning_rate": 0.001,
+      "loss": 0.4118,
+      "step": 10806
+    },
+    {
+      "epoch": 0.2981889169026375,
+      "grad_norm": 0.0027296175248920918,
+      "learning_rate": 0.001,
+      "loss": 0.3905,
+      "step": 10807
+    },
+    {
+      "epoch": 0.29821650910370184,
+      "grad_norm": 0.0026647706981748343,
+      "learning_rate": 0.001,
+      "loss": 0.4231,
+      "step": 10808
+    },
+    {
+      "epoch": 0.2982441013047662,
+      "grad_norm": 0.003241637721657753,
+      "learning_rate": 0.001,
+      "loss": 0.3983,
+      "step": 10809
+    },
+    {
+      "epoch": 0.2982716935058306,
+      "grad_norm": 0.0028002928011119366,
+      "learning_rate": 0.001,
+      "loss": 0.4148,
+      "step": 10810
+    },
+    {
+      "epoch": 0.29829928570689496,
+      "grad_norm": 0.0028025200590491295,
+      "learning_rate": 0.001,
+      "loss": 0.4006,
+      "step": 10811
+    },
+    {
+      "epoch": 0.2983268779079593,
+      "grad_norm": 0.002906485227867961,
+      "learning_rate": 0.001,
+      "loss": 0.3997,
+      "step": 10812
+    },
+    {
+      "epoch": 0.29835447010902366,
+      "grad_norm": 0.0039535220712423325,
+      "learning_rate": 0.001,
+      "loss": 0.4075,
+      "step": 10813
+    },
+    {
+      "epoch": 0.29838206231008807,
+      "grad_norm": 0.0033603734336793423,
+      "learning_rate": 0.001,
+      "loss": 0.3703,
+      "step": 10814
+    },
+    {
+      "epoch": 0.2984096545111524,
+      "grad_norm": 0.0031661302782595158,
+      "learning_rate": 0.001,
+      "loss": 0.3992,
+      "step": 10815
+    },
+    {
+      "epoch": 0.2984372467122168,
+      "grad_norm": 0.002344726352021098,
+      "learning_rate": 0.001,
+      "loss": 0.3914,
+      "step": 10816
+    },
+    {
+      "epoch": 0.2984648389132812,
+      "grad_norm": 0.0025039922911673784,
+      "learning_rate": 0.001,
+      "loss": 0.398,
+      "step": 10817
+    },
+    {
+      "epoch": 0.29849243111434554,
+      "grad_norm": 0.002225354313850403,
+      "learning_rate": 0.001,
+      "loss": 0.425,
+      "step": 10818
+    },
+    {
+      "epoch": 0.2985200233154099,
+      "grad_norm": 0.0027313402388244867,
+      "learning_rate": 0.001,
+      "loss": 0.3782,
+      "step": 10819
+    },
+    {
+      "epoch": 0.2985476155164743,
+      "grad_norm": 0.017960865050554276,
+      "learning_rate": 0.001,
+      "loss": 0.4091,
+      "step": 10820
+    },
+    {
+      "epoch": 0.29857520771753865,
+      "grad_norm": 0.011598210781812668,
+      "learning_rate": 0.001,
+      "loss": 0.3751,
+      "step": 10821
+    },
+    {
+      "epoch": 0.298602799918603,
+      "grad_norm": 0.00278772902674973,
+      "learning_rate": 0.001,
+      "loss": 0.4128,
+      "step": 10822
+    },
+    {
+      "epoch": 0.29863039211966735,
+      "grad_norm": 0.0026171396020799875,
+      "learning_rate": 0.001,
+      "loss": 0.4718,
+      "step": 10823
+    },
+    {
+      "epoch": 0.29865798432073176,
+      "grad_norm": 0.0038320801686495543,
+      "learning_rate": 0.001,
+      "loss": 0.3419,
+      "step": 10824
+    },
+    {
+      "epoch": 0.2986855765217961,
+      "grad_norm": 0.002889628754928708,
+      "learning_rate": 0.001,
+      "loss": 0.4093,
+      "step": 10825
+    },
+    {
+      "epoch": 0.29871316872286047,
+      "grad_norm": 0.003588633146136999,
+      "learning_rate": 0.001,
+      "loss": 0.3945,
+      "step": 10826
+    },
+    {
+      "epoch": 0.2987407609239249,
+      "grad_norm": 0.009326872415840626,
+      "learning_rate": 0.001,
+      "loss": 0.4037,
+      "step": 10827
+    },
+    {
+      "epoch": 0.2987683531249892,
+      "grad_norm": 0.003909088671207428,
+      "learning_rate": 0.001,
+      "loss": 0.407,
+      "step": 10828
+    },
+    {
+      "epoch": 0.2987959453260536,
+      "grad_norm": 0.002797875087708235,
+      "learning_rate": 0.001,
+      "loss": 0.3514,
+      "step": 10829
+    },
+    {
+      "epoch": 0.29882353752711793,
+      "grad_norm": 0.003469844814389944,
+      "learning_rate": 0.001,
+      "loss": 0.4057,
+      "step": 10830
+    },
+    {
+      "epoch": 0.29885112972818234,
+      "grad_norm": 0.0033456028904765844,
+      "learning_rate": 0.001,
+      "loss": 0.3879,
+      "step": 10831
+    },
+    {
+      "epoch": 0.2988787219292467,
+      "grad_norm": 0.034299980849027634,
+      "learning_rate": 0.001,
+      "loss": 0.3644,
+      "step": 10832
+    },
+    {
+      "epoch": 0.29890631413031105,
+      "grad_norm": 0.003411337733268738,
+      "learning_rate": 0.001,
+      "loss": 0.3837,
+      "step": 10833
+    },
+    {
+      "epoch": 0.29893390633137545,
+      "grad_norm": 0.0031848186627030373,
+      "learning_rate": 0.001,
+      "loss": 0.3845,
+      "step": 10834
+    },
+    {
+      "epoch": 0.2989614985324398,
+      "grad_norm": 0.004304205067455769,
+      "learning_rate": 0.001,
+      "loss": 0.3718,
+      "step": 10835
+    },
+    {
+      "epoch": 0.29898909073350416,
+      "grad_norm": 0.002746276091784239,
+      "learning_rate": 0.001,
+      "loss": 0.3903,
+      "step": 10836
+    },
+    {
+      "epoch": 0.29901668293456857,
+      "grad_norm": 0.002491393592208624,
+      "learning_rate": 0.001,
+      "loss": 0.4229,
+      "step": 10837
+    },
+    {
+      "epoch": 0.2990442751356329,
+      "grad_norm": 0.0038965316489338875,
+      "learning_rate": 0.001,
+      "loss": 0.3433,
+      "step": 10838
+    },
+    {
+      "epoch": 0.2990718673366973,
+      "grad_norm": 0.0028245735447853804,
+      "learning_rate": 0.001,
+      "loss": 0.3994,
+      "step": 10839
+    },
+    {
+      "epoch": 0.2990994595377616,
+      "grad_norm": 0.0021889859344810247,
+      "learning_rate": 0.001,
+      "loss": 0.4225,
+      "step": 10840
+    },
+    {
+      "epoch": 0.29912705173882603,
+      "grad_norm": 0.0026167482137680054,
+      "learning_rate": 0.001,
+      "loss": 0.4401,
+      "step": 10841
+    },
+    {
+      "epoch": 0.2991546439398904,
+      "grad_norm": 0.0028761785943061113,
+      "learning_rate": 0.001,
+      "loss": 0.3929,
+      "step": 10842
+    },
+    {
+      "epoch": 0.29918223614095474,
+      "grad_norm": 0.0031041146721690893,
+      "learning_rate": 0.001,
+      "loss": 0.3791,
+      "step": 10843
+    },
+    {
+      "epoch": 0.29920982834201915,
+      "grad_norm": 0.002183483447879553,
+      "learning_rate": 0.001,
+      "loss": 0.4201,
+      "step": 10844
+    },
+    {
+      "epoch": 0.2992374205430835,
+      "grad_norm": 0.0021024607121944427,
+      "learning_rate": 0.001,
+      "loss": 0.3916,
+      "step": 10845
+    },
+    {
+      "epoch": 0.29926501274414785,
+      "grad_norm": 0.004469338804483414,
+      "learning_rate": 0.001,
+      "loss": 0.3661,
+      "step": 10846
+    },
+    {
+      "epoch": 0.29929260494521226,
+      "grad_norm": 0.002203304087743163,
+      "learning_rate": 0.001,
+      "loss": 0.3981,
+      "step": 10847
+    },
+    {
+      "epoch": 0.2993201971462766,
+      "grad_norm": 0.002491022925823927,
+      "learning_rate": 0.001,
+      "loss": 0.3879,
+      "step": 10848
+    },
+    {
+      "epoch": 0.29934778934734096,
+      "grad_norm": 0.005152471829205751,
+      "learning_rate": 0.001,
+      "loss": 0.3898,
+      "step": 10849
+    },
+    {
+      "epoch": 0.2993753815484053,
+      "grad_norm": 0.002667092252522707,
+      "learning_rate": 0.001,
+      "loss": 0.4063,
+      "step": 10850
+    },
+    {
+      "epoch": 0.2994029737494697,
+      "grad_norm": 0.002084558829665184,
+      "learning_rate": 0.001,
+      "loss": 0.4005,
+      "step": 10851
+    },
+    {
+      "epoch": 0.2994305659505341,
+      "grad_norm": 0.004005138296633959,
+      "learning_rate": 0.001,
+      "loss": 0.3918,
+      "step": 10852
+    },
+    {
+      "epoch": 0.29945815815159843,
+      "grad_norm": 0.002981225959956646,
+      "learning_rate": 0.001,
+      "loss": 0.4126,
+      "step": 10853
+    },
+    {
+      "epoch": 0.29948575035266284,
+      "grad_norm": 0.002931734314188361,
+      "learning_rate": 0.001,
+      "loss": 0.3983,
+      "step": 10854
+    },
+    {
+      "epoch": 0.2995133425537272,
+      "grad_norm": 0.002261395798996091,
+      "learning_rate": 0.001,
+      "loss": 0.4082,
+      "step": 10855
+    },
+    {
+      "epoch": 0.29954093475479154,
+      "grad_norm": 0.0047109718434512615,
+      "learning_rate": 0.001,
+      "loss": 0.3924,
+      "step": 10856
+    },
+    {
+      "epoch": 0.29956852695585595,
+      "grad_norm": 0.003906469792127609,
+      "learning_rate": 0.001,
+      "loss": 0.4029,
+      "step": 10857
+    },
+    {
+      "epoch": 0.2995961191569203,
+      "grad_norm": 0.003466276917606592,
+      "learning_rate": 0.001,
+      "loss": 0.3953,
+      "step": 10858
+    },
+    {
+      "epoch": 0.29962371135798466,
+      "grad_norm": 0.005029084160923958,
+      "learning_rate": 0.001,
+      "loss": 0.4143,
+      "step": 10859
+    },
+    {
+      "epoch": 0.299651303559049,
+      "grad_norm": 0.0030740953516215086,
+      "learning_rate": 0.001,
+      "loss": 0.402,
+      "step": 10860
+    },
+    {
+      "epoch": 0.2996788957601134,
+      "grad_norm": 0.00288767390884459,
+      "learning_rate": 0.001,
+      "loss": 0.3996,
+      "step": 10861
+    },
+    {
+      "epoch": 0.29970648796117777,
+      "grad_norm": 0.004084376618266106,
+      "learning_rate": 0.001,
+      "loss": 0.408,
+      "step": 10862
+    },
+    {
+      "epoch": 0.2997340801622421,
+      "grad_norm": 0.008121359162032604,
+      "learning_rate": 0.001,
+      "loss": 0.3961,
+      "step": 10863
+    },
+    {
+      "epoch": 0.29976167236330653,
+      "grad_norm": 0.0030457088723778725,
+      "learning_rate": 0.001,
+      "loss": 0.406,
+      "step": 10864
+    },
+    {
+      "epoch": 0.2997892645643709,
+      "grad_norm": 0.0024807127192616463,
+      "learning_rate": 0.001,
+      "loss": 0.4243,
+      "step": 10865
+    },
+    {
+      "epoch": 0.29981685676543524,
+      "grad_norm": 0.0041881585493683815,
+      "learning_rate": 0.001,
+      "loss": 0.3817,
+      "step": 10866
+    },
+    {
+      "epoch": 0.29984444896649964,
+      "grad_norm": 0.005836515221744776,
+      "learning_rate": 0.001,
+      "loss": 0.4146,
+      "step": 10867
+    },
+    {
+      "epoch": 0.299872041167564,
+      "grad_norm": 0.003004920668900013,
+      "learning_rate": 0.001,
+      "loss": 0.377,
+      "step": 10868
+    },
+    {
+      "epoch": 0.29989963336862835,
+      "grad_norm": 0.0062048486433923244,
+      "learning_rate": 0.001,
+      "loss": 0.3794,
+      "step": 10869
+    },
+    {
+      "epoch": 0.2999272255696927,
+      "grad_norm": 0.0027396834921091795,
+      "learning_rate": 0.001,
+      "loss": 0.375,
+      "step": 10870
+    },
+    {
+      "epoch": 0.2999548177707571,
+      "grad_norm": 0.0027734714094549417,
+      "learning_rate": 0.001,
+      "loss": 0.3982,
+      "step": 10871
+    },
+    {
+      "epoch": 0.29998240997182146,
+      "grad_norm": 0.0023340985644608736,
+      "learning_rate": 0.001,
+      "loss": 0.4122,
+      "step": 10872
+    },
+    {
+      "epoch": 0.3000100021728858,
+      "grad_norm": 0.005321372300386429,
+      "learning_rate": 0.001,
+      "loss": 0.4037,
+      "step": 10873
+    },
+    {
+      "epoch": 0.3000375943739502,
+      "grad_norm": 0.002819483634084463,
+      "learning_rate": 0.001,
+      "loss": 0.3661,
+      "step": 10874
+    },
+    {
+      "epoch": 0.3000651865750146,
+      "grad_norm": 0.0026980694383382797,
+      "learning_rate": 0.001,
+      "loss": 0.4059,
+      "step": 10875
+    },
+    {
+      "epoch": 0.30009277877607893,
+      "grad_norm": 0.003244214691221714,
+      "learning_rate": 0.001,
+      "loss": 0.3792,
+      "step": 10876
+    },
+    {
+      "epoch": 0.30012037097714334,
+      "grad_norm": 0.004197717644274235,
+      "learning_rate": 0.001,
+      "loss": 0.4162,
+      "step": 10877
+    },
+    {
+      "epoch": 0.3001479631782077,
+      "grad_norm": 0.0026915615890175104,
+      "learning_rate": 0.001,
+      "loss": 0.3901,
+      "step": 10878
+    },
+    {
+      "epoch": 0.30017555537927204,
+      "grad_norm": 0.003251213114708662,
+      "learning_rate": 0.001,
+      "loss": 0.4024,
+      "step": 10879
+    },
+    {
+      "epoch": 0.3002031475803364,
+      "grad_norm": 0.004156854934990406,
+      "learning_rate": 0.001,
+      "loss": 0.4144,
+      "step": 10880
+    },
+    {
+      "epoch": 0.3002307397814008,
+      "grad_norm": 0.0028878510929644108,
+      "learning_rate": 0.001,
+      "loss": 0.3931,
+      "step": 10881
+    },
+    {
+      "epoch": 0.30025833198246515,
+      "grad_norm": 0.003199911443516612,
+      "learning_rate": 0.001,
+      "loss": 0.3568,
+      "step": 10882
+    },
+    {
+      "epoch": 0.3002859241835295,
+      "grad_norm": 0.002910776762291789,
+      "learning_rate": 0.001,
+      "loss": 0.4044,
+      "step": 10883
+    },
+    {
+      "epoch": 0.3003135163845939,
+      "grad_norm": 0.004030926618725061,
+      "learning_rate": 0.001,
+      "loss": 0.394,
+      "step": 10884
+    },
+    {
+      "epoch": 0.30034110858565827,
+      "grad_norm": 0.004950513131916523,
+      "learning_rate": 0.001,
+      "loss": 0.3878,
+      "step": 10885
+    },
+    {
+      "epoch": 0.3003687007867226,
+      "grad_norm": 0.0032172142527997494,
+      "learning_rate": 0.001,
+      "loss": 0.4344,
+      "step": 10886
+    },
+    {
+      "epoch": 0.30039629298778703,
+      "grad_norm": 0.002721422351896763,
+      "learning_rate": 0.001,
+      "loss": 0.3552,
+      "step": 10887
+    },
+    {
+      "epoch": 0.3004238851888514,
+      "grad_norm": 0.0029736789874732494,
+      "learning_rate": 0.001,
+      "loss": 0.4153,
+      "step": 10888
+    },
+    {
+      "epoch": 0.30045147738991573,
+      "grad_norm": 0.0037150525022298098,
+      "learning_rate": 0.001,
+      "loss": 0.4114,
+      "step": 10889
+    },
+    {
+      "epoch": 0.3004790695909801,
+      "grad_norm": 0.002336825244128704,
+      "learning_rate": 0.001,
+      "loss": 0.3982,
+      "step": 10890
+    },
+    {
+      "epoch": 0.3005066617920445,
+      "grad_norm": 0.0033320027869194746,
+      "learning_rate": 0.001,
+      "loss": 0.4131,
+      "step": 10891
+    },
+    {
+      "epoch": 0.30053425399310885,
+      "grad_norm": 0.002994644921272993,
+      "learning_rate": 0.001,
+      "loss": 0.3913,
+      "step": 10892
+    },
+    {
+      "epoch": 0.3005618461941732,
+      "grad_norm": 0.002780053298920393,
+      "learning_rate": 0.001,
+      "loss": 0.4264,
+      "step": 10893
+    },
+    {
+      "epoch": 0.3005894383952376,
+      "grad_norm": 0.0025280967820435762,
+      "learning_rate": 0.001,
+      "loss": 0.4108,
+      "step": 10894
+    },
+    {
+      "epoch": 0.30061703059630196,
+      "grad_norm": 0.003201976651325822,
+      "learning_rate": 0.001,
+      "loss": 0.4263,
+      "step": 10895
+    },
+    {
+      "epoch": 0.3006446227973663,
+      "grad_norm": 0.0020535339135676622,
+      "learning_rate": 0.001,
+      "loss": 0.4297,
+      "step": 10896
+    },
+    {
+      "epoch": 0.3006722149984307,
+      "grad_norm": 0.0021851370111107826,
+      "learning_rate": 0.001,
+      "loss": 0.4011,
+      "step": 10897
+    },
+    {
+      "epoch": 0.3006998071994951,
+      "grad_norm": 0.003310294123366475,
+      "learning_rate": 0.001,
+      "loss": 0.3851,
+      "step": 10898
+    },
+    {
+      "epoch": 0.3007273994005594,
+      "grad_norm": 0.002345843706279993,
+      "learning_rate": 0.001,
+      "loss": 0.3874,
+      "step": 10899
+    },
+    {
+      "epoch": 0.3007549916016238,
+      "grad_norm": 0.004679036792367697,
+      "learning_rate": 0.001,
+      "loss": 0.389,
+      "step": 10900
+    },
+    {
+      "epoch": 0.3007825838026882,
+      "grad_norm": 0.0026619029231369495,
+      "learning_rate": 0.001,
+      "loss": 0.3923,
+      "step": 10901
+    },
+    {
+      "epoch": 0.30081017600375254,
+      "grad_norm": 0.003363067051395774,
+      "learning_rate": 0.001,
+      "loss": 0.3938,
+      "step": 10902
+    },
+    {
+      "epoch": 0.3008377682048169,
+      "grad_norm": 0.0038764209020882845,
+      "learning_rate": 0.001,
+      "loss": 0.3845,
+      "step": 10903
+    },
+    {
+      "epoch": 0.3008653604058813,
+      "grad_norm": 0.005055352114140987,
+      "learning_rate": 0.001,
+      "loss": 0.382,
+      "step": 10904
+    },
+    {
+      "epoch": 0.30089295260694565,
+      "grad_norm": 0.0033133430406451225,
+      "learning_rate": 0.001,
+      "loss": 0.3822,
+      "step": 10905
+    },
+    {
+      "epoch": 0.30092054480801,
+      "grad_norm": 0.002643629675731063,
+      "learning_rate": 0.001,
+      "loss": 0.4331,
+      "step": 10906
+    },
+    {
+      "epoch": 0.3009481370090744,
+      "grad_norm": 0.010275435633957386,
+      "learning_rate": 0.001,
+      "loss": 0.42,
+      "step": 10907
+    },
+    {
+      "epoch": 0.30097572921013876,
+      "grad_norm": 0.0022841233294457197,
+      "learning_rate": 0.001,
+      "loss": 0.4112,
+      "step": 10908
+    },
+    {
+      "epoch": 0.3010033214112031,
+      "grad_norm": 0.00588460685685277,
+      "learning_rate": 0.001,
+      "loss": 0.4,
+      "step": 10909
+    },
+    {
+      "epoch": 0.30103091361226747,
+      "grad_norm": 0.003090951358899474,
+      "learning_rate": 0.001,
+      "loss": 0.3985,
+      "step": 10910
+    },
+    {
+      "epoch": 0.3010585058133319,
+      "grad_norm": 0.011329708620905876,
+      "learning_rate": 0.001,
+      "loss": 0.4228,
+      "step": 10911
+    },
+    {
+      "epoch": 0.30108609801439623,
+      "grad_norm": 0.002050732960924506,
+      "learning_rate": 0.001,
+      "loss": 0.4334,
+      "step": 10912
+    },
+    {
+      "epoch": 0.3011136902154606,
+      "grad_norm": 0.005159840919077396,
+      "learning_rate": 0.001,
+      "loss": 0.3852,
+      "step": 10913
+    },
+    {
+      "epoch": 0.301141282416525,
+      "grad_norm": 0.002075582044199109,
+      "learning_rate": 0.001,
+      "loss": 0.3713,
+      "step": 10914
+    },
+    {
+      "epoch": 0.30116887461758934,
+      "grad_norm": 0.0043141464702785015,
+      "learning_rate": 0.001,
+      "loss": 0.3896,
+      "step": 10915
+    },
+    {
+      "epoch": 0.3011964668186537,
+      "grad_norm": 0.0034079302567988634,
+      "learning_rate": 0.001,
+      "loss": 0.4032,
+      "step": 10916
+    },
+    {
+      "epoch": 0.30122405901971805,
+      "grad_norm": 0.0023484264966100454,
+      "learning_rate": 0.001,
+      "loss": 0.3907,
+      "step": 10917
+    },
+    {
+      "epoch": 0.30125165122078246,
+      "grad_norm": 0.0037031807005405426,
+      "learning_rate": 0.001,
+      "loss": 0.4304,
+      "step": 10918
+    },
+    {
+      "epoch": 0.3012792434218468,
+      "grad_norm": 0.0023840791545808315,
+      "learning_rate": 0.001,
+      "loss": 0.4139,
+      "step": 10919
+    },
+    {
+      "epoch": 0.30130683562291116,
+      "grad_norm": 0.002175979781895876,
+      "learning_rate": 0.001,
+      "loss": 0.4065,
+      "step": 10920
+    },
+    {
+      "epoch": 0.30133442782397557,
+      "grad_norm": 0.0025980628561228514,
+      "learning_rate": 0.001,
+      "loss": 0.4119,
+      "step": 10921
+    },
+    {
+      "epoch": 0.3013620200250399,
+      "grad_norm": 0.012000390328466892,
+      "learning_rate": 0.001,
+      "loss": 0.3716,
+      "step": 10922
+    },
+    {
+      "epoch": 0.3013896122261043,
+      "grad_norm": 0.002536133164539933,
+      "learning_rate": 0.001,
+      "loss": 0.372,
+      "step": 10923
+    },
+    {
+      "epoch": 0.3014172044271687,
+      "grad_norm": 0.002884393557906151,
+      "learning_rate": 0.001,
+      "loss": 0.377,
+      "step": 10924
+    },
+    {
+      "epoch": 0.30144479662823304,
+      "grad_norm": 0.002150443848222494,
+      "learning_rate": 0.001,
+      "loss": 0.4188,
+      "step": 10925
+    },
+    {
+      "epoch": 0.3014723888292974,
+      "grad_norm": 0.0027890305500477552,
+      "learning_rate": 0.001,
+      "loss": 0.3784,
+      "step": 10926
+    },
+    {
+      "epoch": 0.30149998103036174,
+      "grad_norm": 0.0049143158830702305,
+      "learning_rate": 0.001,
+      "loss": 0.3985,
+      "step": 10927
+    },
+    {
+      "epoch": 0.30152757323142615,
+      "grad_norm": 0.003542589023709297,
+      "learning_rate": 0.001,
+      "loss": 0.3984,
+      "step": 10928
+    },
+    {
+      "epoch": 0.3015551654324905,
+      "grad_norm": 0.002202375093474984,
+      "learning_rate": 0.001,
+      "loss": 0.4332,
+      "step": 10929
+    },
+    {
+      "epoch": 0.30158275763355485,
+      "grad_norm": 0.0022870609536767006,
+      "learning_rate": 0.001,
+      "loss": 0.4504,
+      "step": 10930
+    },
+    {
+      "epoch": 0.30161034983461926,
+      "grad_norm": 0.0055159348994493484,
+      "learning_rate": 0.001,
+      "loss": 0.3928,
+      "step": 10931
+    },
+    {
+      "epoch": 0.3016379420356836,
+      "grad_norm": 0.03571107238531113,
+      "learning_rate": 0.001,
+      "loss": 0.3786,
+      "step": 10932
+    },
+    {
+      "epoch": 0.30166553423674797,
+      "grad_norm": 0.004229827784001827,
+      "learning_rate": 0.001,
+      "loss": 0.3811,
+      "step": 10933
+    },
+    {
+      "epoch": 0.3016931264378124,
+      "grad_norm": 0.0025944889057427645,
+      "learning_rate": 0.001,
+      "loss": 0.4141,
+      "step": 10934
+    },
+    {
+      "epoch": 0.30172071863887673,
+      "grad_norm": 0.007089374121278524,
+      "learning_rate": 0.001,
+      "loss": 0.403,
+      "step": 10935
+    },
+    {
+      "epoch": 0.3017483108399411,
+      "grad_norm": 0.002950213151052594,
+      "learning_rate": 0.001,
+      "loss": 0.3948,
+      "step": 10936
+    },
+    {
+      "epoch": 0.30177590304100543,
+      "grad_norm": 0.006799718365073204,
+      "learning_rate": 0.001,
+      "loss": 0.4585,
+      "step": 10937
+    },
+    {
+      "epoch": 0.30180349524206984,
+      "grad_norm": 0.004761769436299801,
+      "learning_rate": 0.001,
+      "loss": 0.4065,
+      "step": 10938
+    },
+    {
+      "epoch": 0.3018310874431342,
+      "grad_norm": 0.004374027717858553,
+      "learning_rate": 0.001,
+      "loss": 0.3667,
+      "step": 10939
+    },
+    {
+      "epoch": 0.30185867964419855,
+      "grad_norm": 0.007258753292262554,
+      "learning_rate": 0.001,
+      "loss": 0.3262,
+      "step": 10940
+    },
+    {
+      "epoch": 0.30188627184526295,
+      "grad_norm": 0.0028893048875033855,
+      "learning_rate": 0.001,
+      "loss": 0.4137,
+      "step": 10941
+    },
+    {
+      "epoch": 0.3019138640463273,
+      "grad_norm": 0.003643547184765339,
+      "learning_rate": 0.001,
+      "loss": 0.3969,
+      "step": 10942
+    },
+    {
+      "epoch": 0.30194145624739166,
+      "grad_norm": 0.0045185210183262825,
+      "learning_rate": 0.001,
+      "loss": 0.4042,
+      "step": 10943
+    },
+    {
+      "epoch": 0.30196904844845607,
+      "grad_norm": 0.0024632923305034637,
+      "learning_rate": 0.001,
+      "loss": 0.4148,
+      "step": 10944
+    },
+    {
+      "epoch": 0.3019966406495204,
+      "grad_norm": 0.0027886577881872654,
+      "learning_rate": 0.001,
+      "loss": 0.4051,
+      "step": 10945
+    },
+    {
+      "epoch": 0.3020242328505848,
+      "grad_norm": 0.0049835750833153725,
+      "learning_rate": 0.001,
+      "loss": 0.3886,
+      "step": 10946
+    },
+    {
+      "epoch": 0.3020518250516491,
+      "grad_norm": 0.003602303098887205,
+      "learning_rate": 0.001,
+      "loss": 0.3975,
+      "step": 10947
+    },
+    {
+      "epoch": 0.30207941725271353,
+      "grad_norm": 0.0027262901421636343,
+      "learning_rate": 0.001,
+      "loss": 0.4128,
+      "step": 10948
+    },
+    {
+      "epoch": 0.3021070094537779,
+      "grad_norm": 0.0026782192289829254,
+      "learning_rate": 0.001,
+      "loss": 0.4065,
+      "step": 10949
+    },
+    {
+      "epoch": 0.30213460165484224,
+      "grad_norm": 0.0033344633411616087,
+      "learning_rate": 0.001,
+      "loss": 0.37,
+      "step": 10950
+    },
+    {
+      "epoch": 0.30216219385590665,
+      "grad_norm": 0.007024961058050394,
+      "learning_rate": 0.001,
+      "loss": 0.4372,
+      "step": 10951
+    },
+    {
+      "epoch": 0.302189786056971,
+      "grad_norm": 0.0027252105064690113,
+      "learning_rate": 0.001,
+      "loss": 0.435,
+      "step": 10952
+    },
+    {
+      "epoch": 0.30221737825803535,
+      "grad_norm": 0.0036616462748497725,
+      "learning_rate": 0.001,
+      "loss": 0.3623,
+      "step": 10953
+    },
+    {
+      "epoch": 0.30224497045909976,
+      "grad_norm": 0.0024184328503906727,
+      "learning_rate": 0.001,
+      "loss": 0.3791,
+      "step": 10954
+    },
+    {
+      "epoch": 0.3022725626601641,
+      "grad_norm": 0.002676165895536542,
+      "learning_rate": 0.001,
+      "loss": 0.3811,
+      "step": 10955
+    },
+    {
+      "epoch": 0.30230015486122847,
+      "grad_norm": 0.00226954510435462,
+      "learning_rate": 0.001,
+      "loss": 0.3708,
+      "step": 10956
+    },
+    {
+      "epoch": 0.3023277470622928,
+      "grad_norm": 0.002427699277177453,
+      "learning_rate": 0.001,
+      "loss": 0.4012,
+      "step": 10957
+    },
+    {
+      "epoch": 0.3023553392633572,
+      "grad_norm": 0.002578388201072812,
+      "learning_rate": 0.001,
+      "loss": 0.3645,
+      "step": 10958
+    },
+    {
+      "epoch": 0.3023829314644216,
+      "grad_norm": 0.0028099005576223135,
+      "learning_rate": 0.001,
+      "loss": 0.3625,
+      "step": 10959
+    },
+    {
+      "epoch": 0.30241052366548593,
+      "grad_norm": 0.0038160127587616444,
+      "learning_rate": 0.001,
+      "loss": 0.4079,
+      "step": 10960
+    },
+    {
+      "epoch": 0.30243811586655034,
+      "grad_norm": 0.004297412466257811,
+      "learning_rate": 0.001,
+      "loss": 0.3963,
+      "step": 10961
+    },
+    {
+      "epoch": 0.3024657080676147,
+      "grad_norm": 0.004003297537565231,
+      "learning_rate": 0.001,
+      "loss": 0.3661,
+      "step": 10962
+    },
+    {
+      "epoch": 0.30249330026867904,
+      "grad_norm": 0.003333853790536523,
+      "learning_rate": 0.001,
+      "loss": 0.3919,
+      "step": 10963
+    },
+    {
+      "epoch": 0.30252089246974345,
+      "grad_norm": 0.007552805822342634,
+      "learning_rate": 0.001,
+      "loss": 0.3821,
+      "step": 10964
+    },
+    {
+      "epoch": 0.3025484846708078,
+      "grad_norm": 0.002244778210297227,
+      "learning_rate": 0.001,
+      "loss": 0.4141,
+      "step": 10965
+    },
+    {
+      "epoch": 0.30257607687187216,
+      "grad_norm": 0.0028706330340355635,
+      "learning_rate": 0.001,
+      "loss": 0.4069,
+      "step": 10966
+    },
+    {
+      "epoch": 0.3026036690729365,
+      "grad_norm": 0.002905101515352726,
+      "learning_rate": 0.001,
+      "loss": 0.3895,
+      "step": 10967
+    },
+    {
+      "epoch": 0.3026312612740009,
+      "grad_norm": 0.0025956053286790848,
+      "learning_rate": 0.001,
+      "loss": 0.3999,
+      "step": 10968
+    },
+    {
+      "epoch": 0.30265885347506527,
+      "grad_norm": 0.0038647784385830164,
+      "learning_rate": 0.001,
+      "loss": 0.3859,
+      "step": 10969
+    },
+    {
+      "epoch": 0.3026864456761296,
+      "grad_norm": 0.004919635597616434,
+      "learning_rate": 0.001,
+      "loss": 0.4256,
+      "step": 10970
+    },
+    {
+      "epoch": 0.30271403787719403,
+      "grad_norm": 0.0035008483100682497,
+      "learning_rate": 0.001,
+      "loss": 0.3939,
+      "step": 10971
+    },
+    {
+      "epoch": 0.3027416300782584,
+      "grad_norm": 0.0027946606278419495,
+      "learning_rate": 0.001,
+      "loss": 0.4249,
+      "step": 10972
+    },
+    {
+      "epoch": 0.30276922227932274,
+      "grad_norm": 0.0031994767487049103,
+      "learning_rate": 0.001,
+      "loss": 0.3945,
+      "step": 10973
+    },
+    {
+      "epoch": 0.30279681448038714,
+      "grad_norm": 0.006230973172932863,
+      "learning_rate": 0.001,
+      "loss": 0.4049,
+      "step": 10974
+    },
+    {
+      "epoch": 0.3028244066814515,
+      "grad_norm": 0.003084323601797223,
+      "learning_rate": 0.001,
+      "loss": 0.4212,
+      "step": 10975
+    },
+    {
+      "epoch": 0.30285199888251585,
+      "grad_norm": 0.0022413854021579027,
+      "learning_rate": 0.001,
+      "loss": 0.4093,
+      "step": 10976
+    },
+    {
+      "epoch": 0.3028795910835802,
+      "grad_norm": 0.0032844298984855413,
+      "learning_rate": 0.001,
+      "loss": 0.4187,
+      "step": 10977
+    },
+    {
+      "epoch": 0.3029071832846446,
+      "grad_norm": 0.002548948395997286,
+      "learning_rate": 0.001,
+      "loss": 0.395,
+      "step": 10978
+    },
+    {
+      "epoch": 0.30293477548570896,
+      "grad_norm": 0.0052003706805408,
+      "learning_rate": 0.001,
+      "loss": 0.4372,
+      "step": 10979
+    },
+    {
+      "epoch": 0.3029623676867733,
+      "grad_norm": 0.003729903372004628,
+      "learning_rate": 0.001,
+      "loss": 0.3965,
+      "step": 10980
+    },
+    {
+      "epoch": 0.3029899598878377,
+      "grad_norm": 0.0028468905948102474,
+      "learning_rate": 0.001,
+      "loss": 0.3932,
+      "step": 10981
+    },
+    {
+      "epoch": 0.3030175520889021,
+      "grad_norm": 0.0020310785621404648,
+      "learning_rate": 0.001,
+      "loss": 0.4303,
+      "step": 10982
+    },
+    {
+      "epoch": 0.30304514428996643,
+      "grad_norm": 0.002875153673812747,
+      "learning_rate": 0.001,
+      "loss": 0.3977,
+      "step": 10983
+    },
+    {
+      "epoch": 0.30307273649103084,
+      "grad_norm": 0.003797962563112378,
+      "learning_rate": 0.001,
+      "loss": 0.4152,
+      "step": 10984
+    },
+    {
+      "epoch": 0.3031003286920952,
+      "grad_norm": 0.003318538423627615,
+      "learning_rate": 0.001,
+      "loss": 0.4107,
+      "step": 10985
+    },
+    {
+      "epoch": 0.30312792089315954,
+      "grad_norm": 0.0027376478537917137,
+      "learning_rate": 0.001,
+      "loss": 0.3777,
+      "step": 10986
+    },
+    {
+      "epoch": 0.3031555130942239,
+      "grad_norm": 0.0021703215315937996,
+      "learning_rate": 0.001,
+      "loss": 0.4027,
+      "step": 10987
+    },
+    {
+      "epoch": 0.3031831052952883,
+      "grad_norm": 0.00254058837890625,
+      "learning_rate": 0.001,
+      "loss": 0.3964,
+      "step": 10988
+    },
+    {
+      "epoch": 0.30321069749635265,
+      "grad_norm": 0.0031664727721363306,
+      "learning_rate": 0.001,
+      "loss": 0.3666,
+      "step": 10989
+    },
+    {
+      "epoch": 0.303238289697417,
+      "grad_norm": 0.003056199988350272,
+      "learning_rate": 0.001,
+      "loss": 0.3985,
+      "step": 10990
+    },
+    {
+      "epoch": 0.3032658818984814,
+      "grad_norm": 0.002997304080054164,
+      "learning_rate": 0.001,
+      "loss": 0.4234,
+      "step": 10991
+    },
+    {
+      "epoch": 0.30329347409954577,
+      "grad_norm": 0.0028167872224003077,
+      "learning_rate": 0.001,
+      "loss": 0.4345,
+      "step": 10992
+    },
+    {
+      "epoch": 0.3033210663006101,
+      "grad_norm": 0.0038003227673470974,
+      "learning_rate": 0.001,
+      "loss": 0.4154,
+      "step": 10993
+    },
+    {
+      "epoch": 0.30334865850167453,
+      "grad_norm": 0.010225886479020119,
+      "learning_rate": 0.001,
+      "loss": 0.3884,
+      "step": 10994
+    },
+    {
+      "epoch": 0.3033762507027389,
+      "grad_norm": 0.004211194813251495,
+      "learning_rate": 0.001,
+      "loss": 0.3673,
+      "step": 10995
+    },
+    {
+      "epoch": 0.30340384290380323,
+      "grad_norm": 0.0034563657827675343,
+      "learning_rate": 0.001,
+      "loss": 0.4128,
+      "step": 10996
+    },
+    {
+      "epoch": 0.3034314351048676,
+      "grad_norm": 0.0032761215697973967,
+      "learning_rate": 0.001,
+      "loss": 0.3444,
+      "step": 10997
+    },
+    {
+      "epoch": 0.303459027305932,
+      "grad_norm": 0.0042040105909109116,
+      "learning_rate": 0.001,
+      "loss": 0.3889,
+      "step": 10998
+    },
+    {
+      "epoch": 0.30348661950699635,
+      "grad_norm": 0.005281214602291584,
+      "learning_rate": 0.001,
+      "loss": 0.4308,
+      "step": 10999
+    },
+    {
+      "epoch": 0.3035142117080607,
+      "grad_norm": 0.0027140434831380844,
+      "learning_rate": 0.001,
+      "loss": 0.414,
+      "step": 11000
+    },
+    {
+      "epoch": 0.3035142117080607,
+      "eval_runtime": 23.9221,
+      "eval_samples_per_second": 1.338,
+      "eval_steps_per_second": 0.167,
+      "step": 11000
+    },
+    {
+      "epoch": 0.3035418039091251,
+      "grad_norm": 0.0059446897357702255,
+      "learning_rate": 0.001,
+      "loss": 0.3585,
+      "step": 11001
+    },
+    {
+      "epoch": 0.30356939611018946,
+      "grad_norm": 0.002668326487764716,
+      "learning_rate": 0.001,
+      "loss": 0.3736,
+      "step": 11002
+    },
+    {
+      "epoch": 0.3035969883112538,
+      "grad_norm": 0.002568204887211323,
+      "learning_rate": 0.001,
+      "loss": 0.3838,
+      "step": 11003
+    },
+    {
+      "epoch": 0.3036245805123182,
+      "grad_norm": 0.002712225541472435,
+      "learning_rate": 0.001,
+      "loss": 0.4075,
+      "step": 11004
+    },
+    {
+      "epoch": 0.3036521727133826,
+      "grad_norm": 0.004501170478761196,
+      "learning_rate": 0.001,
+      "loss": 0.3691,
+      "step": 11005
+    },
+    {
+      "epoch": 0.3036797649144469,
+      "grad_norm": 0.005471101030707359,
+      "learning_rate": 0.001,
+      "loss": 0.3584,
+      "step": 11006
+    },
+    {
+      "epoch": 0.3037073571155113,
+      "grad_norm": 0.0026964505668729544,
+      "learning_rate": 0.001,
+      "loss": 0.4092,
+      "step": 11007
+    },
+    {
+      "epoch": 0.3037349493165757,
+      "grad_norm": 0.003707374446094036,
+      "learning_rate": 0.001,
+      "loss": 0.3823,
+      "step": 11008
+    },
+    {
+      "epoch": 0.30376254151764004,
+      "grad_norm": 0.012931153178215027,
+      "learning_rate": 0.001,
+      "loss": 0.4105,
+      "step": 11009
+    },
+    {
+      "epoch": 0.3037901337187044,
+      "grad_norm": 0.009963351301848888,
+      "learning_rate": 0.001,
+      "loss": 0.3633,
+      "step": 11010
+    },
+    {
+      "epoch": 0.3038177259197688,
+      "grad_norm": 0.0042528132908046246,
+      "learning_rate": 0.001,
+      "loss": 0.4651,
+      "step": 11011
+    },
+    {
+      "epoch": 0.30384531812083315,
+      "grad_norm": 0.004563808441162109,
+      "learning_rate": 0.001,
+      "loss": 0.4025,
+      "step": 11012
+    },
+    {
+      "epoch": 0.3038729103218975,
+      "grad_norm": 0.0027470001950860023,
+      "learning_rate": 0.001,
+      "loss": 0.4025,
+      "step": 11013
+    },
+    {
+      "epoch": 0.30390050252296186,
+      "grad_norm": 0.0026821244973689318,
+      "learning_rate": 0.001,
+      "loss": 0.3991,
+      "step": 11014
+    },
+    {
+      "epoch": 0.30392809472402627,
+      "grad_norm": 0.002528084209188819,
+      "learning_rate": 0.001,
+      "loss": 0.3951,
+      "step": 11015
+    },
+    {
+      "epoch": 0.3039556869250906,
+      "grad_norm": 0.005667414516210556,
+      "learning_rate": 0.001,
+      "loss": 0.3563,
+      "step": 11016
+    },
+    {
+      "epoch": 0.30398327912615497,
+      "grad_norm": 0.0031073635909706354,
+      "learning_rate": 0.001,
+      "loss": 0.3783,
+      "step": 11017
+    },
+    {
+      "epoch": 0.3040108713272194,
+      "grad_norm": 0.0026901857927441597,
+      "learning_rate": 0.001,
+      "loss": 0.3659,
+      "step": 11018
+    },
+    {
+      "epoch": 0.30403846352828373,
+      "grad_norm": 0.0030291874427348375,
+      "learning_rate": 0.001,
+      "loss": 0.373,
+      "step": 11019
+    },
+    {
+      "epoch": 0.3040660557293481,
+      "grad_norm": 0.0024688898120075464,
+      "learning_rate": 0.001,
+      "loss": 0.3581,
+      "step": 11020
+    },
+    {
+      "epoch": 0.3040936479304125,
+      "grad_norm": 0.0029477437492460012,
+      "learning_rate": 0.001,
+      "loss": 0.3858,
+      "step": 11021
+    },
+    {
+      "epoch": 0.30412124013147684,
+      "grad_norm": 0.0034905215725302696,
+      "learning_rate": 0.001,
+      "loss": 0.4062,
+      "step": 11022
+    },
+    {
+      "epoch": 0.3041488323325412,
+      "grad_norm": 0.0028976472094655037,
+      "learning_rate": 0.001,
+      "loss": 0.4163,
+      "step": 11023
+    },
+    {
+      "epoch": 0.30417642453360555,
+      "grad_norm": 0.002226645825430751,
+      "learning_rate": 0.001,
+      "loss": 0.4025,
+      "step": 11024
+    },
+    {
+      "epoch": 0.30420401673466996,
+      "grad_norm": 0.00522614223882556,
+      "learning_rate": 0.001,
+      "loss": 0.3829,
+      "step": 11025
+    },
+    {
+      "epoch": 0.3042316089357343,
+      "grad_norm": 0.004471330437809229,
+      "learning_rate": 0.001,
+      "loss": 0.4359,
+      "step": 11026
+    },
+    {
+      "epoch": 0.30425920113679866,
+      "grad_norm": 0.0025410952512174845,
+      "learning_rate": 0.001,
+      "loss": 0.4367,
+      "step": 11027
+    },
+    {
+      "epoch": 0.30428679333786307,
+      "grad_norm": 0.0039034727960824966,
+      "learning_rate": 0.001,
+      "loss": 0.3898,
+      "step": 11028
+    },
+    {
+      "epoch": 0.3043143855389274,
+      "grad_norm": 0.0032912297174334526,
+      "learning_rate": 0.001,
+      "loss": 0.4122,
+      "step": 11029
+    },
+    {
+      "epoch": 0.3043419777399918,
+      "grad_norm": 0.009197532199323177,
+      "learning_rate": 0.001,
+      "loss": 0.4057,
+      "step": 11030
+    },
+    {
+      "epoch": 0.3043695699410562,
+      "grad_norm": 0.003327523358166218,
+      "learning_rate": 0.001,
+      "loss": 0.3578,
+      "step": 11031
+    },
+    {
+      "epoch": 0.30439716214212054,
+      "grad_norm": 0.0036153208930045366,
+      "learning_rate": 0.001,
+      "loss": 0.3604,
+      "step": 11032
+    },
+    {
+      "epoch": 0.3044247543431849,
+      "grad_norm": 0.0028802400920540094,
+      "learning_rate": 0.001,
+      "loss": 0.408,
+      "step": 11033
+    },
+    {
+      "epoch": 0.30445234654424924,
+      "grad_norm": 0.0023775347508490086,
+      "learning_rate": 0.001,
+      "loss": 0.4018,
+      "step": 11034
+    },
+    {
+      "epoch": 0.30447993874531365,
+      "grad_norm": 0.007655430119484663,
+      "learning_rate": 0.001,
+      "loss": 0.3631,
+      "step": 11035
+    },
+    {
+      "epoch": 0.304507530946378,
+      "grad_norm": 0.0038870847783982754,
+      "learning_rate": 0.001,
+      "loss": 0.4255,
+      "step": 11036
+    },
+    {
+      "epoch": 0.30453512314744235,
+      "grad_norm": 0.010876862332224846,
+      "learning_rate": 0.001,
+      "loss": 0.3784,
+      "step": 11037
+    },
+    {
+      "epoch": 0.30456271534850676,
+      "grad_norm": 0.01851513236761093,
+      "learning_rate": 0.001,
+      "loss": 0.4014,
+      "step": 11038
+    },
+    {
+      "epoch": 0.3045903075495711,
+      "grad_norm": 0.0071109263226389885,
+      "learning_rate": 0.001,
+      "loss": 0.3756,
+      "step": 11039
+    },
+    {
+      "epoch": 0.30461789975063547,
+      "grad_norm": 0.002972143003717065,
+      "learning_rate": 0.001,
+      "loss": 0.3919,
+      "step": 11040
+    },
+    {
+      "epoch": 0.3046454919516999,
+      "grad_norm": 0.0042697228491306305,
+      "learning_rate": 0.001,
+      "loss": 0.3751,
+      "step": 11041
+    },
+    {
+      "epoch": 0.30467308415276423,
+      "grad_norm": 0.0034229194279760122,
+      "learning_rate": 0.001,
+      "loss": 0.4196,
+      "step": 11042
+    },
+    {
+      "epoch": 0.3047006763538286,
+      "grad_norm": 0.0030748506542295218,
+      "learning_rate": 0.001,
+      "loss": 0.371,
+      "step": 11043
+    },
+    {
+      "epoch": 0.30472826855489293,
+      "grad_norm": 0.003974152263253927,
+      "learning_rate": 0.001,
+      "loss": 0.4025,
+      "step": 11044
+    },
+    {
+      "epoch": 0.30475586075595734,
+      "grad_norm": 0.0025529295671731234,
+      "learning_rate": 0.001,
+      "loss": 0.3786,
+      "step": 11045
+    },
+    {
+      "epoch": 0.3047834529570217,
+      "grad_norm": 0.002749155042693019,
+      "learning_rate": 0.001,
+      "loss": 0.4034,
+      "step": 11046
+    },
+    {
+      "epoch": 0.30481104515808605,
+      "grad_norm": 0.004938180558383465,
+      "learning_rate": 0.001,
+      "loss": 0.3687,
+      "step": 11047
+    },
+    {
+      "epoch": 0.30483863735915046,
+      "grad_norm": 0.0023326098453253508,
+      "learning_rate": 0.001,
+      "loss": 0.4322,
+      "step": 11048
+    },
+    {
+      "epoch": 0.3048662295602148,
+      "grad_norm": 0.0040608481504023075,
+      "learning_rate": 0.001,
+      "loss": 0.409,
+      "step": 11049
+    },
+    {
+      "epoch": 0.30489382176127916,
+      "grad_norm": 0.004212150815874338,
+      "learning_rate": 0.001,
+      "loss": 0.3593,
+      "step": 11050
+    },
+    {
+      "epoch": 0.30492141396234357,
+      "grad_norm": 0.0022898244205862284,
+      "learning_rate": 0.001,
+      "loss": 0.4135,
+      "step": 11051
+    },
+    {
+      "epoch": 0.3049490061634079,
+      "grad_norm": 0.009196506813168526,
+      "learning_rate": 0.001,
+      "loss": 0.4093,
+      "step": 11052
+    },
+    {
+      "epoch": 0.3049765983644723,
+      "grad_norm": 0.0035148763563483953,
+      "learning_rate": 0.001,
+      "loss": 0.4245,
+      "step": 11053
+    },
+    {
+      "epoch": 0.3050041905655366,
+      "grad_norm": 0.002594373654574156,
+      "learning_rate": 0.001,
+      "loss": 0.3939,
+      "step": 11054
+    },
+    {
+      "epoch": 0.30503178276660103,
+      "grad_norm": 0.0025384812615811825,
+      "learning_rate": 0.001,
+      "loss": 0.3966,
+      "step": 11055
+    },
+    {
+      "epoch": 0.3050593749676654,
+      "grad_norm": 0.0027979982551187277,
+      "learning_rate": 0.001,
+      "loss": 0.3702,
+      "step": 11056
+    },
+    {
+      "epoch": 0.30508696716872974,
+      "grad_norm": 0.0027982275933027267,
+      "learning_rate": 0.001,
+      "loss": 0.3994,
+      "step": 11057
+    },
+    {
+      "epoch": 0.30511455936979415,
+      "grad_norm": 0.002686277497559786,
+      "learning_rate": 0.001,
+      "loss": 0.3885,
+      "step": 11058
+    },
+    {
+      "epoch": 0.3051421515708585,
+      "grad_norm": 0.0058397226966917515,
+      "learning_rate": 0.001,
+      "loss": 0.4108,
+      "step": 11059
+    },
+    {
+      "epoch": 0.30516974377192285,
+      "grad_norm": 0.0036498201079666615,
+      "learning_rate": 0.001,
+      "loss": 0.4319,
+      "step": 11060
+    },
+    {
+      "epoch": 0.30519733597298726,
+      "grad_norm": 0.0023296461440622807,
+      "learning_rate": 0.001,
+      "loss": 0.4174,
+      "step": 11061
+    },
+    {
+      "epoch": 0.3052249281740516,
+      "grad_norm": 0.024197006598114967,
+      "learning_rate": 0.001,
+      "loss": 0.3755,
+      "step": 11062
+    },
+    {
+      "epoch": 0.30525252037511597,
+      "grad_norm": 0.005508562549948692,
+      "learning_rate": 0.001,
+      "loss": 0.402,
+      "step": 11063
+    },
+    {
+      "epoch": 0.3052801125761803,
+      "grad_norm": 0.009980360977351665,
+      "learning_rate": 0.001,
+      "loss": 0.4399,
+      "step": 11064
+    },
+    {
+      "epoch": 0.3053077047772447,
+      "grad_norm": 0.006406312808394432,
+      "learning_rate": 0.001,
+      "loss": 0.4074,
+      "step": 11065
+    },
+    {
+      "epoch": 0.3053352969783091,
+      "grad_norm": 0.00675487145781517,
+      "learning_rate": 0.001,
+      "loss": 0.3866,
+      "step": 11066
+    },
+    {
+      "epoch": 0.30536288917937343,
+      "grad_norm": 0.00432139215990901,
+      "learning_rate": 0.001,
+      "loss": 0.4053,
+      "step": 11067
+    },
+    {
+      "epoch": 0.30539048138043784,
+      "grad_norm": 0.004939731676131487,
+      "learning_rate": 0.001,
+      "loss": 0.3877,
+      "step": 11068
+    },
+    {
+      "epoch": 0.3054180735815022,
+      "grad_norm": 0.048318974673748016,
+      "learning_rate": 0.001,
+      "loss": 0.3609,
+      "step": 11069
+    },
+    {
+      "epoch": 0.30544566578256654,
+      "grad_norm": 0.002895533572882414,
+      "learning_rate": 0.001,
+      "loss": 0.4055,
+      "step": 11070
+    },
+    {
+      "epoch": 0.30547325798363095,
+      "grad_norm": 0.0036742573138326406,
+      "learning_rate": 0.001,
+      "loss": 0.3945,
+      "step": 11071
+    },
+    {
+      "epoch": 0.3055008501846953,
+      "grad_norm": 0.0037072747945785522,
+      "learning_rate": 0.001,
+      "loss": 0.4032,
+      "step": 11072
+    },
+    {
+      "epoch": 0.30552844238575966,
+      "grad_norm": 0.003319724928587675,
+      "learning_rate": 0.001,
+      "loss": 0.4217,
+      "step": 11073
+    },
+    {
+      "epoch": 0.305556034586824,
+      "grad_norm": 0.002916824072599411,
+      "learning_rate": 0.001,
+      "loss": 0.4025,
+      "step": 11074
+    },
+    {
+      "epoch": 0.3055836267878884,
+      "grad_norm": 0.00400786055251956,
+      "learning_rate": 0.001,
+      "loss": 0.4225,
+      "step": 11075
+    },
+    {
+      "epoch": 0.30561121898895277,
+      "grad_norm": 0.003279311815276742,
+      "learning_rate": 0.001,
+      "loss": 0.3989,
+      "step": 11076
+    },
+    {
+      "epoch": 0.3056388111900171,
+      "grad_norm": 0.003970045130699873,
+      "learning_rate": 0.001,
+      "loss": 0.4058,
+      "step": 11077
+    },
+    {
+      "epoch": 0.30566640339108153,
+      "grad_norm": 0.0034616603516042233,
+      "learning_rate": 0.001,
+      "loss": 0.386,
+      "step": 11078
+    },
+    {
+      "epoch": 0.3056939955921459,
+      "grad_norm": 0.003935270942747593,
+      "learning_rate": 0.001,
+      "loss": 0.3998,
+      "step": 11079
+    },
+    {
+      "epoch": 0.30572158779321024,
+      "grad_norm": 0.005806989502161741,
+      "learning_rate": 0.001,
+      "loss": 0.3649,
+      "step": 11080
+    },
+    {
+      "epoch": 0.30574917999427464,
+      "grad_norm": 0.004018415231257677,
+      "learning_rate": 0.001,
+      "loss": 0.3918,
+      "step": 11081
+    },
+    {
+      "epoch": 0.305776772195339,
+      "grad_norm": 0.0026464585680514574,
+      "learning_rate": 0.001,
+      "loss": 0.3754,
+      "step": 11082
+    },
+    {
+      "epoch": 0.30580436439640335,
+      "grad_norm": 0.006566127296537161,
+      "learning_rate": 0.001,
+      "loss": 0.378,
+      "step": 11083
+    },
+    {
+      "epoch": 0.3058319565974677,
+      "grad_norm": 0.0035378343891352415,
+      "learning_rate": 0.001,
+      "loss": 0.3585,
+      "step": 11084
+    },
+    {
+      "epoch": 0.3058595487985321,
+      "grad_norm": 0.005152801051735878,
+      "learning_rate": 0.001,
+      "loss": 0.4044,
+      "step": 11085
+    },
+    {
+      "epoch": 0.30588714099959646,
+      "grad_norm": 0.0025191842578351498,
+      "learning_rate": 0.001,
+      "loss": 0.4112,
+      "step": 11086
+    },
+    {
+      "epoch": 0.3059147332006608,
+      "grad_norm": 0.002810958307236433,
+      "learning_rate": 0.001,
+      "loss": 0.3963,
+      "step": 11087
+    },
+    {
+      "epoch": 0.3059423254017252,
+      "grad_norm": 0.003942539449781179,
+      "learning_rate": 0.001,
+      "loss": 0.3939,
+      "step": 11088
+    },
+    {
+      "epoch": 0.3059699176027896,
+      "grad_norm": 0.0032198168337345123,
+      "learning_rate": 0.001,
+      "loss": 0.4049,
+      "step": 11089
+    },
+    {
+      "epoch": 0.30599750980385393,
+      "grad_norm": 0.004471073392778635,
+      "learning_rate": 0.001,
+      "loss": 0.3834,
+      "step": 11090
+    },
+    {
+      "epoch": 0.30602510200491834,
+      "grad_norm": 0.0025380223523825407,
+      "learning_rate": 0.001,
+      "loss": 0.4236,
+      "step": 11091
+    },
+    {
+      "epoch": 0.3060526942059827,
+      "grad_norm": 0.003936004359275103,
+      "learning_rate": 0.001,
+      "loss": 0.3769,
+      "step": 11092
+    },
+    {
+      "epoch": 0.30608028640704704,
+      "grad_norm": 0.0035589123144745827,
+      "learning_rate": 0.001,
+      "loss": 0.3807,
+      "step": 11093
+    },
+    {
+      "epoch": 0.3061078786081114,
+      "grad_norm": 0.003829971654340625,
+      "learning_rate": 0.001,
+      "loss": 0.4002,
+      "step": 11094
+    },
+    {
+      "epoch": 0.3061354708091758,
+      "grad_norm": 0.0033564860932528973,
+      "learning_rate": 0.001,
+      "loss": 0.3849,
+      "step": 11095
+    },
+    {
+      "epoch": 0.30616306301024016,
+      "grad_norm": 0.006199992261826992,
+      "learning_rate": 0.001,
+      "loss": 0.4171,
+      "step": 11096
+    },
+    {
+      "epoch": 0.3061906552113045,
+      "grad_norm": 0.0038411279674619436,
+      "learning_rate": 0.001,
+      "loss": 0.4201,
+      "step": 11097
+    },
+    {
+      "epoch": 0.3062182474123689,
+      "grad_norm": 0.0028849910013377666,
+      "learning_rate": 0.001,
+      "loss": 0.4105,
+      "step": 11098
+    },
+    {
+      "epoch": 0.30624583961343327,
+      "grad_norm": 0.005419904366135597,
+      "learning_rate": 0.001,
+      "loss": 0.3878,
+      "step": 11099
+    },
+    {
+      "epoch": 0.3062734318144976,
+      "grad_norm": 0.004621229134500027,
+      "learning_rate": 0.001,
+      "loss": 0.4035,
+      "step": 11100
+    },
+    {
+      "epoch": 0.30630102401556203,
+      "grad_norm": 0.004461618140339851,
+      "learning_rate": 0.001,
+      "loss": 0.3947,
+      "step": 11101
+    },
+    {
+      "epoch": 0.3063286162166264,
+      "grad_norm": 0.0031210975721478462,
+      "learning_rate": 0.001,
+      "loss": 0.402,
+      "step": 11102
+    },
+    {
+      "epoch": 0.30635620841769073,
+      "grad_norm": 0.0036159909795969725,
+      "learning_rate": 0.001,
+      "loss": 0.3989,
+      "step": 11103
+    },
+    {
+      "epoch": 0.3063838006187551,
+      "grad_norm": 0.0030616209842264652,
+      "learning_rate": 0.001,
+      "loss": 0.3793,
+      "step": 11104
+    },
+    {
+      "epoch": 0.3064113928198195,
+      "grad_norm": 0.004985187668353319,
+      "learning_rate": 0.001,
+      "loss": 0.3857,
+      "step": 11105
+    },
+    {
+      "epoch": 0.30643898502088385,
+      "grad_norm": 0.0028018758166581392,
+      "learning_rate": 0.001,
+      "loss": 0.4123,
+      "step": 11106
+    },
+    {
+      "epoch": 0.3064665772219482,
+      "grad_norm": 0.004196144640445709,
+      "learning_rate": 0.001,
+      "loss": 0.3706,
+      "step": 11107
+    },
+    {
+      "epoch": 0.3064941694230126,
+      "grad_norm": 0.0038290584925562143,
+      "learning_rate": 0.001,
+      "loss": 0.3843,
+      "step": 11108
+    },
+    {
+      "epoch": 0.30652176162407696,
+      "grad_norm": 0.0028051917906850576,
+      "learning_rate": 0.001,
+      "loss": 0.4059,
+      "step": 11109
+    },
+    {
+      "epoch": 0.3065493538251413,
+      "grad_norm": 0.003199791768565774,
+      "learning_rate": 0.001,
+      "loss": 0.3826,
+      "step": 11110
+    },
+    {
+      "epoch": 0.30657694602620567,
+      "grad_norm": 0.0040592108853161335,
+      "learning_rate": 0.001,
+      "loss": 0.4014,
+      "step": 11111
+    },
+    {
+      "epoch": 0.3066045382272701,
+      "grad_norm": 0.005883520934730768,
+      "learning_rate": 0.001,
+      "loss": 0.3825,
+      "step": 11112
+    },
+    {
+      "epoch": 0.3066321304283344,
+      "grad_norm": 0.0024851111229509115,
+      "learning_rate": 0.001,
+      "loss": 0.4243,
+      "step": 11113
+    },
+    {
+      "epoch": 0.3066597226293988,
+      "grad_norm": 0.003925004508346319,
+      "learning_rate": 0.001,
+      "loss": 0.3919,
+      "step": 11114
+    },
+    {
+      "epoch": 0.3066873148304632,
+      "grad_norm": 0.0040878294967114925,
+      "learning_rate": 0.001,
+      "loss": 0.3982,
+      "step": 11115
+    },
+    {
+      "epoch": 0.30671490703152754,
+      "grad_norm": 0.0032062202226370573,
+      "learning_rate": 0.001,
+      "loss": 0.3999,
+      "step": 11116
+    },
+    {
+      "epoch": 0.3067424992325919,
+      "grad_norm": 0.004021904431283474,
+      "learning_rate": 0.001,
+      "loss": 0.4104,
+      "step": 11117
+    },
+    {
+      "epoch": 0.3067700914336563,
+      "grad_norm": 0.002459887880831957,
+      "learning_rate": 0.001,
+      "loss": 0.4044,
+      "step": 11118
+    },
+    {
+      "epoch": 0.30679768363472065,
+      "grad_norm": 0.0019991539884358644,
+      "learning_rate": 0.001,
+      "loss": 0.4321,
+      "step": 11119
+    },
+    {
+      "epoch": 0.306825275835785,
+      "grad_norm": 0.002835212042555213,
+      "learning_rate": 0.001,
+      "loss": 0.3878,
+      "step": 11120
+    },
+    {
+      "epoch": 0.30685286803684936,
+      "grad_norm": 0.003514475654810667,
+      "learning_rate": 0.001,
+      "loss": 0.4045,
+      "step": 11121
+    },
+    {
+      "epoch": 0.30688046023791377,
+      "grad_norm": 0.0025088752154260874,
+      "learning_rate": 0.001,
+      "loss": 0.3913,
+      "step": 11122
+    },
+    {
+      "epoch": 0.3069080524389781,
+      "grad_norm": 0.0025806506164371967,
+      "learning_rate": 0.001,
+      "loss": 0.425,
+      "step": 11123
+    },
+    {
+      "epoch": 0.30693564464004247,
+      "grad_norm": 0.0025829877704381943,
+      "learning_rate": 0.001,
+      "loss": 0.3979,
+      "step": 11124
+    },
+    {
+      "epoch": 0.3069632368411069,
+      "grad_norm": 0.0033166445791721344,
+      "learning_rate": 0.001,
+      "loss": 0.3884,
+      "step": 11125
+    },
+    {
+      "epoch": 0.30699082904217123,
+      "grad_norm": 0.0023280063178390265,
+      "learning_rate": 0.001,
+      "loss": 0.4089,
+      "step": 11126
+    },
+    {
+      "epoch": 0.3070184212432356,
+      "grad_norm": 0.004174636676907539,
+      "learning_rate": 0.001,
+      "loss": 0.3766,
+      "step": 11127
+    },
+    {
+      "epoch": 0.3070460134443,
+      "grad_norm": 0.002358420053496957,
+      "learning_rate": 0.001,
+      "loss": 0.4037,
+      "step": 11128
+    },
+    {
+      "epoch": 0.30707360564536434,
+      "grad_norm": 0.002632340881973505,
+      "learning_rate": 0.001,
+      "loss": 0.4251,
+      "step": 11129
+    },
+    {
+      "epoch": 0.3071011978464287,
+      "grad_norm": 0.004848247393965721,
+      "learning_rate": 0.001,
+      "loss": 0.4432,
+      "step": 11130
+    },
+    {
+      "epoch": 0.30712879004749305,
+      "grad_norm": 0.002614147262647748,
+      "learning_rate": 0.001,
+      "loss": 0.3682,
+      "step": 11131
+    },
+    {
+      "epoch": 0.30715638224855746,
+      "grad_norm": 0.0026096473447978497,
+      "learning_rate": 0.001,
+      "loss": 0.4032,
+      "step": 11132
+    },
+    {
+      "epoch": 0.3071839744496218,
+      "grad_norm": 0.002825048053637147,
+      "learning_rate": 0.001,
+      "loss": 0.3376,
+      "step": 11133
+    },
+    {
+      "epoch": 0.30721156665068616,
+      "grad_norm": 0.002993338042870164,
+      "learning_rate": 0.001,
+      "loss": 0.3784,
+      "step": 11134
+    },
+    {
+      "epoch": 0.30723915885175057,
+      "grad_norm": 0.0025206785649061203,
+      "learning_rate": 0.001,
+      "loss": 0.4025,
+      "step": 11135
+    },
+    {
+      "epoch": 0.3072667510528149,
+      "grad_norm": 0.0064323414117097855,
+      "learning_rate": 0.001,
+      "loss": 0.3892,
+      "step": 11136
+    },
+    {
+      "epoch": 0.3072943432538793,
+      "grad_norm": 0.006029163021594286,
+      "learning_rate": 0.001,
+      "loss": 0.4013,
+      "step": 11137
+    },
+    {
+      "epoch": 0.3073219354549437,
+      "grad_norm": 0.003802549559623003,
+      "learning_rate": 0.001,
+      "loss": 0.4149,
+      "step": 11138
+    },
+    {
+      "epoch": 0.30734952765600804,
+      "grad_norm": 0.0036848250310868025,
+      "learning_rate": 0.001,
+      "loss": 0.408,
+      "step": 11139
+    },
+    {
+      "epoch": 0.3073771198570724,
+      "grad_norm": 0.002685883082449436,
+      "learning_rate": 0.001,
+      "loss": 0.3675,
+      "step": 11140
+    },
+    {
+      "epoch": 0.30740471205813674,
+      "grad_norm": 0.004660370759665966,
+      "learning_rate": 0.001,
+      "loss": 0.4285,
+      "step": 11141
+    },
+    {
+      "epoch": 0.30743230425920115,
+      "grad_norm": 0.004408833105117083,
+      "learning_rate": 0.001,
+      "loss": 0.4147,
+      "step": 11142
+    },
+    {
+      "epoch": 0.3074598964602655,
+      "grad_norm": 0.003984685987234116,
+      "learning_rate": 0.001,
+      "loss": 0.3991,
+      "step": 11143
+    },
+    {
+      "epoch": 0.30748748866132986,
+      "grad_norm": 0.0034449570812284946,
+      "learning_rate": 0.001,
+      "loss": 0.392,
+      "step": 11144
+    },
+    {
+      "epoch": 0.30751508086239426,
+      "grad_norm": 0.0050663729198277,
+      "learning_rate": 0.001,
+      "loss": 0.4088,
+      "step": 11145
+    },
+    {
+      "epoch": 0.3075426730634586,
+      "grad_norm": 0.0025960092898458242,
+      "learning_rate": 0.001,
+      "loss": 0.3755,
+      "step": 11146
+    },
+    {
+      "epoch": 0.30757026526452297,
+      "grad_norm": 0.003925015218555927,
+      "learning_rate": 0.001,
+      "loss": 0.4243,
+      "step": 11147
+    },
+    {
+      "epoch": 0.3075978574655874,
+      "grad_norm": 0.003075996646657586,
+      "learning_rate": 0.001,
+      "loss": 0.3816,
+      "step": 11148
+    },
+    {
+      "epoch": 0.30762544966665173,
+      "grad_norm": 0.0028860154561698437,
+      "learning_rate": 0.001,
+      "loss": 0.4019,
+      "step": 11149
+    },
+    {
+      "epoch": 0.3076530418677161,
+      "grad_norm": 0.0029483395628631115,
+      "learning_rate": 0.001,
+      "loss": 0.3814,
+      "step": 11150
+    },
+    {
+      "epoch": 0.30768063406878043,
+      "grad_norm": 0.006685647647827864,
+      "learning_rate": 0.001,
+      "loss": 0.4076,
+      "step": 11151
+    },
+    {
+      "epoch": 0.30770822626984484,
+      "grad_norm": 0.0028818738646805286,
+      "learning_rate": 0.001,
+      "loss": 0.3885,
+      "step": 11152
+    },
+    {
+      "epoch": 0.3077358184709092,
+      "grad_norm": 0.003373377723619342,
+      "learning_rate": 0.001,
+      "loss": 0.3959,
+      "step": 11153
+    },
+    {
+      "epoch": 0.30776341067197355,
+      "grad_norm": 0.005747408606112003,
+      "learning_rate": 0.001,
+      "loss": 0.4192,
+      "step": 11154
+    },
+    {
+      "epoch": 0.30779100287303796,
+      "grad_norm": 0.003609336679801345,
+      "learning_rate": 0.001,
+      "loss": 0.4214,
+      "step": 11155
+    },
+    {
+      "epoch": 0.3078185950741023,
+      "grad_norm": 0.005563123617321253,
+      "learning_rate": 0.001,
+      "loss": 0.4268,
+      "step": 11156
+    },
+    {
+      "epoch": 0.30784618727516666,
+      "grad_norm": 0.003597275586798787,
+      "learning_rate": 0.001,
+      "loss": 0.4104,
+      "step": 11157
+    },
+    {
+      "epoch": 0.30787377947623107,
+      "grad_norm": 0.0027966846246272326,
+      "learning_rate": 0.001,
+      "loss": 0.4379,
+      "step": 11158
+    },
+    {
+      "epoch": 0.3079013716772954,
+      "grad_norm": 0.002950585214421153,
+      "learning_rate": 0.001,
+      "loss": 0.3762,
+      "step": 11159
+    },
+    {
+      "epoch": 0.3079289638783598,
+      "grad_norm": 0.0030036913231015205,
+      "learning_rate": 0.001,
+      "loss": 0.4352,
+      "step": 11160
+    },
+    {
+      "epoch": 0.3079565560794241,
+      "grad_norm": 0.004463705699890852,
+      "learning_rate": 0.001,
+      "loss": 0.4056,
+      "step": 11161
+    },
+    {
+      "epoch": 0.30798414828048853,
+      "grad_norm": 0.0020041607785969973,
+      "learning_rate": 0.001,
+      "loss": 0.4238,
+      "step": 11162
+    },
+    {
+      "epoch": 0.3080117404815529,
+      "grad_norm": 0.003834531642496586,
+      "learning_rate": 0.001,
+      "loss": 0.3812,
+      "step": 11163
+    },
+    {
+      "epoch": 0.30803933268261724,
+      "grad_norm": 0.002345136133953929,
+      "learning_rate": 0.001,
+      "loss": 0.4017,
+      "step": 11164
+    },
+    {
+      "epoch": 0.30806692488368165,
+      "grad_norm": 0.0028496733866631985,
+      "learning_rate": 0.001,
+      "loss": 0.3947,
+      "step": 11165
+    },
+    {
+      "epoch": 0.308094517084746,
+      "grad_norm": 0.003474792465567589,
+      "learning_rate": 0.001,
+      "loss": 0.4264,
+      "step": 11166
+    },
+    {
+      "epoch": 0.30812210928581035,
+      "grad_norm": 0.0036203833296895027,
+      "learning_rate": 0.001,
+      "loss": 0.4084,
+      "step": 11167
+    },
+    {
+      "epoch": 0.30814970148687476,
+      "grad_norm": 0.004971282556653023,
+      "learning_rate": 0.001,
+      "loss": 0.3704,
+      "step": 11168
+    },
+    {
+      "epoch": 0.3081772936879391,
+      "grad_norm": 0.007116068620234728,
+      "learning_rate": 0.001,
+      "loss": 0.4281,
+      "step": 11169
+    },
+    {
+      "epoch": 0.30820488588900347,
+      "grad_norm": 0.005289649125188589,
+      "learning_rate": 0.001,
+      "loss": 0.3739,
+      "step": 11170
+    },
+    {
+      "epoch": 0.3082324780900678,
+      "grad_norm": 0.0034187568817287683,
+      "learning_rate": 0.001,
+      "loss": 0.3981,
+      "step": 11171
+    },
+    {
+      "epoch": 0.3082600702911322,
+      "grad_norm": 0.002938291057944298,
+      "learning_rate": 0.001,
+      "loss": 0.432,
+      "step": 11172
+    },
+    {
+      "epoch": 0.3082876624921966,
+      "grad_norm": 0.004787916783243418,
+      "learning_rate": 0.001,
+      "loss": 0.4073,
+      "step": 11173
+    },
+    {
+      "epoch": 0.30831525469326093,
+      "grad_norm": 0.0029457362834364176,
+      "learning_rate": 0.001,
+      "loss": 0.4207,
+      "step": 11174
+    },
+    {
+      "epoch": 0.30834284689432534,
+      "grad_norm": 0.006731043569743633,
+      "learning_rate": 0.001,
+      "loss": 0.3951,
+      "step": 11175
+    },
+    {
+      "epoch": 0.3083704390953897,
+      "grad_norm": 0.005668407306075096,
+      "learning_rate": 0.001,
+      "loss": 0.4199,
+      "step": 11176
+    },
+    {
+      "epoch": 0.30839803129645404,
+      "grad_norm": 0.0028704048600047827,
+      "learning_rate": 0.001,
+      "loss": 0.4402,
+      "step": 11177
+    },
+    {
+      "epoch": 0.30842562349751845,
+      "grad_norm": 0.00468471460044384,
+      "learning_rate": 0.001,
+      "loss": 0.4205,
+      "step": 11178
+    },
+    {
+      "epoch": 0.3084532156985828,
+      "grad_norm": 0.003784140106290579,
+      "learning_rate": 0.001,
+      "loss": 0.42,
+      "step": 11179
+    },
+    {
+      "epoch": 0.30848080789964716,
+      "grad_norm": 0.0028519639745354652,
+      "learning_rate": 0.001,
+      "loss": 0.4257,
+      "step": 11180
+    },
+    {
+      "epoch": 0.3085084001007115,
+      "grad_norm": 0.004001649562269449,
+      "learning_rate": 0.001,
+      "loss": 0.3933,
+      "step": 11181
+    },
+    {
+      "epoch": 0.3085359923017759,
+      "grad_norm": 0.0034179389476776123,
+      "learning_rate": 0.001,
+      "loss": 0.3608,
+      "step": 11182
+    },
+    {
+      "epoch": 0.30856358450284027,
+      "grad_norm": 0.004161709453910589,
+      "learning_rate": 0.001,
+      "loss": 0.3561,
+      "step": 11183
+    },
+    {
+      "epoch": 0.3085911767039046,
+      "grad_norm": 0.0021271705627441406,
+      "learning_rate": 0.001,
+      "loss": 0.4045,
+      "step": 11184
+    },
+    {
+      "epoch": 0.30861876890496903,
+      "grad_norm": 0.004352390766143799,
+      "learning_rate": 0.001,
+      "loss": 0.4272,
+      "step": 11185
+    },
+    {
+      "epoch": 0.3086463611060334,
+      "grad_norm": 0.0035197525285184383,
+      "learning_rate": 0.001,
+      "loss": 0.3793,
+      "step": 11186
+    },
+    {
+      "epoch": 0.30867395330709774,
+      "grad_norm": 0.003885299200192094,
+      "learning_rate": 0.001,
+      "loss": 0.4038,
+      "step": 11187
+    },
+    {
+      "epoch": 0.30870154550816215,
+      "grad_norm": 0.004519937559962273,
+      "learning_rate": 0.001,
+      "loss": 0.4017,
+      "step": 11188
+    },
+    {
+      "epoch": 0.3087291377092265,
+      "grad_norm": 0.001987120136618614,
+      "learning_rate": 0.001,
+      "loss": 0.4137,
+      "step": 11189
+    },
+    {
+      "epoch": 0.30875672991029085,
+      "grad_norm": 0.003369292477145791,
+      "learning_rate": 0.001,
+      "loss": 0.3709,
+      "step": 11190
+    },
+    {
+      "epoch": 0.3087843221113552,
+      "grad_norm": 0.0031297069508582354,
+      "learning_rate": 0.001,
+      "loss": 0.3769,
+      "step": 11191
+    },
+    {
+      "epoch": 0.3088119143124196,
+      "grad_norm": 0.0024408060126006603,
+      "learning_rate": 0.001,
+      "loss": 0.4602,
+      "step": 11192
+    },
+    {
+      "epoch": 0.30883950651348396,
+      "grad_norm": 0.0031313635408878326,
+      "learning_rate": 0.001,
+      "loss": 0.4325,
+      "step": 11193
+    },
+    {
+      "epoch": 0.3088670987145483,
+      "grad_norm": 0.0029580635018646717,
+      "learning_rate": 0.001,
+      "loss": 0.3781,
+      "step": 11194
+    },
+    {
+      "epoch": 0.3088946909156127,
+      "grad_norm": 0.0027078099083155394,
+      "learning_rate": 0.001,
+      "loss": 0.3863,
+      "step": 11195
+    },
+    {
+      "epoch": 0.3089222831166771,
+      "grad_norm": 0.0038466129917651415,
+      "learning_rate": 0.001,
+      "loss": 0.4153,
+      "step": 11196
+    },
+    {
+      "epoch": 0.30894987531774143,
+      "grad_norm": 0.003398042405024171,
+      "learning_rate": 0.001,
+      "loss": 0.3995,
+      "step": 11197
+    },
+    {
+      "epoch": 0.30897746751880584,
+      "grad_norm": 0.007685355842113495,
+      "learning_rate": 0.001,
+      "loss": 0.4019,
+      "step": 11198
+    },
+    {
+      "epoch": 0.3090050597198702,
+      "grad_norm": 0.005074410233646631,
+      "learning_rate": 0.001,
+      "loss": 0.3998,
+      "step": 11199
+    },
+    {
+      "epoch": 0.30903265192093454,
+      "grad_norm": 0.003894890658557415,
+      "learning_rate": 0.001,
+      "loss": 0.4394,
+      "step": 11200
+    },
+    {
+      "epoch": 0.3090602441219989,
+      "grad_norm": 0.0026795901358127594,
+      "learning_rate": 0.001,
+      "loss": 0.4315,
+      "step": 11201
+    },
+    {
+      "epoch": 0.3090878363230633,
+      "grad_norm": 0.003955294843763113,
+      "learning_rate": 0.001,
+      "loss": 0.3723,
+      "step": 11202
+    },
+    {
+      "epoch": 0.30911542852412766,
+      "grad_norm": 0.0025795320980250835,
+      "learning_rate": 0.001,
+      "loss": 0.4317,
+      "step": 11203
+    },
+    {
+      "epoch": 0.309143020725192,
+      "grad_norm": 0.004697353113442659,
+      "learning_rate": 0.001,
+      "loss": 0.368,
+      "step": 11204
+    },
+    {
+      "epoch": 0.3091706129262564,
+      "grad_norm": 0.0036036588717252016,
+      "learning_rate": 0.001,
+      "loss": 0.376,
+      "step": 11205
+    },
+    {
+      "epoch": 0.30919820512732077,
+      "grad_norm": 0.002920630620792508,
+      "learning_rate": 0.001,
+      "loss": 0.4238,
+      "step": 11206
+    },
+    {
+      "epoch": 0.3092257973283851,
+      "grad_norm": 0.00252629560418427,
+      "learning_rate": 0.001,
+      "loss": 0.4474,
+      "step": 11207
+    },
+    {
+      "epoch": 0.3092533895294495,
+      "grad_norm": 0.002593503100797534,
+      "learning_rate": 0.001,
+      "loss": 0.4349,
+      "step": 11208
+    },
+    {
+      "epoch": 0.3092809817305139,
+      "grad_norm": 0.002563952933996916,
+      "learning_rate": 0.001,
+      "loss": 0.3945,
+      "step": 11209
+    },
+    {
+      "epoch": 0.30930857393157823,
+      "grad_norm": 0.00308457319624722,
+      "learning_rate": 0.001,
+      "loss": 0.3698,
+      "step": 11210
+    },
+    {
+      "epoch": 0.3093361661326426,
+      "grad_norm": 0.004335676319897175,
+      "learning_rate": 0.001,
+      "loss": 0.3702,
+      "step": 11211
+    },
+    {
+      "epoch": 0.309363758333707,
+      "grad_norm": 0.004092890303581953,
+      "learning_rate": 0.001,
+      "loss": 0.3871,
+      "step": 11212
+    },
+    {
+      "epoch": 0.30939135053477135,
+      "grad_norm": 0.003627408528700471,
+      "learning_rate": 0.001,
+      "loss": 0.4175,
+      "step": 11213
+    },
+    {
+      "epoch": 0.3094189427358357,
+      "grad_norm": 0.0031013472471386194,
+      "learning_rate": 0.001,
+      "loss": 0.3755,
+      "step": 11214
+    },
+    {
+      "epoch": 0.3094465349369001,
+      "grad_norm": 0.002774468855932355,
+      "learning_rate": 0.001,
+      "loss": 0.434,
+      "step": 11215
+    },
+    {
+      "epoch": 0.30947412713796446,
+      "grad_norm": 0.0022229300811886787,
+      "learning_rate": 0.001,
+      "loss": 0.4303,
+      "step": 11216
+    },
+    {
+      "epoch": 0.3095017193390288,
+      "grad_norm": 0.004068805370479822,
+      "learning_rate": 0.001,
+      "loss": 0.3823,
+      "step": 11217
+    },
+    {
+      "epoch": 0.30952931154009317,
+      "grad_norm": 0.0022410049568861723,
+      "learning_rate": 0.001,
+      "loss": 0.4067,
+      "step": 11218
+    },
+    {
+      "epoch": 0.3095569037411576,
+      "grad_norm": 0.002807425567880273,
+      "learning_rate": 0.001,
+      "loss": 0.409,
+      "step": 11219
+    },
+    {
+      "epoch": 0.3095844959422219,
+      "grad_norm": 0.008168449625372887,
+      "learning_rate": 0.001,
+      "loss": 0.4257,
+      "step": 11220
+    },
+    {
+      "epoch": 0.3096120881432863,
+      "grad_norm": 0.007224308326840401,
+      "learning_rate": 0.001,
+      "loss": 0.3927,
+      "step": 11221
+    },
+    {
+      "epoch": 0.3096396803443507,
+      "grad_norm": 0.004867930430918932,
+      "learning_rate": 0.001,
+      "loss": 0.4163,
+      "step": 11222
+    },
+    {
+      "epoch": 0.30966727254541504,
+      "grad_norm": 0.004449460655450821,
+      "learning_rate": 0.001,
+      "loss": 0.4065,
+      "step": 11223
+    },
+    {
+      "epoch": 0.3096948647464794,
+      "grad_norm": 0.003997356165200472,
+      "learning_rate": 0.001,
+      "loss": 0.3887,
+      "step": 11224
+    },
+    {
+      "epoch": 0.3097224569475438,
+      "grad_norm": 0.0036069692578166723,
+      "learning_rate": 0.001,
+      "loss": 0.3726,
+      "step": 11225
+    },
+    {
+      "epoch": 0.30975004914860815,
+      "grad_norm": 0.003934454172849655,
+      "learning_rate": 0.001,
+      "loss": 0.4149,
+      "step": 11226
+    },
+    {
+      "epoch": 0.3097776413496725,
+      "grad_norm": 0.01457425020635128,
+      "learning_rate": 0.001,
+      "loss": 0.3672,
+      "step": 11227
+    },
+    {
+      "epoch": 0.30980523355073686,
+      "grad_norm": 0.004006854724138975,
+      "learning_rate": 0.001,
+      "loss": 0.3849,
+      "step": 11228
+    },
+    {
+      "epoch": 0.30983282575180127,
+      "grad_norm": 0.0053675188682973385,
+      "learning_rate": 0.001,
+      "loss": 0.3782,
+      "step": 11229
+    },
+    {
+      "epoch": 0.3098604179528656,
+      "grad_norm": 0.0030079265125095844,
+      "learning_rate": 0.001,
+      "loss": 0.3465,
+      "step": 11230
+    },
+    {
+      "epoch": 0.30988801015392997,
+      "grad_norm": 0.004947451408952475,
+      "learning_rate": 0.001,
+      "loss": 0.4309,
+      "step": 11231
+    },
+    {
+      "epoch": 0.3099156023549944,
+      "grad_norm": 0.0025891209952533245,
+      "learning_rate": 0.001,
+      "loss": 0.3676,
+      "step": 11232
+    },
+    {
+      "epoch": 0.30994319455605873,
+      "grad_norm": 0.0022305548191070557,
+      "learning_rate": 0.001,
+      "loss": 0.4611,
+      "step": 11233
+    },
+    {
+      "epoch": 0.3099707867571231,
+      "grad_norm": 0.0024289439897984266,
+      "learning_rate": 0.001,
+      "loss": 0.416,
+      "step": 11234
+    },
+    {
+      "epoch": 0.3099983789581875,
+      "grad_norm": 0.005738126579672098,
+      "learning_rate": 0.001,
+      "loss": 0.392,
+      "step": 11235
+    },
+    {
+      "epoch": 0.31002597115925185,
+      "grad_norm": 0.005195004399865866,
+      "learning_rate": 0.001,
+      "loss": 0.3711,
+      "step": 11236
+    },
+    {
+      "epoch": 0.3100535633603162,
+      "grad_norm": 0.003794547636061907,
+      "learning_rate": 0.001,
+      "loss": 0.4293,
+      "step": 11237
+    },
+    {
+      "epoch": 0.31008115556138055,
+      "grad_norm": 0.0029551167972385883,
+      "learning_rate": 0.001,
+      "loss": 0.3751,
+      "step": 11238
+    },
+    {
+      "epoch": 0.31010874776244496,
+      "grad_norm": 0.002617292571812868,
+      "learning_rate": 0.001,
+      "loss": 0.4004,
+      "step": 11239
+    },
+    {
+      "epoch": 0.3101363399635093,
+      "grad_norm": 0.013282334432005882,
+      "learning_rate": 0.001,
+      "loss": 0.3738,
+      "step": 11240
+    },
+    {
+      "epoch": 0.31016393216457366,
+      "grad_norm": 0.0025673380587249994,
+      "learning_rate": 0.001,
+      "loss": 0.4433,
+      "step": 11241
+    },
+    {
+      "epoch": 0.31019152436563807,
+      "grad_norm": 0.0032431413419544697,
+      "learning_rate": 0.001,
+      "loss": 0.3933,
+      "step": 11242
+    },
+    {
+      "epoch": 0.3102191165667024,
+      "grad_norm": 0.006165751256048679,
+      "learning_rate": 0.001,
+      "loss": 0.4224,
+      "step": 11243
+    },
+    {
+      "epoch": 0.3102467087677668,
+      "grad_norm": 0.0051593780517578125,
+      "learning_rate": 0.001,
+      "loss": 0.4081,
+      "step": 11244
+    },
+    {
+      "epoch": 0.3102743009688312,
+      "grad_norm": 0.004621399566531181,
+      "learning_rate": 0.001,
+      "loss": 0.4194,
+      "step": 11245
+    },
+    {
+      "epoch": 0.31030189316989554,
+      "grad_norm": 0.007019545882940292,
+      "learning_rate": 0.001,
+      "loss": 0.3769,
+      "step": 11246
+    },
+    {
+      "epoch": 0.3103294853709599,
+      "grad_norm": 0.0030706804245710373,
+      "learning_rate": 0.001,
+      "loss": 0.4098,
+      "step": 11247
+    },
+    {
+      "epoch": 0.31035707757202424,
+      "grad_norm": 0.009759355336427689,
+      "learning_rate": 0.001,
+      "loss": 0.4047,
+      "step": 11248
+    },
+    {
+      "epoch": 0.31038466977308865,
+      "grad_norm": 0.004515976645052433,
+      "learning_rate": 0.001,
+      "loss": 0.3796,
+      "step": 11249
+    },
+    {
+      "epoch": 0.310412261974153,
+      "grad_norm": 0.005143680609762669,
+      "learning_rate": 0.001,
+      "loss": 0.3672,
+      "step": 11250
+    },
+    {
+      "epoch": 0.31043985417521736,
+      "grad_norm": 0.004736714996397495,
+      "learning_rate": 0.001,
+      "loss": 0.3894,
+      "step": 11251
+    },
+    {
+      "epoch": 0.31046744637628176,
+      "grad_norm": 0.004402066580951214,
+      "learning_rate": 0.001,
+      "loss": 0.3817,
+      "step": 11252
+    },
+    {
+      "epoch": 0.3104950385773461,
+      "grad_norm": 0.004078419879078865,
+      "learning_rate": 0.001,
+      "loss": 0.3932,
+      "step": 11253
+    },
+    {
+      "epoch": 0.31052263077841047,
+      "grad_norm": 0.005986323114484549,
+      "learning_rate": 0.001,
+      "loss": 0.4077,
+      "step": 11254
+    },
+    {
+      "epoch": 0.3105502229794749,
+      "grad_norm": 0.0041157579980790615,
+      "learning_rate": 0.001,
+      "loss": 0.4147,
+      "step": 11255
+    },
+    {
+      "epoch": 0.31057781518053923,
+      "grad_norm": 0.005283708218485117,
+      "learning_rate": 0.001,
+      "loss": 0.3949,
+      "step": 11256
+    },
+    {
+      "epoch": 0.3106054073816036,
+      "grad_norm": 0.0037576963659375906,
+      "learning_rate": 0.001,
+      "loss": 0.4075,
+      "step": 11257
+    },
+    {
+      "epoch": 0.31063299958266793,
+      "grad_norm": 0.004866046831011772,
+      "learning_rate": 0.001,
+      "loss": 0.3901,
+      "step": 11258
+    },
+    {
+      "epoch": 0.31066059178373234,
+      "grad_norm": 0.007358568720519543,
+      "learning_rate": 0.001,
+      "loss": 0.4208,
+      "step": 11259
+    },
+    {
+      "epoch": 0.3106881839847967,
+      "grad_norm": 0.004052911419421434,
+      "learning_rate": 0.001,
+      "loss": 0.412,
+      "step": 11260
+    },
+    {
+      "epoch": 0.31071577618586105,
+      "grad_norm": 0.004355463664978743,
+      "learning_rate": 0.001,
+      "loss": 0.3823,
+      "step": 11261
+    },
+    {
+      "epoch": 0.31074336838692546,
+      "grad_norm": 0.006888225674629211,
+      "learning_rate": 0.001,
+      "loss": 0.3888,
+      "step": 11262
+    },
+    {
+      "epoch": 0.3107709605879898,
+      "grad_norm": 0.004396567586809397,
+      "learning_rate": 0.001,
+      "loss": 0.3983,
+      "step": 11263
+    },
+    {
+      "epoch": 0.31079855278905416,
+      "grad_norm": 0.003929099999368191,
+      "learning_rate": 0.001,
+      "loss": 0.3827,
+      "step": 11264
+    },
+    {
+      "epoch": 0.31082614499011857,
+      "grad_norm": 0.0028630662709474564,
+      "learning_rate": 0.001,
+      "loss": 0.4243,
+      "step": 11265
+    },
+    {
+      "epoch": 0.3108537371911829,
+      "grad_norm": 0.003565790131688118,
+      "learning_rate": 0.001,
+      "loss": 0.4025,
+      "step": 11266
+    },
+    {
+      "epoch": 0.3108813293922473,
+      "grad_norm": 0.004160342272371054,
+      "learning_rate": 0.001,
+      "loss": 0.3897,
+      "step": 11267
+    },
+    {
+      "epoch": 0.3109089215933116,
+      "grad_norm": 0.004211884923279285,
+      "learning_rate": 0.001,
+      "loss": 0.4012,
+      "step": 11268
+    },
+    {
+      "epoch": 0.31093651379437603,
+      "grad_norm": 0.004373464733362198,
+      "learning_rate": 0.001,
+      "loss": 0.4307,
+      "step": 11269
+    },
+    {
+      "epoch": 0.3109641059954404,
+      "grad_norm": 0.0034809240605682135,
+      "learning_rate": 0.001,
+      "loss": 0.3354,
+      "step": 11270
+    },
+    {
+      "epoch": 0.31099169819650474,
+      "grad_norm": 0.004237370565533638,
+      "learning_rate": 0.001,
+      "loss": 0.3768,
+      "step": 11271
+    },
+    {
+      "epoch": 0.31101929039756915,
+      "grad_norm": 0.003246934851631522,
+      "learning_rate": 0.001,
+      "loss": 0.4406,
+      "step": 11272
+    },
+    {
+      "epoch": 0.3110468825986335,
+      "grad_norm": 0.0036773579195141792,
+      "learning_rate": 0.001,
+      "loss": 0.3831,
+      "step": 11273
+    },
+    {
+      "epoch": 0.31107447479969785,
+      "grad_norm": 0.005854573100805283,
+      "learning_rate": 0.001,
+      "loss": 0.3478,
+      "step": 11274
+    },
+    {
+      "epoch": 0.31110206700076226,
+      "grad_norm": 0.004920937120914459,
+      "learning_rate": 0.001,
+      "loss": 0.4125,
+      "step": 11275
+    },
+    {
+      "epoch": 0.3111296592018266,
+      "grad_norm": 0.0030272614676505327,
+      "learning_rate": 0.001,
+      "loss": 0.4373,
+      "step": 11276
+    },
+    {
+      "epoch": 0.31115725140289097,
+      "grad_norm": 0.002614455996081233,
+      "learning_rate": 0.001,
+      "loss": 0.3873,
+      "step": 11277
+    },
+    {
+      "epoch": 0.3111848436039553,
+      "grad_norm": 0.006632671691477299,
+      "learning_rate": 0.001,
+      "loss": 0.3863,
+      "step": 11278
+    },
+    {
+      "epoch": 0.3112124358050197,
+      "grad_norm": 0.005778672639280558,
+      "learning_rate": 0.001,
+      "loss": 0.3869,
+      "step": 11279
+    },
+    {
+      "epoch": 0.3112400280060841,
+      "grad_norm": 0.0035203301813453436,
+      "learning_rate": 0.001,
+      "loss": 0.3743,
+      "step": 11280
+    },
+    {
+      "epoch": 0.31126762020714843,
+      "grad_norm": 0.002483597956597805,
+      "learning_rate": 0.001,
+      "loss": 0.3692,
+      "step": 11281
+    },
+    {
+      "epoch": 0.31129521240821284,
+      "grad_norm": 0.004106397274881601,
+      "learning_rate": 0.001,
+      "loss": 0.3901,
+      "step": 11282
+    },
+    {
+      "epoch": 0.3113228046092772,
+      "grad_norm": 0.002346999244764447,
+      "learning_rate": 0.001,
+      "loss": 0.4004,
+      "step": 11283
+    },
+    {
+      "epoch": 0.31135039681034155,
+      "grad_norm": 0.005101846065372229,
+      "learning_rate": 0.001,
+      "loss": 0.3911,
+      "step": 11284
+    },
+    {
+      "epoch": 0.31137798901140595,
+      "grad_norm": 0.0027893667574971914,
+      "learning_rate": 0.001,
+      "loss": 0.4077,
+      "step": 11285
+    },
+    {
+      "epoch": 0.3114055812124703,
+      "grad_norm": 0.0029769500251859426,
+      "learning_rate": 0.001,
+      "loss": 0.4126,
+      "step": 11286
+    },
+    {
+      "epoch": 0.31143317341353466,
+      "grad_norm": 0.0033861526753753424,
+      "learning_rate": 0.001,
+      "loss": 0.3951,
+      "step": 11287
+    },
+    {
+      "epoch": 0.311460765614599,
+      "grad_norm": 0.006664988584816456,
+      "learning_rate": 0.001,
+      "loss": 0.3681,
+      "step": 11288
+    },
+    {
+      "epoch": 0.3114883578156634,
+      "grad_norm": 0.004560802131891251,
+      "learning_rate": 0.001,
+      "loss": 0.4025,
+      "step": 11289
+    },
+    {
+      "epoch": 0.31151595001672777,
+      "grad_norm": 0.005946944933384657,
+      "learning_rate": 0.001,
+      "loss": 0.3888,
+      "step": 11290
+    },
+    {
+      "epoch": 0.3115435422177921,
+      "grad_norm": 0.0032140284311026335,
+      "learning_rate": 0.001,
+      "loss": 0.3937,
+      "step": 11291
+    },
+    {
+      "epoch": 0.31157113441885653,
+      "grad_norm": 0.011185484007000923,
+      "learning_rate": 0.001,
+      "loss": 0.392,
+      "step": 11292
+    },
+    {
+      "epoch": 0.3115987266199209,
+      "grad_norm": 0.0024210652336478233,
+      "learning_rate": 0.001,
+      "loss": 0.3568,
+      "step": 11293
+    },
+    {
+      "epoch": 0.31162631882098524,
+      "grad_norm": 0.00252347718924284,
+      "learning_rate": 0.001,
+      "loss": 0.404,
+      "step": 11294
+    },
+    {
+      "epoch": 0.3116539110220496,
+      "grad_norm": 0.0039216154254972935,
+      "learning_rate": 0.001,
+      "loss": 0.405,
+      "step": 11295
+    },
+    {
+      "epoch": 0.311681503223114,
+      "grad_norm": 0.0024138404987752438,
+      "learning_rate": 0.001,
+      "loss": 0.3934,
+      "step": 11296
+    },
+    {
+      "epoch": 0.31170909542417835,
+      "grad_norm": 0.0026965364813804626,
+      "learning_rate": 0.001,
+      "loss": 0.4213,
+      "step": 11297
+    },
+    {
+      "epoch": 0.3117366876252427,
+      "grad_norm": 0.0034357013646513224,
+      "learning_rate": 0.001,
+      "loss": 0.409,
+      "step": 11298
+    },
+    {
+      "epoch": 0.3117642798263071,
+      "grad_norm": 0.0027775561902672052,
+      "learning_rate": 0.001,
+      "loss": 0.4162,
+      "step": 11299
+    },
+    {
+      "epoch": 0.31179187202737146,
+      "grad_norm": 0.002832148689776659,
+      "learning_rate": 0.001,
+      "loss": 0.3689,
+      "step": 11300
+    },
+    {
+      "epoch": 0.3118194642284358,
+      "grad_norm": 0.002956314478069544,
+      "learning_rate": 0.001,
+      "loss": 0.4185,
+      "step": 11301
+    },
+    {
+      "epoch": 0.3118470564295002,
+      "grad_norm": 0.006010833196341991,
+      "learning_rate": 0.001,
+      "loss": 0.3865,
+      "step": 11302
+    },
+    {
+      "epoch": 0.3118746486305646,
+      "grad_norm": 0.002660473110154271,
+      "learning_rate": 0.001,
+      "loss": 0.426,
+      "step": 11303
+    },
+    {
+      "epoch": 0.31190224083162893,
+      "grad_norm": 0.002595923375338316,
+      "learning_rate": 0.001,
+      "loss": 0.3733,
+      "step": 11304
+    },
+    {
+      "epoch": 0.3119298330326933,
+      "grad_norm": 0.004651275463402271,
+      "learning_rate": 0.001,
+      "loss": 0.4042,
+      "step": 11305
+    },
+    {
+      "epoch": 0.3119574252337577,
+      "grad_norm": 0.0029690570663660765,
+      "learning_rate": 0.001,
+      "loss": 0.353,
+      "step": 11306
+    },
+    {
+      "epoch": 0.31198501743482204,
+      "grad_norm": 0.0036981538869440556,
+      "learning_rate": 0.001,
+      "loss": 0.4236,
+      "step": 11307
+    },
+    {
+      "epoch": 0.3120126096358864,
+      "grad_norm": 0.0027398637030273676,
+      "learning_rate": 0.001,
+      "loss": 0.4292,
+      "step": 11308
+    },
+    {
+      "epoch": 0.3120402018369508,
+      "grad_norm": 0.003324718214571476,
+      "learning_rate": 0.001,
+      "loss": 0.4538,
+      "step": 11309
+    },
+    {
+      "epoch": 0.31206779403801516,
+      "grad_norm": 0.003862992627546191,
+      "learning_rate": 0.001,
+      "loss": 0.3949,
+      "step": 11310
+    },
+    {
+      "epoch": 0.3120953862390795,
+      "grad_norm": 0.0027167177759110928,
+      "learning_rate": 0.001,
+      "loss": 0.361,
+      "step": 11311
+    },
+    {
+      "epoch": 0.3121229784401439,
+      "grad_norm": 0.005694078281521797,
+      "learning_rate": 0.001,
+      "loss": 0.3341,
+      "step": 11312
+    },
+    {
+      "epoch": 0.31215057064120827,
+      "grad_norm": 0.0032801406923681498,
+      "learning_rate": 0.001,
+      "loss": 0.3755,
+      "step": 11313
+    },
+    {
+      "epoch": 0.3121781628422726,
+      "grad_norm": 0.0072447690181434155,
+      "learning_rate": 0.001,
+      "loss": 0.3904,
+      "step": 11314
+    },
+    {
+      "epoch": 0.312205755043337,
+      "grad_norm": 0.00575765548273921,
+      "learning_rate": 0.001,
+      "loss": 0.3801,
+      "step": 11315
+    },
+    {
+      "epoch": 0.3122333472444014,
+      "grad_norm": 0.0030593755654990673,
+      "learning_rate": 0.001,
+      "loss": 0.3693,
+      "step": 11316
+    },
+    {
+      "epoch": 0.31226093944546573,
+      "grad_norm": 0.0034721451811492443,
+      "learning_rate": 0.001,
+      "loss": 0.3949,
+      "step": 11317
+    },
+    {
+      "epoch": 0.3122885316465301,
+      "grad_norm": 0.0029739809688180685,
+      "learning_rate": 0.001,
+      "loss": 0.4125,
+      "step": 11318
+    },
+    {
+      "epoch": 0.3123161238475945,
+      "grad_norm": 0.004178667441010475,
+      "learning_rate": 0.001,
+      "loss": 0.4132,
+      "step": 11319
+    },
+    {
+      "epoch": 0.31234371604865885,
+      "grad_norm": 0.00276771723292768,
+      "learning_rate": 0.001,
+      "loss": 0.4108,
+      "step": 11320
+    },
+    {
+      "epoch": 0.3123713082497232,
+      "grad_norm": 0.0033821798861026764,
+      "learning_rate": 0.001,
+      "loss": 0.4078,
+      "step": 11321
+    },
+    {
+      "epoch": 0.3123989004507876,
+      "grad_norm": 0.003055412322282791,
+      "learning_rate": 0.001,
+      "loss": 0.4071,
+      "step": 11322
+    },
+    {
+      "epoch": 0.31242649265185196,
+      "grad_norm": 0.006147374864667654,
+      "learning_rate": 0.001,
+      "loss": 0.4117,
+      "step": 11323
+    },
+    {
+      "epoch": 0.3124540848529163,
+      "grad_norm": 0.002903368789702654,
+      "learning_rate": 0.001,
+      "loss": 0.3949,
+      "step": 11324
+    },
+    {
+      "epoch": 0.31248167705398067,
+      "grad_norm": 0.002361924620345235,
+      "learning_rate": 0.001,
+      "loss": 0.3739,
+      "step": 11325
+    },
+    {
+      "epoch": 0.3125092692550451,
+      "grad_norm": 0.002768756588920951,
+      "learning_rate": 0.001,
+      "loss": 0.3951,
+      "step": 11326
+    },
+    {
+      "epoch": 0.3125368614561094,
+      "grad_norm": 0.009059234522283077,
+      "learning_rate": 0.001,
+      "loss": 0.3981,
+      "step": 11327
+    },
+    {
+      "epoch": 0.3125644536571738,
+      "grad_norm": 0.0029197093099355698,
+      "learning_rate": 0.001,
+      "loss": 0.4243,
+      "step": 11328
+    },
+    {
+      "epoch": 0.3125920458582382,
+      "grad_norm": 0.003418351523578167,
+      "learning_rate": 0.001,
+      "loss": 0.3994,
+      "step": 11329
+    },
+    {
+      "epoch": 0.31261963805930254,
+      "grad_norm": 0.004701963625848293,
+      "learning_rate": 0.001,
+      "loss": 0.3557,
+      "step": 11330
+    },
+    {
+      "epoch": 0.3126472302603669,
+      "grad_norm": 0.008553891442716122,
+      "learning_rate": 0.001,
+      "loss": 0.3819,
+      "step": 11331
+    },
+    {
+      "epoch": 0.3126748224614313,
+      "grad_norm": 0.0062170871533453465,
+      "learning_rate": 0.001,
+      "loss": 0.3824,
+      "step": 11332
+    },
+    {
+      "epoch": 0.31270241466249565,
+      "grad_norm": 0.0068610128946602345,
+      "learning_rate": 0.001,
+      "loss": 0.4345,
+      "step": 11333
+    },
+    {
+      "epoch": 0.31273000686356,
+      "grad_norm": 0.008722824975848198,
+      "learning_rate": 0.001,
+      "loss": 0.377,
+      "step": 11334
+    },
+    {
+      "epoch": 0.31275759906462436,
+      "grad_norm": 0.004026619717478752,
+      "learning_rate": 0.001,
+      "loss": 0.4284,
+      "step": 11335
+    },
+    {
+      "epoch": 0.31278519126568877,
+      "grad_norm": 0.006060839165002108,
+      "learning_rate": 0.001,
+      "loss": 0.3568,
+      "step": 11336
+    },
+    {
+      "epoch": 0.3128127834667531,
+      "grad_norm": 0.0033903757575899363,
+      "learning_rate": 0.001,
+      "loss": 0.4015,
+      "step": 11337
+    },
+    {
+      "epoch": 0.31284037566781747,
+      "grad_norm": 0.004284377209842205,
+      "learning_rate": 0.001,
+      "loss": 0.3676,
+      "step": 11338
+    },
+    {
+      "epoch": 0.3128679678688819,
+      "grad_norm": 0.002431008731946349,
+      "learning_rate": 0.001,
+      "loss": 0.4198,
+      "step": 11339
+    },
+    {
+      "epoch": 0.31289556006994623,
+      "grad_norm": 0.0026608938351273537,
+      "learning_rate": 0.001,
+      "loss": 0.4473,
+      "step": 11340
+    },
+    {
+      "epoch": 0.3129231522710106,
+      "grad_norm": 0.0032837912440299988,
+      "learning_rate": 0.001,
+      "loss": 0.3772,
+      "step": 11341
+    },
+    {
+      "epoch": 0.312950744472075,
+      "grad_norm": 0.0023711349349468946,
+      "learning_rate": 0.001,
+      "loss": 0.43,
+      "step": 11342
+    },
+    {
+      "epoch": 0.31297833667313935,
+      "grad_norm": 0.003046794096007943,
+      "learning_rate": 0.001,
+      "loss": 0.4083,
+      "step": 11343
+    },
+    {
+      "epoch": 0.3130059288742037,
+      "grad_norm": 0.0024662583600729704,
+      "learning_rate": 0.001,
+      "loss": 0.4026,
+      "step": 11344
+    },
+    {
+      "epoch": 0.31303352107526805,
+      "grad_norm": 0.002765703946352005,
+      "learning_rate": 0.001,
+      "loss": 0.3953,
+      "step": 11345
+    },
+    {
+      "epoch": 0.31306111327633246,
+      "grad_norm": 0.0024844948202371597,
+      "learning_rate": 0.001,
+      "loss": 0.3844,
+      "step": 11346
+    },
+    {
+      "epoch": 0.3130887054773968,
+      "grad_norm": 0.0025793337263166904,
+      "learning_rate": 0.001,
+      "loss": 0.3831,
+      "step": 11347
+    },
+    {
+      "epoch": 0.31311629767846116,
+      "grad_norm": 0.003900103038176894,
+      "learning_rate": 0.001,
+      "loss": 0.3882,
+      "step": 11348
+    },
+    {
+      "epoch": 0.31314388987952557,
+      "grad_norm": 0.0021257405169308186,
+      "learning_rate": 0.001,
+      "loss": 0.4056,
+      "step": 11349
+    },
+    {
+      "epoch": 0.3131714820805899,
+      "grad_norm": 0.00406573386862874,
+      "learning_rate": 0.001,
+      "loss": 0.3997,
+      "step": 11350
+    },
+    {
+      "epoch": 0.3131990742816543,
+      "grad_norm": 0.003567320527508855,
+      "learning_rate": 0.001,
+      "loss": 0.4156,
+      "step": 11351
+    },
+    {
+      "epoch": 0.3132266664827187,
+      "grad_norm": 0.0028111112769693136,
+      "learning_rate": 0.001,
+      "loss": 0.3966,
+      "step": 11352
+    },
+    {
+      "epoch": 0.31325425868378304,
+      "grad_norm": 0.004948635585606098,
+      "learning_rate": 0.001,
+      "loss": 0.4094,
+      "step": 11353
+    },
+    {
+      "epoch": 0.3132818508848474,
+      "grad_norm": 0.0023954228963702917,
+      "learning_rate": 0.001,
+      "loss": 0.4198,
+      "step": 11354
+    },
+    {
+      "epoch": 0.31330944308591174,
+      "grad_norm": 0.0028324832674115896,
+      "learning_rate": 0.001,
+      "loss": 0.3635,
+      "step": 11355
+    },
+    {
+      "epoch": 0.31333703528697615,
+      "grad_norm": 0.0026057560462504625,
+      "learning_rate": 0.001,
+      "loss": 0.4259,
+      "step": 11356
+    },
+    {
+      "epoch": 0.3133646274880405,
+      "grad_norm": 0.006796353030949831,
+      "learning_rate": 0.001,
+      "loss": 0.4002,
+      "step": 11357
+    },
+    {
+      "epoch": 0.31339221968910486,
+      "grad_norm": 0.0028146374970674515,
+      "learning_rate": 0.001,
+      "loss": 0.4088,
+      "step": 11358
+    },
+    {
+      "epoch": 0.31341981189016926,
+      "grad_norm": 0.0030071684159338474,
+      "learning_rate": 0.001,
+      "loss": 0.4035,
+      "step": 11359
+    },
+    {
+      "epoch": 0.3134474040912336,
+      "grad_norm": 0.0034612752497196198,
+      "learning_rate": 0.001,
+      "loss": 0.423,
+      "step": 11360
+    },
+    {
+      "epoch": 0.31347499629229797,
+      "grad_norm": 0.0025981247890740633,
+      "learning_rate": 0.001,
+      "loss": 0.3929,
+      "step": 11361
+    },
+    {
+      "epoch": 0.3135025884933624,
+      "grad_norm": 0.0051524159498512745,
+      "learning_rate": 0.001,
+      "loss": 0.3775,
+      "step": 11362
+    },
+    {
+      "epoch": 0.31353018069442673,
+      "grad_norm": 0.008304521441459656,
+      "learning_rate": 0.001,
+      "loss": 0.3742,
+      "step": 11363
+    },
+    {
+      "epoch": 0.3135577728954911,
+      "grad_norm": 0.0033939266577363014,
+      "learning_rate": 0.001,
+      "loss": 0.409,
+      "step": 11364
+    },
+    {
+      "epoch": 0.31358536509655544,
+      "grad_norm": 0.0036061664577573538,
+      "learning_rate": 0.001,
+      "loss": 0.4467,
+      "step": 11365
+    },
+    {
+      "epoch": 0.31361295729761984,
+      "grad_norm": 0.005402529612183571,
+      "learning_rate": 0.001,
+      "loss": 0.3777,
+      "step": 11366
+    },
+    {
+      "epoch": 0.3136405494986842,
+      "grad_norm": 0.005045123398303986,
+      "learning_rate": 0.001,
+      "loss": 0.3801,
+      "step": 11367
+    },
+    {
+      "epoch": 0.31366814169974855,
+      "grad_norm": 0.00310247833840549,
+      "learning_rate": 0.001,
+      "loss": 0.4249,
+      "step": 11368
+    },
+    {
+      "epoch": 0.31369573390081296,
+      "grad_norm": 0.002901574596762657,
+      "learning_rate": 0.001,
+      "loss": 0.4057,
+      "step": 11369
+    },
+    {
+      "epoch": 0.3137233261018773,
+      "grad_norm": 0.0029007482808083296,
+      "learning_rate": 0.001,
+      "loss": 0.3855,
+      "step": 11370
+    },
+    {
+      "epoch": 0.31375091830294166,
+      "grad_norm": 0.0036781311500817537,
+      "learning_rate": 0.001,
+      "loss": 0.4041,
+      "step": 11371
+    },
+    {
+      "epoch": 0.31377851050400607,
+      "grad_norm": 0.004679156932979822,
+      "learning_rate": 0.001,
+      "loss": 0.3871,
+      "step": 11372
+    },
+    {
+      "epoch": 0.3138061027050704,
+      "grad_norm": 0.002834693994373083,
+      "learning_rate": 0.001,
+      "loss": 0.4189,
+      "step": 11373
+    },
+    {
+      "epoch": 0.3138336949061348,
+      "grad_norm": 0.0029626016039401293,
+      "learning_rate": 0.001,
+      "loss": 0.4246,
+      "step": 11374
+    },
+    {
+      "epoch": 0.3138612871071991,
+      "grad_norm": 0.0032038032077252865,
+      "learning_rate": 0.001,
+      "loss": 0.3963,
+      "step": 11375
+    },
+    {
+      "epoch": 0.31388887930826354,
+      "grad_norm": 0.004681065212935209,
+      "learning_rate": 0.001,
+      "loss": 0.4177,
+      "step": 11376
+    },
+    {
+      "epoch": 0.3139164715093279,
+      "grad_norm": 0.004868704825639725,
+      "learning_rate": 0.001,
+      "loss": 0.4064,
+      "step": 11377
+    },
+    {
+      "epoch": 0.31394406371039224,
+      "grad_norm": 0.005584596190601587,
+      "learning_rate": 0.001,
+      "loss": 0.3819,
+      "step": 11378
+    },
+    {
+      "epoch": 0.31397165591145665,
+      "grad_norm": 0.002676298376172781,
+      "learning_rate": 0.001,
+      "loss": 0.4062,
+      "step": 11379
+    },
+    {
+      "epoch": 0.313999248112521,
+      "grad_norm": 0.003069305093958974,
+      "learning_rate": 0.001,
+      "loss": 0.3809,
+      "step": 11380
+    },
+    {
+      "epoch": 0.31402684031358535,
+      "grad_norm": 0.002781213726848364,
+      "learning_rate": 0.001,
+      "loss": 0.4287,
+      "step": 11381
+    },
+    {
+      "epoch": 0.31405443251464976,
+      "grad_norm": 0.0022207528818398714,
+      "learning_rate": 0.001,
+      "loss": 0.4073,
+      "step": 11382
+    },
+    {
+      "epoch": 0.3140820247157141,
+      "grad_norm": 0.0025241211988031864,
+      "learning_rate": 0.001,
+      "loss": 0.3887,
+      "step": 11383
+    },
+    {
+      "epoch": 0.31410961691677847,
+      "grad_norm": 0.004623900167644024,
+      "learning_rate": 0.001,
+      "loss": 0.3849,
+      "step": 11384
+    },
+    {
+      "epoch": 0.3141372091178428,
+      "grad_norm": 0.0028964534867554903,
+      "learning_rate": 0.001,
+      "loss": 0.4217,
+      "step": 11385
+    },
+    {
+      "epoch": 0.3141648013189072,
+      "grad_norm": 0.01115355733782053,
+      "learning_rate": 0.001,
+      "loss": 0.4417,
+      "step": 11386
+    },
+    {
+      "epoch": 0.3141923935199716,
+      "grad_norm": 0.003189532784745097,
+      "learning_rate": 0.001,
+      "loss": 0.3955,
+      "step": 11387
+    },
+    {
+      "epoch": 0.31421998572103593,
+      "grad_norm": 0.005057721398770809,
+      "learning_rate": 0.001,
+      "loss": 0.4194,
+      "step": 11388
+    },
+    {
+      "epoch": 0.31424757792210034,
+      "grad_norm": 0.005494655575603247,
+      "learning_rate": 0.001,
+      "loss": 0.4234,
+      "step": 11389
+    },
+    {
+      "epoch": 0.3142751701231647,
+      "grad_norm": 0.0033896707464009523,
+      "learning_rate": 0.001,
+      "loss": 0.3345,
+      "step": 11390
+    },
+    {
+      "epoch": 0.31430276232422905,
+      "grad_norm": 0.0024348304141312838,
+      "learning_rate": 0.001,
+      "loss": 0.413,
+      "step": 11391
+    },
+    {
+      "epoch": 0.3143303545252934,
+      "grad_norm": 0.002852590288966894,
+      "learning_rate": 0.001,
+      "loss": 0.4514,
+      "step": 11392
+    },
+    {
+      "epoch": 0.3143579467263578,
+      "grad_norm": 0.0025470538530498743,
+      "learning_rate": 0.001,
+      "loss": 0.3871,
+      "step": 11393
+    },
+    {
+      "epoch": 0.31438553892742216,
+      "grad_norm": 0.003952103201299906,
+      "learning_rate": 0.001,
+      "loss": 0.3673,
+      "step": 11394
+    },
+    {
+      "epoch": 0.3144131311284865,
+      "grad_norm": 0.0030062925070524216,
+      "learning_rate": 0.001,
+      "loss": 0.4349,
+      "step": 11395
+    },
+    {
+      "epoch": 0.3144407233295509,
+      "grad_norm": 0.008497409522533417,
+      "learning_rate": 0.001,
+      "loss": 0.4031,
+      "step": 11396
+    },
+    {
+      "epoch": 0.31446831553061527,
+      "grad_norm": 0.0024450889322906733,
+      "learning_rate": 0.001,
+      "loss": 0.3983,
+      "step": 11397
+    },
+    {
+      "epoch": 0.3144959077316796,
+      "grad_norm": 0.002428788226097822,
+      "learning_rate": 0.001,
+      "loss": 0.3698,
+      "step": 11398
+    },
+    {
+      "epoch": 0.31452349993274403,
+      "grad_norm": 0.0026488315779715776,
+      "learning_rate": 0.001,
+      "loss": 0.4082,
+      "step": 11399
+    },
+    {
+      "epoch": 0.3145510921338084,
+      "grad_norm": 0.003063708543777466,
+      "learning_rate": 0.001,
+      "loss": 0.3891,
+      "step": 11400
+    },
+    {
+      "epoch": 0.31457868433487274,
+      "grad_norm": 0.0036700747441500425,
+      "learning_rate": 0.001,
+      "loss": 0.3839,
+      "step": 11401
+    },
+    {
+      "epoch": 0.3146062765359371,
+      "grad_norm": 0.0024674683809280396,
+      "learning_rate": 0.001,
+      "loss": 0.3815,
+      "step": 11402
+    },
+    {
+      "epoch": 0.3146338687370015,
+      "grad_norm": 0.002942080609500408,
+      "learning_rate": 0.001,
+      "loss": 0.4228,
+      "step": 11403
+    },
+    {
+      "epoch": 0.31466146093806585,
+      "grad_norm": 0.0024689743295311928,
+      "learning_rate": 0.001,
+      "loss": 0.3945,
+      "step": 11404
+    },
+    {
+      "epoch": 0.3146890531391302,
+      "grad_norm": 0.003294649999588728,
+      "learning_rate": 0.001,
+      "loss": 0.4543,
+      "step": 11405
+    },
+    {
+      "epoch": 0.3147166453401946,
+      "grad_norm": 0.0027752909809350967,
+      "learning_rate": 0.001,
+      "loss": 0.3795,
+      "step": 11406
+    },
+    {
+      "epoch": 0.31474423754125896,
+      "grad_norm": 0.0031409088987857103,
+      "learning_rate": 0.001,
+      "loss": 0.3476,
+      "step": 11407
+    },
+    {
+      "epoch": 0.3147718297423233,
+      "grad_norm": 0.004619597923010588,
+      "learning_rate": 0.001,
+      "loss": 0.3996,
+      "step": 11408
+    },
+    {
+      "epoch": 0.3147994219433877,
+      "grad_norm": 0.005096206441521645,
+      "learning_rate": 0.001,
+      "loss": 0.4164,
+      "step": 11409
+    },
+    {
+      "epoch": 0.3148270141444521,
+      "grad_norm": 0.003756036050617695,
+      "learning_rate": 0.001,
+      "loss": 0.3912,
+      "step": 11410
+    },
+    {
+      "epoch": 0.31485460634551643,
+      "grad_norm": 0.0021125138737261295,
+      "learning_rate": 0.001,
+      "loss": 0.4238,
+      "step": 11411
+    },
+    {
+      "epoch": 0.3148821985465808,
+      "grad_norm": 0.002177385613322258,
+      "learning_rate": 0.001,
+      "loss": 0.381,
+      "step": 11412
+    },
+    {
+      "epoch": 0.3149097907476452,
+      "grad_norm": 0.0027410865295678377,
+      "learning_rate": 0.001,
+      "loss": 0.4251,
+      "step": 11413
+    },
+    {
+      "epoch": 0.31493738294870954,
+      "grad_norm": 0.003590056672692299,
+      "learning_rate": 0.001,
+      "loss": 0.4169,
+      "step": 11414
+    },
+    {
+      "epoch": 0.3149649751497739,
+      "grad_norm": 0.002279781037941575,
+      "learning_rate": 0.001,
+      "loss": 0.3946,
+      "step": 11415
+    },
+    {
+      "epoch": 0.3149925673508383,
+      "grad_norm": 0.0023333400022238493,
+      "learning_rate": 0.001,
+      "loss": 0.4085,
+      "step": 11416
+    },
+    {
+      "epoch": 0.31502015955190266,
+      "grad_norm": 0.004322835244238377,
+      "learning_rate": 0.001,
+      "loss": 0.3835,
+      "step": 11417
+    },
+    {
+      "epoch": 0.315047751752967,
+      "grad_norm": 0.0034005579072982073,
+      "learning_rate": 0.001,
+      "loss": 0.4159,
+      "step": 11418
+    },
+    {
+      "epoch": 0.3150753439540314,
+      "grad_norm": 0.007161610759794712,
+      "learning_rate": 0.001,
+      "loss": 0.4044,
+      "step": 11419
+    },
+    {
+      "epoch": 0.31510293615509577,
+      "grad_norm": 0.007838692516088486,
+      "learning_rate": 0.001,
+      "loss": 0.4317,
+      "step": 11420
+    },
+    {
+      "epoch": 0.3151305283561601,
+      "grad_norm": 0.004573942627757788,
+      "learning_rate": 0.001,
+      "loss": 0.3715,
+      "step": 11421
+    },
+    {
+      "epoch": 0.3151581205572245,
+      "grad_norm": 0.005383932497352362,
+      "learning_rate": 0.001,
+      "loss": 0.4437,
+      "step": 11422
+    },
+    {
+      "epoch": 0.3151857127582889,
+      "grad_norm": 0.004969790577888489,
+      "learning_rate": 0.001,
+      "loss": 0.3845,
+      "step": 11423
+    },
+    {
+      "epoch": 0.31521330495935324,
+      "grad_norm": 0.0022800557781010866,
+      "learning_rate": 0.001,
+      "loss": 0.3946,
+      "step": 11424
+    },
+    {
+      "epoch": 0.3152408971604176,
+      "grad_norm": 0.0025320611894130707,
+      "learning_rate": 0.001,
+      "loss": 0.4653,
+      "step": 11425
+    },
+    {
+      "epoch": 0.315268489361482,
+      "grad_norm": 0.002388893160969019,
+      "learning_rate": 0.001,
+      "loss": 0.4077,
+      "step": 11426
+    },
+    {
+      "epoch": 0.31529608156254635,
+      "grad_norm": 0.008318680338561535,
+      "learning_rate": 0.001,
+      "loss": 0.397,
+      "step": 11427
+    },
+    {
+      "epoch": 0.3153236737636107,
+      "grad_norm": 0.007277218624949455,
+      "learning_rate": 0.001,
+      "loss": 0.3586,
+      "step": 11428
+    },
+    {
+      "epoch": 0.3153512659646751,
+      "grad_norm": 0.0038948303554207087,
+      "learning_rate": 0.001,
+      "loss": 0.3747,
+      "step": 11429
+    },
+    {
+      "epoch": 0.31537885816573946,
+      "grad_norm": 0.003630647901445627,
+      "learning_rate": 0.001,
+      "loss": 0.4183,
+      "step": 11430
+    },
+    {
+      "epoch": 0.3154064503668038,
+      "grad_norm": 0.002882918808609247,
+      "learning_rate": 0.001,
+      "loss": 0.4038,
+      "step": 11431
+    },
+    {
+      "epoch": 0.31543404256786817,
+      "grad_norm": 0.0030421330593526363,
+      "learning_rate": 0.001,
+      "loss": 0.367,
+      "step": 11432
+    },
+    {
+      "epoch": 0.3154616347689326,
+      "grad_norm": 0.00291895167902112,
+      "learning_rate": 0.001,
+      "loss": 0.427,
+      "step": 11433
+    },
+    {
+      "epoch": 0.3154892269699969,
+      "grad_norm": 0.003857139963656664,
+      "learning_rate": 0.001,
+      "loss": 0.4236,
+      "step": 11434
+    },
+    {
+      "epoch": 0.3155168191710613,
+      "grad_norm": 0.0034651218447834253,
+      "learning_rate": 0.001,
+      "loss": 0.4063,
+      "step": 11435
+    },
+    {
+      "epoch": 0.3155444113721257,
+      "grad_norm": 0.0023657598067075014,
+      "learning_rate": 0.001,
+      "loss": 0.4227,
+      "step": 11436
+    },
+    {
+      "epoch": 0.31557200357319004,
+      "grad_norm": 0.002382135484367609,
+      "learning_rate": 0.001,
+      "loss": 0.3864,
+      "step": 11437
+    },
+    {
+      "epoch": 0.3155995957742544,
+      "grad_norm": 0.0034222460817545652,
+      "learning_rate": 0.001,
+      "loss": 0.418,
+      "step": 11438
+    },
+    {
+      "epoch": 0.3156271879753188,
+      "grad_norm": 0.004517734050750732,
+      "learning_rate": 0.001,
+      "loss": 0.3899,
+      "step": 11439
+    },
+    {
+      "epoch": 0.31565478017638315,
+      "grad_norm": 0.005458935163915157,
+      "learning_rate": 0.001,
+      "loss": 0.4125,
+      "step": 11440
+    },
+    {
+      "epoch": 0.3156823723774475,
+      "grad_norm": 0.00377734680660069,
+      "learning_rate": 0.001,
+      "loss": 0.3623,
+      "step": 11441
+    },
+    {
+      "epoch": 0.31570996457851186,
+      "grad_norm": 0.006429413799196482,
+      "learning_rate": 0.001,
+      "loss": 0.3875,
+      "step": 11442
+    },
+    {
+      "epoch": 0.31573755677957627,
+      "grad_norm": 0.002111859619617462,
+      "learning_rate": 0.001,
+      "loss": 0.3918,
+      "step": 11443
+    },
+    {
+      "epoch": 0.3157651489806406,
+      "grad_norm": 0.005765847861766815,
+      "learning_rate": 0.001,
+      "loss": 0.4058,
+      "step": 11444
+    },
+    {
+      "epoch": 0.315792741181705,
+      "grad_norm": 0.0038511583115905523,
+      "learning_rate": 0.001,
+      "loss": 0.3508,
+      "step": 11445
+    },
+    {
+      "epoch": 0.3158203333827694,
+      "grad_norm": 0.006009151693433523,
+      "learning_rate": 0.001,
+      "loss": 0.4063,
+      "step": 11446
+    },
+    {
+      "epoch": 0.31584792558383373,
+      "grad_norm": 0.0027272317092865705,
+      "learning_rate": 0.001,
+      "loss": 0.4205,
+      "step": 11447
+    },
+    {
+      "epoch": 0.3158755177848981,
+      "grad_norm": 0.0022494541481137276,
+      "learning_rate": 0.001,
+      "loss": 0.4016,
+      "step": 11448
+    },
+    {
+      "epoch": 0.3159031099859625,
+      "grad_norm": 0.0025459870230406523,
+      "learning_rate": 0.001,
+      "loss": 0.3924,
+      "step": 11449
+    },
+    {
+      "epoch": 0.31593070218702685,
+      "grad_norm": 0.008022737689316273,
+      "learning_rate": 0.001,
+      "loss": 0.3628,
+      "step": 11450
+    },
+    {
+      "epoch": 0.3159582943880912,
+      "grad_norm": 0.0043347179889678955,
+      "learning_rate": 0.001,
+      "loss": 0.3537,
+      "step": 11451
+    },
+    {
+      "epoch": 0.31598588658915555,
+      "grad_norm": 0.003317405702546239,
+      "learning_rate": 0.001,
+      "loss": 0.4351,
+      "step": 11452
+    },
+    {
+      "epoch": 0.31601347879021996,
+      "grad_norm": 0.003171857912093401,
+      "learning_rate": 0.001,
+      "loss": 0.3757,
+      "step": 11453
+    },
+    {
+      "epoch": 0.3160410709912843,
+      "grad_norm": 0.0032373506110161543,
+      "learning_rate": 0.001,
+      "loss": 0.4016,
+      "step": 11454
+    },
+    {
+      "epoch": 0.31606866319234866,
+      "grad_norm": 0.003428233554586768,
+      "learning_rate": 0.001,
+      "loss": 0.3685,
+      "step": 11455
+    },
+    {
+      "epoch": 0.3160962553934131,
+      "grad_norm": 0.0027770015876740217,
+      "learning_rate": 0.001,
+      "loss": 0.4151,
+      "step": 11456
+    },
+    {
+      "epoch": 0.3161238475944774,
+      "grad_norm": 0.002452113199979067,
+      "learning_rate": 0.001,
+      "loss": 0.4296,
+      "step": 11457
+    },
+    {
+      "epoch": 0.3161514397955418,
+      "grad_norm": 0.0024971971288323402,
+      "learning_rate": 0.001,
+      "loss": 0.413,
+      "step": 11458
+    },
+    {
+      "epoch": 0.3161790319966062,
+      "grad_norm": 0.002314744982868433,
+      "learning_rate": 0.001,
+      "loss": 0.4278,
+      "step": 11459
+    },
+    {
+      "epoch": 0.31620662419767054,
+      "grad_norm": 0.004431735258549452,
+      "learning_rate": 0.001,
+      "loss": 0.3828,
+      "step": 11460
+    },
+    {
+      "epoch": 0.3162342163987349,
+      "grad_norm": 0.007284036837518215,
+      "learning_rate": 0.001,
+      "loss": 0.3353,
+      "step": 11461
+    },
+    {
+      "epoch": 0.31626180859979924,
+      "grad_norm": 0.004431730601936579,
+      "learning_rate": 0.001,
+      "loss": 0.3701,
+      "step": 11462
+    },
+    {
+      "epoch": 0.31628940080086365,
+      "grad_norm": 0.003810502588748932,
+      "learning_rate": 0.001,
+      "loss": 0.3924,
+      "step": 11463
+    },
+    {
+      "epoch": 0.316316993001928,
+      "grad_norm": 0.004132286179810762,
+      "learning_rate": 0.001,
+      "loss": 0.379,
+      "step": 11464
+    },
+    {
+      "epoch": 0.31634458520299236,
+      "grad_norm": 0.0026423779781907797,
+      "learning_rate": 0.001,
+      "loss": 0.3951,
+      "step": 11465
+    },
+    {
+      "epoch": 0.31637217740405676,
+      "grad_norm": 0.0030175880528986454,
+      "learning_rate": 0.001,
+      "loss": 0.3806,
+      "step": 11466
+    },
+    {
+      "epoch": 0.3163997696051211,
+      "grad_norm": 0.0023432145826518536,
+      "learning_rate": 0.001,
+      "loss": 0.422,
+      "step": 11467
+    },
+    {
+      "epoch": 0.31642736180618547,
+      "grad_norm": 0.004060344770550728,
+      "learning_rate": 0.001,
+      "loss": 0.3897,
+      "step": 11468
+    },
+    {
+      "epoch": 0.3164549540072499,
+      "grad_norm": 0.00273421430028975,
+      "learning_rate": 0.001,
+      "loss": 0.3729,
+      "step": 11469
+    },
+    {
+      "epoch": 0.31648254620831423,
+      "grad_norm": 0.0024940611328929663,
+      "learning_rate": 0.001,
+      "loss": 0.3923,
+      "step": 11470
+    },
+    {
+      "epoch": 0.3165101384093786,
+      "grad_norm": 0.005460789427161217,
+      "learning_rate": 0.001,
+      "loss": 0.4153,
+      "step": 11471
+    },
+    {
+      "epoch": 0.31653773061044294,
+      "grad_norm": 0.0026238500140607357,
+      "learning_rate": 0.001,
+      "loss": 0.4299,
+      "step": 11472
+    },
+    {
+      "epoch": 0.31656532281150734,
+      "grad_norm": 0.005485900212079287,
+      "learning_rate": 0.001,
+      "loss": 0.4071,
+      "step": 11473
+    },
+    {
+      "epoch": 0.3165929150125717,
+      "grad_norm": 0.0072882408276200294,
+      "learning_rate": 0.001,
+      "loss": 0.4056,
+      "step": 11474
+    },
+    {
+      "epoch": 0.31662050721363605,
+      "grad_norm": 0.002355532720685005,
+      "learning_rate": 0.001,
+      "loss": 0.4526,
+      "step": 11475
+    },
+    {
+      "epoch": 0.31664809941470046,
+      "grad_norm": 0.0029103090055286884,
+      "learning_rate": 0.001,
+      "loss": 0.4019,
+      "step": 11476
+    },
+    {
+      "epoch": 0.3166756916157648,
+      "grad_norm": 0.005752061028033495,
+      "learning_rate": 0.001,
+      "loss": 0.4086,
+      "step": 11477
+    },
+    {
+      "epoch": 0.31670328381682916,
+      "grad_norm": 0.0035546228755265474,
+      "learning_rate": 0.001,
+      "loss": 0.4,
+      "step": 11478
+    },
+    {
+      "epoch": 0.31673087601789357,
+      "grad_norm": 0.007723371963948011,
+      "learning_rate": 0.001,
+      "loss": 0.4098,
+      "step": 11479
+    },
+    {
+      "epoch": 0.3167584682189579,
+      "grad_norm": 0.005489727947860956,
+      "learning_rate": 0.001,
+      "loss": 0.3787,
+      "step": 11480
+    },
+    {
+      "epoch": 0.3167860604200223,
+      "grad_norm": 0.0035039815120399,
+      "learning_rate": 0.001,
+      "loss": 0.4038,
+      "step": 11481
+    },
+    {
+      "epoch": 0.3168136526210866,
+      "grad_norm": 0.00970650278031826,
+      "learning_rate": 0.001,
+      "loss": 0.4075,
+      "step": 11482
+    },
+    {
+      "epoch": 0.31684124482215104,
+      "grad_norm": 0.002773486776277423,
+      "learning_rate": 0.001,
+      "loss": 0.4455,
+      "step": 11483
+    },
+    {
+      "epoch": 0.3168688370232154,
+      "grad_norm": 0.002964557381346822,
+      "learning_rate": 0.001,
+      "loss": 0.389,
+      "step": 11484
+    },
+    {
+      "epoch": 0.31689642922427974,
+      "grad_norm": 0.002768107457086444,
+      "learning_rate": 0.001,
+      "loss": 0.4399,
+      "step": 11485
+    },
+    {
+      "epoch": 0.31692402142534415,
+      "grad_norm": 0.003310150234028697,
+      "learning_rate": 0.001,
+      "loss": 0.3921,
+      "step": 11486
+    },
+    {
+      "epoch": 0.3169516136264085,
+      "grad_norm": 0.0038422702345997095,
+      "learning_rate": 0.001,
+      "loss": 0.3947,
+      "step": 11487
+    },
+    {
+      "epoch": 0.31697920582747285,
+      "grad_norm": 0.0059617673978209496,
+      "learning_rate": 0.001,
+      "loss": 0.3932,
+      "step": 11488
+    },
+    {
+      "epoch": 0.3170067980285372,
+      "grad_norm": 0.004942035768181086,
+      "learning_rate": 0.001,
+      "loss": 0.4235,
+      "step": 11489
+    },
+    {
+      "epoch": 0.3170343902296016,
+      "grad_norm": 0.005482214502990246,
+      "learning_rate": 0.001,
+      "loss": 0.3917,
+      "step": 11490
+    },
+    {
+      "epoch": 0.31706198243066597,
+      "grad_norm": 0.004851524252444506,
+      "learning_rate": 0.001,
+      "loss": 0.3555,
+      "step": 11491
+    },
+    {
+      "epoch": 0.3170895746317303,
+      "grad_norm": 0.0029763453640043736,
+      "learning_rate": 0.001,
+      "loss": 0.4055,
+      "step": 11492
+    },
+    {
+      "epoch": 0.31711716683279473,
+      "grad_norm": 0.0036718971095979214,
+      "learning_rate": 0.001,
+      "loss": 0.3886,
+      "step": 11493
+    },
+    {
+      "epoch": 0.3171447590338591,
+      "grad_norm": 0.003948654048144817,
+      "learning_rate": 0.001,
+      "loss": 0.4239,
+      "step": 11494
+    },
+    {
+      "epoch": 0.31717235123492343,
+      "grad_norm": 0.011904267594218254,
+      "learning_rate": 0.001,
+      "loss": 0.3985,
+      "step": 11495
+    },
+    {
+      "epoch": 0.31719994343598784,
+      "grad_norm": 0.003118073334917426,
+      "learning_rate": 0.001,
+      "loss": 0.383,
+      "step": 11496
+    },
+    {
+      "epoch": 0.3172275356370522,
+      "grad_norm": 0.0038996157236397266,
+      "learning_rate": 0.001,
+      "loss": 0.3929,
+      "step": 11497
+    },
+    {
+      "epoch": 0.31725512783811655,
+      "grad_norm": 0.003080842550843954,
+      "learning_rate": 0.001,
+      "loss": 0.379,
+      "step": 11498
+    },
+    {
+      "epoch": 0.3172827200391809,
+      "grad_norm": 0.0027819147799164057,
+      "learning_rate": 0.001,
+      "loss": 0.3748,
+      "step": 11499
+    },
+    {
+      "epoch": 0.3173103122402453,
+      "grad_norm": 0.0029706538189202547,
+      "learning_rate": 0.001,
+      "loss": 0.3841,
+      "step": 11500
+    },
+    {
+      "epoch": 0.3173103122402453,
+      "eval_runtime": 23.5592,
+      "eval_samples_per_second": 1.358,
+      "eval_steps_per_second": 0.17,
+      "step": 11500
+    },
+    {
+      "epoch": 0.31733790444130966,
+      "grad_norm": 0.002240256406366825,
+      "learning_rate": 0.001,
+      "loss": 0.4069,
+      "step": 11501
+    },
+    {
+      "epoch": 0.317365496642374,
+      "grad_norm": 0.0034430986270308495,
+      "learning_rate": 0.001,
+      "loss": 0.3758,
+      "step": 11502
+    },
+    {
+      "epoch": 0.3173930888434384,
+      "grad_norm": 0.01547628827393055,
+      "learning_rate": 0.001,
+      "loss": 0.3948,
+      "step": 11503
+    },
+    {
+      "epoch": 0.3174206810445028,
+      "grad_norm": 0.003593276022002101,
+      "learning_rate": 0.001,
+      "loss": 0.41,
+      "step": 11504
+    },
+    {
+      "epoch": 0.3174482732455671,
+      "grad_norm": 0.00244096084497869,
+      "learning_rate": 0.001,
+      "loss": 0.4129,
+      "step": 11505
+    },
+    {
+      "epoch": 0.31747586544663153,
+      "grad_norm": 0.002778639318421483,
+      "learning_rate": 0.001,
+      "loss": 0.3911,
+      "step": 11506
+    },
+    {
+      "epoch": 0.3175034576476959,
+      "grad_norm": 0.018177034333348274,
+      "learning_rate": 0.001,
+      "loss": 0.3928,
+      "step": 11507
+    },
+    {
+      "epoch": 0.31753104984876024,
+      "grad_norm": 0.005340322386473417,
+      "learning_rate": 0.001,
+      "loss": 0.4106,
+      "step": 11508
+    },
+    {
+      "epoch": 0.3175586420498246,
+      "grad_norm": 0.0037258395459502935,
+      "learning_rate": 0.001,
+      "loss": 0.4093,
+      "step": 11509
+    },
+    {
+      "epoch": 0.317586234250889,
+      "grad_norm": 0.004941198974847794,
+      "learning_rate": 0.001,
+      "loss": 0.3956,
+      "step": 11510
+    },
+    {
+      "epoch": 0.31761382645195335,
+      "grad_norm": 0.0029092850163578987,
+      "learning_rate": 0.001,
+      "loss": 0.3928,
+      "step": 11511
+    },
+    {
+      "epoch": 0.3176414186530177,
+      "grad_norm": 0.0023247981444001198,
+      "learning_rate": 0.001,
+      "loss": 0.3985,
+      "step": 11512
+    },
+    {
+      "epoch": 0.3176690108540821,
+      "grad_norm": 0.0024109010118991137,
+      "learning_rate": 0.001,
+      "loss": 0.3925,
+      "step": 11513
+    },
+    {
+      "epoch": 0.31769660305514646,
+      "grad_norm": 0.004136643372476101,
+      "learning_rate": 0.001,
+      "loss": 0.4308,
+      "step": 11514
+    },
+    {
+      "epoch": 0.3177241952562108,
+      "grad_norm": 0.002965535270050168,
+      "learning_rate": 0.001,
+      "loss": 0.3983,
+      "step": 11515
+    },
+    {
+      "epoch": 0.3177517874572752,
+      "grad_norm": 0.0028476836159825325,
+      "learning_rate": 0.001,
+      "loss": 0.4078,
+      "step": 11516
+    },
+    {
+      "epoch": 0.3177793796583396,
+      "grad_norm": 0.002511124825105071,
+      "learning_rate": 0.001,
+      "loss": 0.3867,
+      "step": 11517
+    },
+    {
+      "epoch": 0.31780697185940393,
+      "grad_norm": 0.002977583557367325,
+      "learning_rate": 0.001,
+      "loss": 0.403,
+      "step": 11518
+    },
+    {
+      "epoch": 0.3178345640604683,
+      "grad_norm": 0.0023715696297585964,
+      "learning_rate": 0.001,
+      "loss": 0.3795,
+      "step": 11519
+    },
+    {
+      "epoch": 0.3178621562615327,
+      "grad_norm": 0.007563414517790079,
+      "learning_rate": 0.001,
+      "loss": 0.388,
+      "step": 11520
+    },
+    {
+      "epoch": 0.31788974846259704,
+      "grad_norm": 0.004035325720906258,
+      "learning_rate": 0.001,
+      "loss": 0.379,
+      "step": 11521
+    },
+    {
+      "epoch": 0.3179173406636614,
+      "grad_norm": 0.00366159132681787,
+      "learning_rate": 0.001,
+      "loss": 0.4218,
+      "step": 11522
+    },
+    {
+      "epoch": 0.3179449328647258,
+      "grad_norm": 0.0030379300005733967,
+      "learning_rate": 0.001,
+      "loss": 0.4373,
+      "step": 11523
+    },
+    {
+      "epoch": 0.31797252506579016,
+      "grad_norm": 0.002500406000763178,
+      "learning_rate": 0.001,
+      "loss": 0.4019,
+      "step": 11524
+    },
+    {
+      "epoch": 0.3180001172668545,
+      "grad_norm": 0.0029437800403684378,
+      "learning_rate": 0.001,
+      "loss": 0.4073,
+      "step": 11525
+    },
+    {
+      "epoch": 0.3180277094679189,
+      "grad_norm": 0.003492504358291626,
+      "learning_rate": 0.001,
+      "loss": 0.3619,
+      "step": 11526
+    },
+    {
+      "epoch": 0.31805530166898327,
+      "grad_norm": 0.002901204163208604,
+      "learning_rate": 0.001,
+      "loss": 0.3956,
+      "step": 11527
+    },
+    {
+      "epoch": 0.3180828938700476,
+      "grad_norm": 0.003703712485730648,
+      "learning_rate": 0.001,
+      "loss": 0.4045,
+      "step": 11528
+    },
+    {
+      "epoch": 0.318110486071112,
+      "grad_norm": 0.003048625076189637,
+      "learning_rate": 0.001,
+      "loss": 0.3834,
+      "step": 11529
+    },
+    {
+      "epoch": 0.3181380782721764,
+      "grad_norm": 0.0022534143645316362,
+      "learning_rate": 0.001,
+      "loss": 0.4213,
+      "step": 11530
+    },
+    {
+      "epoch": 0.31816567047324074,
+      "grad_norm": 0.0026876600459218025,
+      "learning_rate": 0.001,
+      "loss": 0.3785,
+      "step": 11531
+    },
+    {
+      "epoch": 0.3181932626743051,
+      "grad_norm": 0.00236628670245409,
+      "learning_rate": 0.001,
+      "loss": 0.4376,
+      "step": 11532
+    },
+    {
+      "epoch": 0.3182208548753695,
+      "grad_norm": 0.004099288955330849,
+      "learning_rate": 0.001,
+      "loss": 0.3916,
+      "step": 11533
+    },
+    {
+      "epoch": 0.31824844707643385,
+      "grad_norm": 0.0041442555375397205,
+      "learning_rate": 0.001,
+      "loss": 0.358,
+      "step": 11534
+    },
+    {
+      "epoch": 0.3182760392774982,
+      "grad_norm": 0.003338357200846076,
+      "learning_rate": 0.001,
+      "loss": 0.4307,
+      "step": 11535
+    },
+    {
+      "epoch": 0.3183036314785626,
+      "grad_norm": 0.002358140889555216,
+      "learning_rate": 0.001,
+      "loss": 0.4438,
+      "step": 11536
+    },
+    {
+      "epoch": 0.31833122367962696,
+      "grad_norm": 0.007097299210727215,
+      "learning_rate": 0.001,
+      "loss": 0.3917,
+      "step": 11537
+    },
+    {
+      "epoch": 0.3183588158806913,
+      "grad_norm": 0.003606355981901288,
+      "learning_rate": 0.001,
+      "loss": 0.4283,
+      "step": 11538
+    },
+    {
+      "epoch": 0.31838640808175567,
+      "grad_norm": 0.0027023248840123415,
+      "learning_rate": 0.001,
+      "loss": 0.379,
+      "step": 11539
+    },
+    {
+      "epoch": 0.3184140002828201,
+      "grad_norm": 0.0069968136958777905,
+      "learning_rate": 0.001,
+      "loss": 0.3748,
+      "step": 11540
+    },
+    {
+      "epoch": 0.31844159248388443,
+      "grad_norm": 0.0028446821961551905,
+      "learning_rate": 0.001,
+      "loss": 0.4025,
+      "step": 11541
+    },
+    {
+      "epoch": 0.3184691846849488,
+      "grad_norm": 0.003783198306336999,
+      "learning_rate": 0.001,
+      "loss": 0.3773,
+      "step": 11542
+    },
+    {
+      "epoch": 0.3184967768860132,
+      "grad_norm": 0.0042613414116203785,
+      "learning_rate": 0.001,
+      "loss": 0.4125,
+      "step": 11543
+    },
+    {
+      "epoch": 0.31852436908707754,
+      "grad_norm": 0.023959210142493248,
+      "learning_rate": 0.001,
+      "loss": 0.4045,
+      "step": 11544
+    },
+    {
+      "epoch": 0.3185519612881419,
+      "grad_norm": 0.02399979531764984,
+      "learning_rate": 0.001,
+      "loss": 0.3862,
+      "step": 11545
+    },
+    {
+      "epoch": 0.3185795534892063,
+      "grad_norm": 0.003035551868379116,
+      "learning_rate": 0.001,
+      "loss": 0.427,
+      "step": 11546
+    },
+    {
+      "epoch": 0.31860714569027065,
+      "grad_norm": 0.0028156766202300787,
+      "learning_rate": 0.001,
+      "loss": 0.394,
+      "step": 11547
+    },
+    {
+      "epoch": 0.318634737891335,
+      "grad_norm": 0.0029186957981437445,
+      "learning_rate": 0.001,
+      "loss": 0.4204,
+      "step": 11548
+    },
+    {
+      "epoch": 0.31866233009239936,
+      "grad_norm": 0.0037356752436608076,
+      "learning_rate": 0.001,
+      "loss": 0.4317,
+      "step": 11549
+    },
+    {
+      "epoch": 0.31868992229346377,
+      "grad_norm": 0.0047831544652581215,
+      "learning_rate": 0.001,
+      "loss": 0.3895,
+      "step": 11550
+    },
+    {
+      "epoch": 0.3187175144945281,
+      "grad_norm": 0.0023849967401474714,
+      "learning_rate": 0.001,
+      "loss": 0.3931,
+      "step": 11551
+    },
+    {
+      "epoch": 0.3187451066955925,
+      "grad_norm": 0.004315240308642387,
+      "learning_rate": 0.001,
+      "loss": 0.3765,
+      "step": 11552
+    },
+    {
+      "epoch": 0.3187726988966569,
+      "grad_norm": 0.003803263884037733,
+      "learning_rate": 0.001,
+      "loss": 0.393,
+      "step": 11553
+    },
+    {
+      "epoch": 0.31880029109772123,
+      "grad_norm": 0.004018161911517382,
+      "learning_rate": 0.001,
+      "loss": 0.3923,
+      "step": 11554
+    },
+    {
+      "epoch": 0.3188278832987856,
+      "grad_norm": 0.0036311408039182425,
+      "learning_rate": 0.001,
+      "loss": 0.3995,
+      "step": 11555
+    },
+    {
+      "epoch": 0.31885547549985,
+      "grad_norm": 0.004630459472537041,
+      "learning_rate": 0.001,
+      "loss": 0.4245,
+      "step": 11556
+    },
+    {
+      "epoch": 0.31888306770091435,
+      "grad_norm": 0.0048345946706831455,
+      "learning_rate": 0.001,
+      "loss": 0.3803,
+      "step": 11557
+    },
+    {
+      "epoch": 0.3189106599019787,
+      "grad_norm": 0.005161342676728964,
+      "learning_rate": 0.001,
+      "loss": 0.3893,
+      "step": 11558
+    },
+    {
+      "epoch": 0.31893825210304305,
+      "grad_norm": 0.0023576943203806877,
+      "learning_rate": 0.001,
+      "loss": 0.4402,
+      "step": 11559
+    },
+    {
+      "epoch": 0.31896584430410746,
+      "grad_norm": 0.004130503162741661,
+      "learning_rate": 0.001,
+      "loss": 0.3545,
+      "step": 11560
+    },
+    {
+      "epoch": 0.3189934365051718,
+      "grad_norm": 0.005308313295245171,
+      "learning_rate": 0.001,
+      "loss": 0.3788,
+      "step": 11561
+    },
+    {
+      "epoch": 0.31902102870623616,
+      "grad_norm": 0.0037142925430089235,
+      "learning_rate": 0.001,
+      "loss": 0.3961,
+      "step": 11562
+    },
+    {
+      "epoch": 0.3190486209073006,
+      "grad_norm": 0.0037768762558698654,
+      "learning_rate": 0.001,
+      "loss": 0.3944,
+      "step": 11563
+    },
+    {
+      "epoch": 0.3190762131083649,
+      "grad_norm": 0.004423100501298904,
+      "learning_rate": 0.001,
+      "loss": 0.3734,
+      "step": 11564
+    },
+    {
+      "epoch": 0.3191038053094293,
+      "grad_norm": 0.0066397832706570625,
+      "learning_rate": 0.001,
+      "loss": 0.4044,
+      "step": 11565
+    },
+    {
+      "epoch": 0.3191313975104937,
+      "grad_norm": 0.003907995298504829,
+      "learning_rate": 0.001,
+      "loss": 0.3999,
+      "step": 11566
+    },
+    {
+      "epoch": 0.31915898971155804,
+      "grad_norm": 0.003112133825197816,
+      "learning_rate": 0.001,
+      "loss": 0.4219,
+      "step": 11567
+    },
+    {
+      "epoch": 0.3191865819126224,
+      "grad_norm": 0.002988254651427269,
+      "learning_rate": 0.001,
+      "loss": 0.3865,
+      "step": 11568
+    },
+    {
+      "epoch": 0.31921417411368674,
+      "grad_norm": 0.023827673867344856,
+      "learning_rate": 0.001,
+      "loss": 0.4147,
+      "step": 11569
+    },
+    {
+      "epoch": 0.31924176631475115,
+      "grad_norm": 0.0068152598105371,
+      "learning_rate": 0.001,
+      "loss": 0.433,
+      "step": 11570
+    },
+    {
+      "epoch": 0.3192693585158155,
+      "grad_norm": 0.004217714536935091,
+      "learning_rate": 0.001,
+      "loss": 0.3661,
+      "step": 11571
+    },
+    {
+      "epoch": 0.31929695071687986,
+      "grad_norm": 0.002245552372187376,
+      "learning_rate": 0.001,
+      "loss": 0.3537,
+      "step": 11572
+    },
+    {
+      "epoch": 0.31932454291794427,
+      "grad_norm": 0.0026237708516418934,
+      "learning_rate": 0.001,
+      "loss": 0.3694,
+      "step": 11573
+    },
+    {
+      "epoch": 0.3193521351190086,
+      "grad_norm": 0.0031141149811446667,
+      "learning_rate": 0.001,
+      "loss": 0.4304,
+      "step": 11574
+    },
+    {
+      "epoch": 0.31937972732007297,
+      "grad_norm": 0.004059195052832365,
+      "learning_rate": 0.001,
+      "loss": 0.4158,
+      "step": 11575
+    },
+    {
+      "epoch": 0.3194073195211373,
+      "grad_norm": 0.002976267132908106,
+      "learning_rate": 0.001,
+      "loss": 0.4212,
+      "step": 11576
+    },
+    {
+      "epoch": 0.31943491172220173,
+      "grad_norm": 0.0048133903183043,
+      "learning_rate": 0.001,
+      "loss": 0.3741,
+      "step": 11577
+    },
+    {
+      "epoch": 0.3194625039232661,
+      "grad_norm": 0.002898376202210784,
+      "learning_rate": 0.001,
+      "loss": 0.3809,
+      "step": 11578
+    },
+    {
+      "epoch": 0.31949009612433044,
+      "grad_norm": 0.0033135826233774424,
+      "learning_rate": 0.001,
+      "loss": 0.3935,
+      "step": 11579
+    },
+    {
+      "epoch": 0.31951768832539484,
+      "grad_norm": 0.005857154726982117,
+      "learning_rate": 0.001,
+      "loss": 0.3859,
+      "step": 11580
+    },
+    {
+      "epoch": 0.3195452805264592,
+      "grad_norm": 0.0028846007771790028,
+      "learning_rate": 0.001,
+      "loss": 0.3835,
+      "step": 11581
+    },
+    {
+      "epoch": 0.31957287272752355,
+      "grad_norm": 0.002914070850238204,
+      "learning_rate": 0.001,
+      "loss": 0.3684,
+      "step": 11582
+    },
+    {
+      "epoch": 0.31960046492858796,
+      "grad_norm": 0.002994397422298789,
+      "learning_rate": 0.001,
+      "loss": 0.3825,
+      "step": 11583
+    },
+    {
+      "epoch": 0.3196280571296523,
+      "grad_norm": 0.0033264674711972475,
+      "learning_rate": 0.001,
+      "loss": 0.4124,
+      "step": 11584
+    },
+    {
+      "epoch": 0.31965564933071666,
+      "grad_norm": 0.002203144831582904,
+      "learning_rate": 0.001,
+      "loss": 0.4649,
+      "step": 11585
+    },
+    {
+      "epoch": 0.319683241531781,
+      "grad_norm": 0.004528568591922522,
+      "learning_rate": 0.001,
+      "loss": 0.3861,
+      "step": 11586
+    },
+    {
+      "epoch": 0.3197108337328454,
+      "grad_norm": 0.003971675876528025,
+      "learning_rate": 0.001,
+      "loss": 0.387,
+      "step": 11587
+    },
+    {
+      "epoch": 0.3197384259339098,
+      "grad_norm": 0.003969928715378046,
+      "learning_rate": 0.001,
+      "loss": 0.3971,
+      "step": 11588
+    },
+    {
+      "epoch": 0.31976601813497413,
+      "grad_norm": 0.0036615943536162376,
+      "learning_rate": 0.001,
+      "loss": 0.3822,
+      "step": 11589
+    },
+    {
+      "epoch": 0.31979361033603854,
+      "grad_norm": 0.0023915197234600782,
+      "learning_rate": 0.001,
+      "loss": 0.4089,
+      "step": 11590
+    },
+    {
+      "epoch": 0.3198212025371029,
+      "grad_norm": 0.0024923242162913084,
+      "learning_rate": 0.001,
+      "loss": 0.4314,
+      "step": 11591
+    },
+    {
+      "epoch": 0.31984879473816724,
+      "grad_norm": 0.002602427499368787,
+      "learning_rate": 0.001,
+      "loss": 0.3861,
+      "step": 11592
+    },
+    {
+      "epoch": 0.31987638693923165,
+      "grad_norm": 0.005880540236830711,
+      "learning_rate": 0.001,
+      "loss": 0.3874,
+      "step": 11593
+    },
+    {
+      "epoch": 0.319903979140296,
+      "grad_norm": 0.00876854732632637,
+      "learning_rate": 0.001,
+      "loss": 0.3895,
+      "step": 11594
+    },
+    {
+      "epoch": 0.31993157134136035,
+      "grad_norm": 0.006111782975494862,
+      "learning_rate": 0.001,
+      "loss": 0.4055,
+      "step": 11595
+    },
+    {
+      "epoch": 0.3199591635424247,
+      "grad_norm": 0.006039989646524191,
+      "learning_rate": 0.001,
+      "loss": 0.4061,
+      "step": 11596
+    },
+    {
+      "epoch": 0.3199867557434891,
+      "grad_norm": 0.008590095676481724,
+      "learning_rate": 0.001,
+      "loss": 0.3448,
+      "step": 11597
+    },
+    {
+      "epoch": 0.32001434794455347,
+      "grad_norm": 0.015925157815217972,
+      "learning_rate": 0.001,
+      "loss": 0.409,
+      "step": 11598
+    },
+    {
+      "epoch": 0.3200419401456178,
+      "grad_norm": 0.003673270810395479,
+      "learning_rate": 0.001,
+      "loss": 0.3928,
+      "step": 11599
+    },
+    {
+      "epoch": 0.32006953234668223,
+      "grad_norm": 0.0026890907902270555,
+      "learning_rate": 0.001,
+      "loss": 0.4251,
+      "step": 11600
+    },
+    {
+      "epoch": 0.3200971245477466,
+      "grad_norm": 0.0037518092431128025,
+      "learning_rate": 0.001,
+      "loss": 0.4057,
+      "step": 11601
+    },
+    {
+      "epoch": 0.32012471674881093,
+      "grad_norm": 0.003528765868395567,
+      "learning_rate": 0.001,
+      "loss": 0.4144,
+      "step": 11602
+    },
+    {
+      "epoch": 0.32015230894987534,
+      "grad_norm": 0.0036971168592572212,
+      "learning_rate": 0.001,
+      "loss": 0.3964,
+      "step": 11603
+    },
+    {
+      "epoch": 0.3201799011509397,
+      "grad_norm": 0.003124906914308667,
+      "learning_rate": 0.001,
+      "loss": 0.3936,
+      "step": 11604
+    },
+    {
+      "epoch": 0.32020749335200405,
+      "grad_norm": 0.0033891245257109404,
+      "learning_rate": 0.001,
+      "loss": 0.4146,
+      "step": 11605
+    },
+    {
+      "epoch": 0.3202350855530684,
+      "grad_norm": 0.002698018215596676,
+      "learning_rate": 0.001,
+      "loss": 0.4173,
+      "step": 11606
+    },
+    {
+      "epoch": 0.3202626777541328,
+      "grad_norm": 0.0034322156570851803,
+      "learning_rate": 0.001,
+      "loss": 0.3921,
+      "step": 11607
+    },
+    {
+      "epoch": 0.32029026995519716,
+      "grad_norm": 0.0033172587864100933,
+      "learning_rate": 0.001,
+      "loss": 0.3796,
+      "step": 11608
+    },
+    {
+      "epoch": 0.3203178621562615,
+      "grad_norm": 0.0030292505398392677,
+      "learning_rate": 0.001,
+      "loss": 0.3811,
+      "step": 11609
+    },
+    {
+      "epoch": 0.3203454543573259,
+      "grad_norm": 0.0033198799937963486,
+      "learning_rate": 0.001,
+      "loss": 0.4154,
+      "step": 11610
+    },
+    {
+      "epoch": 0.3203730465583903,
+      "grad_norm": 0.003341773757711053,
+      "learning_rate": 0.001,
+      "loss": 0.3965,
+      "step": 11611
+    },
+    {
+      "epoch": 0.3204006387594546,
+      "grad_norm": 0.004839817062020302,
+      "learning_rate": 0.001,
+      "loss": 0.3685,
+      "step": 11612
+    },
+    {
+      "epoch": 0.32042823096051903,
+      "grad_norm": 0.0031633905600756407,
+      "learning_rate": 0.001,
+      "loss": 0.3595,
+      "step": 11613
+    },
+    {
+      "epoch": 0.3204558231615834,
+      "grad_norm": 0.011446718126535416,
+      "learning_rate": 0.001,
+      "loss": 0.4126,
+      "step": 11614
+    },
+    {
+      "epoch": 0.32048341536264774,
+      "grad_norm": 0.011381502263247967,
+      "learning_rate": 0.001,
+      "loss": 0.3986,
+      "step": 11615
+    },
+    {
+      "epoch": 0.3205110075637121,
+      "grad_norm": 0.013346023857593536,
+      "learning_rate": 0.001,
+      "loss": 0.4054,
+      "step": 11616
+    },
+    {
+      "epoch": 0.3205385997647765,
+      "grad_norm": 0.004878256935626268,
+      "learning_rate": 0.001,
+      "loss": 0.4247,
+      "step": 11617
+    },
+    {
+      "epoch": 0.32056619196584085,
+      "grad_norm": 0.005075674969702959,
+      "learning_rate": 0.001,
+      "loss": 0.406,
+      "step": 11618
+    },
+    {
+      "epoch": 0.3205937841669052,
+      "grad_norm": 0.005384589079767466,
+      "learning_rate": 0.001,
+      "loss": 0.4004,
+      "step": 11619
+    },
+    {
+      "epoch": 0.3206213763679696,
+      "grad_norm": 0.004261939786374569,
+      "learning_rate": 0.001,
+      "loss": 0.3931,
+      "step": 11620
+    },
+    {
+      "epoch": 0.32064896856903397,
+      "grad_norm": 0.008900157175958157,
+      "learning_rate": 0.001,
+      "loss": 0.4182,
+      "step": 11621
+    },
+    {
+      "epoch": 0.3206765607700983,
+      "grad_norm": 0.0024166591465473175,
+      "learning_rate": 0.001,
+      "loss": 0.4527,
+      "step": 11622
+    },
+    {
+      "epoch": 0.3207041529711627,
+      "grad_norm": 0.0027629456017166376,
+      "learning_rate": 0.001,
+      "loss": 0.3968,
+      "step": 11623
+    },
+    {
+      "epoch": 0.3207317451722271,
+      "grad_norm": 0.002562351291999221,
+      "learning_rate": 0.001,
+      "loss": 0.3683,
+      "step": 11624
+    },
+    {
+      "epoch": 0.32075933737329143,
+      "grad_norm": 0.0035787278320640326,
+      "learning_rate": 0.001,
+      "loss": 0.388,
+      "step": 11625
+    },
+    {
+      "epoch": 0.3207869295743558,
+      "grad_norm": 0.0025681303814053535,
+      "learning_rate": 0.001,
+      "loss": 0.414,
+      "step": 11626
+    },
+    {
+      "epoch": 0.3208145217754202,
+      "grad_norm": 0.003248112043365836,
+      "learning_rate": 0.001,
+      "loss": 0.4581,
+      "step": 11627
+    },
+    {
+      "epoch": 0.32084211397648454,
+      "grad_norm": 0.001800362253561616,
+      "learning_rate": 0.001,
+      "loss": 0.4295,
+      "step": 11628
+    },
+    {
+      "epoch": 0.3208697061775489,
+      "grad_norm": 0.002697241958230734,
+      "learning_rate": 0.001,
+      "loss": 0.3712,
+      "step": 11629
+    },
+    {
+      "epoch": 0.3208972983786133,
+      "grad_norm": 0.0029813311994075775,
+      "learning_rate": 0.001,
+      "loss": 0.3826,
+      "step": 11630
+    },
+    {
+      "epoch": 0.32092489057967766,
+      "grad_norm": 0.006515763234347105,
+      "learning_rate": 0.001,
+      "loss": 0.3937,
+      "step": 11631
+    },
+    {
+      "epoch": 0.320952482780742,
+      "grad_norm": 0.002813748549669981,
+      "learning_rate": 0.001,
+      "loss": 0.3896,
+      "step": 11632
+    },
+    {
+      "epoch": 0.3209800749818064,
+      "grad_norm": 0.00456245755776763,
+      "learning_rate": 0.001,
+      "loss": 0.4173,
+      "step": 11633
+    },
+    {
+      "epoch": 0.32100766718287077,
+      "grad_norm": 0.0027176521252840757,
+      "learning_rate": 0.001,
+      "loss": 0.4196,
+      "step": 11634
+    },
+    {
+      "epoch": 0.3210352593839351,
+      "grad_norm": 0.003955816384404898,
+      "learning_rate": 0.001,
+      "loss": 0.4418,
+      "step": 11635
+    },
+    {
+      "epoch": 0.3210628515849995,
+      "grad_norm": 0.005285539198666811,
+      "learning_rate": 0.001,
+      "loss": 0.3808,
+      "step": 11636
+    },
+    {
+      "epoch": 0.3210904437860639,
+      "grad_norm": 0.0025900506880134344,
+      "learning_rate": 0.001,
+      "loss": 0.4138,
+      "step": 11637
+    },
+    {
+      "epoch": 0.32111803598712824,
+      "grad_norm": 0.006391063332557678,
+      "learning_rate": 0.001,
+      "loss": 0.4104,
+      "step": 11638
+    },
+    {
+      "epoch": 0.3211456281881926,
+      "grad_norm": 0.0032550417818129063,
+      "learning_rate": 0.001,
+      "loss": 0.3613,
+      "step": 11639
+    },
+    {
+      "epoch": 0.321173220389257,
+      "grad_norm": 0.0030542064923793077,
+      "learning_rate": 0.001,
+      "loss": 0.4094,
+      "step": 11640
+    },
+    {
+      "epoch": 0.32120081259032135,
+      "grad_norm": 0.0037448366638273,
+      "learning_rate": 0.001,
+      "loss": 0.3858,
+      "step": 11641
+    },
+    {
+      "epoch": 0.3212284047913857,
+      "grad_norm": 0.003444848582148552,
+      "learning_rate": 0.001,
+      "loss": 0.4203,
+      "step": 11642
+    },
+    {
+      "epoch": 0.3212559969924501,
+      "grad_norm": 0.004159749951213598,
+      "learning_rate": 0.001,
+      "loss": 0.4023,
+      "step": 11643
+    },
+    {
+      "epoch": 0.32128358919351446,
+      "grad_norm": 0.002784531796351075,
+      "learning_rate": 0.001,
+      "loss": 0.4541,
+      "step": 11644
+    },
+    {
+      "epoch": 0.3213111813945788,
+      "grad_norm": 0.004061343614012003,
+      "learning_rate": 0.001,
+      "loss": 0.377,
+      "step": 11645
+    },
+    {
+      "epoch": 0.32133877359564317,
+      "grad_norm": 0.003336769063025713,
+      "learning_rate": 0.001,
+      "loss": 0.3717,
+      "step": 11646
+    },
+    {
+      "epoch": 0.3213663657967076,
+      "grad_norm": 0.002995783928781748,
+      "learning_rate": 0.001,
+      "loss": 0.3825,
+      "step": 11647
+    },
+    {
+      "epoch": 0.32139395799777193,
+      "grad_norm": 0.0039530228823423386,
+      "learning_rate": 0.001,
+      "loss": 0.3861,
+      "step": 11648
+    },
+    {
+      "epoch": 0.3214215501988363,
+      "grad_norm": 0.0023765622172504663,
+      "learning_rate": 0.001,
+      "loss": 0.3891,
+      "step": 11649
+    },
+    {
+      "epoch": 0.3214491423999007,
+      "grad_norm": 0.006312607321888208,
+      "learning_rate": 0.001,
+      "loss": 0.3821,
+      "step": 11650
+    },
+    {
+      "epoch": 0.32147673460096504,
+      "grad_norm": 0.013422048650681973,
+      "learning_rate": 0.001,
+      "loss": 0.3958,
+      "step": 11651
+    },
+    {
+      "epoch": 0.3215043268020294,
+      "grad_norm": 0.0030129451770335436,
+      "learning_rate": 0.001,
+      "loss": 0.4134,
+      "step": 11652
+    },
+    {
+      "epoch": 0.3215319190030938,
+      "grad_norm": 0.003410701872780919,
+      "learning_rate": 0.001,
+      "loss": 0.3984,
+      "step": 11653
+    },
+    {
+      "epoch": 0.32155951120415815,
+      "grad_norm": 0.004033817909657955,
+      "learning_rate": 0.001,
+      "loss": 0.4192,
+      "step": 11654
+    },
+    {
+      "epoch": 0.3215871034052225,
+      "grad_norm": 0.01334238052368164,
+      "learning_rate": 0.001,
+      "loss": 0.4287,
+      "step": 11655
+    },
+    {
+      "epoch": 0.32161469560628686,
+      "grad_norm": 0.0067011150531470776,
+      "learning_rate": 0.001,
+      "loss": 0.3896,
+      "step": 11656
+    },
+    {
+      "epoch": 0.32164228780735127,
+      "grad_norm": 0.022604364901781082,
+      "learning_rate": 0.001,
+      "loss": 0.3968,
+      "step": 11657
+    },
+    {
+      "epoch": 0.3216698800084156,
+      "grad_norm": 0.00880459789186716,
+      "learning_rate": 0.001,
+      "loss": 0.4127,
+      "step": 11658
+    },
+    {
+      "epoch": 0.32169747220948,
+      "grad_norm": 0.00856684148311615,
+      "learning_rate": 0.001,
+      "loss": 0.4005,
+      "step": 11659
+    },
+    {
+      "epoch": 0.3217250644105444,
+      "grad_norm": 0.009844634681940079,
+      "learning_rate": 0.001,
+      "loss": 0.4195,
+      "step": 11660
+    },
+    {
+      "epoch": 0.32175265661160873,
+      "grad_norm": 0.002705506980419159,
+      "learning_rate": 0.001,
+      "loss": 0.4083,
+      "step": 11661
+    },
+    {
+      "epoch": 0.3217802488126731,
+      "grad_norm": 0.004265344236046076,
+      "learning_rate": 0.001,
+      "loss": 0.3886,
+      "step": 11662
+    },
+    {
+      "epoch": 0.3218078410137375,
+      "grad_norm": 0.0042647975496947765,
+      "learning_rate": 0.001,
+      "loss": 0.3847,
+      "step": 11663
+    },
+    {
+      "epoch": 0.32183543321480185,
+      "grad_norm": 0.0029369243420660496,
+      "learning_rate": 0.001,
+      "loss": 0.4092,
+      "step": 11664
+    },
+    {
+      "epoch": 0.3218630254158662,
+      "grad_norm": 0.00970220472663641,
+      "learning_rate": 0.001,
+      "loss": 0.4123,
+      "step": 11665
+    },
+    {
+      "epoch": 0.32189061761693055,
+      "grad_norm": 0.006463578902184963,
+      "learning_rate": 0.001,
+      "loss": 0.4051,
+      "step": 11666
+    },
+    {
+      "epoch": 0.32191820981799496,
+      "grad_norm": 0.0018640676280483603,
+      "learning_rate": 0.001,
+      "loss": 0.4162,
+      "step": 11667
+    },
+    {
+      "epoch": 0.3219458020190593,
+      "grad_norm": 0.002777427202090621,
+      "learning_rate": 0.001,
+      "loss": 0.4171,
+      "step": 11668
+    },
+    {
+      "epoch": 0.32197339422012367,
+      "grad_norm": 0.004610841162502766,
+      "learning_rate": 0.001,
+      "loss": 0.3625,
+      "step": 11669
+    },
+    {
+      "epoch": 0.3220009864211881,
+      "grad_norm": 0.006540005095303059,
+      "learning_rate": 0.001,
+      "loss": 0.4204,
+      "step": 11670
+    },
+    {
+      "epoch": 0.3220285786222524,
+      "grad_norm": 0.005402629263699055,
+      "learning_rate": 0.001,
+      "loss": 0.4004,
+      "step": 11671
+    },
+    {
+      "epoch": 0.3220561708233168,
+      "grad_norm": 0.00336782680824399,
+      "learning_rate": 0.001,
+      "loss": 0.4244,
+      "step": 11672
+    },
+    {
+      "epoch": 0.32208376302438113,
+      "grad_norm": 0.005407973658293486,
+      "learning_rate": 0.001,
+      "loss": 0.388,
+      "step": 11673
+    },
+    {
+      "epoch": 0.32211135522544554,
+      "grad_norm": 0.002696128562092781,
+      "learning_rate": 0.001,
+      "loss": 0.3825,
+      "step": 11674
+    },
+    {
+      "epoch": 0.3221389474265099,
+      "grad_norm": 0.0032647987827658653,
+      "learning_rate": 0.001,
+      "loss": 0.3783,
+      "step": 11675
+    },
+    {
+      "epoch": 0.32216653962757424,
+      "grad_norm": 0.005423584952950478,
+      "learning_rate": 0.001,
+      "loss": 0.3873,
+      "step": 11676
+    },
+    {
+      "epoch": 0.32219413182863865,
+      "grad_norm": 0.003348333528265357,
+      "learning_rate": 0.001,
+      "loss": 0.3715,
+      "step": 11677
+    },
+    {
+      "epoch": 0.322221724029703,
+      "grad_norm": 0.002469028811901808,
+      "learning_rate": 0.001,
+      "loss": 0.3968,
+      "step": 11678
+    },
+    {
+      "epoch": 0.32224931623076736,
+      "grad_norm": 0.0023266442585736513,
+      "learning_rate": 0.001,
+      "loss": 0.398,
+      "step": 11679
+    },
+    {
+      "epoch": 0.32227690843183177,
+      "grad_norm": 0.003974389284849167,
+      "learning_rate": 0.001,
+      "loss": 0.379,
+      "step": 11680
+    },
+    {
+      "epoch": 0.3223045006328961,
+      "grad_norm": 0.002209985861554742,
+      "learning_rate": 0.001,
+      "loss": 0.4158,
+      "step": 11681
+    },
+    {
+      "epoch": 0.32233209283396047,
+      "grad_norm": 0.002755221212282777,
+      "learning_rate": 0.001,
+      "loss": 0.4107,
+      "step": 11682
+    },
+    {
+      "epoch": 0.3223596850350248,
+      "grad_norm": 0.003273082198575139,
+      "learning_rate": 0.001,
+      "loss": 0.3741,
+      "step": 11683
+    },
+    {
+      "epoch": 0.32238727723608923,
+      "grad_norm": 0.003454297548159957,
+      "learning_rate": 0.001,
+      "loss": 0.4128,
+      "step": 11684
+    },
+    {
+      "epoch": 0.3224148694371536,
+      "grad_norm": 0.0033135886769741774,
+      "learning_rate": 0.001,
+      "loss": 0.4422,
+      "step": 11685
+    },
+    {
+      "epoch": 0.32244246163821794,
+      "grad_norm": 0.00300540286116302,
+      "learning_rate": 0.001,
+      "loss": 0.3943,
+      "step": 11686
+    },
+    {
+      "epoch": 0.32247005383928234,
+      "grad_norm": 0.0025454771239310503,
+      "learning_rate": 0.001,
+      "loss": 0.4337,
+      "step": 11687
+    },
+    {
+      "epoch": 0.3224976460403467,
+      "grad_norm": 0.0030585303902626038,
+      "learning_rate": 0.001,
+      "loss": 0.4475,
+      "step": 11688
+    },
+    {
+      "epoch": 0.32252523824141105,
+      "grad_norm": 0.0024806379806250334,
+      "learning_rate": 0.001,
+      "loss": 0.3628,
+      "step": 11689
+    },
+    {
+      "epoch": 0.32255283044247546,
+      "grad_norm": 0.002650330774486065,
+      "learning_rate": 0.001,
+      "loss": 0.4029,
+      "step": 11690
+    },
+    {
+      "epoch": 0.3225804226435398,
+      "grad_norm": 0.003308363724499941,
+      "learning_rate": 0.001,
+      "loss": 0.4157,
+      "step": 11691
+    },
+    {
+      "epoch": 0.32260801484460416,
+      "grad_norm": 0.0027979854494333267,
+      "learning_rate": 0.001,
+      "loss": 0.424,
+      "step": 11692
+    },
+    {
+      "epoch": 0.3226356070456685,
+      "grad_norm": 0.0035791087429970503,
+      "learning_rate": 0.001,
+      "loss": 0.3991,
+      "step": 11693
+    },
+    {
+      "epoch": 0.3226631992467329,
+      "grad_norm": 0.004163493402302265,
+      "learning_rate": 0.001,
+      "loss": 0.3944,
+      "step": 11694
+    },
+    {
+      "epoch": 0.3226907914477973,
+      "grad_norm": 0.0028729864861816168,
+      "learning_rate": 0.001,
+      "loss": 0.4305,
+      "step": 11695
+    },
+    {
+      "epoch": 0.32271838364886163,
+      "grad_norm": 0.018418997526168823,
+      "learning_rate": 0.001,
+      "loss": 0.4063,
+      "step": 11696
+    },
+    {
+      "epoch": 0.32274597584992604,
+      "grad_norm": 0.0038654280360788107,
+      "learning_rate": 0.001,
+      "loss": 0.3831,
+      "step": 11697
+    },
+    {
+      "epoch": 0.3227735680509904,
+      "grad_norm": 0.002807804848998785,
+      "learning_rate": 0.001,
+      "loss": 0.4042,
+      "step": 11698
+    },
+    {
+      "epoch": 0.32280116025205474,
+      "grad_norm": 0.003091311315074563,
+      "learning_rate": 0.001,
+      "loss": 0.3997,
+      "step": 11699
+    },
+    {
+      "epoch": 0.32282875245311915,
+      "grad_norm": 0.002582644810900092,
+      "learning_rate": 0.001,
+      "loss": 0.3888,
+      "step": 11700
+    },
+    {
+      "epoch": 0.3228563446541835,
+      "grad_norm": 0.0029077474027872086,
+      "learning_rate": 0.001,
+      "loss": 0.3997,
+      "step": 11701
+    },
+    {
+      "epoch": 0.32288393685524786,
+      "grad_norm": 0.004372213501483202,
+      "learning_rate": 0.001,
+      "loss": 0.4026,
+      "step": 11702
+    },
+    {
+      "epoch": 0.3229115290563122,
+      "grad_norm": 0.005009527318179607,
+      "learning_rate": 0.001,
+      "loss": 0.4202,
+      "step": 11703
+    },
+    {
+      "epoch": 0.3229391212573766,
+      "grad_norm": 0.002494605490937829,
+      "learning_rate": 0.001,
+      "loss": 0.4044,
+      "step": 11704
+    },
+    {
+      "epoch": 0.32296671345844097,
+      "grad_norm": 0.0030128166545182467,
+      "learning_rate": 0.001,
+      "loss": 0.3958,
+      "step": 11705
+    },
+    {
+      "epoch": 0.3229943056595053,
+      "grad_norm": 0.005615463946014643,
+      "learning_rate": 0.001,
+      "loss": 0.3868,
+      "step": 11706
+    },
+    {
+      "epoch": 0.32302189786056973,
+      "grad_norm": 0.0026556167285889387,
+      "learning_rate": 0.001,
+      "loss": 0.4277,
+      "step": 11707
+    },
+    {
+      "epoch": 0.3230494900616341,
+      "grad_norm": 0.004147225525230169,
+      "learning_rate": 0.001,
+      "loss": 0.4191,
+      "step": 11708
+    },
+    {
+      "epoch": 0.32307708226269843,
+      "grad_norm": 0.004111787304282188,
+      "learning_rate": 0.001,
+      "loss": 0.3782,
+      "step": 11709
+    },
+    {
+      "epoch": 0.32310467446376284,
+      "grad_norm": 0.0030196192674338818,
+      "learning_rate": 0.001,
+      "loss": 0.4388,
+      "step": 11710
+    },
+    {
+      "epoch": 0.3231322666648272,
+      "grad_norm": 0.0029588905163109303,
+      "learning_rate": 0.001,
+      "loss": 0.4076,
+      "step": 11711
+    },
+    {
+      "epoch": 0.32315985886589155,
+      "grad_norm": 0.0030484474264085293,
+      "learning_rate": 0.001,
+      "loss": 0.4257,
+      "step": 11712
+    },
+    {
+      "epoch": 0.3231874510669559,
+      "grad_norm": 0.0031253311317414045,
+      "learning_rate": 0.001,
+      "loss": 0.3743,
+      "step": 11713
+    },
+    {
+      "epoch": 0.3232150432680203,
+      "grad_norm": 0.004489220213145018,
+      "learning_rate": 0.001,
+      "loss": 0.3902,
+      "step": 11714
+    },
+    {
+      "epoch": 0.32324263546908466,
+      "grad_norm": 0.0034049181267619133,
+      "learning_rate": 0.001,
+      "loss": 0.4251,
+      "step": 11715
+    },
+    {
+      "epoch": 0.323270227670149,
+      "grad_norm": 0.0045754867605865,
+      "learning_rate": 0.001,
+      "loss": 0.3577,
+      "step": 11716
+    },
+    {
+      "epoch": 0.3232978198712134,
+      "grad_norm": 0.0035644634626805782,
+      "learning_rate": 0.001,
+      "loss": 0.4277,
+      "step": 11717
+    },
+    {
+      "epoch": 0.3233254120722778,
+      "grad_norm": 0.0033633566927164793,
+      "learning_rate": 0.001,
+      "loss": 0.3759,
+      "step": 11718
+    },
+    {
+      "epoch": 0.3233530042733421,
+      "grad_norm": 0.003968440927565098,
+      "learning_rate": 0.001,
+      "loss": 0.3516,
+      "step": 11719
+    },
+    {
+      "epoch": 0.32338059647440653,
+      "grad_norm": 0.003708978882059455,
+      "learning_rate": 0.001,
+      "loss": 0.3837,
+      "step": 11720
+    },
+    {
+      "epoch": 0.3234081886754709,
+      "grad_norm": 0.002603841945528984,
+      "learning_rate": 0.001,
+      "loss": 0.4278,
+      "step": 11721
+    },
+    {
+      "epoch": 0.32343578087653524,
+      "grad_norm": 0.0024553535040467978,
+      "learning_rate": 0.001,
+      "loss": 0.3641,
+      "step": 11722
+    },
+    {
+      "epoch": 0.3234633730775996,
+      "grad_norm": 0.0033427192829549313,
+      "learning_rate": 0.001,
+      "loss": 0.3833,
+      "step": 11723
+    },
+    {
+      "epoch": 0.323490965278664,
+      "grad_norm": 0.008662039414048195,
+      "learning_rate": 0.001,
+      "loss": 0.4312,
+      "step": 11724
+    },
+    {
+      "epoch": 0.32351855747972835,
+      "grad_norm": 0.002312082564458251,
+      "learning_rate": 0.001,
+      "loss": 0.4505,
+      "step": 11725
+    },
+    {
+      "epoch": 0.3235461496807927,
+      "grad_norm": 0.0026776569429785013,
+      "learning_rate": 0.001,
+      "loss": 0.3886,
+      "step": 11726
+    },
+    {
+      "epoch": 0.3235737418818571,
+      "grad_norm": 0.003623252734541893,
+      "learning_rate": 0.001,
+      "loss": 0.4441,
+      "step": 11727
+    },
+    {
+      "epoch": 0.32360133408292147,
+      "grad_norm": 0.004855128470808268,
+      "learning_rate": 0.001,
+      "loss": 0.4372,
+      "step": 11728
+    },
+    {
+      "epoch": 0.3236289262839858,
+      "grad_norm": 0.0022675027139484882,
+      "learning_rate": 0.001,
+      "loss": 0.4128,
+      "step": 11729
+    },
+    {
+      "epoch": 0.3236565184850502,
+      "grad_norm": 0.003087605582550168,
+      "learning_rate": 0.001,
+      "loss": 0.3768,
+      "step": 11730
+    },
+    {
+      "epoch": 0.3236841106861146,
+      "grad_norm": 0.003196093952283263,
+      "learning_rate": 0.001,
+      "loss": 0.411,
+      "step": 11731
+    },
+    {
+      "epoch": 0.32371170288717893,
+      "grad_norm": 0.002098439959809184,
+      "learning_rate": 0.001,
+      "loss": 0.4549,
+      "step": 11732
+    },
+    {
+      "epoch": 0.3237392950882433,
+      "grad_norm": 0.0027286570984870195,
+      "learning_rate": 0.001,
+      "loss": 0.4001,
+      "step": 11733
+    },
+    {
+      "epoch": 0.3237668872893077,
+      "grad_norm": 0.0031044287607073784,
+      "learning_rate": 0.001,
+      "loss": 0.4143,
+      "step": 11734
+    },
+    {
+      "epoch": 0.32379447949037204,
+      "grad_norm": 0.002601184183731675,
+      "learning_rate": 0.001,
+      "loss": 0.3938,
+      "step": 11735
+    },
+    {
+      "epoch": 0.3238220716914364,
+      "grad_norm": 0.003185172798112035,
+      "learning_rate": 0.001,
+      "loss": 0.37,
+      "step": 11736
+    },
+    {
+      "epoch": 0.3238496638925008,
+      "grad_norm": 0.0032646963372826576,
+      "learning_rate": 0.001,
+      "loss": 0.4073,
+      "step": 11737
+    },
+    {
+      "epoch": 0.32387725609356516,
+      "grad_norm": 0.004382995422929525,
+      "learning_rate": 0.001,
+      "loss": 0.3969,
+      "step": 11738
+    },
+    {
+      "epoch": 0.3239048482946295,
+      "grad_norm": 0.004330406431108713,
+      "learning_rate": 0.001,
+      "loss": 0.3801,
+      "step": 11739
+    },
+    {
+      "epoch": 0.3239324404956939,
+      "grad_norm": 0.01158521044999361,
+      "learning_rate": 0.001,
+      "loss": 0.3747,
+      "step": 11740
+    },
+    {
+      "epoch": 0.32396003269675827,
+      "grad_norm": 0.002962679835036397,
+      "learning_rate": 0.001,
+      "loss": 0.4056,
+      "step": 11741
+    },
+    {
+      "epoch": 0.3239876248978226,
+      "grad_norm": 0.0029328917153179646,
+      "learning_rate": 0.001,
+      "loss": 0.4242,
+      "step": 11742
+    },
+    {
+      "epoch": 0.324015217098887,
+      "grad_norm": 0.004508418962359428,
+      "learning_rate": 0.001,
+      "loss": 0.4069,
+      "step": 11743
+    },
+    {
+      "epoch": 0.3240428092999514,
+      "grad_norm": 0.0027705549728125334,
+      "learning_rate": 0.001,
+      "loss": 0.4,
+      "step": 11744
+    },
+    {
+      "epoch": 0.32407040150101574,
+      "grad_norm": 0.0022474832367151976,
+      "learning_rate": 0.001,
+      "loss": 0.4098,
+      "step": 11745
+    },
+    {
+      "epoch": 0.3240979937020801,
+      "grad_norm": 0.003890387015417218,
+      "learning_rate": 0.001,
+      "loss": 0.3878,
+      "step": 11746
+    },
+    {
+      "epoch": 0.3241255859031445,
+      "grad_norm": 0.0027710788417607546,
+      "learning_rate": 0.001,
+      "loss": 0.3838,
+      "step": 11747
+    },
+    {
+      "epoch": 0.32415317810420885,
+      "grad_norm": 0.002632656367495656,
+      "learning_rate": 0.001,
+      "loss": 0.3931,
+      "step": 11748
+    },
+    {
+      "epoch": 0.3241807703052732,
+      "grad_norm": 0.0026942237745970488,
+      "learning_rate": 0.001,
+      "loss": 0.3633,
+      "step": 11749
+    },
+    {
+      "epoch": 0.3242083625063376,
+      "grad_norm": 0.0025526436511427164,
+      "learning_rate": 0.001,
+      "loss": 0.404,
+      "step": 11750
+    },
+    {
+      "epoch": 0.32423595470740196,
+      "grad_norm": 0.002429589629173279,
+      "learning_rate": 0.001,
+      "loss": 0.3975,
+      "step": 11751
+    },
+    {
+      "epoch": 0.3242635469084663,
+      "grad_norm": 0.003830046160146594,
+      "learning_rate": 0.001,
+      "loss": 0.3961,
+      "step": 11752
+    },
+    {
+      "epoch": 0.32429113910953067,
+      "grad_norm": 0.003458908060565591,
+      "learning_rate": 0.001,
+      "loss": 0.4292,
+      "step": 11753
+    },
+    {
+      "epoch": 0.3243187313105951,
+      "grad_norm": 0.002460696967318654,
+      "learning_rate": 0.001,
+      "loss": 0.3917,
+      "step": 11754
+    },
+    {
+      "epoch": 0.32434632351165943,
+      "grad_norm": 0.004534050356596708,
+      "learning_rate": 0.001,
+      "loss": 0.4216,
+      "step": 11755
+    },
+    {
+      "epoch": 0.3243739157127238,
+      "grad_norm": 0.0023591818753629923,
+      "learning_rate": 0.001,
+      "loss": 0.3782,
+      "step": 11756
+    },
+    {
+      "epoch": 0.3244015079137882,
+      "grad_norm": 0.0029876313637942076,
+      "learning_rate": 0.001,
+      "loss": 0.3675,
+      "step": 11757
+    },
+    {
+      "epoch": 0.32442910011485254,
+      "grad_norm": 0.003414266975596547,
+      "learning_rate": 0.001,
+      "loss": 0.3941,
+      "step": 11758
+    },
+    {
+      "epoch": 0.3244566923159169,
+      "grad_norm": 0.004012831952422857,
+      "learning_rate": 0.001,
+      "loss": 0.3652,
+      "step": 11759
+    },
+    {
+      "epoch": 0.3244842845169813,
+      "grad_norm": 0.0034918745514005423,
+      "learning_rate": 0.001,
+      "loss": 0.419,
+      "step": 11760
+    },
+    {
+      "epoch": 0.32451187671804566,
+      "grad_norm": 0.002764191012829542,
+      "learning_rate": 0.001,
+      "loss": 0.3974,
+      "step": 11761
+    },
+    {
+      "epoch": 0.32453946891911,
+      "grad_norm": 0.003841676749289036,
+      "learning_rate": 0.001,
+      "loss": 0.3758,
+      "step": 11762
+    },
+    {
+      "epoch": 0.32456706112017436,
+      "grad_norm": 0.004220995120704174,
+      "learning_rate": 0.001,
+      "loss": 0.422,
+      "step": 11763
+    },
+    {
+      "epoch": 0.32459465332123877,
+      "grad_norm": 0.003612626576796174,
+      "learning_rate": 0.001,
+      "loss": 0.4017,
+      "step": 11764
+    },
+    {
+      "epoch": 0.3246222455223031,
+      "grad_norm": 0.0025354884564876556,
+      "learning_rate": 0.001,
+      "loss": 0.3927,
+      "step": 11765
+    },
+    {
+      "epoch": 0.3246498377233675,
+      "grad_norm": 0.003477931022644043,
+      "learning_rate": 0.001,
+      "loss": 0.3919,
+      "step": 11766
+    },
+    {
+      "epoch": 0.3246774299244319,
+      "grad_norm": 0.004952993709594011,
+      "learning_rate": 0.001,
+      "loss": 0.4138,
+      "step": 11767
+    },
+    {
+      "epoch": 0.32470502212549623,
+      "grad_norm": 0.0031629884615540504,
+      "learning_rate": 0.001,
+      "loss": 0.358,
+      "step": 11768
+    },
+    {
+      "epoch": 0.3247326143265606,
+      "grad_norm": 0.002113747876137495,
+      "learning_rate": 0.001,
+      "loss": 0.4527,
+      "step": 11769
+    },
+    {
+      "epoch": 0.32476020652762494,
+      "grad_norm": 0.002285837195813656,
+      "learning_rate": 0.001,
+      "loss": 0.4368,
+      "step": 11770
+    },
+    {
+      "epoch": 0.32478779872868935,
+      "grad_norm": 0.003418036038056016,
+      "learning_rate": 0.001,
+      "loss": 0.4313,
+      "step": 11771
+    },
+    {
+      "epoch": 0.3248153909297537,
+      "grad_norm": 0.00575110362842679,
+      "learning_rate": 0.001,
+      "loss": 0.399,
+      "step": 11772
+    },
+    {
+      "epoch": 0.32484298313081805,
+      "grad_norm": 0.0031514798756688833,
+      "learning_rate": 0.001,
+      "loss": 0.4403,
+      "step": 11773
+    },
+    {
+      "epoch": 0.32487057533188246,
+      "grad_norm": 0.0029524280689656734,
+      "learning_rate": 0.001,
+      "loss": 0.3897,
+      "step": 11774
+    },
+    {
+      "epoch": 0.3248981675329468,
+      "grad_norm": 0.0038776015862822533,
+      "learning_rate": 0.001,
+      "loss": 0.4326,
+      "step": 11775
+    },
+    {
+      "epoch": 0.32492575973401117,
+      "grad_norm": 0.006274119019508362,
+      "learning_rate": 0.001,
+      "loss": 0.3959,
+      "step": 11776
+    },
+    {
+      "epoch": 0.3249533519350756,
+      "grad_norm": 0.00265169283375144,
+      "learning_rate": 0.001,
+      "loss": 0.395,
+      "step": 11777
+    },
+    {
+      "epoch": 0.3249809441361399,
+      "grad_norm": 0.0040013487450778484,
+      "learning_rate": 0.001,
+      "loss": 0.3982,
+      "step": 11778
+    },
+    {
+      "epoch": 0.3250085363372043,
+      "grad_norm": 0.004476572852581739,
+      "learning_rate": 0.001,
+      "loss": 0.426,
+      "step": 11779
+    },
+    {
+      "epoch": 0.32503612853826863,
+      "grad_norm": 0.00346595561131835,
+      "learning_rate": 0.001,
+      "loss": 0.3974,
+      "step": 11780
+    },
+    {
+      "epoch": 0.32506372073933304,
+      "grad_norm": 0.004196068271994591,
+      "learning_rate": 0.001,
+      "loss": 0.3683,
+      "step": 11781
+    },
+    {
+      "epoch": 0.3250913129403974,
+      "grad_norm": 0.0036759015638381243,
+      "learning_rate": 0.001,
+      "loss": 0.3842,
+      "step": 11782
+    },
+    {
+      "epoch": 0.32511890514146174,
+      "grad_norm": 0.0028389294166117907,
+      "learning_rate": 0.001,
+      "loss": 0.3608,
+      "step": 11783
+    },
+    {
+      "epoch": 0.32514649734252615,
+      "grad_norm": 0.00459603127092123,
+      "learning_rate": 0.001,
+      "loss": 0.4131,
+      "step": 11784
+    },
+    {
+      "epoch": 0.3251740895435905,
+      "grad_norm": 0.005619920324534178,
+      "learning_rate": 0.001,
+      "loss": 0.3996,
+      "step": 11785
+    },
+    {
+      "epoch": 0.32520168174465486,
+      "grad_norm": 0.009859694167971611,
+      "learning_rate": 0.001,
+      "loss": 0.3829,
+      "step": 11786
+    },
+    {
+      "epoch": 0.32522927394571927,
+      "grad_norm": 0.006388423964381218,
+      "learning_rate": 0.001,
+      "loss": 0.4,
+      "step": 11787
+    },
+    {
+      "epoch": 0.3252568661467836,
+      "grad_norm": 0.006722534541040659,
+      "learning_rate": 0.001,
+      "loss": 0.3912,
+      "step": 11788
+    },
+    {
+      "epoch": 0.32528445834784797,
+      "grad_norm": 0.008275543339550495,
+      "learning_rate": 0.001,
+      "loss": 0.3877,
+      "step": 11789
+    },
+    {
+      "epoch": 0.3253120505489123,
+      "grad_norm": 0.004540057387202978,
+      "learning_rate": 0.001,
+      "loss": 0.43,
+      "step": 11790
+    },
+    {
+      "epoch": 0.32533964274997673,
+      "grad_norm": 0.0027223837096244097,
+      "learning_rate": 0.001,
+      "loss": 0.427,
+      "step": 11791
+    },
+    {
+      "epoch": 0.3253672349510411,
+      "grad_norm": 0.003166953567415476,
+      "learning_rate": 0.001,
+      "loss": 0.4142,
+      "step": 11792
+    },
+    {
+      "epoch": 0.32539482715210544,
+      "grad_norm": 0.004159968346357346,
+      "learning_rate": 0.001,
+      "loss": 0.3904,
+      "step": 11793
+    },
+    {
+      "epoch": 0.32542241935316985,
+      "grad_norm": 0.002168782986700535,
+      "learning_rate": 0.001,
+      "loss": 0.4455,
+      "step": 11794
+    },
+    {
+      "epoch": 0.3254500115542342,
+      "grad_norm": 0.0049192942678928375,
+      "learning_rate": 0.001,
+      "loss": 0.3977,
+      "step": 11795
+    },
+    {
+      "epoch": 0.32547760375529855,
+      "grad_norm": 0.0027885152958333492,
+      "learning_rate": 0.001,
+      "loss": 0.3801,
+      "step": 11796
+    },
+    {
+      "epoch": 0.32550519595636296,
+      "grad_norm": 0.005841918755322695,
+      "learning_rate": 0.001,
+      "loss": 0.3976,
+      "step": 11797
+    },
+    {
+      "epoch": 0.3255327881574273,
+      "grad_norm": 0.00430692546069622,
+      "learning_rate": 0.001,
+      "loss": 0.4142,
+      "step": 11798
+    },
+    {
+      "epoch": 0.32556038035849166,
+      "grad_norm": 0.003325339872390032,
+      "learning_rate": 0.001,
+      "loss": 0.3958,
+      "step": 11799
+    },
+    {
+      "epoch": 0.325587972559556,
+      "grad_norm": 0.0032596606761217117,
+      "learning_rate": 0.001,
+      "loss": 0.3914,
+      "step": 11800
+    },
+    {
+      "epoch": 0.3256155647606204,
+      "grad_norm": 0.004303377121686935,
+      "learning_rate": 0.001,
+      "loss": 0.3912,
+      "step": 11801
+    },
+    {
+      "epoch": 0.3256431569616848,
+      "grad_norm": 0.002758364425972104,
+      "learning_rate": 0.001,
+      "loss": 0.4049,
+      "step": 11802
+    },
+    {
+      "epoch": 0.32567074916274913,
+      "grad_norm": 0.00543051864951849,
+      "learning_rate": 0.001,
+      "loss": 0.3508,
+      "step": 11803
+    },
+    {
+      "epoch": 0.32569834136381354,
+      "grad_norm": 0.003291371511295438,
+      "learning_rate": 0.001,
+      "loss": 0.4341,
+      "step": 11804
+    },
+    {
+      "epoch": 0.3257259335648779,
+      "grad_norm": 0.0028754028026014566,
+      "learning_rate": 0.001,
+      "loss": 0.3812,
+      "step": 11805
+    },
+    {
+      "epoch": 0.32575352576594224,
+      "grad_norm": 0.002680160803720355,
+      "learning_rate": 0.001,
+      "loss": 0.3901,
+      "step": 11806
+    },
+    {
+      "epoch": 0.32578111796700665,
+      "grad_norm": 0.0026779999025166035,
+      "learning_rate": 0.001,
+      "loss": 0.3638,
+      "step": 11807
+    },
+    {
+      "epoch": 0.325808710168071,
+      "grad_norm": 0.0029692722018808126,
+      "learning_rate": 0.001,
+      "loss": 0.3997,
+      "step": 11808
+    },
+    {
+      "epoch": 0.32583630236913536,
+      "grad_norm": 0.00280319363810122,
+      "learning_rate": 0.001,
+      "loss": 0.4106,
+      "step": 11809
+    },
+    {
+      "epoch": 0.3258638945701997,
+      "grad_norm": 0.002986667212098837,
+      "learning_rate": 0.001,
+      "loss": 0.4094,
+      "step": 11810
+    },
+    {
+      "epoch": 0.3258914867712641,
+      "grad_norm": 0.003013553563505411,
+      "learning_rate": 0.001,
+      "loss": 0.4467,
+      "step": 11811
+    },
+    {
+      "epoch": 0.32591907897232847,
+      "grad_norm": 0.002967800246551633,
+      "learning_rate": 0.001,
+      "loss": 0.4084,
+      "step": 11812
+    },
+    {
+      "epoch": 0.3259466711733928,
+      "grad_norm": 0.0031372224912047386,
+      "learning_rate": 0.001,
+      "loss": 0.3709,
+      "step": 11813
+    },
+    {
+      "epoch": 0.32597426337445723,
+      "grad_norm": 0.0031039847526699305,
+      "learning_rate": 0.001,
+      "loss": 0.3604,
+      "step": 11814
+    },
+    {
+      "epoch": 0.3260018555755216,
+      "grad_norm": 0.003049139864742756,
+      "learning_rate": 0.001,
+      "loss": 0.377,
+      "step": 11815
+    },
+    {
+      "epoch": 0.32602944777658593,
+      "grad_norm": 0.0029599741101264954,
+      "learning_rate": 0.001,
+      "loss": 0.4062,
+      "step": 11816
+    },
+    {
+      "epoch": 0.32605703997765034,
+      "grad_norm": 0.003724336624145508,
+      "learning_rate": 0.001,
+      "loss": 0.407,
+      "step": 11817
+    },
+    {
+      "epoch": 0.3260846321787147,
+      "grad_norm": 0.0028906487859785557,
+      "learning_rate": 0.001,
+      "loss": 0.414,
+      "step": 11818
+    },
+    {
+      "epoch": 0.32611222437977905,
+      "grad_norm": 0.003816339885815978,
+      "learning_rate": 0.001,
+      "loss": 0.3857,
+      "step": 11819
+    },
+    {
+      "epoch": 0.3261398165808434,
+      "grad_norm": 0.005886279512196779,
+      "learning_rate": 0.001,
+      "loss": 0.4197,
+      "step": 11820
+    },
+    {
+      "epoch": 0.3261674087819078,
+      "grad_norm": 0.0029236190021038055,
+      "learning_rate": 0.001,
+      "loss": 0.4003,
+      "step": 11821
+    },
+    {
+      "epoch": 0.32619500098297216,
+      "grad_norm": 0.005116250831633806,
+      "learning_rate": 0.001,
+      "loss": 0.3837,
+      "step": 11822
+    },
+    {
+      "epoch": 0.3262225931840365,
+      "grad_norm": 0.0027175480499863625,
+      "learning_rate": 0.001,
+      "loss": 0.3772,
+      "step": 11823
+    },
+    {
+      "epoch": 0.3262501853851009,
+      "grad_norm": 0.005795364733785391,
+      "learning_rate": 0.001,
+      "loss": 0.3854,
+      "step": 11824
+    },
+    {
+      "epoch": 0.3262777775861653,
+      "grad_norm": 0.0028781124856323004,
+      "learning_rate": 0.001,
+      "loss": 0.4071,
+      "step": 11825
+    },
+    {
+      "epoch": 0.3263053697872296,
+      "grad_norm": 0.008585303090512753,
+      "learning_rate": 0.001,
+      "loss": 0.3939,
+      "step": 11826
+    },
+    {
+      "epoch": 0.32633296198829403,
+      "grad_norm": 0.0027091645170003176,
+      "learning_rate": 0.001,
+      "loss": 0.3958,
+      "step": 11827
+    },
+    {
+      "epoch": 0.3263605541893584,
+      "grad_norm": 0.0028620229568332434,
+      "learning_rate": 0.001,
+      "loss": 0.4246,
+      "step": 11828
+    },
+    {
+      "epoch": 0.32638814639042274,
+      "grad_norm": 0.0028989813290536404,
+      "learning_rate": 0.001,
+      "loss": 0.3798,
+      "step": 11829
+    },
+    {
+      "epoch": 0.3264157385914871,
+      "grad_norm": 0.005880692508071661,
+      "learning_rate": 0.001,
+      "loss": 0.3887,
+      "step": 11830
+    },
+    {
+      "epoch": 0.3264433307925515,
+      "grad_norm": 0.005375871900469065,
+      "learning_rate": 0.001,
+      "loss": 0.3917,
+      "step": 11831
+    },
+    {
+      "epoch": 0.32647092299361585,
+      "grad_norm": 0.00993763655424118,
+      "learning_rate": 0.001,
+      "loss": 0.3602,
+      "step": 11832
+    },
+    {
+      "epoch": 0.3264985151946802,
+      "grad_norm": 0.004657833371311426,
+      "learning_rate": 0.001,
+      "loss": 0.408,
+      "step": 11833
+    },
+    {
+      "epoch": 0.3265261073957446,
+      "grad_norm": 0.0023800579365342855,
+      "learning_rate": 0.001,
+      "loss": 0.3682,
+      "step": 11834
+    },
+    {
+      "epoch": 0.32655369959680897,
+      "grad_norm": 0.003295092610642314,
+      "learning_rate": 0.001,
+      "loss": 0.405,
+      "step": 11835
+    },
+    {
+      "epoch": 0.3265812917978733,
+      "grad_norm": 0.0033699970226734877,
+      "learning_rate": 0.001,
+      "loss": 0.4249,
+      "step": 11836
+    },
+    {
+      "epoch": 0.3266088839989377,
+      "grad_norm": 0.0023069006856530905,
+      "learning_rate": 0.001,
+      "loss": 0.381,
+      "step": 11837
+    },
+    {
+      "epoch": 0.3266364762000021,
+      "grad_norm": 0.002622458152472973,
+      "learning_rate": 0.001,
+      "loss": 0.4079,
+      "step": 11838
+    },
+    {
+      "epoch": 0.32666406840106643,
+      "grad_norm": 0.0072345067746937275,
+      "learning_rate": 0.001,
+      "loss": 0.3669,
+      "step": 11839
+    },
+    {
+      "epoch": 0.3266916606021308,
+      "grad_norm": 0.002343923319131136,
+      "learning_rate": 0.001,
+      "loss": 0.4473,
+      "step": 11840
+    },
+    {
+      "epoch": 0.3267192528031952,
+      "grad_norm": 0.003922601230442524,
+      "learning_rate": 0.001,
+      "loss": 0.3833,
+      "step": 11841
+    },
+    {
+      "epoch": 0.32674684500425955,
+      "grad_norm": 0.002412164816632867,
+      "learning_rate": 0.001,
+      "loss": 0.43,
+      "step": 11842
+    },
+    {
+      "epoch": 0.3267744372053239,
+      "grad_norm": 0.004316416569054127,
+      "learning_rate": 0.001,
+      "loss": 0.4081,
+      "step": 11843
+    },
+    {
+      "epoch": 0.3268020294063883,
+      "grad_norm": 0.0025885479990392923,
+      "learning_rate": 0.001,
+      "loss": 0.3644,
+      "step": 11844
+    },
+    {
+      "epoch": 0.32682962160745266,
+      "grad_norm": 0.008287443779408932,
+      "learning_rate": 0.001,
+      "loss": 0.3932,
+      "step": 11845
+    },
+    {
+      "epoch": 0.326857213808517,
+      "grad_norm": 0.0027159906458109617,
+      "learning_rate": 0.001,
+      "loss": 0.4268,
+      "step": 11846
+    },
+    {
+      "epoch": 0.3268848060095814,
+      "grad_norm": 0.002340937964618206,
+      "learning_rate": 0.001,
+      "loss": 0.3868,
+      "step": 11847
+    },
+    {
+      "epoch": 0.32691239821064577,
+      "grad_norm": 0.0027317411731928587,
+      "learning_rate": 0.001,
+      "loss": 0.3732,
+      "step": 11848
+    },
+    {
+      "epoch": 0.3269399904117101,
+      "grad_norm": 0.0028033903799951077,
+      "learning_rate": 0.001,
+      "loss": 0.3836,
+      "step": 11849
+    },
+    {
+      "epoch": 0.3269675826127745,
+      "grad_norm": 0.004709617234766483,
+      "learning_rate": 0.001,
+      "loss": 0.3827,
+      "step": 11850
+    },
+    {
+      "epoch": 0.3269951748138389,
+      "grad_norm": 0.005631160456687212,
+      "learning_rate": 0.001,
+      "loss": 0.4108,
+      "step": 11851
+    },
+    {
+      "epoch": 0.32702276701490324,
+      "grad_norm": 0.016221268102526665,
+      "learning_rate": 0.001,
+      "loss": 0.4047,
+      "step": 11852
+    },
+    {
+      "epoch": 0.3270503592159676,
+      "grad_norm": 0.0028514659497886896,
+      "learning_rate": 0.001,
+      "loss": 0.4052,
+      "step": 11853
+    },
+    {
+      "epoch": 0.327077951417032,
+      "grad_norm": 0.00452115572988987,
+      "learning_rate": 0.001,
+      "loss": 0.3871,
+      "step": 11854
+    },
+    {
+      "epoch": 0.32710554361809635,
+      "grad_norm": 0.00457549886777997,
+      "learning_rate": 0.001,
+      "loss": 0.4025,
+      "step": 11855
+    },
+    {
+      "epoch": 0.3271331358191607,
+      "grad_norm": 0.0027922464068979025,
+      "learning_rate": 0.001,
+      "loss": 0.3936,
+      "step": 11856
+    },
+    {
+      "epoch": 0.3271607280202251,
+      "grad_norm": 0.005448077339679003,
+      "learning_rate": 0.001,
+      "loss": 0.3526,
+      "step": 11857
+    },
+    {
+      "epoch": 0.32718832022128946,
+      "grad_norm": 0.007846580818295479,
+      "learning_rate": 0.001,
+      "loss": 0.4052,
+      "step": 11858
+    },
+    {
+      "epoch": 0.3272159124223538,
+      "grad_norm": 0.0035153308417648077,
+      "learning_rate": 0.001,
+      "loss": 0.3719,
+      "step": 11859
+    },
+    {
+      "epoch": 0.32724350462341817,
+      "grad_norm": 0.0028227800503373146,
+      "learning_rate": 0.001,
+      "loss": 0.3836,
+      "step": 11860
+    },
+    {
+      "epoch": 0.3272710968244826,
+      "grad_norm": 0.004079067148268223,
+      "learning_rate": 0.001,
+      "loss": 0.3951,
+      "step": 11861
+    },
+    {
+      "epoch": 0.32729868902554693,
+      "grad_norm": 0.007391565479338169,
+      "learning_rate": 0.001,
+      "loss": 0.3974,
+      "step": 11862
+    },
+    {
+      "epoch": 0.3273262812266113,
+      "grad_norm": 0.0028963603544980288,
+      "learning_rate": 0.001,
+      "loss": 0.4337,
+      "step": 11863
+    },
+    {
+      "epoch": 0.3273538734276757,
+      "grad_norm": 0.002462986158207059,
+      "learning_rate": 0.001,
+      "loss": 0.3778,
+      "step": 11864
+    },
+    {
+      "epoch": 0.32738146562874004,
+      "grad_norm": 0.0026601895224303007,
+      "learning_rate": 0.001,
+      "loss": 0.4071,
+      "step": 11865
+    },
+    {
+      "epoch": 0.3274090578298044,
+      "grad_norm": 0.0028002276085317135,
+      "learning_rate": 0.001,
+      "loss": 0.3891,
+      "step": 11866
+    },
+    {
+      "epoch": 0.32743665003086875,
+      "grad_norm": 0.0025977008044719696,
+      "learning_rate": 0.001,
+      "loss": 0.4384,
+      "step": 11867
+    },
+    {
+      "epoch": 0.32746424223193316,
+      "grad_norm": 0.0031972804572433233,
+      "learning_rate": 0.001,
+      "loss": 0.4012,
+      "step": 11868
+    },
+    {
+      "epoch": 0.3274918344329975,
+      "grad_norm": 0.0031719047110527754,
+      "learning_rate": 0.001,
+      "loss": 0.386,
+      "step": 11869
+    },
+    {
+      "epoch": 0.32751942663406186,
+      "grad_norm": 0.021741317585110664,
+      "learning_rate": 0.001,
+      "loss": 0.3954,
+      "step": 11870
+    },
+    {
+      "epoch": 0.32754701883512627,
+      "grad_norm": 0.005185617599636316,
+      "learning_rate": 0.001,
+      "loss": 0.3913,
+      "step": 11871
+    },
+    {
+      "epoch": 0.3275746110361906,
+      "grad_norm": 0.004073983523994684,
+      "learning_rate": 0.001,
+      "loss": 0.3319,
+      "step": 11872
+    },
+    {
+      "epoch": 0.327602203237255,
+      "grad_norm": 0.003672317136079073,
+      "learning_rate": 0.001,
+      "loss": 0.3986,
+      "step": 11873
+    },
+    {
+      "epoch": 0.3276297954383194,
+      "grad_norm": 0.003211831208318472,
+      "learning_rate": 0.001,
+      "loss": 0.3866,
+      "step": 11874
+    },
+    {
+      "epoch": 0.32765738763938373,
+      "grad_norm": 0.005349930375814438,
+      "learning_rate": 0.001,
+      "loss": 0.3879,
+      "step": 11875
+    },
+    {
+      "epoch": 0.3276849798404481,
+      "grad_norm": 0.001996625680476427,
+      "learning_rate": 0.001,
+      "loss": 0.4215,
+      "step": 11876
+    },
+    {
+      "epoch": 0.32771257204151244,
+      "grad_norm": 0.0034636626951396465,
+      "learning_rate": 0.001,
+      "loss": 0.3961,
+      "step": 11877
+    },
+    {
+      "epoch": 0.32774016424257685,
+      "grad_norm": 0.006974721793085337,
+      "learning_rate": 0.001,
+      "loss": 0.3961,
+      "step": 11878
+    },
+    {
+      "epoch": 0.3277677564436412,
+      "grad_norm": 0.004640425555408001,
+      "learning_rate": 0.001,
+      "loss": 0.4197,
+      "step": 11879
+    },
+    {
+      "epoch": 0.32779534864470555,
+      "grad_norm": 0.0025745509192347527,
+      "learning_rate": 0.001,
+      "loss": 0.3604,
+      "step": 11880
+    },
+    {
+      "epoch": 0.32782294084576996,
+      "grad_norm": 0.0033829696476459503,
+      "learning_rate": 0.001,
+      "loss": 0.3901,
+      "step": 11881
+    },
+    {
+      "epoch": 0.3278505330468343,
+      "grad_norm": 0.003960746806114912,
+      "learning_rate": 0.001,
+      "loss": 0.3708,
+      "step": 11882
+    },
+    {
+      "epoch": 0.32787812524789867,
+      "grad_norm": 0.006613335572183132,
+      "learning_rate": 0.001,
+      "loss": 0.3652,
+      "step": 11883
+    },
+    {
+      "epoch": 0.3279057174489631,
+      "grad_norm": 0.003319015959277749,
+      "learning_rate": 0.001,
+      "loss": 0.389,
+      "step": 11884
+    },
+    {
+      "epoch": 0.3279333096500274,
+      "grad_norm": 0.0034291057381778955,
+      "learning_rate": 0.001,
+      "loss": 0.393,
+      "step": 11885
+    },
+    {
+      "epoch": 0.3279609018510918,
+      "grad_norm": 0.0027653754223138094,
+      "learning_rate": 0.001,
+      "loss": 0.3873,
+      "step": 11886
+    },
+    {
+      "epoch": 0.32798849405215613,
+      "grad_norm": 0.00384811544790864,
+      "learning_rate": 0.001,
+      "loss": 0.3987,
+      "step": 11887
+    },
+    {
+      "epoch": 0.32801608625322054,
+      "grad_norm": 0.0028639482334256172,
+      "learning_rate": 0.001,
+      "loss": 0.3981,
+      "step": 11888
+    },
+    {
+      "epoch": 0.3280436784542849,
+      "grad_norm": 0.0026863133534789085,
+      "learning_rate": 0.001,
+      "loss": 0.4167,
+      "step": 11889
+    },
+    {
+      "epoch": 0.32807127065534925,
+      "grad_norm": 0.0028144351672381163,
+      "learning_rate": 0.001,
+      "loss": 0.3815,
+      "step": 11890
+    },
+    {
+      "epoch": 0.32809886285641365,
+      "grad_norm": 0.010337802581489086,
+      "learning_rate": 0.001,
+      "loss": 0.3988,
+      "step": 11891
+    },
+    {
+      "epoch": 0.328126455057478,
+      "grad_norm": 0.01729881763458252,
+      "learning_rate": 0.001,
+      "loss": 0.4123,
+      "step": 11892
+    },
+    {
+      "epoch": 0.32815404725854236,
+      "grad_norm": 0.01979982666671276,
+      "learning_rate": 0.001,
+      "loss": 0.3943,
+      "step": 11893
+    },
+    {
+      "epoch": 0.32818163945960677,
+      "grad_norm": 0.003908053506165743,
+      "learning_rate": 0.001,
+      "loss": 0.3937,
+      "step": 11894
+    },
+    {
+      "epoch": 0.3282092316606711,
+      "grad_norm": 0.0044098771177232265,
+      "learning_rate": 0.001,
+      "loss": 0.4198,
+      "step": 11895
+    },
+    {
+      "epoch": 0.32823682386173547,
+      "grad_norm": 0.0028675422072410583,
+      "learning_rate": 0.001,
+      "loss": 0.4231,
+      "step": 11896
+    },
+    {
+      "epoch": 0.3282644160627998,
+      "grad_norm": 0.006255595479160547,
+      "learning_rate": 0.001,
+      "loss": 0.3649,
+      "step": 11897
+    },
+    {
+      "epoch": 0.32829200826386423,
+      "grad_norm": 0.0025888034142553806,
+      "learning_rate": 0.001,
+      "loss": 0.4051,
+      "step": 11898
+    },
+    {
+      "epoch": 0.3283196004649286,
+      "grad_norm": 0.0021037342958152294,
+      "learning_rate": 0.001,
+      "loss": 0.4037,
+      "step": 11899
+    },
+    {
+      "epoch": 0.32834719266599294,
+      "grad_norm": 0.003093705978244543,
+      "learning_rate": 0.001,
+      "loss": 0.3568,
+      "step": 11900
+    },
+    {
+      "epoch": 0.32837478486705735,
+      "grad_norm": 0.0024948539212346077,
+      "learning_rate": 0.001,
+      "loss": 0.4112,
+      "step": 11901
+    },
+    {
+      "epoch": 0.3284023770681217,
+      "grad_norm": 0.0028005533386021852,
+      "learning_rate": 0.001,
+      "loss": 0.3761,
+      "step": 11902
+    },
+    {
+      "epoch": 0.32842996926918605,
+      "grad_norm": 0.006543453317135572,
+      "learning_rate": 0.001,
+      "loss": 0.3957,
+      "step": 11903
+    },
+    {
+      "epoch": 0.32845756147025046,
+      "grad_norm": 0.0032540878746658564,
+      "learning_rate": 0.001,
+      "loss": 0.4036,
+      "step": 11904
+    },
+    {
+      "epoch": 0.3284851536713148,
+      "grad_norm": 0.017330944538116455,
+      "learning_rate": 0.001,
+      "loss": 0.392,
+      "step": 11905
+    },
+    {
+      "epoch": 0.32851274587237916,
+      "grad_norm": 0.015614539384841919,
+      "learning_rate": 0.001,
+      "loss": 0.3827,
+      "step": 11906
+    },
+    {
+      "epoch": 0.3285403380734435,
+      "grad_norm": 0.04733549430966377,
+      "learning_rate": 0.001,
+      "loss": 0.4134,
+      "step": 11907
+    },
+    {
+      "epoch": 0.3285679302745079,
+      "grad_norm": 0.062019579112529755,
+      "learning_rate": 0.001,
+      "loss": 0.371,
+      "step": 11908
+    },
+    {
+      "epoch": 0.3285955224755723,
+      "grad_norm": 0.008806095458567142,
+      "learning_rate": 0.001,
+      "loss": 0.4204,
+      "step": 11909
+    },
+    {
+      "epoch": 0.32862311467663663,
+      "grad_norm": 0.052441034466028214,
+      "learning_rate": 0.001,
+      "loss": 0.3895,
+      "step": 11910
+    },
+    {
+      "epoch": 0.32865070687770104,
+      "grad_norm": 0.004011763259768486,
+      "learning_rate": 0.001,
+      "loss": 0.3924,
+      "step": 11911
+    },
+    {
+      "epoch": 0.3286782990787654,
+      "grad_norm": 0.004210361745208502,
+      "learning_rate": 0.001,
+      "loss": 0.3974,
+      "step": 11912
+    },
+    {
+      "epoch": 0.32870589127982974,
+      "grad_norm": 0.009359506890177727,
+      "learning_rate": 0.001,
+      "loss": 0.352,
+      "step": 11913
+    },
+    {
+      "epoch": 0.32873348348089415,
+      "grad_norm": 0.007341593038290739,
+      "learning_rate": 0.001,
+      "loss": 0.3809,
+      "step": 11914
+    },
+    {
+      "epoch": 0.3287610756819585,
+      "grad_norm": 0.005077636335045099,
+      "learning_rate": 0.001,
+      "loss": 0.3935,
+      "step": 11915
+    },
+    {
+      "epoch": 0.32878866788302286,
+      "grad_norm": 0.01000258419662714,
+      "learning_rate": 0.001,
+      "loss": 0.407,
+      "step": 11916
+    },
+    {
+      "epoch": 0.3288162600840872,
+      "grad_norm": 0.030418379232287407,
+      "learning_rate": 0.001,
+      "loss": 0.3816,
+      "step": 11917
+    },
+    {
+      "epoch": 0.3288438522851516,
+      "grad_norm": 0.003933771047741175,
+      "learning_rate": 0.001,
+      "loss": 0.4188,
+      "step": 11918
+    },
+    {
+      "epoch": 0.32887144448621597,
+      "grad_norm": 0.002575295278802514,
+      "learning_rate": 0.001,
+      "loss": 0.4108,
+      "step": 11919
+    },
+    {
+      "epoch": 0.3288990366872803,
+      "grad_norm": 0.002374958945438266,
+      "learning_rate": 0.001,
+      "loss": 0.429,
+      "step": 11920
+    },
+    {
+      "epoch": 0.32892662888834473,
+      "grad_norm": 0.003249475732445717,
+      "learning_rate": 0.001,
+      "loss": 0.3692,
+      "step": 11921
+    },
+    {
+      "epoch": 0.3289542210894091,
+      "grad_norm": 0.002772397128865123,
+      "learning_rate": 0.001,
+      "loss": 0.3916,
+      "step": 11922
+    },
+    {
+      "epoch": 0.32898181329047343,
+      "grad_norm": 0.0042383186519145966,
+      "learning_rate": 0.001,
+      "loss": 0.3551,
+      "step": 11923
+    },
+    {
+      "epoch": 0.32900940549153784,
+      "grad_norm": 0.003252743510529399,
+      "learning_rate": 0.001,
+      "loss": 0.4146,
+      "step": 11924
+    },
+    {
+      "epoch": 0.3290369976926022,
+      "grad_norm": 0.0025390556547790766,
+      "learning_rate": 0.001,
+      "loss": 0.3918,
+      "step": 11925
+    },
+    {
+      "epoch": 0.32906458989366655,
+      "grad_norm": 0.002343092579394579,
+      "learning_rate": 0.001,
+      "loss": 0.4287,
+      "step": 11926
+    },
+    {
+      "epoch": 0.3290921820947309,
+      "grad_norm": 0.0033002211712300777,
+      "learning_rate": 0.001,
+      "loss": 0.3726,
+      "step": 11927
+    },
+    {
+      "epoch": 0.3291197742957953,
+      "grad_norm": 0.003043625270947814,
+      "learning_rate": 0.001,
+      "loss": 0.4038,
+      "step": 11928
+    },
+    {
+      "epoch": 0.32914736649685966,
+      "grad_norm": 0.006221500225365162,
+      "learning_rate": 0.001,
+      "loss": 0.4217,
+      "step": 11929
+    },
+    {
+      "epoch": 0.329174958697924,
+      "grad_norm": 0.004655203316360712,
+      "learning_rate": 0.001,
+      "loss": 0.4154,
+      "step": 11930
+    },
+    {
+      "epoch": 0.3292025508989884,
+      "grad_norm": 0.0021311945747584105,
+      "learning_rate": 0.001,
+      "loss": 0.3988,
+      "step": 11931
+    },
+    {
+      "epoch": 0.3292301431000528,
+      "grad_norm": 0.0027006971649825573,
+      "learning_rate": 0.001,
+      "loss": 0.4346,
+      "step": 11932
+    },
+    {
+      "epoch": 0.3292577353011171,
+      "grad_norm": 0.002889286493882537,
+      "learning_rate": 0.001,
+      "loss": 0.3541,
+      "step": 11933
+    },
+    {
+      "epoch": 0.32928532750218154,
+      "grad_norm": 0.007051974534988403,
+      "learning_rate": 0.001,
+      "loss": 0.4284,
+      "step": 11934
+    },
+    {
+      "epoch": 0.3293129197032459,
+      "grad_norm": 0.002629111986607313,
+      "learning_rate": 0.001,
+      "loss": 0.409,
+      "step": 11935
+    },
+    {
+      "epoch": 0.32934051190431024,
+      "grad_norm": 0.004683708306401968,
+      "learning_rate": 0.001,
+      "loss": 0.3419,
+      "step": 11936
+    },
+    {
+      "epoch": 0.3293681041053746,
+      "grad_norm": 0.003245966276153922,
+      "learning_rate": 0.001,
+      "loss": 0.394,
+      "step": 11937
+    },
+    {
+      "epoch": 0.329395696306439,
+      "grad_norm": 0.0032094584312289953,
+      "learning_rate": 0.001,
+      "loss": 0.3986,
+      "step": 11938
+    },
+    {
+      "epoch": 0.32942328850750335,
+      "grad_norm": 0.0030683856457471848,
+      "learning_rate": 0.001,
+      "loss": 0.3863,
+      "step": 11939
+    },
+    {
+      "epoch": 0.3294508807085677,
+      "grad_norm": 0.0035426607355475426,
+      "learning_rate": 0.001,
+      "loss": 0.4418,
+      "step": 11940
+    },
+    {
+      "epoch": 0.3294784729096321,
+      "grad_norm": 0.003129177028313279,
+      "learning_rate": 0.001,
+      "loss": 0.3972,
+      "step": 11941
+    },
+    {
+      "epoch": 0.32950606511069647,
+      "grad_norm": 0.003153977682814002,
+      "learning_rate": 0.001,
+      "loss": 0.3795,
+      "step": 11942
+    },
+    {
+      "epoch": 0.3295336573117608,
+      "grad_norm": 0.0056450688280165195,
+      "learning_rate": 0.001,
+      "loss": 0.3759,
+      "step": 11943
+    },
+    {
+      "epoch": 0.3295612495128252,
+      "grad_norm": 0.0025312695652246475,
+      "learning_rate": 0.001,
+      "loss": 0.3729,
+      "step": 11944
+    },
+    {
+      "epoch": 0.3295888417138896,
+      "grad_norm": 0.0025766692124307156,
+      "learning_rate": 0.001,
+      "loss": 0.4299,
+      "step": 11945
+    },
+    {
+      "epoch": 0.32961643391495393,
+      "grad_norm": 0.005080906208604574,
+      "learning_rate": 0.001,
+      "loss": 0.4023,
+      "step": 11946
+    },
+    {
+      "epoch": 0.3296440261160183,
+      "grad_norm": 0.0058040316216647625,
+      "learning_rate": 0.001,
+      "loss": 0.3528,
+      "step": 11947
+    },
+    {
+      "epoch": 0.3296716183170827,
+      "grad_norm": 0.0035558012314140797,
+      "learning_rate": 0.001,
+      "loss": 0.3893,
+      "step": 11948
+    },
+    {
+      "epoch": 0.32969921051814705,
+      "grad_norm": 0.003042223397642374,
+      "learning_rate": 0.001,
+      "loss": 0.3936,
+      "step": 11949
+    },
+    {
+      "epoch": 0.3297268027192114,
+      "grad_norm": 0.0033221363555639982,
+      "learning_rate": 0.001,
+      "loss": 0.419,
+      "step": 11950
+    },
+    {
+      "epoch": 0.3297543949202758,
+      "grad_norm": 0.0032754812855273485,
+      "learning_rate": 0.001,
+      "loss": 0.3948,
+      "step": 11951
+    },
+    {
+      "epoch": 0.32978198712134016,
+      "grad_norm": 0.002569601172581315,
+      "learning_rate": 0.001,
+      "loss": 0.3701,
+      "step": 11952
+    },
+    {
+      "epoch": 0.3298095793224045,
+      "grad_norm": 0.003200158243998885,
+      "learning_rate": 0.001,
+      "loss": 0.4138,
+      "step": 11953
+    },
+    {
+      "epoch": 0.32983717152346886,
+      "grad_norm": 0.003082757582888007,
+      "learning_rate": 0.001,
+      "loss": 0.369,
+      "step": 11954
+    },
+    {
+      "epoch": 0.32986476372453327,
+      "grad_norm": 0.002875220263376832,
+      "learning_rate": 0.001,
+      "loss": 0.3803,
+      "step": 11955
+    },
+    {
+      "epoch": 0.3298923559255976,
+      "grad_norm": 0.002740328898653388,
+      "learning_rate": 0.001,
+      "loss": 0.3595,
+      "step": 11956
+    },
+    {
+      "epoch": 0.329919948126662,
+      "grad_norm": 0.004781671334058046,
+      "learning_rate": 0.001,
+      "loss": 0.4139,
+      "step": 11957
+    },
+    {
+      "epoch": 0.3299475403277264,
+      "grad_norm": 0.002997961826622486,
+      "learning_rate": 0.001,
+      "loss": 0.401,
+      "step": 11958
+    },
+    {
+      "epoch": 0.32997513252879074,
+      "grad_norm": 0.002518662018701434,
+      "learning_rate": 0.001,
+      "loss": 0.3957,
+      "step": 11959
+    },
+    {
+      "epoch": 0.3300027247298551,
+      "grad_norm": 0.0048488592728972435,
+      "learning_rate": 0.001,
+      "loss": 0.4122,
+      "step": 11960
+    },
+    {
+      "epoch": 0.3300303169309195,
+      "grad_norm": 0.005046727601438761,
+      "learning_rate": 0.001,
+      "loss": 0.388,
+      "step": 11961
+    },
+    {
+      "epoch": 0.33005790913198385,
+      "grad_norm": 0.0023717847652733326,
+      "learning_rate": 0.001,
+      "loss": 0.4206,
+      "step": 11962
+    },
+    {
+      "epoch": 0.3300855013330482,
+      "grad_norm": 0.002914504613727331,
+      "learning_rate": 0.001,
+      "loss": 0.4109,
+      "step": 11963
+    },
+    {
+      "epoch": 0.33011309353411256,
+      "grad_norm": 0.006090945564210415,
+      "learning_rate": 0.001,
+      "loss": 0.3835,
+      "step": 11964
+    },
+    {
+      "epoch": 0.33014068573517696,
+      "grad_norm": 0.0030452022328972816,
+      "learning_rate": 0.001,
+      "loss": 0.3956,
+      "step": 11965
+    },
+    {
+      "epoch": 0.3301682779362413,
+      "grad_norm": 0.0026935841888189316,
+      "learning_rate": 0.001,
+      "loss": 0.414,
+      "step": 11966
+    },
+    {
+      "epoch": 0.33019587013730567,
+      "grad_norm": 0.003028253326192498,
+      "learning_rate": 0.001,
+      "loss": 0.3754,
+      "step": 11967
+    },
+    {
+      "epoch": 0.3302234623383701,
+      "grad_norm": 0.004835574887692928,
+      "learning_rate": 0.001,
+      "loss": 0.3862,
+      "step": 11968
+    },
+    {
+      "epoch": 0.33025105453943443,
+      "grad_norm": 0.0029693404212594032,
+      "learning_rate": 0.001,
+      "loss": 0.403,
+      "step": 11969
+    },
+    {
+      "epoch": 0.3302786467404988,
+      "grad_norm": 0.0037660168018192053,
+      "learning_rate": 0.001,
+      "loss": 0.4376,
+      "step": 11970
+    },
+    {
+      "epoch": 0.3303062389415632,
+      "grad_norm": 0.0026343679055571556,
+      "learning_rate": 0.001,
+      "loss": 0.4401,
+      "step": 11971
+    },
+    {
+      "epoch": 0.33033383114262754,
+      "grad_norm": 0.002500650705769658,
+      "learning_rate": 0.001,
+      "loss": 0.4046,
+      "step": 11972
+    },
+    {
+      "epoch": 0.3303614233436919,
+      "grad_norm": 0.0031541388016194105,
+      "learning_rate": 0.001,
+      "loss": 0.4058,
+      "step": 11973
+    },
+    {
+      "epoch": 0.33038901554475625,
+      "grad_norm": 0.0035852715373039246,
+      "learning_rate": 0.001,
+      "loss": 0.3842,
+      "step": 11974
+    },
+    {
+      "epoch": 0.33041660774582066,
+      "grad_norm": 0.0040337685495615005,
+      "learning_rate": 0.001,
+      "loss": 0.4281,
+      "step": 11975
+    },
+    {
+      "epoch": 0.330444199946885,
+      "grad_norm": 0.0031562617514282465,
+      "learning_rate": 0.001,
+      "loss": 0.3547,
+      "step": 11976
+    },
+    {
+      "epoch": 0.33047179214794936,
+      "grad_norm": 0.004418304655700922,
+      "learning_rate": 0.001,
+      "loss": 0.3938,
+      "step": 11977
+    },
+    {
+      "epoch": 0.33049938434901377,
+      "grad_norm": 0.0028235798235982656,
+      "learning_rate": 0.001,
+      "loss": 0.3749,
+      "step": 11978
+    },
+    {
+      "epoch": 0.3305269765500781,
+      "grad_norm": 0.0024365021381527185,
+      "learning_rate": 0.001,
+      "loss": 0.4025,
+      "step": 11979
+    },
+    {
+      "epoch": 0.3305545687511425,
+      "grad_norm": 0.003930834122002125,
+      "learning_rate": 0.001,
+      "loss": 0.3923,
+      "step": 11980
+    },
+    {
+      "epoch": 0.3305821609522069,
+      "grad_norm": 0.002820041496306658,
+      "learning_rate": 0.001,
+      "loss": 0.4124,
+      "step": 11981
+    },
+    {
+      "epoch": 0.33060975315327124,
+      "grad_norm": 0.0039013477507978678,
+      "learning_rate": 0.001,
+      "loss": 0.4256,
+      "step": 11982
+    },
+    {
+      "epoch": 0.3306373453543356,
+      "grad_norm": 0.002920904429629445,
+      "learning_rate": 0.001,
+      "loss": 0.389,
+      "step": 11983
+    },
+    {
+      "epoch": 0.33066493755539994,
+      "grad_norm": 0.0036882515996694565,
+      "learning_rate": 0.001,
+      "loss": 0.3485,
+      "step": 11984
+    },
+    {
+      "epoch": 0.33069252975646435,
+      "grad_norm": 0.0034662606194615364,
+      "learning_rate": 0.001,
+      "loss": 0.4022,
+      "step": 11985
+    },
+    {
+      "epoch": 0.3307201219575287,
+      "grad_norm": 0.003996389918029308,
+      "learning_rate": 0.001,
+      "loss": 0.4232,
+      "step": 11986
+    },
+    {
+      "epoch": 0.33074771415859305,
+      "grad_norm": 0.002916965400800109,
+      "learning_rate": 0.001,
+      "loss": 0.4265,
+      "step": 11987
+    },
+    {
+      "epoch": 0.33077530635965746,
+      "grad_norm": 0.0024744670372456312,
+      "learning_rate": 0.001,
+      "loss": 0.4377,
+      "step": 11988
+    },
+    {
+      "epoch": 0.3308028985607218,
+      "grad_norm": 0.005105322692543268,
+      "learning_rate": 0.001,
+      "loss": 0.412,
+      "step": 11989
+    },
+    {
+      "epoch": 0.33083049076178617,
+      "grad_norm": 0.0023484451230615377,
+      "learning_rate": 0.001,
+      "loss": 0.4235,
+      "step": 11990
+    },
+    {
+      "epoch": 0.3308580829628506,
+      "grad_norm": 0.0028626341372728348,
+      "learning_rate": 0.001,
+      "loss": 0.3979,
+      "step": 11991
+    },
+    {
+      "epoch": 0.3308856751639149,
+      "grad_norm": 0.0024867048487067223,
+      "learning_rate": 0.001,
+      "loss": 0.4314,
+      "step": 11992
+    },
+    {
+      "epoch": 0.3309132673649793,
+      "grad_norm": 0.0038186991587281227,
+      "learning_rate": 0.001,
+      "loss": 0.3815,
+      "step": 11993
+    },
+    {
+      "epoch": 0.33094085956604363,
+      "grad_norm": 0.002811636310070753,
+      "learning_rate": 0.001,
+      "loss": 0.4059,
+      "step": 11994
+    },
+    {
+      "epoch": 0.33096845176710804,
+      "grad_norm": 0.0019408464431762695,
+      "learning_rate": 0.001,
+      "loss": 0.4154,
+      "step": 11995
+    },
+    {
+      "epoch": 0.3309960439681724,
+      "grad_norm": 0.0032817174214869738,
+      "learning_rate": 0.001,
+      "loss": 0.39,
+      "step": 11996
+    },
+    {
+      "epoch": 0.33102363616923675,
+      "grad_norm": 0.005393838509917259,
+      "learning_rate": 0.001,
+      "loss": 0.3443,
+      "step": 11997
+    },
+    {
+      "epoch": 0.33105122837030115,
+      "grad_norm": 0.002987619722262025,
+      "learning_rate": 0.001,
+      "loss": 0.4412,
+      "step": 11998
+    },
+    {
+      "epoch": 0.3310788205713655,
+      "grad_norm": 0.004077446181327105,
+      "learning_rate": 0.001,
+      "loss": 0.3921,
+      "step": 11999
+    },
+    {
+      "epoch": 0.33110641277242986,
+      "grad_norm": 0.0028092057909816504,
+      "learning_rate": 0.001,
+      "loss": 0.3888,
+      "step": 12000
+    },
+    {
+      "epoch": 0.33110641277242986,
+      "eval_runtime": 23.4032,
+      "eval_samples_per_second": 1.367,
+      "eval_steps_per_second": 0.171,
+      "step": 12000
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 36242,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 2000,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 9.647879841911721e+19,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}